# Importing libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


# Loading data

In [2]:
df = pd.read_csv(r'C:\Users\WIN\Desktop\py_project2.csv')

In [3]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# slicing the original dataframe and creating a new dataframe of the York data.

In [7]:
york_data = df[df['Borough'] == 'York'].reset_index(drop=True)
york_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
2,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013
3,M6N,York,"Runnymede, The Junction North",43.673185,-79.487262
4,M9N,York,Weston,43.706876,-79.518188


# get the geographical coordinates of York.

In [8]:
address = 'York, Toronto'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of York are 43.67910515, -79.49118414007154.


# visualizing York and the neighborhoods in it.

In [9]:
# create map of York using latitude and longitude values
map_york = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(york_data['Latitude'], york_data['Longitude'], york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_york)  
    
map_york

# Define Foursquare Credentials and Version

In [23]:
CLIENT_ID = 'ID' # Foursquare ID
CLIENT_SECRET = 'SECRET' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ID
CLIENT_SECRET:SECRET


# Exploring Neighborhoods in York, and getting the top 100 venues  within a radius of 500 meters for each neighborhood.

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
york_venues = getNearbyVenues(names=york_data['Neighborhood'],
                                   latitudes=york_data['Latitude'],
                                   longitudes=york_data['Longitude']
                                  )

Humewood-Cedarvale
Caledonia-Fairbanks
Del Ray, Mount Dennis, Keelsdale and Silverthorn
Runnymede, The Junction North
Weston


# checking how many venues were returned for each neighborhood.

In [13]:
york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Caledonia-Fairbanks,4,4,4,4,4,4
"Del Ray, Mount Dennis, Keelsdale and Silverthorn",4,4,4,4,4,4
Humewood-Cedarvale,4,4,4,4,4,4
"Runnymede, The Junction North",3,3,3,3,3,3
Weston,2,2,2,2,2,2


# find out how many unique categories can be curated from all the returned venues

In [14]:
print('There are {} uniques categories.'.format(len(york_venues['Venue Category'].unique())))

There are 15 uniques categories.


# Analyze Each Neighborhood

In [15]:
# one hot encoding
york_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[fixed_columns]

york_onehot.head()

Unnamed: 0,Neighborhood,Bar,Brewery,Bus Line,Convenience Store,Discount Store,Field,Grocery Store,Hockey Arena,Park,Playground,Pool,Restaurant,Sandwich Place,Trail,Women's Store
0,Humewood-Cedarvale,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,Humewood-Cedarvale,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Caledonia-Fairbanks,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [16]:
york_grouped = york_onehot.groupby('Neighborhood').mean().reset_index()
york_grouped

Unnamed: 0,Neighborhood,Bar,Brewery,Bus Line,Convenience Store,Discount Store,Field,Grocery Store,Hockey Arena,Park,Playground,Pool,Restaurant,Sandwich Place,Trail,Women's Store
0,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.25,0.0,0.0,0.0,0.25
1,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0
2,Humewood-Cedarvale,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.25,0.0
3,"Runnymede, The Junction North",0.0,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Weston,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


# print each neighborhood along with the top 5 most common venues

In [17]:
num_top_venues = 5

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Caledonia-Fairbanks----
           venue  freq
0           Park  0.50
1           Pool  0.25
2  Women's Store  0.25
3            Bar  0.00
4        Brewery  0.00


----Del Ray, Mount Dennis, Keelsdale and Silverthorn----
            venue  freq
0             Bar  0.25
1  Discount Store  0.25
2      Restaurant  0.25
3  Sandwich Place  0.25
4         Brewery  0.00


----Humewood-Cedarvale----
          venue  freq
0         Field  0.25
1  Hockey Arena  0.25
2    Playground  0.25
3         Trail  0.25
4           Bar  0.00


----Runnymede, The Junction North----
               venue  freq
0            Brewery  0.33
1           Bus Line  0.33
2      Grocery Store  0.33
3                Bar  0.00
4  Convenience Store  0.00


----Weston----
               venue  freq
0  Convenience Store   0.5
1               Park   0.5
2                Bar   0.0
3            Brewery   0.0
4           Bus Line   0.0




# put it into a pandas dataframe.

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [19]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Caledonia-Fairbanks,Park,Women's Store,Pool,Trail,Sandwich Place,Restaurant,Playground,Hockey Arena,Grocery Store,Field
1,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Sandwich Place,Restaurant,Discount Store,Bar,Women's Store,Trail,Pool,Playground,Park,Hockey Arena
2,Humewood-Cedarvale,Trail,Playground,Hockey Arena,Field,Women's Store,Sandwich Place,Restaurant,Pool,Park,Grocery Store
3,"Runnymede, The Junction North",Grocery Store,Bus Line,Brewery,Women's Store,Trail,Sandwich Place,Restaurant,Pool,Playground,Park
4,Weston,Park,Convenience Store,Women's Store,Trail,Sandwich Place,Restaurant,Pool,Playground,Hockey Arena,Grocery Store


# Finally clustering the neighborhoods.

In [20]:
# set number of clusters
kclusters = 5

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 1, 2, 0])

In [21]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = york_data

# merge york_grouped with york_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,1,Trail,Playground,Hockey Arena,Field,Women's Store,Sandwich Place,Restaurant,Pool,Park,Grocery Store
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,3,Park,Women's Store,Pool,Trail,Sandwich Place,Restaurant,Playground,Hockey Arena,Grocery Store,Field
2,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013,4,Sandwich Place,Restaurant,Discount Store,Bar,Women's Store,Trail,Pool,Playground,Park,Hockey Arena
3,M6N,York,"Runnymede, The Junction North",43.673185,-79.487262,2,Grocery Store,Bus Line,Brewery,Women's Store,Trail,Sandwich Place,Restaurant,Pool,Playground,Park
4,M9N,York,Weston,43.706876,-79.518188,0,Park,Convenience Store,Women's Store,Trail,Sandwich Place,Restaurant,Pool,Playground,Hockey Arena,Grocery Store


In [22]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters