# Notebook for Toronto Neighborhoods project

In [131]:
# imports
import pandas as pd
import numpy as np
import lxml
import requests
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#import matplotlib.cm as cm
#import matplotlib.colors as colors
import folium

### 1. Read table from wikipedia

In [2]:
# get table from wikipedia
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)
neigh = tables[0]
neigh.rename(columns = {'Postal Code': 'PostalCode'}, inplace=True)

In [3]:
# subset to Borough != 'Not assigned'
mask = neigh.Borough != 'Not assigned'
neigh = neigh[mask].reset_index(drop=True)
neigh.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
# check for Neighbourhood == 'Not assigned'
print(neigh[neigh.Neighbourhood == 'Not assigned'].shape)
print('All Neighbourhoods assigned!')

(0, 3)
All Neighbourhoods assigned!


In [5]:
# check for duplicate PostalCodes
print(neigh['PostalCode'].value_counts().max())
print('No duplicate PostalCodes found!')

1
No duplicate PostalCodes found!


In [6]:
# print the dataframe shape
print(neigh.shape)

(103, 3)


### 2. Add geocoding info to the neighborhoods

In [7]:
# note: the geocoder package did not work so will use the CSV file
geo = pd.read_csv('..\..\data\Geospatial_Coordinates.csv')
geo.rename(columns = {'Postal Code': 'PostalCode'}, inplace=True)
neigh_geo = neigh.merge(geo, on='PostalCode')
neigh_geo.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [8]:
# subset to boroughs containing "Toronto"
mask = neigh_geo.Borough.str.contains('Toronto')
toronto_geo = neigh_geo[mask].reset_index(drop=True)
print(toronto_geo.shape)
toronto_geo.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


### 3. Analyze and cluster neighborhoods

In [9]:
# set foursquare credentials and version
CLIENT_ID = os.getenv('FOURSQUARE_CLIENT_ID')
CLIENT_SECRET = os.getenv('FOURSQUARE_CLIENT_SECRET')
VERSION = '20180605'
LIMIT = 100

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
toronto_venues = getNearbyVenues(names=toronto_geo['PostalCode'],
                                 latitudes=toronto_geo['Latitude'],
                                 longitudes=toronto_geo['Longitude']
                                )
print("All done!")

All done!


In [12]:
print(toronto_venues.shape)
toronto_venues.head()

(1594, 7)


Unnamed: 0,PostalCode,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [13]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 233 uniques categories.


In [30]:
# derive features with one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postal code column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move postal code column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

postal_code_counts = toronto_onehot.PostalCode.value_counts().reset_index()
postal_code_counts.columns = ['PostalCode', 'Count']
print(postal_code_counts[['Count']].quantile([0,0.05,0.1,0.25,0.33,0.5,0.67,0.75,0.9,0.95,1]))
toronto_hot_counts = toronto_onehot.merge(postal_code_counts, on='PostalCode')

print(toronto_hot_counts.shape)
toronto_hot_counts.head()

       Count
0.00    2.00
0.05    2.90
0.10    4.00
0.25   16.00
0.33   18.54
0.50   31.00
0.67   43.92
0.75   60.00
0.90  100.00
0.95  100.00
1.00  100.00
(1594, 235)


Unnamed: 0,PostalCode,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio,Count
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45


In [105]:
# group by postal code and take mean by venue type
toronto_grouped = toronto_hot_counts.groupby('PostalCode').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

(39, 235)


Unnamed: 0,PostalCode,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio,Count
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,5
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,...,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,42
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054054,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027,37
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [33]:
# define function to sort venue type by frequency of occurence
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [122]:
# create the dataframe containint top ten venue types for each postal code
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Count,Health Food Store,Pub,Trail,Neighborhood,Park,Comic Shop,Department Store,Electronics Store,Eastern European Restaurant
1,M4K,Count,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Dessert Shop,Pub,Caribbean Restaurant
2,M4L,Count,Fast Food Restaurant,Movie Theater,Sandwich Place,Restaurant,Pub,Pizza Place,Pet Store,Park,Liquor Store
3,M4M,Count,Coffee Shop,Brewery,Gastropub,Café,Bakery,American Restaurant,Neighborhood,Cheese Shop,Clothing Store
4,M4N,Count,Park,Swim School,Bus Line,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [106]:
# standardize values
mask = toronto_grouped.Count > 19
X = toronto_grouped[mask].drop('PostalCode', 1)
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)

In [107]:
# determine best number of clusters
toronto_grouped_clustering = cluster_dataset
n_clus = [2,3,4,5,6,7]

for k in n_clus:
    print("K: ",k)
    # run k-means clustering
    print("Clusters:")
    kmeans = KMeans(n_clusters=k).fit(toronto_grouped_clustering)
    # check cluster labels generated for each row in the dataframe
    print(pd.DataFrame(kmeans.labels_).value_counts())
    # Calculate silhouette_score
    print("Silhouette: ",silhouette_score(toronto_grouped_clustering, kmeans.labels_),"\n")

K:  2
Clusters:
1    16
0     9
dtype: int64
Silhouette:  0.03627715668712578 

K:  3
Clusters:
0    14
2    10
1     1
dtype: int64
Silhouette:  0.03278083533583258 

K:  4
Clusters:
3    11
1     5
0     5
2     4
dtype: int64
Silhouette:  0.028851001591830433 

K:  5
Clusters:
1    12
3     5
2     4
0     3
4     1
dtype: int64
Silhouette:  0.03243467216201001 

K:  6
Clusters:
2    9
0    7
5    6
4    1
3    1
1    1
dtype: int64
Silhouette:  0.03378165590135469 

K:  7
Clusters:
3    9
6    4
5    4
0    3
4    2
1    2
2    1
dtype: int64
Silhouette:  0.03623436313520548 



In [108]:
# run k-means clustering
kmeans = KMeans(n_clusters=4).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(pd.DataFrame(kmeans.labels_).value_counts())

# Calculate silhouette_score
print("Silhouette: ",silhouette_score(toronto_grouped_clustering, kmeans.labels_),"\n")

1    14
0     9
3     1
2     1
dtype: int64
Silhouette:  0.04339586616155145 



In [129]:
# add clustering labels
labeled = neighborhoods_venues_sorted[mask]
labeled.head()
labeled.insert(0, 'Cluster Labels', kmeans.labels_)
labeled.head()

Unnamed: 0,Cluster Labels,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,M4E,Count,Health Food Store,Pub,Trail,Neighborhood,Park,Comic Shop,Department Store,Electronics Store,Eastern European Restaurant
1,1,M4K,Count,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Dessert Shop,Pub,Caribbean Restaurant
2,1,M4L,Count,Fast Food Restaurant,Movie Theater,Sandwich Place,Restaurant,Pub,Pizza Place,Pet Store,Park,Liquor Store
3,1,M4M,Count,Coffee Shop,Brewery,Gastropub,Café,Bakery,American Restaurant,Neighborhood,Cheese Shop,Clothing Store
5,1,M4P,Count,Hotel,Park,Department Store,Sandwich Place,Breakfast Spot,Food & Drink Shop,Pizza Place,Gym / Fitness Center,Gastropub


In [130]:
# add clustering labels to toronto_geo
temp = toronto_geo.merge(postal_code_counts, on='PostalCode')
mask = temp.Count > 19
toronto_geo_19 = temp[mask]
toronto_geo_19_merged = toronto_geo_19.merge(labeled, on='PostalCode')
toronto_geo_19_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Count,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,45,1,Count,Coffee Shop,Park,Pub,Bakery,Breakfast Spot,Theater,Restaurant,Café,Yoga Studio
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,31,1,Count,Coffee Shop,Sushi Restaurant,Distribution Center,Portuguese Restaurant,Park,Mexican Restaurant,Japanese Restaurant,Italian Restaurant,Yoga Studio
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,100,0,Count,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Fast Food Restaurant
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,55,0,Count,Coffee Shop,Cocktail Bar,Cheese Shop,Restaurant,Beer Bar,Seafood Restaurant,Farmers Market,Bakery,Japanese Restaurant
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,61,0,Count,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Salad Place,Bubble Tea Shop,Burger Joint,Juice Bar,Ice Cream Shop


In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### 4. Examine Clusters

In [None]:
# cluster 0
mask0 = manhattan_merged['Cluster Labels'] == 0
manhattan_merged.loc[mask0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

In [None]:
# cluster 1
mask1 = manhattan_merged['Cluster Labels'] == 1
manhattan_merged.loc[mask1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

In [None]:
# cluster 2
mask2 = manhattan_merged['Cluster Labels'] == 2
manhattan_merged.loc[mask2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

In [None]:
# cluster 3
manhattan_merged['Cluster Labels'] == 3
manhattan_merged.loc[mask3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

In [None]:
# cluster 4
manhattan_merged['Cluster Labels'] == 4
manhattan_merged.loc[mask4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]