In [1]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(url, header=0)

In [2]:
df = tables[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
import numpy as np
df.replace('Not assigned', np.nan, inplace=True)

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [4]:
df['Neighbourhood'] = df['Neighbourhood'].mask(pd.isnull, df['Borough'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Ignore cells with a borough that is Not assigned.

In [5]:
df.dropna(subset=['Borough'], axis=0, inplace=True)

#### Combine neighbourhoods with the same postcode.

In [6]:
df = df.groupby(['Postcode', 'Borough'], as_index=False, sort=False).agg({'Neighbourhood': lambda x: x.str.cat(sep=', ')})
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [7]:
df_latlng = pd.read_csv("http://cocl.us/Geospatial_data")
df_latlng.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

In [8]:
df_toronto = df.join(df_latlng.set_index('Postcode'), on='Postcode')
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [9]:
df_toronto['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           11
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Queen's Park         2
Mississauga          1
Name: Borough, dtype: int64

#### North York has the highest number of neighbourhoods.

## Explore North York

In [10]:
# Foursquare credentials, deleted before uploading to Github for privacy concerns.
CLIENT_ID = ''
CLIENT_SECRET = ''
VERSION = '20191201'

In [11]:
df_ny = df_toronto[df_toronto['Borough'] == 'North York']
df_ny.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
7,M3B,North York,Don Mills North,43.745906,-79.352188
10,M6B,North York,Glencairn,43.709577,-79.445073


#### Function that processes all neighbourhoods in North York.

In [12]:
import requests

def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighbourhood and create a new dataframe called *dt_venues*

In [13]:
ny_venues = getNearbyVenues(names=df_ny['Neighbourhood'],
                            latitudes=df_ny['Latitude'],
                            longitudes=df_ny['Longitude']
                            )
ny_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [14]:
print('There are {} uniques categories.'.format(len(ny_venues['Venue Category'].unique())))

There are 108 uniques categories.


In [15]:
# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighbourhood'] = ny_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

ny_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Tailor Shop,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group rows by neighbourhood and calculate the mean of the frequency of occurrence of each category

In [16]:
ny_grouped = ny_onehot.groupby('Neighbourhood').mean().reset_index()
ny_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Tailor Shop,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto, Downsview East",0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = ny_grouped['Neighbourhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Pharmacy,Shopping Mall,Frozen Yogurt Shop,Fried Chicken Joint,Fast Food Restaurant,Diner,Deli / Bodega,Middle Eastern Restaurant,Gas Station
1,Bayview Village,Chinese Restaurant,Bank,Café,Japanese Restaurant,Women's Store,Fast Food Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Pub,Butcher,Sandwich Place,Juice Bar,Café,Indian Restaurant,Pizza Place,Pharmacy
3,"CFB Toronto, Downsview East",Airport,Park,Women's Store,Fast Food Restaurant,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
4,Don Mills North,Gym / Fitness Center,Caribbean Restaurant,Café,Japanese Restaurant,Women's Store,Fast Food Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant


In [18]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

## Clustering neighbourhoods

In [19]:
# set number of clusters
kclusters = 5

ny_grouped_clustering = ny_grouped.drop('Neighbourhood', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check labels
kmeans.labels_[0:10] 

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 4])

In [20]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge neighbourhoods_venues_sorted with df_ny to add latitude/longitude for each neighborhood
ny_merged = df_ny.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

ny_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Food & Drink Shop,Women's Store,Empanada Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,French Restaurant,Intersection,Financial or Legal Service,Dog Run,Construction & Landscaping,Convenience Store
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,1.0,Clothing Store,Women's Store,Shoe Store,Boutique,Coffee Shop,Furniture / Home Store,Miscellaneous Shop,Accessories Store,Vietnamese Restaurant,Tea Room
7,M3B,North York,Don Mills North,43.745906,-79.352188,1.0,Gym / Fitness Center,Caribbean Restaurant,Café,Japanese Restaurant,Women's Store,Fast Food Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
10,M6B,North York,Glencairn,43.709577,-79.445073,1.0,Park,Playground,Bakery,Pub,Japanese Restaurant,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega


In [21]:
ny_merged['Cluster Labels'].fillna(5, inplace=True)

In [22]:
ny_merged['Cluster Labels'] = ny_merged['Cluster Labels'].astype('int')

# create map
latitude = 43.7615
longitude = -79.4111
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighbourhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)

## Visualize clustering results.
### Note that the map doesn't render in Jupyter Notebook on Github. The html version is in the same folder if you want to check it out.

In [23]:
filename = 'cluster_ny'
map_clusters.save(f'{filename}.html')

from IPython.core.display import display
display(map_clusters)

#### Cluster 1 is the biggest cluster. Let's take a deeper look into it.

In [30]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[2] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Victoria Village,1,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,French Restaurant,Intersection,Financial or Legal Service,Dog Run,Construction & Landscaping,Convenience Store
3,"Lawrence Heights, Lawrence Manor",1,Clothing Store,Women's Store,Shoe Store,Boutique,Coffee Shop,Furniture / Home Store,Miscellaneous Shop,Accessories Store,Vietnamese Restaurant,Tea Room
7,Don Mills North,1,Gym / Fitness Center,Caribbean Restaurant,Café,Japanese Restaurant,Women's Store,Fast Food Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
10,Glencairn,1,Park,Playground,Bakery,Pub,Japanese Restaurant,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
13,"Flemingdon Park, Don Mills South",1,Gym,Asian Restaurant,Coffee Shop,Beer Store,Sporting Goods Shop,Fast Food Restaurant,Italian Restaurant,Japanese Restaurant,Discount Store,Dim Sum Restaurant
27,Hillcrest Village,1,Golf Course,Pool,Mediterranean Restaurant,Dog Run,Women's Store,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
28,"Bathurst Manor, Downsview North, Wilson Heights",1,Coffee Shop,Pharmacy,Shopping Mall,Frozen Yogurt Shop,Fried Chicken Joint,Fast Food Restaurant,Diner,Deli / Bodega,Middle Eastern Restaurant,Gas Station
33,"Fairview, Henry Farm, Oriole",1,Clothing Store,Fast Food Restaurant,Coffee Shop,Women's Store,Bakery,Japanese Restaurant,Kids Store,Food Court,Toy / Game Store,Jewelry Store
34,"Northwood Park, York University",1,Coffee Shop,Caribbean Restaurant,Bar,Massage Studio,Women's Store,Empanada Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
39,Bayview Village,1,Chinese Restaurant,Bank,Café,Japanese Restaurant,Women's Store,Fast Food Restaurant,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant


#### Neighbourhoods in Cluster 1 are likely populated area where people live and work.

In [31]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[2] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,0,Park,Food & Drink Shop,Women's Store,Empanada Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
40,"CFB Toronto, Downsview East",0,Airport,Park,Women's Store,Fast Food Restaurant,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner


#### Cluster 0 features park and airport.

In [32]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[2] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
45,"Silver Hills, York Mills",2,Cafeteria,Women's Store,Fast Food Restaurant,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner,Discount Store


#### Cluster 2 is a commercial area full of stores and restaurants.

In [33]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[2] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Humber Summit,3,Pizza Place,Empanada Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner


#### Cluster 3 features Concert Hall.

In [34]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[2] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
57,"Emery, Humberlea",4,Baseball Field,Women's Store,Financial or Legal Service,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner,Discount Store


#### Lastly, Cluster 4 is the sport center of North York.