In [92]:
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library

# Reading Toronto postal code file data into a dataframe

In [93]:
df_tor = pd.read_excel('Toronto_postalcodes.xlsx') 

# Ignore cells with a borough that is Not assigned.

In [94]:
df_tor = df_tor[df_tor.Neighborhood.notnull()]

# Displaying dataframe

In [95]:
df_tor.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


# Print the number of rows

In [96]:
df_tor.shape

(103, 3)

# Read file with coordinates into a dataframe

In [97]:
df_cor = pd.read_csv('Geospatial_Coordinates.csv')
df_cor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Add Latitude & longitude columns to Toronto Dataframe

In [98]:
df_tor['Latitude'] =""

In [99]:
df_tor['Longitude'] =""

In [100]:
df_tor = df_tor.set_index("Postal code").sort_index()

In [101]:
df_tor.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,Malvern / Rouge,,
M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,,
M1E,Scarborough,Guildwood / Morningside / West Hill,,
M1G,Scarborough,Woburn,,
M1H,Scarborough,Cedarbrae,,


In [102]:
df_cor = df_cor.set_index("Postal Code").sort_index()

In [103]:
df_cor.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [104]:
df_tor['Latitude'] = np.where(df_tor.index == df_cor.index, df_cor['Latitude'], "")

In [105]:
df_tor['Longitude'] = np.where(df_tor.index == df_cor.index, df_cor['Longitude'], "")

In [109]:
df_tor.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,Malvern / Rouge,43.8066863,-79.19435340000001
M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7845351,-79.16049709999999
M1E,Scarborough,Guildwood / Morningside / West Hill,43.7635726,-79.1887115
M1G,Scarborough,Woburn,43.7709921,-79.21691740000001
M1H,Scarborough,Cedarbrae,43.773136,-79.23947609999999


In [113]:
df_tor = df_tor.reset_index()

In [114]:
df_tor.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.8066863,-79.19435340000001
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7845351,-79.16049709999999
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.21691740000001
4,M1H,Scarborough,Cedarbrae,43.773136,-79.23947609999999


# Filter neighbourhood by Toronto

In [135]:
df_tor1 = df_tor[df_tor.Borough.str.contains('Toronto',case=False)]
df_tor1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.67635739999999,-79.2930312
41,M4K,East Toronto,The Danforth West / Riverdale,43.6795571,-79.352188
42,M4L,East Toronto,India Bazaar / The Beaches West,43.6689985,-79.31557159999998
43,M4M,East Toronto,Studio District,43.6595255,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.7280205,-79.3887901


# Import all necessary libraries

In [136]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [142]:
df_tor1.groupby('Borough').count()

Unnamed: 0_level_0,Postal code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
West Toronto,6,6,6,6


In [155]:
# one hot encoding
df_onehot = pd.get_dummies(df_tor1['Neighborhood'], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Borough'] = df_tor1['Borough']

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

Unnamed: 0,Borough,Berczy Park,Brockton / Parkdale Village / Exhibition Place,Business reply mail Processing Centre,CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,Central Bay Street,Christie,Church and Wellesley,Commerce Court / Victoria Hotel,Davisville,Davisville North,Dufferin / Dovercourt Village,First Canadian Place / Underground city,Forest Hill North & West,Garden District / Ryerson,Harbourfront East / Union Station / Toronto Islands,High Park / The Junction South,India Bazaar / The Beaches West,Kensington Market / Chinatown / Grange Park,Lawrence Park,Little Portugal / Trinity,Moore Park / Summerhill East,North Toronto West,Parkdale / Roncesvalles,Queen's Park / Ontario Provincial Government,Regent Park / Harbourfront,Richmond / Adelaide / King,Rosedale,Roselawn,Runnymede / Swansea,St. James Town,St. James Town / Cabbagetown,Stn A PO Boxes,Studio District,Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park,The Annex / North Midtown / Yorkville,The Beaches,The Danforth West / Riverdale,Toronto Dominion Centre / Design Exchange,University of Toronto / Harbord
37,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
41,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
42,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
43,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
44,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [157]:
df_grouped =df_onehot.groupby('Borough').mean().reset_index()
df_grouped

Unnamed: 0,Borough,Berczy Park,Brockton / Parkdale Village / Exhibition Place,Business reply mail Processing Centre,CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,Central Bay Street,Christie,Church and Wellesley,Commerce Court / Victoria Hotel,Davisville,Davisville North,Dufferin / Dovercourt Village,First Canadian Place / Underground city,Forest Hill North & West,Garden District / Ryerson,Harbourfront East / Union Station / Toronto Islands,High Park / The Junction South,India Bazaar / The Beaches West,Kensington Market / Chinatown / Grange Park,Lawrence Park,Little Portugal / Trinity,Moore Park / Summerhill East,North Toronto West,Parkdale / Roncesvalles,Queen's Park / Ontario Provincial Government,Regent Park / Harbourfront,Richmond / Adelaide / King,Rosedale,Roselawn,Runnymede / Swansea,St. James Town,St. James Town / Cabbagetown,Stn A PO Boxes,Studio District,Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park,The Annex / North Midtown / Yorkville,The Beaches,The Danforth West / Riverdale,Toronto Dominion Centre / Design Exchange,University of Toronto / Harbord
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0
1,Downtown Toronto,0.052632,0.0,0.0,0.052632,0.052632,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.052632,0.0,0.052632,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.052632,0.052632,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632
2,East Toronto,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.2,0.0,0.0
3,West Toronto,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
num_top_venues = 5

for hood in df_grouped['Borough']:
    print("----"+hood+"----")
    temp = df_grouped[df_grouped['Borough'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    #print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    #print('\n')

----Central Toronto----
----Downtown Toronto----
----East Toronto----
----West Toronto----


In [161]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [163]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Borough'] =df_grouped['Borough']

for ind in np.arange(df_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Roselawn,Davisville,Lawrence Park,The Annex / North Midtown / Yorkville,Summerhill West / Rathnelly / South Hill / For...,Moore Park / Summerhill East,North Toronto West,Forest Hill North & West,Davisville North,Garden District / Ryerson
1,Downtown Toronto,University of Toronto / Harbord,Regent Park / Harbourfront,CN Tower / King and Spadina / Railway Lands / ...,Central Bay Street,Christie,Church and Wellesley,Commerce Court / Victoria Hotel,First Canadian Place / Underground city,Garden District / Ryerson,Harbourfront East / Union Station / Toronto Is...
2,East Toronto,India Bazaar / The Beaches West,The Danforth West / Riverdale,The Beaches,Business reply mail Processing Centre,Studio District,Davisville North,High Park / The Junction South,Harbourfront East / Union Station / Toronto Is...,Garden District / Ryerson,Forest Hill North & West
3,West Toronto,Little Portugal / Trinity,Dufferin / Dovercourt Village,High Park / The Junction South,Parkdale / Roncesvalles,Runnymede / Swansea,Brockton / Parkdale Village / Exhibition Place,Central Bay Street,Christie,Church and Wellesley,Commerce Court / Victoria Hotel


In [165]:
# set number of clusters
kclusters = 4

df_grouped_clustering = df_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 1, 2], dtype=int32)

In [170]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df_tor1

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Borough'), on='Borough')

df_merged.head() # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.67635739999999,-79.2930312,1,India Bazaar / The Beaches West,The Danforth West / Riverdale,The Beaches,Business reply mail Processing Centre,Studio District,Davisville North,High Park / The Junction South,Harbourfront East / Union Station / Toronto Is...,Garden District / Ryerson,Forest Hill North & West
41,M4K,East Toronto,The Danforth West / Riverdale,43.6795571,-79.352188,1,India Bazaar / The Beaches West,The Danforth West / Riverdale,The Beaches,Business reply mail Processing Centre,Studio District,Davisville North,High Park / The Junction South,Harbourfront East / Union Station / Toronto Is...,Garden District / Ryerson,Forest Hill North & West
42,M4L,East Toronto,India Bazaar / The Beaches West,43.6689985,-79.31557159999998,1,India Bazaar / The Beaches West,The Danforth West / Riverdale,The Beaches,Business reply mail Processing Centre,Studio District,Davisville North,High Park / The Junction South,Harbourfront East / Union Station / Toronto Is...,Garden District / Ryerson,Forest Hill North & West
43,M4M,East Toronto,Studio District,43.6595255,-79.340923,1,India Bazaar / The Beaches West,The Danforth West / Riverdale,The Beaches,Business reply mail Processing Centre,Studio District,Davisville North,High Park / The Junction South,Harbourfront East / Union Station / Toronto Is...,Garden District / Ryerson,Forest Hill North & West
44,M4N,Central Toronto,Lawrence Park,43.7280205,-79.3887901,0,Roselawn,Davisville,Lawrence Park,The Annex / North Midtown / Yorkville,Summerhill West / Rathnelly / South Hill / For...,Moore Park / Summerhill East,North Toronto West,Forest Hill North & West,Davisville North,Garden District / Ryerson


In [189]:
df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[1] + list(range(4, df_merged.shape[1]))]].head(1)

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Central Toronto,-79.3887901,0,Roselawn,Davisville,Lawrence Park,The Annex / North Midtown / Yorkville,Summerhill West / Rathnelly / South Hill / For...,Moore Park / Summerhill East,North Toronto West,Forest Hill North & West,Davisville North,Garden District / Ryerson


In [190]:
df_merged.loc[df_merged['Cluster Labels'] == 1, df_merged.columns[[1] + list(range(4, df_merged.shape[1]))]].head(1)

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,East Toronto,-79.2930312,1,India Bazaar / The Beaches West,The Danforth West / Riverdale,The Beaches,Business reply mail Processing Centre,Studio District,Davisville North,High Park / The Junction South,Harbourfront East / Union Station / Toronto Is...,Garden District / Ryerson,Forest Hill North & West


In [191]:
df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[1] + list(range(4, df_merged.shape[1]))]].head(1)

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
76,West Toronto,-79.4422593,2,Little Portugal / Trinity,Dufferin / Dovercourt Village,High Park / The Junction South,Parkdale / Roncesvalles,Runnymede / Swansea,Brockton / Parkdale Village / Exhibition Place,Central Bay Street,Christie,Church and Wellesley,Commerce Court / Victoria Hotel


In [192]:
df_merged.loc[df_merged['Cluster Labels'] == 3, df_merged.columns[[1] + list(range(4, df_merged.shape[1]))]].head(1)

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Downtown Toronto,-79.37752940000001,3,University of Toronto / Harbord,Regent Park / Harbourfront,CN Tower / King and Spadina / Railway Lands / ...,Central Bay Street,Christie,Church and Wellesley,Commerce Court / Victoria Hotel,First Canadian Place / Underground city,Garden District / Ryerson,Harbourfront East / Union Station / Toronto Is...


In [193]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    brotlipy-0.7.0             |py36h8c4c3a4_1000         346 KB  conda-forge
    chardet-3.0.4              |py36h9f0ad1d_1006         188 KB  conda-forge
    cryptography-2.9.2         |   py36h45558ae_0         613 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    pandas-1.0.3               |   py36h83

In [198]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

       
map_clusters