# Segmenting and Clustering Neighborhoods in Toronto

#### Import libraries

In [3]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
from geopy.geocoders import Nominatim
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 

print('Libraries imported.')

Libraries imported.


#### Download Toronto Dataset

In [4]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df=pd.read_html(url, header=0)[0]

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Delete all rows with Borough as "Not assigned" and resetting the index

In [5]:
df = df[df.Borough != 'Not assigned']
df.reset_index(inplace = True)
df.drop('index', axis=1,inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Concatenate rows based on Postcode and Borough for the Neighbourhood column

In [6]:
df1 = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x:', '.join(x))
df2 = pd.DataFrame(df1)
df2.reset_index(inplace = True)
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Unable to use the Geocoder Package, therefore I used the provided csv file for the following dataframe

In [7]:
Toronto_df = pd.read_csv("Geospatial_Coordinates.csv")
Toronto_df.rename(columns={'Postal Code':'Postcode'}, inplace=True)
Toronto_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df_inner = pd.merge(Toronto_df, df2, on='Postcode', how='inner')
df_inner = df_inner[['Postcode','Borough','Neighbourhood','Latitude','Longitude']]
df_inner.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Creating clusters

#### Remove all Borough's except those that contain "Toronto"

In [66]:
Torontodf = df_inner[df_inner.Borough.str.contains("Toronto")]
Torontodf

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


#### One hot encoding

In [74]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Torontodf[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Torontodf['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
37,The Beaches,0,0,1,0
41,"The Danforth West, Riverdale",0,0,1,0
42,"The Beaches West, India Bazaar",0,0,1,0
43,Studio District,0,0,1,0
44,Lawrence Park,1,0,0,0


In [72]:
Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,"Adelaide, King, Richmond",0,1,0,0
1,Berczy Park,0,1,0,0
2,"Brockton, Exhibition Place, Parkdale Village",0,0,0,1
3,Business Reply Mail Processing Centre 969 Eastern,0,0,1,0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,1,0,0


In [79]:
# set number of clusters
kclusters = 4

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([0, 0, 2, 3, 0, 0, 0, 0, 0, 0], dtype=int32)

#### Merging clusters with Toronto dataframe

In [89]:
#add clustering labels
Torontodf.insert(0, 'Cluster Labels', kmeans.labels_)
Torontodf.head()


Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,3,M4M,East Toronto,Studio District,43.659526,-79.340923
44,0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### Create a map

In [91]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Torontodf['Latitude'], Torontodf['Longitude'], Torontodf['Neighbourhood'], Torontodf['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [90]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="T_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Examine Clusters

In [92]:
Torontodf.loc[Torontodf['Cluster Labels'] == 0, Torontodf.columns[[1] + list(range(5, Torontodf.shape[1]))]]

Unnamed: 0,Postcode,Longitude
37,M4E,-79.293031
41,M4K,-79.352188
44,M4N,-79.38879
45,M4P,-79.390197
46,M4R,-79.405678
47,M4S,-79.38879
48,M4T,-79.38316
49,M4V,-79.400049
50,M4W,-79.377529
54,M5B,-79.378937


In [93]:
Torontodf.loc[Torontodf['Cluster Labels'] == 1, Torontodf.columns[[1] + list(range(5, Torontodf.shape[1]))]]

Unnamed: 0,Postcode,Longitude
51,M4X,-79.367675
52,M4Y,-79.38316
53,M5A,-79.360636
57,M5G,-79.387383
63,M5N,-79.416936
65,M5R,-79.405678
66,M5S,-79.400049
69,M5W,-79.374846
82,M6P,-79.464763


In [94]:
Torontodf.loc[Torontodf['Cluster Labels'] == 2, Torontodf.columns[[1] + list(range(5, Torontodf.shape[1]))]]

Unnamed: 0,Postcode,Longitude
42,M4L,-79.315572
55,M5C,-79.375418
61,M5L,-79.379817
64,M5P,-79.411307
67,M5T,-79.400049
70,M5X,-79.38228


In [95]:
Torontodf.loc[Torontodf['Cluster Labels'] == 3, Torontodf.columns[[1] + list(range(5, Torontodf.shape[1]))]]

Unnamed: 0,Postcode,Longitude
43,M4M,-79.340923
78,M6K,-79.428191
83,M6R,-79.456325
84,M6S,-79.48445
87,M7Y,-79.321558
