<h1 align=center><font size = 5><em>Segmenting and Clustering Neighborhoods in Toronto</font></h1>

# Part 1

In [1]:
import pandas as pd

### Read the table

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
read_table = pd.read_html(url,header=[0])
df = read_table[0]
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Rename columns

In [3]:
df.columns = ['PostalCode','Borough','Neighborhood']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [4]:
df = df[df.Borough !='Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Group neighborhoods in the same borough

In [5]:
df = df.groupby('PostalCode',as_index=False).agg(lambda x: ','.join(set(x.dropna())))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [6]:
df.loc[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df['Borough']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Number of rows

In [7]:
print("The number of rows is : %.f" % df.shape[0])

The number of rows is : 103


# Part 2

### Read geographical coordinates CSV and rename columns

In [8]:
coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
coordinates.columns = ['PostalCode','Latitude','Longitude']
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge 2 tables

In [9]:
dfgeo = df.merge(coordinates, on="PostalCode", how="left")
dfgeo.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Part 3

In [10]:
import numpy as np
import json

!pip install geopy
from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip install folium==0.5.0
import folium

Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 6.4 MB/s eta 0:00:011
[?25hCollecting branca
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=394f2a9b15edd3558c6ece28d20cac0a010adcc855ba73be8b78db2f3f97b37a
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.5.0


### Find longitude and Latitude of Toronto with geopy

In [11]:
address = 'Toronto'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {},{}.'.format(latitude,longitude))

The geograpical coordinate of Toronto are 43.6534817,-79.3839347.


### Create a map of Toronto with all neighborhoods

In [12]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng, label in zip(dfgeo['Latitude'], dfgeo['Longitude'], dfgeo['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Filter only boroughs with the word Toronto inside

In [13]:
borough_names = list(dfgeo.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

#new dataframe with boroughs with Toronto
dfgeo = dfgeo[dfgeo['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
dfgeo.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [14]:
#create map with only this Borough
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng, label in zip(dfgeo['Latitude'], dfgeo['Longitude'], dfgeo['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Foursquare Credentials

In [15]:
CLIENT_ID = '4FVCBSDHQW5W15PQUJAGAYKPIHBZ5GROWEZRKOQSDVHXVXFP' # your Foursquare ID
CLIENT_SECRET = 'PTEZWY4FQBJYI20GACEDNKSLG0UR03FPVC0YKDQ4UG3Q2Y30' # your Foursquare Secret
VERSION = '20180604'

### Explore neighborhoods

In [18]:
# I chose to stay in a 500m radius and to limit the number of venues at 100
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(dfgeo['Latitude'], dfgeo['Longitude'], dfgeo['PostalCode'], dfgeo['Borough'], dfgeo['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

# make the list a DF
venues_df = pd.DataFrame(venues)
venues_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [19]:
# Give a name to columns
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhoods', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1624, 9)


Unnamed: 0,PostalCode,Borough,Neighborhoods,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [20]:
# we keep only 4 columns, we don't need the others
venues_df2=venues_df.drop(['BoroughLatitude', 'BoroughLongitude','VenueName','VenueLatitude','VenueLongitude'], axis=1)
venues_df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods,VenueCategory
0,M4E,East Toronto,The Beaches,Trail
1,M4E,East Toronto,The Beaches,Health Food Store
2,M4E,East Toronto,The Beaches,Pub
3,M4E,East Toronto,The Beaches,Neighborhood
4,M4K,East Toronto,"The Danforth West, Riverdale",Greek Restaurant


In [21]:
# change VenueCategory to dummies
venues_dummies=pd.get_dummies(venues_df2[['VenueCategory']], prefix="", prefix_sep="")

#n now I replace the VenueCategory columns by the dummies
venues_df2 = pd.concat([venues_df2, venues_dummies], axis=1)
venues_df2.drop("VenueCategory", axis=1, inplace=True)
venues_df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,East Toronto,"The Danforth West, Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


I group the Neighborhoods and sum the number of venues

In [22]:
toronto_group=venues_df2.groupby(["PostalCode", "Borough", "Neighborhoods"]).sum().reset_index()

print(toronto_group.shape)
toronto_group

(39, 240)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,M4K,East Toronto,"The Danforth West, Riverdale",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,M4L,East Toronto,"India Bazaar, The Beaches West",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4M,East Toronto,Studio District,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,M4N,Central Toronto,Lawrence Park,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M4P,Central Toronto,Davisville North,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,M4S,Central Toronto,Davisville,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,M4T,Central Toronto,"Moore Park, Summerhill East",0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [23]:
# set number of clusters
kclusters = 5

toronto_group2 = toronto_group.drop(['PostalCode', 'Borough','Neighborhoods'], axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_group2)


#### I group the geo datas in the same way by neighborhood

In [24]:
dfgeo_group=dfgeo.groupby(["PostalCode", "Borough", "Neighborhood"]).mean().reset_index()
print(dfgeo_group.shape)
dfgeo_group.head(5)

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### I add the cluster number found by K-mean to the geo datas

In [25]:
dfgeo_group["Cluster"]=kmeans.labels_
dfgeo_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,1
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1


#### I display the point of the dataframe with clusters on a map.

In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# choose the colors the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add points to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(dfgeo_group['Latitude'], dfgeo_group['Longitude'], dfgeo_group['PostalCode'], dfgeo_group['Borough'], dfgeo_group['Neighborhood'], dfgeo_group['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters