<h1 align=center>Segmenting and Clustering Neighborhoods in Toronto</h1>

<h2>Import all the necessary library</h2>

In [25]:
# library to handle data in a vectorized manner
import numpy as np 

# library for data analsysis
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle JSON files
import json 

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 
import geocoder

# library to handle requests
import requests 
# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
import folium 

# lxml library
import lxml
print('Libraries imported.')

Libraries imported.


<h2>Data wrangling from wiki</h2>

In [26]:
# use pandas and lxml to obtain data from wiki
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url, header = 0)[0]
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [27]:
# replace 'Not assigned' into 'NaN' value
df.replace('Not assigned', np.nan, inplace = True)
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [28]:
# drop rows with borough value of 'NaN'
df.dropna(subset = ['Borough'], axis = 0, inplace = True)
# reset index because some rows are dropped
df.reset_index(drop = True, inplace = True)
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [29]:
# group the dataframe by postal code and concatenate the neighborhood
df.groupby(['Postal Code'])['Neighborhood'].apply(lambda x: "{%s}" % ', '.join(x))
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [30]:
# replace neighborhood with value 'NaN' into borough's value
df['Neighborhood'].replace(np.nan, df['Borough'], inplace = True)
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h2>Shape of the final dataframe</h2>

In [31]:
print(df.shape)

(103, 3)


<h2>As the package takes too much time to run because it's unreliable, data is taken directly from https://cocl.us/Geospatial_data</h2>

In [32]:
# change postal code panda series into list
# postal_code = df['Postal Code'].values.tolist()

# initialize latitude and longitude to None
# lat_lng_coords = [None for i in range(len(postal_code))]

# loop until you get the coordinates for all postal code
# for i in range(0,len(postal_code)-1):
    # while (lat_lng_coords[i] is None):
        # g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[i]))
        # lat_lng_coords[i] = g.latlng

In [33]:
# obtain data from https://cocl.us/Geospatial_data
df_geo_coords = pd.read_csv('https://cocl.us/Geospatial_data')
df_geo_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
# merge df and df_geo_coords
merged_df = pd.merge(left = df, right = df_geo_coords, how = 'left', left_on = 'Postal Code', right_on = 'Postal Code')
merged_df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [35]:
# slice merged_df with borough containing the word 'Toronto'
df_toronto = merged_df[merged_df['Borough'].str.contains('Toronto')]
# reset index because some rows are dropped
df_toronto.reset_index(drop = True, inplace = True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [36]:
address = 'Toronto, TO'

# in order to define an instance of the geocoder, we need to define a user_agent. I name mine to_explorer
geolocator = Nominatim(user_agent='to_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

<h2>Create a map of Toronto with df_toronto superimposed on top</h2>

In [37]:
# create a map using folium
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

<h2>Define Foursquare credentials and version</h2>

In [57]:
# input your id, secret, and version into variables
CLIENT_ID = 'GLRCVZP2RKPHMGC4EBE2QWHTYC0FRWUQIXBFXYALHX2Z31LB'
CLIENT_SECRET = 'UIRGABYTQ2XQ1RUC1U3LKVYHJBDCGUZKOWTEFWYPAKDHQZDA'
VERSION = '20170101'
# set limit and radius
LIMIT = 100
radius = 1000

<h2>Create a function to explore each neighborhood</h2>

In [60]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

<h2>Create a new dataframe for venues nearby each neighborhood in Toronto</h2>

In [62]:
toronto_venues = getNearbyVenues(names = df_toronto['Neighborhood'], latitudes = df_toronto['Latitude'], longitudes = df_toronto['Longitude'])
print(toronto_venues.shape)
toronto_venues.head()

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands


KeyError: 'groups'

<h2>One hot encoding</h2>

In [63]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='',prefix_sep='')
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
print(toronto_onehot.shape)
toronto_onehot.head()

NameError: name 'toronto_venues' is not defined

<h2>Group rows by neighborhood</h2>

In [42]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

NameError: name 'toronto_onehot' is not defined

<h2>Clustering neighborhoods</h2>

In [43]:
# set number of clusters
num_clusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
k_means = KMeans(n_clusters=num_clusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
k_means.labels_[0:30] 

NameError: name 'toronto_grouped' is not defined

<h2>Create a new dataframe that includes the cluster labels</h2>

In [44]:
toronto_merged = df_toronto[['Borough', 'Neighborhood', 'Latitude', 'Longitude']]
toronto_merged.insert(4,'Cluster Labels',k_means.labels_)
toronto_merged.head()

NameError: name 'k_means' is not defined

<h2>Visualize the resulting clusters</h2>

In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

KeyError: 'Cluster Labels'