## Segmenting and Clustering Neighborhoods in Toronto

# Part 1

Import pandas

In [26]:
import pandas as pd

Scrape wikipedia page

In [27]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df = pd.read_html(url)
pd.DataFrame(df)

Unnamed: 0,0
0,Postal Code Borough \ 0 ...
1,...
2,0 1 2 3 4 5 6 7 8 9 10 ...


The page contains 3 dataframes, but we are interested in the first one

In [28]:
df = pd.DataFrame(df[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Remove "Not assingned" rows from Borough

In [29]:
Borough = df[df["Borough"]=="Not assigned"].index
df.drop(Borough, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Cheking for any duplicates and reseting the index

In [55]:
print(df[df["Postal Code"].duplicated()])
print("")
print(df[df["Neighborhood"]=="Not assigned"])

df = df.reset_index(drop=True)

df.head()

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [20]:
df.shape

(103, 3)

# Part 2

If needed download geocoder and wget library and import them

In [63]:
#!pip install geocoder
#!pip install wget
import geocoder
import wget

Download the file from the url using wget

In [69]:
geo = wget.download("http://cocl.us/Geospatial_data")

100% [................................................................................] 2891 / 2891

Read the file as a csv

In [73]:
df_geo = pd.read_csv(geo)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Combine datafram df and df_geo using "Postal Code" as reference

In [75]:
df_comb = pd.merge(df, df_geo, on="Postal Code")
df_comb.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Import every library

In [111]:
import requests
import numpy as np
from sklearn.cluster import KMeans

Select rows with Toronto

In [81]:
df_T = df_comb[df_comb["Borough"].str.contains("Toronto")]
df_T.reset_index(drop=True, inplace=True)
df_T.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Define Foursquare credentials

In [83]:
CLIENT_ID = 'CQKCKTCWLYNK2PHWO1CS0JR4TGZ0CDGLT5LSFCK0H5SI1CAP' 
CLIENT_SECRET = 'OBE02YAUTK2I0PWH2CVZ0P1AG25KEQCRHXSZBGZSJEAL321N' 
VERSION = '20180604'
LIMIT = 30

Define a function to obtain venues

In [90]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()["response"]['groups'][0]['items']

        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [91]:
T_venues = getNearbyVenues(df_T["Neighborhood"],
                         df_T["Latitude"],
                         df_T["Longitude"])

In [92]:
T_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


In [93]:
T_venues.shape

(864, 7)

Number of unique categories

In [95]:
print('There are {} uniques categories.'.format(len(T_venues['Venue Category'].unique())))

There are 196 uniques categories.


Analysing aach neighborhood using one-hot encoding

In [97]:
T_onehot = pd.get_dummies(T_venues[['Venue Category']], prefix="", prefix_sep="")

T_onehot.rename(columns={'Neighborhood':'Neighborhood (category)'}, inplace=True)
T_onehot['Neighborhood'] = T_venues['Neighborhood'] 

fixed_columns = [T_onehot.columns[-1]] + list(T_onehot.columns[:-1])
T_onehot = T_onehot[fixed_columns]

T_onehot.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
T_onehot.shape

(864, 197)

Grouping rows by neighborhood and taking the mean of the frequency of occurence

In [101]:
T_grouped = T_onehot.groupby('Neighborhood').mean().reset_index()
T_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.066667,0.066667,0.133333,0.2,0.066667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333


Top 5 most common venues for each neighborhood

In [102]:
num_top_venues = 5
for hood in T_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = T_grouped[T_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                     venue  freq
0             Cocktail Bar  0.07
1       Seafood Restaurant  0.07
2                 Beer Bar  0.07
3              Coffee Shop  0.07
4  Comfort Food Restaurant  0.03


----Brockton, Parkdale Village, Exhibition Place----
                venue  freq
0                Café  0.14
1      Breakfast Spot  0.09
2         Coffee Shop  0.09
3       Grocery Store  0.05
4  Italian Restaurant  0.05


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0    Light Rail Station  0.11
1                  Park  0.05
2         Auto Workshop  0.05
3  Fast Food Restaurant  0.05
4               Brewery  0.05


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0     Airport Service  0.20
1      Airport Lounge  0.13
2             Airport  0.07
3                 Bar  0.07
4  Airport Food Court  0.07

This funtion sorts venues in descending order

In [104]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Display the top 10 venues for each neighborhood.

In [105]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = T_grouped['Neighborhood']

for ind in np.arange(T_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(T_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Seafood Restaurant,Coffee Shop,Cocktail Bar,Beer Bar,Greek Restaurant,Creperie,Breakfast Spot,Jazz Club,Liquor Store,Museum
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Performing Arts Venue,Bakery,Pet Store,Convenience Store,Climbing Gym,Restaurant,Burrito Place
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Brewery,Butcher,Fast Food Restaurant,Recording Studio,Restaurant,Auto Workshop,Spa,Burrito Place,Pizza Place
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport,Boutique,Rental Car Location,Boat or Ferry,Coffee Shop,Harbor / Marina,Sculpture Garden,Airport Terminal
4,Central Bay Street,Coffee Shop,Italian Restaurant,Japanese Restaurant,Café,Yoga Studio,Park,Sandwich Place,Bubble Tea Shop,Ramen Restaurant,Poke Place


Cluster the neighborhood 

In [108]:
kclusters = 5

T_grouped_clustering = T_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(T_grouped_clustering)

kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Dataframe with cluster and the top 10 venues for each neighborhood.

In [109]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

T_merged = df_T

T_merged = T_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

T_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Park,Breakfast Spot,Café,Pub,Theater,Bakery,Restaurant,Chocolate Shop,Yoga Studio
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Sushi Restaurant,Discount Store,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Distribution Center,Yoga Studio
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Café,Coffee Shop,Clothing Store,Theater,Burrito Place,Hotel,Electronics Store,Japanese Restaurant,Sandwich Place,Diner
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Gastropub,Japanese Restaurant,Café,Coffee Shop,Restaurant,Ice Cream Shop,Farmers Market,Italian Restaurant,Hotel,BBQ Joint
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Asian Restaurant,Neighborhood (category),Trail,Health Food Store,Pub,Cuban Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store


Visualize the resulting clusters

In [112]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude=43.654260
longitude=-79.389494
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(T_merged['Latitude'], T_merged['Longitude'], T_merged['Neighborhood'], T_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Examine clusters

In [115]:
T_merged.loc[T_merged['Cluster Labels'] == 0, T_merged.columns[[1] + list(range(5, T_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Park,Breakfast Spot,Café,Pub,Theater,Bakery,Restaurant,Chocolate Shop,Yoga Studio
1,Downtown Toronto,0,Coffee Shop,Diner,Sushi Restaurant,Discount Store,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Distribution Center,Yoga Studio
2,Downtown Toronto,0,Café,Coffee Shop,Clothing Store,Theater,Burrito Place,Hotel,Electronics Store,Japanese Restaurant,Sandwich Place,Diner
3,Downtown Toronto,0,Gastropub,Japanese Restaurant,Café,Coffee Shop,Restaurant,Ice Cream Shop,Farmers Market,Italian Restaurant,Hotel,BBQ Joint
5,Downtown Toronto,0,Seafood Restaurant,Coffee Shop,Cocktail Bar,Beer Bar,Greek Restaurant,Creperie,Breakfast Spot,Jazz Club,Liquor Store,Museum
6,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Japanese Restaurant,Café,Yoga Studio,Park,Sandwich Place,Bubble Tea Shop,Ramen Restaurant,Poke Place
7,Downtown Toronto,0,Grocery Store,Café,Park,Italian Restaurant,Candy Store,Restaurant,Athletics & Sports,Baby Store,Diner,Coffee Shop
8,Downtown Toronto,0,Coffee Shop,Café,Pizza Place,Lounge,Hotel,Bar,Speakeasy,Plaza,Bakery,Steakhouse
9,West Toronto,0,Pharmacy,Bakery,Supermarket,Café,Music Venue,Park,Grocery Store,Brewery,Bank,Pizza Place
10,Downtown Toronto,0,Plaza,Hotel,Café,Park,Dance Studio,Bistro,Basketball Stadium,Sporting Goods Shop,IT Services,Ice Cream Shop


In [116]:
T_merged.loc[T_merged['Cluster Labels'] == 1, T_merged.columns[[1] + list(range(5, T_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,Central Toronto,1,Park,Yoga Studio,Cuban Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
33,Downtown Toronto,1,Park,Playground,Trail,Creperie,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store


In [117]:
T_merged.loc[T_merged['Cluster Labels'] == 2, T_merged.columns[[1] + list(range(5, T_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,2,Park,Swim School,Bus Line,Dance Studio,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


In [118]:
T_merged.loc[T_merged['Cluster Labels'] == 3, T_merged.columns[[1] + list(range(5, T_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Central Toronto,3,Pool,Garden,Home Service,Yoga Studio,Cuban Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner


In [119]:
T_merged.loc[T_merged['Cluster Labels'] == 4, T_merged.columns[[1] + list(range(5, T_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East Toronto,4,Asian Restaurant,Neighborhood (category),Trail,Health Food Store,Pub,Cuban Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store
