## Segmenting and Clustering Neighborhoods in Toronto

### Part 1 Create Postal Code and Neighborhood Table
#### Import necessary packages

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json 
from pandas.io.json import json_normalize
import requests 
import folium
from sklearn.cluster import KMeans
%matplotlib inline
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Load dataframe from url

In [34]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
Postal=pd.read_html(url,header=0)[0]
Postal=Postal[Postal['Borough']!='Not assigned']
#Postal_old=Postal
#for i in range(Postal.shape[0]):
#    Split=Postal.iloc[i,2].split(',')
#    if len(Split)>1:
#        for j in range(1,len(Split)):
#            Postal=Postal.append({'Postal Code':Postal.iloc[i,0],'Borough':Postal.iloc[i,1],'Neighbourhood':Split[j].strip()},ignore_index=True)
#        Postal.iloc[i,2]=Split[0]
print("The shape of the dataframe is {}".format(Postal.shape))
Postal.head()

The shape of the dataframe is (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Part 2 Merge the Original Data with Geographic Information (Latitude and Longitude)
#### Use read_csv method to obtain postal code and geo data table

In [35]:
Geo=pd.read_csv('http://cocl.us/Geospatial_data')
print("The shape of the dataframe is {}".format(Geo.shape))
Geo.head()

The shape of the dataframe is (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge Postal Code table with Geo data table

In [36]:
Postal_LatLong=Postal.merge(Geo,left_on='Postal Code',right_on='Postal Code')
print("The shape of the dataframe is {}".format(Postal_LatLong.shape))
Postal_LatLong.head()

The shape of the dataframe is (103, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part3

#### Foursquare Credentials

In [37]:
CLIENT_ID='3OW5MFJGIAF0L4JIP3I2RNWPDOCPEZAE2IY1FWFB4HF5DTZS'
CLIENT_SECRET='LRROCYA51VN1MZQ21WHSF0T1J4W1VAGJ0QWM4BFHG2QPNTM0'
VERSION='20200815'
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

#### Foursquare Data Retrieval

*This is the function used to get the category of venue*

In [38]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

*This function iteratively calls information from Foursquare using the neighbourhood data we just created in Part 2*

In [39]:
def getnearestvenues(Postal_LatLong,LIMIT,radius):
    for i in range(Postal_LatLong.shape[0]):
        name,lat,long=Postal_LatLong.iloc[i,2:5]
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        long, 
        radius, 
        LIMIT)
        results = requests.get(url).json()
        if results['response']['totalResults']!=0:
            venues = results['response']['groups'][0]['items']
            nearby_venues = json_normalize(venues) # flatten JSON
            # filter columns
            filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
            nearby_venues =nearby_venues.loc[:, filtered_columns]

            # filter the category for each row
            nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

            # clean columns
            nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
            nearby_venues['Neighbourhood']=name
            nearby_venues['lat_Neighbourhood']=lat
            nearby_venues['lng_Neighbourhood']=long
            nearby_venues=nearby_venues[['Neighbourhood','lat_Neighbourhood','lng_Neighbourhood','name','categories','lat','lng']]
            if i==0:
                Nearby_Venue_Table=nearby_venues
            else:
                Nearby_Venue_Table=Nearby_Venue_Table.append(nearby_venues,ignore_index=True)
    return Nearby_Venue_Table

*This cell applied the function we defined in previous cell to obtain venues infomration in Toronto*
*Also, I created a new table for manipulation while keeping the original one as backup*

In [77]:
Nearby_Venue_Table=getnearestvenues(Postal_LatLong,LIMIT=100,radius=500)
toronto_venues=Nearby_Venue_Table.copy()
toronto_venues.head()

0
1
2
3
4
5
6
7
8
9
10
11


KeyboardInterrupt: 

*This cell uses groupby method (Neighbourhood as key) along with count() method toroughly check the data quantity*

In [41]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,lat_Neighbourhood,lng_Neighbourhood,name,categories,lat,lng
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",6,6,6,6,6,6
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
Berczy Park,58,58,58,58,58,58
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",14,14,14,14,14,14
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",15,15,15,15,15,15


*This cell uses get_dummies to get table with dummy variables of each venue in neighbourhood*

In [42]:
# one hot encoding
toronto_onehot = pd.get_dummies(Nearby_Venue_Table[['categories']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = Nearby_Venue_Table['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = Nearby_Venue_Table['Neighbourhood'] 


toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


*Then, we groupby the dummy variable table by neighbourhood and take mean() to calculate the proportion of each category withon the neghbourhood*

In [29]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
1,"Alderwood, Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
3,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
4,"Bedford Park, Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.040000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
5,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.017241,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
6,"Birch Cliff, Cliffside West",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
7,"Brockton, Parkdale Village, Exhibition Place",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.041667
8,"Business reply mail Processing Centre, South C...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.000000,0.066667,0.066667,0.066667,0.133333,0.2,0.066667,0.000000,...,0.00,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000


*This cell creates report about the top five frequent vanues within the neighborhood*

In [57]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
hood

----Agincourt----
                       venue  freq
0                     Lounge  0.25
1             Breakfast Spot  0.25
2  Latin American Restaurant  0.25
3               Skating Rink  0.25
4          Accessories Store  0.00


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place  0.33
1  Sandwich Place  0.17
2             Gym  0.17
3     Coffee Shop  0.17
4             Pub  0.17


----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                       Bank  0.09
1                Coffee Shop  0.09
2                  Gift Shop  0.05
3              Shopping Mall  0.05
4  Middle Eastern Restaurant  0.05


----Bayview Village----
                 venue  freq
0                 Café  0.25
1                 Bank  0.25
2  Japanese Restaurant  0.25
3   Chinese Restaurant  0.25
4                Motel  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.08
1         Coffee Shop  0.08

                venue  freq
0      Sandwich Place  0.25
1                Park  0.25
2            Bus Line  0.25
3   Mobile Phone Shop  0.25
4  Mexican Restaurant  0.00


----Lawrence Manor, Lawrence Heights----
                    venue  freq
0  Furniture / Home Store  0.33
1          Clothing Store  0.25
2       Accessories Store  0.08
3   Vietnamese Restaurant  0.08
4             Event Space  0.08


----Lawrence Park----
               venue  freq
0               Park  0.25
1           Bus Line  0.25
2             Lawyer  0.25
3        Swim School  0.25
4  Accessories Store  0.00


----Leaside----
                    venue  freq
0     Sporting Goods Shop  0.10
1             Coffee Shop  0.10
2            Burger Joint  0.06
3  Furniture / Home Store  0.06
4                    Bank  0.06


----Little Portugal, Trinity----
                   venue  freq
0                    Bar  0.11
1       Asian Restaurant  0.06
2            Coffee Shop  0.06
3  Vietnamese Restaurant  0.04
4          

                venue  freq
0         Pizza Place  0.25
1         Coffee Shop  0.12
2      Discount Store  0.12
3      Sandwich Place  0.12
4  Chinese Restaurant  0.12


----Weston----
                             venue  freq
0                Convenience Store   1.0
1                Accessories Store   0.0
2               Mexican Restaurant   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


----Wexford, Maryvale----
                   venue  freq
0                 Bakery  0.17
1         Breakfast Spot  0.17
2            Auto Garage  0.17
3         Sandwich Place  0.17
4  Vietnamese Restaurant  0.17


----Willowdale, Newtonbrook----
                             venue  freq
0                             Park   1.0
1                Accessories Store   0.0
2               Mexican Restaurant   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


----Willowdale, Willowdale East----
              venue  freq
0  Ramen Re

'York Mills West'

*This cell defines the function to get most common venues*

In [59]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

*This cell returns the dataframe of 10 most common venues within the neighbourhood*

In [60]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Drugstore,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
1,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Pub,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Frozen Yogurt Shop,Bridal Shop,Sandwich Place,Diner,Restaurant,Deli / Bodega,Middle Eastern Restaurant,Supermarket
3,Bayview Village,Café,Bank,Chinese Restaurant,Japanese Restaurant,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Restaurant,Sandwich Place,Coffee Shop,Café,Toy / Game Store,Cupcake Shop,Pizza Place,Pharmacy,Thai Restaurant


*In this cell, we start to apply KMeans Clustering to turn neighbourhood into designated number of clusters (5 in this example)* 

In [66]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 3, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 3, 2, 4, 2, 2, 2, 2, 2, 1, 2, 0, 3, 2, 2, 2, 3,
       2, 2, 2, 0, 3, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2,
       2, 3, 2, 0, 2, 2, 3])

*This cell adds the results of clustering back into our neighbourhood-venue dataframe and join in with our sorted frequency venues dataframe *

In [70]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighbourhood,lat_Neighbourhood,lng_Neighbourhood,name,categories,lat,lng,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,Park,43.751976,-79.33214,3,Food & Drink Shop,Park,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,Parkwoods,43.753259,-79.329656,Variety Store,Food & Drink Shop,43.751974,-79.333114,3,Food & Drink Shop,Park,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Drugstore
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,Hockey Arena,43.723481,-79.315635,0,Hockey Arena,Portuguese Restaurant,Pizza Place,Coffee Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
3,Victoria Village,43.725882,-79.315572,Tim Hortons,Coffee Shop,43.725517,-79.313103,0,Hockey Arena,Portuguese Restaurant,Pizza Place,Coffee Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
4,Victoria Village,43.725882,-79.315572,Portugril,Portuguese Restaurant,43.725819,-79.312785,0,Hockey Arena,Portuguese Restaurant,Pizza Place,Coffee Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant


*Finally, we create visualization map using folium along with the cluster labels and venue locations*

In [76]:
latitude=43.73
longitude=-79.4593
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['lat'], toronto_merged['lng'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters