# Segmenting and Clustering Neighbourhoods in Toronto

# Part 1

Import libraries

In [199]:
import pandas as pd
import numpy as np

In [113]:
!pip install lxml



Read the url file

In [114]:
url = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)

Assign first element to df

In [115]:
df = df[0]

Replace unassigned values with NaN and drop Nan Values

In [119]:
df.replace(to_replace={'Borough':{'Not assigned':np.nan}},inplace =True)
df.dropna(inplace=True)

Group values based on Postal Codes

In [123]:
df = df.groupby('Postal Code',as_index=False).agg(lambda x: ','.join(set(x.dropna())))
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Shape of the dataframe

In [124]:
df.shape

(103, 3)

DataFrame

In [125]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [177]:
df.loc[df['Neighbourhood'] == 'Not assigned','Neighbourhood'] = df['Borough']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 2

Read the csv file and assign it to df2

In [200]:
df2 = pd.read_csv('https://cocl.us/Geospatial_data')
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Evaluate shapes of both data frames

In [180]:
print(df1.shape[0])
print(df2.shape[0])

103
103


Merge both data frames

In [185]:
df_merge = pd.merge(left=df1,right=df2,on='Postal Code')
df_merge.head(15)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Part 3

Import libraries

In [187]:
import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    absl-py-0.11.0             |   py37h89c1867_0         168 KB  conda-forge
    aiohttp-3.7.4              |   py37h5e8e339_0         632 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    appdirs-1.4.4              |     p

libdeflate-1.7       | 67 KB     | ##################################### | 100% 
mysql-libs-8.0.23    | 1.8 MB    | ##################################### | 100% 
pyyaml-5.4.1         | 189 KB    | ##################################### | 100% 
tornado-6.1          | 646 KB    | ##################################### | 100% 
secretstorage-3.3.1  | 24 KB     | ##################################### | 100% 
cachetools-4.2.1     | 13 KB     | ##################################### | 100% 
toml-0.10.2          | 18 KB     | ##################################### | 100% 
networkx-2.5         | 1.2 MB    | ##################################### | 100% 
ca-certificates-2020 | 137 KB    | ##################################### | 100% 
xlrd-2.0.1           | 92 KB     | ##################################### | 100% 
libgcc-ng-9.3.0      | 7.8 MB    | ##################################### | 100% 
sqlalchemy-1.3.23    | 1.8 MB    | ##################################### | 100% 
pytest-6.2.2         | 430 K

jupyter_core-4.7.1   | 72 KB     | ##################################### | 100% 
libaec-1.0.4         | 31 KB     | ##################################### | 100% 
sympy-1.7.1          | 10.9 MB   | ##################################### | 100% 
libprotobuf-3.15.5   | 2.5 MB    | ##################################### | 100% 
tqdm-4.59.0          | 77 KB     | ##################################### | 100% 
pyqt-impl-5.12.3     | 5.9 MB    | ##################################### | 100% 
charls-2.2.0         | 138 KB    | ##################################### | 100% 
lerc-2.2.1           | 213 KB    | ##################################### | 100% 
liblapack-3.9.0      | 11 KB     | ##################################### | 100% 
decorator-4.4.2      | 11 KB     | ##################################### | 100% 
brotli-1.0.9         | 389 KB    | ##################################### | 100% 
pandas-1.2.3         | 11.8 MB   | ##################################### | 100% 
brotlipy-0.7.0       | 341 K

zstd-1.4.9           | 431 KB    | ##################################### | 100% 
py-xgboost-1.3.3     | 124 KB    | ##################################### | 100% 
gmp-6.2.1            | 806 KB    | ##################################### | 100% 
giflib-5.2.1         | 77 KB     | ##################################### | 100% 
opt_einsum-3.3.0     | 51 KB     | ##################################### | 100% 
zlib-1.2.11          | 106 KB    | ##################################### | 100% 
multidict-5.1.0      | 67 KB     | ##################################### | 100% 
pysocks-1.7.1        | 27 KB     | ##################################### | 100% 
biopython-1.78       | 2.6 MB    | ##################################### | 100% 
numpy-1.20.1         | 5.8 MB    | ##################################### | 100% 
libpq-13.1           | 2.7 MB    | ##################################### | 100% 
google-auth-1.24.0   | 62 KB     | ##################################### | 100% 
scipy-1.5.3          | 18.5 

This path already exists in the target prefix, and it won't be removed by
an uninstall action in this transaction. The path appears to be coming from
the package 'defaults/linux-64::numpy-base-1.18.5-py37hde5b4d6_0', which is already installed in the prefix.


ClobberError: The package 'conda-forge/linux-64::numpy-1.20.1-py37haa41c4c_0' cannot be installed due to a
path collision for 'lib/python3.7/site-packages/numpy/random/tests/__pycache__/test_randomstate_regression.cpython-37.pyc'.
This path already exists in the target prefix, and it won't be removed by
an uninstall action in this transaction. The path appears to be coming from
the package 'defaults/linux-64::numpy-base-1.18.5-py37hde5b4d6_0', which is already installed in the prefix.


ClobberError: The package 'conda-forge/linux-64::numpy-1.20.1-py37haa41c4c_0' cannot be installed due to a
path collision for 'lib/python3.7/site-packages/numpy/random/tests/__pycache__/test_regression.cpython-37.pyc'.
This path alre

done
Executing transaction: | b'Exception while loading config file /var/pod/.ws/ax-ext/config/wscloud/jupyter_notebook_config.py\n    Traceback (most recent call last):\n      File "/opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/traitlets/config/application.py", line 737, in _load_config_files\n        config = loader.load_config()\n      File "/opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/traitlets/config/loader.py", line 616, in load_config\n        self._read_file_as_dict()\n      File "/opt/conda/envs/Python-3.7-main/lib/python3.7/site-packages/traitlets/config/loader.py", line 648, in _read_file_as_dict\n        exec(compile(f.read(), conf_filename, \'exec\'), namespace, namespace)\n      File "/var/pod/.ws/ax-ext/config/wscloud/jupyter_notebook_config.py", line 17, in <module>\n        from cdsax_jupyter_extensions.ax_log import ax_log_request\n    ModuleNotFoundError: No module named \'cdsax_jupyter_extensions\'\nEnabling notebook extension jupyter-js

Form map with label markers

In [188]:
mp = folium.Map(location=[43.6532, -79.3832], zoom_start=9.5)

# add markers to map
for pc, lat, lng, borough, neighborhood in zip(df['Postal Code'], df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{} | {} | {} '.format(pc, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mp)  
    
mp

Define Foursquare credentials

In [201]:
CLIENT_ID = 'BSGKIURX4BRKJQVKKMXK4NO1M50GDTEAAW4YUOGN5PECKIQ5' #Foursquare ID
CLIENT_SECRET = 'UZCWBSTUHYOJB3KIWRNS0BOWWL0IJ3HMKLUXX2CVUJQMTU2C' #Foursquare Secret
ACCESS_TOKEN = 'QFW3ZH5O0KW1RMC1PYGDFQJ2DZ5QNOEIUDYXNRUGD0UT1DKI' #FourSquare Access Token
VERSION = '20180604'
LIMIT = 10

Define a function which gives a data frame of the top 10 venues of each neighbourhood within a 500m radius

In [191]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        #GET request
        results = requests.get(url).json()["response"]["groups"][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Apply the defined function

In [192]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

One hot encoding

In [193]:
# one hot encoding
oh = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
oh['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [oh.columns[-1]] + list(oh.columns[:-1])
oh = oh[fixed_columns]

In [194]:
gp = oh.groupby('Neighborhood').mean().reset_index()

Define a function for the most popular venues

In [195]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = gp['Neighborhood']

for ind in np.arange(gp.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(gp.iloc[ind, :], num_top_venues)

Use K-Means cluster algorithm to cluster the neighbourhoods

In [196]:
# number of clusters
kclusters = 5

cl = gp.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cl)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 2, 0, 0, 0, 0, 2, 0, 2], dtype=int32)

Merge results with original data frame

In [197]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

merge = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
merge = merge.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood', how='right')

merge.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0,Fast Food Restaurant,Print Shop,Women's Store,Department Store,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,Construction & Landscaping,Bar,Women's Store,Diner,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Electronics Store,Rental Car Location,Medical Center,Bank,Intersection,Restaurant,Breakfast Spot,Eastern European Restaurant,Drugstore,Donut Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2,Coffee Shop,Pharmacy,Korean BBQ Restaurant,Department Store,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Athletics & Sports,Lounge,Hakka Restaurant,Gas Station,Fried Chicken Joint,Thai Restaurant,Bank,Caribbean Restaurant,Bakery,Dessert Shop


Visualise the clusters on a map of Toronto

In [206]:
# create map
cmap = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merge['Latitude'], merge['Longitude'], merge['Neighbourhood'], merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(cmap)
       
cmap

# Summary

Majority of neighbourhoods belong to cluster 0 which are hugely populated nearer to Coast.