In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import lxml

# Toronto Neighbourhoods

The below lines of code use requests and BeautifulSoup to extract the HTML page.

In [2]:
# Get the neighbourhoods page
wiki_page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
wiki_soup = BeautifulSoup(wiki_page.content, 'html.parser')

The below lines of code will convert the scraped HTML page into a Pandas dataframe.

In [3]:
# Put the neighbourhoods into a dataframe
wiki_table = wiki_soup.find('table', class_='wikitable')
postal_codes = pd.read_html(str(wiki_table))[0]
postal_codes

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


The below line of code will remove the any row where the borough is set to 'Not assigned'.

In [4]:
# Remove the codes that are 'not assigned' to a borough
postal_codes = postal_codes[postal_codes['Borough'] != 'Not assigned'].reset_index()[['Postal code', 'Borough', 'Neighborhood']]
postal_codes

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


The below line of code will make the neighbourhood equal to the borough if no neighbourhood is specified.

In [5]:
# Make neighbourhood equal to borough if no neighbourhood given
postal_codes['Neighborhood'] = postal_codes.apply(lambda x: x['Borough'] if x['Neighborhood'] == 'Not assigned' else x['Neighborhood'], axis = 1)
postal_codes.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In the assignment it was asked that "More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.". However the two lines below show that every postal code given is unique - so there is no need to perform this operation.

The only difference is that neighbourhoods are separated by slashes instead of commas, so this is changed in the dataframe using the final line.

In [6]:
print('Total Codes: ', postal_codes['Postal code'].shape[0])
print('Unique Codes:', postal_codes['Postal code'].unique().shape[0])

postal_codes['Neighborhood'] = postal_codes['Neighborhood'].apply(lambda x: x.replace(' / ', ', '))

Total Codes:  103
Unique Codes: 103


This is the final dataframe.

In [7]:
postal_codes.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


This is the shape of the final dataframe.

In [8]:
postal_codes.shape

(103, 3)

# Latitude and Longitude
The latitude and longitude for each postal code are imported.

In [9]:
lat_long = pd.read_csv('Geospatial_Coordinates.csv')
lat_long

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


The two dataframes are combined using the Postal Code column.

In [10]:
postal_codes.columns = ['Postal Code', 'Borough', 'Neighborhood']
boroughs = postal_codes.merge(lat_long, on='Postal Code')
boroughs

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Clustering

Only boroughs which contain 'Toronto' are used in this clustering.

In [11]:
toronto_boroughs = boroughs[boroughs['Borough'].str.contains('Toronto')]
toronto_boroughs.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


Import necessary packages.

In [12]:
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Fitting is done using location data, with 7 clusters.

In [21]:
# Get the features to fit
codes = toronto_boroughs[['Postal Code']]
locations = toronto_boroughs[['Latitude', 'Longitude']]

# Create kmeans
k_means = KMeans(n_clusters = 7, random_state = 3)
k_means.fit(locations)

# Get labels
codes['Labels'] = k_means.labels_
codes.head(39)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Postal Code,Labels
2,M5A,6
4,M7A,5
9,M5B,1
15,M5C,1
19,M4E,0
20,M5E,1
24,M5G,1
25,M6G,2
30,M5H,1
31,M6H,4


The clustering map is below. The boroughs have been clustered using location data
 - purple appears to be central toronto
 - blue appears to be west toronto
 - orange appears to be east toronto
 - yellow and cyan are northern toronto
 - green and red are outer toronto

In [22]:
map_clusters = folium.Map(location=[43.654260, -79.360636], zoom_start=11)

# set color scheme for the clusters
x = np.arange(7)
ys = [i + x + (i*x)**2 for i in range(7)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(locations['Latitude'], locations['Longitude'], codes['Postal Code'], codes['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters