# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors


## Problem #1

### Using a Wikipedia page that contains postal codes and burroughs for neighborhoods in Canada, create a dataframe that contains the necessary data.

Send a request to get the URL of the Wikipedia page and create a BeautifulSoup object to parse the page

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

Receive the contents from the page and placing them in a table contents object

In [3]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

Create a dataframe from the table contents and update borough names to be more succint

In [4]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

View first 5 records of newly created dataframe

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


Display the dimensions of the dataframe

In [6]:
df.shape

(103, 3)

## Problem #2

### Create a new dataframe that also has the geospatial coordintates of those neighborhoods present in the initial dataframe

Read and create a temorary dataframe from the provided 'Geospatial_Coordinates.csv' file. This file contains the geospatial coordintates of the neighborhoods in the initial dataframe.

In [7]:
df_coor = pd.read_csv('Geospatial_Coordinates.csv')
df_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename the Postal Code column to match the column name in the intial dataframe

In [8]:
df_coor.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df_coor

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Merge the intial dataframe and the coordintates dataframe to create a new dataframe that displays all the necessary data.

In [9]:
df_can = pd.merge(df, df_coor, how='inner')
df_can.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [10]:
df_can.shape

(103, 5)

## Problem #3

### Segment and cluster the neighborhoods in Toronto

Create a new dataframe with only the boroughs that contain the word 'Toronto'

In [11]:
toronto_data = df_can[df_can['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


Display the number of boroughs that contain the word Toronto and the number of neighborhoods in those boroughs

In [12]:
print('There are {} boroughs that contain the word Toronto and there are {} neighborhoods in those boroughs.'.format(
        len(toronto_data['Borough'].unique()),
        toronto_data.shape[0]
    )
)

There are 7 boroughs that contain the word Toronto and there are 39 neighborhoods in those boroughs.


Find the geographical coordinates of Toronto, Canada

In [13]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


Create a map of Toronto, Canada that shows its neighborhoods 

In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Use K-Means clustering to place each neighborhood in 4 mutually exclusive clusters and create labels for each cluster

In [15]:
kclusters = 4

toronto_cluster = toronto_data.drop(['PostalCode','Borough','Neighborhood'], 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 0, 3, 3, 1, 3, 1, 0], dtype=int32)

Add the labels back to the Toronto dataframe to identify which neighborhoods belong to what cluster.

In [16]:
toronto_data['Cluster Labels'] = kmeans.labels_
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3


Create a map that shows the newly created clusters of Toronto neighborhoods

In [17]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters