# Segmenting and Clustering neighbourhoods in Toronto

The first task will be to create a dataframe including the neighbourhoods, postal code and Borough. This is done via a wikipedia page. First off the necessary libraries will be installed.

In [25]:
!pip install bs4
!pip install geocoder
import pandas as pd
import numpy  as np
import requests
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    attrs-21.2.0               |     pyhd8ed1ab_0          44 KB  conda-forge
    branca-0.4.2               |     pyhd8ed1ab_0          26 KB  conda-forge
    entrypoints-0.3            |  pyhd8ed1ab_1003           8 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    jsonschema-3.2.0           |     pyhd8ed1ab_3          45 KB  conda-forge
    pandas-1.1.5               |   py36h28

First we will download the wikipedia page:

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data=requests.get(url)
html_data.encoding

'UTF-8'

Next we will parse the data into a beautiful soup object and find the table and implement it in a data frame.

In [3]:
#Parse the data to beautiful soup
soup = BeautifulSoup(html_data.text)
table_contents=[]
table=soup.find('table')
#Implement the data into a list

for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

#Add the data to a dataframe
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                            'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                           'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                            'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

Now to see what the shape of dataframe is and to prepare the data for merging we rename the postal code column.

In [4]:
df.sort_values(by=['PostalCode'])
df = df.rename(columns={'PostalCode':'Postal Code'})
df.shape


(103, 3)

Next we will try to get the locations of the different neighbourhoods using the csv data, because the geocoder did not work. After completing this we merge the two dataframes.

In [7]:
url ='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
df_geospatial = pd.read_csv(url)
df_geospatial.sort_values(by=['Postal Code'])

df_positioned = pd.merge(df, df_geospatial, on="Postal Code")
df_positioned

    


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Now we will cluster and explore the outputs. First we import the necessary libraries

In [13]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library





Next we normalize the data and only use the positional data 

In [17]:
df_positiononly=df_positioned.drop(columns=['Postal Code','Borough','Neighborhood'])
X = df_positiononly.values[:,1:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset



array([[ 0.69818881],
       [ 0.84388426],
       [ 0.37773518],
       [-0.6993678 ],
       [ 0.07922652],
       [-1.39737754],
       [ 2.09777597],
       [ 0.465121  ],
       [ 0.90216906],
       [ 0.18842596],
       [-0.49568547],
       [-1.62993333],
       [ 2.44798852],
       [ 0.58164715],
       [ 0.81474393],
       [ 0.22482887],
       [-0.32106485],
       [-1.86243118],
       [ 2.15613628],
       [ 1.07704414],
       [ 0.24667041],
       [-0.58298336],
       [ 1.86437197],
       [ 0.3486083 ],
       [ 0.10106496],
       [-0.26285143],
       [ 1.6310228 ],
       [ 0.3486083 ],
       [-0.46658445],
       [ 0.49425098],
       [ 0.1301846 ],
       [-0.46658445],
       [ 1.6310228 ],
       [ 0.523382  ],
       [-0.9320953 ],
       [ 0.61078127],
       [ 0.15930528],
       [-0.23374316],
       [ 1.39772948],
       [ 0.1156253 ],
       [-0.6993678 ],
       [ 0.465121  ],
       [ 0.16112481],
       [-0.32106485],
       [ 1.16449306],
       [ 0

Then we cluster the neighbourhoods.

In [18]:
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cluster_dataset)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 3, 1, 4, 1, 0, 2, 3, 3, 1], dtype=int32)

For the visualization we need to install geocoders and find the coordinates of Toronto

In [21]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronte is {}, {}.'.format(latitude, longitude))

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.12.5  |       ha878542_0         137 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.1.0                |     pyhd3deb0d_0          64 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         235 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.1.0-pyhd3deb0d_0

The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-

We add the cluster labels to the dataframe to allow for visualisation.

In [23]:
df_visual=df_positioned
df_visual.insert(0, 'Cluster Labels', kmeans.labels_)
df_visual

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,3,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,1,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...,...
98,4,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,1,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,3,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,4,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


Then we visualize the data and this shows the neighbourhoods that are closest together according to their geological position.

In [27]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_visual['Latitude'], df_visual['Longitude'], df_visual['Neighborhood'], df_visual['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

A bit simple maybe but it shows the basics ;)