# Part 1: Web Scraping

First I imported the required libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Then I got the html of the wikipedia page with the neighborhoods in Toronto and stored it into a soup object.

In [2]:
html_data = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641").text
soup = BeautifulSoup(html_data,"html5lib")

I created a pandas dataframe with the data in the tables, I ignored registers where there is no Postal Code or Borough and in the ones where there is no neighbour, I replaced it with the name of the borough.

In [3]:
table_contents=[]
table=soup.find('table')
table.find('tr').decompose()
for row in table.findAll('tr'):
    cell = {}
    col = row.find_all('td')
    if col[0].text=='Not assigned' or col[1].text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = col[0].text
        cell['Borough'] = col[1].text
        if col[2].text.strip('\n')=='Not assigned':
            cell['Neighborhood'] = col[1].text
        else:
            cell['Neighborhood'] = col[2].text.strip('\n')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

Finally I grouped the data by postal code, I placed one borough per postal code and I made a list with the neighbours in each postal code. 

In [4]:
grouped_df = df.groupby("PostalCode")

grouped_lists = grouped_df["Neighborhood"].apply(list)
grouped_lists = grouped_lists.reset_index()

grouped_lists1 = grouped_df["Borough"].apply(list)
grouped_lists1 = grouped_lists1.reset_index()

frames = [grouped_lists1, grouped_lists]
  
result = pd.concat(frames, axis=1)
result = result.loc[:, ~result.columns.duplicated()]
i=0
for row in result['Borough']:
    newBorough = list(set(result['Borough'][i]))
    result['Borough'][i] = newBorough[0]
    i = i+1
    
j=0
for row in result['Neighborhood']:
    newNeighborhood = str(result['Neighborhood'][j])[1:-1]
    result['Neighborhood'][j] = newNeighborhood
    j = j+1

result.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"'Rouge', 'Malvern'"
1,M1C,Scarborough,"'Highland Creek', 'Rouge Hill', 'Port Union'"
2,M1E,Scarborough,"'Guildwood', 'Morningside', 'West Hill'"
3,M1G,Scarborough,'Woburn'
4,M1H,Scarborough,'Cedarbrae'


Not to forget to display the shape.

In [5]:
print("The shape of the following dataframe is "+ str(result.shape))

The shape of the following dataframe is (103, 3)


# Part 2: Latitude and Longitude

First I downloaded the data from the csv file provided, I decided to download it to have a better accuracy, then I read the data and stored it in a dataframe.

In [6]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
print('Data downloaded!')
data = pd.read_csv('Geospatial_Coordinates.csv')
data

Data downloaded!


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Then I merged both dataframes and removed the duplicated Postal Code.

In [7]:
dataframes = [result, data]
neighbors = pd.concat(dataframes, axis=1)
neighbors.drop(['Postal Code'], axis=1, inplace=True)
neighbors.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"'Rouge', 'Malvern'",43.806686,-79.194353
1,M1C,Scarborough,"'Highland Creek', 'Rouge Hill', 'Port Union'",43.784535,-79.160497
2,M1E,Scarborough,"'Guildwood', 'Morningside', 'West Hill'",43.763573,-79.188711
3,M1G,Scarborough,'Woburn',43.770992,-79.216917
4,M1H,Scarborough,'Cedarbrae',43.773136,-79.239476


# Part 3: Clustering neighbors

First I imported the used libraries.

In [8]:
from geopy.geocoders import Nominatim 

import requests 
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

import numpy as np
import folium 

from sklearn.cluster import KMeans

print('Libraries imported.')

Libraries imported.


Then I got the address of Toronto

In [9]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


I hot-encoded the boroughs and the postal codes so they can be used by the clusering algorithm.

In [10]:
neighbors_onehot = pd.get_dummies(neighbors[['Borough','PostalCode']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
neighbors_onehot['Neighborhood'] = neighbors['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [neighbors_onehot.columns[-1]] + list(neighbors_onehot.columns[:-1])
neighbors_onehot = neighbors_onehot[fixed_columns]

neighbors_onehot.head()

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,...,M9A,M9B,M9C,M9L,M9M,M9N,M9P,M9R,M9V,M9W
0,"'Rouge', 'Malvern'",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,"'Highland Creek', 'Rouge Hill', 'Port Union'",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"'Guildwood', 'Morningside', 'West Hill'",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,'Woburn',0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,'Cedarbrae',0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Here I trained the KMeans cluster algoritm with the data.

In [11]:
kclusters = 5
neighbors_grouped_clustering = neighbors_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighbors_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0,
       1, 1, 1, 1, 1, 2, 2, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 0, 4], dtype=int32)

Here I inserted the cluster lables to the main dataframe.

In [12]:
neighbors.insert(0, 'Cluster Labels', kmeans.labels_)

neighbors.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,3,M1B,Scarborough,"'Rouge', 'Malvern'",43.806686,-79.194353
1,3,M1C,Scarborough,"'Highland Creek', 'Rouge Hill', 'Port Union'",43.784535,-79.160497
2,3,M1E,Scarborough,"'Guildwood', 'Morningside', 'West Hill'",43.763573,-79.188711
3,3,M1G,Scarborough,'Woburn',43.770992,-79.216917
4,3,M1H,Scarborough,'Cedarbrae',43.773136,-79.239476


Finally I created the folium map with the 6 clusters I decided to use and displayed them.

The data shows how is Toronto distributed, we can identify 5 main zones: the center of the state which has the greatest city density, the cities surrounding the center and the 3 frontiers which correspond to the north one, the east one and the west one, also this have the lowest city density, something remarkable is an outlier color red which correspond to a processing centre.

In [14]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighbors['Latitude'], neighbors['Longitude'], neighbors['Neighborhood'], neighbors['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters