In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
!conda install -c conda-forge folium=0.5.0 --yes
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


For map functionality, you can also view this Notebook per this link:

https://dataplatform.cloud.ibm.com/analytics/notebooks/v2/fb8b32d6-7c1d-4fda-b1f6-365497d8796f/view?access_token=ebcf354359f0ba1b2f98179046d854fd53a0cd0ae202a484c629542adb571bc5


# Part 1: Web scraping for Toronto neighborhood and cleaning the data


In [2]:
# open Wiki page with Beautiful Soup
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

## The data is in the table
Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [3]:
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n')) # remove the new line char from neighborhood cell

In [4]:
toronto_hood = [('Postal Code', postalCodeList),
                ('Borough', boroughList), 
                ('Neighborhood', neighborhoodList)]
df = pd.DataFrame.from_dict(dict(toronto_hood))
df.head()

Unnamed: 0,Borough,Neighborhood,Postal Code
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A


We can observe that we have some data, which belongs to no assigned Borough. We will need to remove these

In [5]:
df_dropNA=df[df.Borough != 'Not assigned'].reset_index(drop=True)
df_dropNA.head()

Unnamed: 0,Borough,Neighborhood,Postal Code
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


"More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma."

In [6]:
toronto_df_groupedby = df_dropNA.groupby(['Postal Code','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_df_groupedby.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


"If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough."

In [7]:
na_neigh_rows = toronto_df_groupedby.Neighborhood == 'Not assigned'
toronto_df_groupedby.loc[na_neigh_rows, 'Neighborhood'] = toronto_df_groupedby.loc[na_neigh_rows, 'Borough']
toronto_df_groupedby[na_neigh_rows]

Unnamed: 0,Postal Code,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


We know have a dataset we can work with. Lets go!

## Getting geolocation data for the Postal Codes

In [8]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Coordinates downloaded!')
coors = pd.read_csv('toronto_coordinates.csv')

Coordinates downloaded!


In [9]:
print(coors.shape)
coors.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
toronto_df_temp = toronto_df_groupedby.set_index('Postal Code')
coors_temp = coors.set_index('Postal Code')
toronto_df_coors = pd.concat([toronto_df_temp, coors_temp], axis=1, join='inner')
toronto_df_coors.head()


Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [11]:
toronto_clean= toronto_df_coors
toronto_clean.index.name = 'Postal Code'
toronto_clean.reset_index(inplace=True)
toronto_clean.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Explore and cluster the Toronto neighborhoods

In [12]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [13]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(toronto_clean['Latitude'], toronto_clean['Longitude'], toronto_clean['Postal Code'],
                                           toronto_clean['Borough'], toronto_clean['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=4,
        popup=popup,
        color='red',
        fill=True,
        fill_color='#4442cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
    
map_toronto

"Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto "

Now we want to cluster the neighbourhoods. To make it a little easier and since this is just a showcase we will only use Boroughs with the Name Toronto in it.

In [14]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = toronto_clean[toronto_clean['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)
toronto_central_df.head()

(38, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [15]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'],
                                           toronto_central_df['Postal Code'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

## Now we will do clustering with the kMeans Algorithmn
For this lets say the Major of Toronto is unhappy with the current Toronto districts and would like to split the downtown districts from the initial 4 into 7 districts. For this he would like to find the close Boroughs and split them in to the new districts (our clusters).

In [16]:
kclusters = 7

toronto_central_clustering = toronto_central_df.drop(['Postal Code', 'Borough', 'Neighborhood'], 1)

toronto_central_clustering.head()

Unnamed: 0,Latitude,Longitude
0,43.676357,-79.293031
1,43.679557,-79.352188
2,43.668999,-79.315572
3,43.659526,-79.340923
4,43.72802,-79.38879


We will use the coordinates for the clustering.

In [17]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_central_clustering)

toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df.sort_values(['Cluster'], inplace=True)
toronto_central_clustered_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0
11,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675,0
13,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,0
32,M6J,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,1


In [18]:
import numpy as np
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_central_clustered_df['Latitude'], toronto_central_clustered_df['Longitude'],
                                             toronto_central_clustered_df['Postal Code'], toronto_central_clustered_df['Borough'],
                                             toronto_central_clustered_df['Neighborhood'], toronto_central_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

We can see:

Cluster 0 - Upper Eastside Toronto (from Old East, Downtown)

Cluster 1 - An "Upper Toronto Westside" (from old West and Downtown neighbourhoods)

Cluster 2 - A new Central (from Central and Downtown)

Cluster 3 - A part of Downtown stays Downtown

Cluster 4 - There is a clear West Toronto (from old West Toronto neighbourhoods)

Cluster 5 - North Toronto (from old northern Central neighbourhoods)

Cluster 6 - A clear East Toronto (from old East Toronto neighbourhoods)



Of course we can also visualize these new distributions:



In [51]:
results2 = toronto_central_clustered_df[["Borough","Cluster"]]
results2.groupby("Borough")
results2 = results2.reset_index(drop=True)
results2.drop_duplicates(inplace=True)
results2
#test.groups.keys()
#test.groups

Unnamed: 0,Borough,Cluster
0,East Toronto,0
2,Downtown Toronto,0
4,West Toronto,1
6,Downtown Toronto,1
8,Central Toronto,2
9,Downtown Toronto,2
14,Downtown Toronto,3
26,West Toronto,4
29,Central Toronto,5
35,East Toronto,6


As a reminder, this was our old map of Toronto. Have fun with the new districts Major!

In [20]:
map_toronto