# Segmenting and Clustering Neighborhoods in Toronto

**1. Preparing the environment**

In [1]:
import numpy as np
import pandas as pd
import json
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
print('Finished OK.')

Finished OK.


**2. web scraping**

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(page, 'html.parser')
wiki_table = soup.body.table.tbody

In [25]:
#wiki_table

**3. getting data from wiki_table and populating them into a dataframe**

In [4]:
# we will use two helper functions to do this
# First get_table_data() which takes as argument a: <tr>tr_content</tr> tag
# where "tr_content" is multiple <td>td_content</td>
# and appends "td_content" to dataframe row
def get_table_data(tr_tag):
    cells = tr_tag.find_all('td')
    row = []
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
        
    return row

In [7]:
# Second get_table_rows() which loops over all <tr>tr_content</tr> tags and call get_table_data()
def get_table_rows():    
    data = []  
    for tr in wiki_table.find_all('tr'):
        row = get_table_data(tr)
        if len(row) != 3:
            continue
        data.append(row)        
    
    return data

In [8]:
# Now let's get the first verion of the dataframe
data = get_table_rows()
columns = ['Postcode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
df.shape

(288, 3)

**4. Cleaning the data**

**4.1  Ignore cells with a borough that is Not assigned**

In [12]:
df1 = df[df.Borough != 'Not assigned'] # drop all rows with df.Borough = 'Not assigned'
# print(df1.head())
df1 = df1.sort_values(by=['Postcode','Borough']) #sorting
# print(df1.head())
df1.reset_index(inplace=True) # reset index
# print(df1.head())
df1.drop('index',axis=1,inplace=True) # drop column 'index'

df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


**4.2 combining neighborhoods falling into same postcode. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough**

In [23]:
df_postcodes = df1['Postcode']
df_postcodes.drop_duplicates(inplace=True) #remove duplicate postcodes
# add cloumns 'Borough' and 'Neighbourhood'
df2 = pd.DataFrame(df_postcodes)
df2['Borough'] = '';
df2['Neighbourhood'] = '';
# reset index since after removing duplicates index is altered
df2.reset_index(inplace=True)
# remove original (altered) index
df2.drop('index', axis=1, inplace=True)

# loop over index of df2 and df1 in order to fill the columns 'Borough' and 'Neighbouring' of df2
for i in df2.index:
    for j in df1.index:
        if df2.iloc[i, 0] == df1.iloc[j, 0]: # test if same postcode in df1 and df2
            df2.iloc[i, 1] = df1.iloc[j, 1]  # fill same Borough of df1 in df2
            df2.iloc[i, 2] = df2.iloc[i, 2] + ',' + df1.iloc[j, 2] # add 'Neighbouring' from df1 into df2.
# Since df2[i,2] (i.e 'Neighbouring') will start by ',', we will remove ','            
for i in df2.index:
    s = df2.iloc[i, 2]
    if s[0] == ',':
        s =s [1:]
    df2.iloc[i,2 ] = s
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1C,Scarborough,Malvern
2,M1E,Scarborough,Highland Creek
3,M1G,Scarborough,Rouge Hill
4,M1H,Scarborough,Port Union


In [24]:
df2.shape

(103, 3)