# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

Get information from  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and store it into a dataframe.

Import usefull libraries.

In [192]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

Retrieve table content from the wikipedia page using BeautifulSoup.

In [193]:
# retrieve table using BeautifulSoup
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 

### Parsing

##### Assumptions
- table is made of *tr* and *td*
- in each *tr* there is a *p* 
- the postal_code is in the *b* tag within *p*  
- the borough and neighborhood info is in *span* within *p*
- if *span* contains a *i*, we can skip to next one (Not Assigned)
- the first info in *span* is the borough and the remaining text is related to the Neighborhood (which will be parsed later)

In [194]:
# init dict that will be use to populate the dataframe
content = {'PostalCode': [], 
           'Borough': [], 
           'Neighborhood': []}

# loop over <tr> and <td>
for tr in table.find_all('tr'):
    for td in tr.find_all('td'):
        # retrieve <p>
        p = td.p
        
        # init data
        postal_code = p.b.get_text()
        borough = ''
        neighbors = ''
        
        # handling cell without assigned borough - skip it
        if p.span.i:
            continue
        
        # retrive <span> and extract borough (first text) and neighbors (the rest)
        span = p.span
        borough = span.get_text(separator=',').split(',')[0]
        s = span.get_text(separator=',').split(',')[1:]
        neighbors = ' '.join([str(elem) for elem in s])
        
        # add info to content dict
        content['PostalCode'].append(postal_code)
        content['Borough'].append(borough) 
        content['Neighborhood'].append(neighbors)
            

create dataframe

In [195]:
df = pd.DataFrame(data=content)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,( Parkwoods )
1,M4A,North York,( Victoria Village )
2,M5A,Downtown Toronto,( Regent Park / Harbourfront )
3,M6A,North York,( Lawrence Manor / Lawrence Heights )
4,M7A,Queen's Park,/ Ontario Provincial Government


#### List Boroughs and number of associated postal codes

In [196]:
df.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,9,9
Downtown Toronto,18,18
East Toronto,5,5
East York,5,5
Etobicoke,12,12
Mississauga,1,1
North York,24,24
Queen's Park,1,1
Scarborough,17,17
West Toronto,6,6


Fix two Boroughs

In [197]:
df[ df['Borough'] == 'Mississauga' ]

Unnamed: 0,PostalCode,Borough,Neighborhood
76,M7R,Mississauga,Canada Post Gateway Processing Centre (Enclave...


In [198]:
idx = df[ df['Borough'] == 'Mississauga' ].index[0]
df.iloc[idx].Neighborhood = '(Mississauga)'
df.iloc[idx]

PostalCode                M7R
Borough           Mississauga
Neighborhood    (Mississauga)
Name: 76, dtype: object

In [199]:
df[ df['Borough'] == 'Queen\'s Park' ]

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,/ Ontario Provincial Government


In [200]:
idx = df[ df['Borough'] == 'Queen\'s Park' ].index[0]
df.iloc[idx].Neighborhood = '( Queen\'s Park )'
df.iloc[idx].Borough = 'Downtown Toronto'
df.iloc[idx]

PostalCode                   M7A
Borough         Downtown Toronto
Neighborhood    ( Queen's Park )
Name: 4, dtype: object

#### Fix , Extract and List all Neighborhoods

In [201]:
df[ df['Neighborhood'] == 'Business reply mail  Processing Centre 969 Eastern (Enclave of M4L)']

Unnamed: 0,PostalCode,Borough,Neighborhood
100,M7Y,East Toronto,Business reply mail Processing Centre 969 Eas...


In [202]:
idx = df[ df['Neighborhood'] == 'Business reply mail  Processing Centre 969 Eastern (Enclave of M4L)'].index[0]
df.iloc[idx].Neighborhood = '( Business reply mail  Processing Centre 969 Eastern Enclave of M4L )'

In [203]:
df[ df['Neighborhood'] == 'Stn A PO Boxes 25 The Esplanade (Enclave of M5E)']

Unnamed: 0,PostalCode,Borough,Neighborhood
92,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade (Enclave of M5E)


In [204]:
idx = df[ df['Neighborhood'] == 'Stn A PO Boxes 25 The Esplanade (Enclave of M5E)'].index[0]
df.iloc[idx].Neighborhood = '( Stn A PO Boxes 25 The Esplanade Enclave of M5E )'

In [205]:
def normalize ( row ) :
    embed = ''.join(re.findall(r'[)][a-zA-Z -/.\']+[(]' ,row))
    if embed:
        row = row.replace(embed, ' / ')
    neigs  = ''.join(re.findall(r'[(][a-zA-Z -/.\']+[)]' ,row))
    neigs = neigs.replace('(', '').replace(')', '').replace('/', ',')
    return neigs
df['Neighborhood'] = df['Neighborhood'].apply( lambda row: normalize(row) )

In [206]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,Garden District Ryerson


In [207]:
df.shape

(103, 3)

In [208]:
df.to_csv('toronto_bor_neigs.csv', index=False)