<strong>Explore and cluster the neighborhoods in Toronto - part 1, neighborhoods obtention</strong>

Scrape raw table from source

In [1]:
# Library for opening url and creating 
# requests 
import urllib.request 
# for parsing all the tables present 
# on the website 
from html_table_parser import HTMLTableParser
import pandas as pd 


# Opens a website and read its 
# binary contents (HTTP Response Body) 
def url_get_contents(url): 
	req = urllib.request.Request(url=url) 
	f = urllib.request.urlopen(req)
	return f.read()

# get the html contents of the source URL. 
xhtml = url_get_contents('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').decode('utf-8') 
p = HTMLTableParser() 
p.feed(xhtml) 
p.tables[0]

[['Postal Code', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'],
 ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'],
 ['M1B', 'Scarborough', 'Malvern, Rouge'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills'],
 ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'],
 ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B',
  'Etobicoke',
  'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'],
 ['M1C', 'Scarborough', '

Obtain the dataframe and clean it

In [2]:
df = pd.DataFrame(p.tables[0])

# appropiate header
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

#remove Not assigned Boroughs
df = df[df['Borough'] != 'Not assigned']

#group repeated postal codes
df = (df.groupby('Postal Code')
       .agg({'Borough' : 'first', 'Neighbourhood' : ','.join})
       .reset_index()
       .reindex(columns=df.columns))

#set Not assigned Neighborhoods to Boroughs' names
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [3]:
df.to_csv('toronto-neighborhood-cleaned.csv', index=False)
df.shape

(103, 3)