# Data Preparation for Clustering Neighbourhoods in Toronto

### Finding data from webpage to dataframe

#### Loading library and scrape weboage content

In [1]:
import requests 
from bs4 import BeautifulSoup 
  
# set the web link to be opened    
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# open with GET method 
resp=requests.get(url) 
if resp.status_code==200: # http_respone 200 means OK status
    # we need a parser,Python built-in HTML parser is enough . 
    webpage=BeautifulSoup(resp.text,'html.parser')  
    print("The web page is successfully opened") 
else: 
    print("There is error in opening the web page!") 

The web page is successfully opened


#### Isolate the require webpage content to form the initial raw data

In [2]:
import pandas as pd

# scrape the require section
list = webpage.find_all('th')

# get the first set data as the heading
heading = [list[0].text, list[1].text, list[2].text.rstrip()]
df = pd.DataFrame(columns = heading)
#df = pd.DataFrame(columns = ['Postcode', 'Borough', 'Neighbourhood'])

# get data from sub-section 
list = webpage.find_all('tr')
idx = 0
print(len(list))
for i in list:
    if (idx>0): # skip the header item
        if (len(list[1].td.text) == 3 and len(i) == 6):
            t = i.find_all('td')
            if (len(t) == 3):
                #print(t[0].text, t[1].text, t[2].text)
                df.loc[idx-1] = [t[0].text, t[1].text, t[2].text.rstrip()]
    idx += 1
df.head()

295


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Cleansing the raw data to the target format

In [3]:
# remove 'Not assigned' from Borough column
df = df[df['Borough'] != 'Not assigned'].reset_index()
df.drop(df.columns[0], axis=1, inplace=True)
print(df.shape)
df.head()

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [4]:
# join Neighbourhood data for same Borough
df = df.groupby(['Postcode', 'Borough'], sort=False)['Neighbourhood'].apply(', '.join).reset_index()

# replece the Neighbourhood with 'Not assigned' with Botough name 
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


#### Lastly, check the final size of the processed dataframe

In [5]:
df.shape

(103, 3)