# Segmenting and Clustering Neighborhoods in Toronto

_Import packages_

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

_URL, Request and Response_

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.Session()
response= r.get(url, timeout=10)
response

<Response [200]>

_Wrangling HTML With BeautifulSoup_

In [4]:
soup = BeautifulSoup(response.content, 'html.parser')

_Title of HTML Table_

In [5]:
title=soup.title.string
title

'List of postal codes of Canada: M - Wikipedia'

_Select table_

In [10]:
Canada_table=soup.find('table',{'class':'wikitable sortable'})
for row in Canada_table.findAll("tr"):
    cells = row.findAll('td')
print('the number of columns is:', len(cells)) #number of cols

rows = Canada_table.findAll("tr")
print('the number of rows is:', len(rows)) #number of rows

the number of columns is: 3
the number of rows is: 181


_Get header_

In [11]:
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)

['Postal Code', 'Borough', 'Neighbourhood']


_Get tabular data_

In [13]:
first_data_raw=1

lst_data_raw=[]
for row in rows[first_data_raw:]:
    tds = row.select('td')
    code = tds[0]
    data = [code.text.rstrip()]
    data.extend([d.text.rstrip() for d in tds[1:]])
    lst_data_raw.append(data)
#print(lst_data_raw)

_Create dictionary_

In [16]:
datax = list(zip(*lst_data_raw))
dat = dict(zip(header, datax))

_Create data frame_

In [17]:
df = pd.DataFrame(dat)

_Clean data frame: drop rows where 'Borough' is not assigned_

In [18]:
df1=df.drop(df.loc[df['Borough']=='Not assigned'].index)

_Clean data frame: assign value in 'Borough' column if 'Neighbourhood' is not assigned_

In [19]:
df1['Neighbourhood'] = df1.apply(lambda x: x['Borough'] if x['Neighbourhood']=='Not assigned' else x['Neighbourhood'], axis=1)

_Check cleaned data frame shape_

In [20]:
df1.shape

(103, 3)