# Segmenting and Clustering Neighborhoods in Toronto

_Import packages_

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

_URL, Request and Response_

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.Session()
response= r.get(url, timeout=10)
response

<Response [200]>

_Wrangling HTML With BeautifulSoup_

In [4]:
soup = BeautifulSoup(response.content, 'html.parser')

_Title of HTML Table_

In [5]:
title=soup.title.string
title

'List of postal codes of Canada: M - Wikipedia'

_Select table_

In [6]:
Canada_table=soup.find('table',{'class':'wikitable sortable'})
for row in Canada_table.findAll("tr"):
    cells = row.findAll('td')
print('the number of columns is:', len(cells)) #number of cols

rows = Canada_table.findAll("tr")
print('the number of rows is:', len(rows)) #number of rows

the number of columns is: 3
the number of rows is: 181


_Get header_

In [7]:
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)

['Postal Code', 'Borough', 'Neighbourhood']


_Get tabular data_

In [8]:
first_data_raw=1

lst_data_raw=[]
for row in rows[first_data_raw:]:
    tds = row.select('td')
    code = tds[0]
    data = [code.text.rstrip()]
    data.extend([d.text.rstrip() for d in tds[1:]])
    lst_data_raw.append(data)
#print(lst_data_raw)

_Create dictionary_

In [9]:
datax = list(zip(*lst_data_raw))
dat = dict(zip(header, datax))

_Create data frame_

In [10]:
df = pd.DataFrame(dat)

_Clean data frame: drop rows where 'Borough' is not assigned_

In [11]:
df1=df.drop(df.loc[df['Borough']=='Not assigned'].index)

_Clean data frame: assign value in 'Borough' column if 'Neighbourhood' is not assigned_

In [12]:
df1['Neighbourhood'] = df1.apply(lambda x: x['Borough'] if x['Neighbourhood']=='Not assigned' else x['Neighbourhood'], axis=1)

_Check cleaned data frame shape_

In [13]:
df1.shape

(103, 3)

# Latitude and Longitude data

In [14]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


_Import latitude and longitude data_

In [18]:
import io

url_gps= 'https://cocl.us/Geospatial_data'
j=requests.get(url_gps).content
df_gps= pd.read_csv(io.StringIO(j.decode('utf-8')))

In [20]:
df_gps.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


_Merge dataframes df1 and df_gps using 'Postal Code' column (inner join)_

In [21]:
new_df=pd.merge(df1,df_gps,on='Postal Code',how='inner')

In [22]:
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [23]:
new_df.shape

(103, 5)