## Segmenting and Clustering Neighborhoods in Toronto 

By Arjun Bansil 

In [1]:
import wikipedia 
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd 

In [2]:
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(website_url,'lxml')


In [3]:
table = soup.find('table',{'class':'wikitable sortable'})

In [4]:
rows = table.findAll('tr')
parsed_table_data = []
for row in rows:
    children = row.findChildren(recursive=False)
    row_text = []
    for child in children:
        clean_text = child.text
        clean_text = clean_text.split('&#91;')[0]
        clean_text = clean_text.split('&#160;')[-1]
        clean_text = clean_text.strip()
        row_text.append(clean_text)
        parsed_table_data.append(row_text)
    

In [5]:
predf = parsed_table_data
predf.remove(predf[0])
predf.remove(predf[1])
predf.remove(predf[2])

In [75]:
df = pd.DataFrame(predf)
df.columns = ["PostalCode", "Borough","Neighborhood"]
df.drop(df[df.Borough == "Not assigned"].index,inplace=True)
df = df[df['PostalCode'] != 'Postcode']


In [76]:
for index, row in df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
df

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M3A,North York,Parkwoods
7,M3A,North York,Parkwoods
8,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
10,M4A,North York,Victoria Village
11,M4A,North York,Victoria Village
12,M5A,Downtown Toronto,Harbourfront
13,M5A,Downtown Toronto,Harbourfront
14,M5A,Downtown Toronto,Harbourfront
15,M5A,Downtown Toronto,Regent Park


In [77]:
df.shape

(636, 3)

In [78]:
df["Neighborhood"] = df["Neighborhood"].astype(str)+","
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M3A,North York,"Parkwoods,"
7,M3A,North York,"Parkwoods,"
8,M3A,North York,"Parkwoods,"
9,M4A,North York,"Victoria Village,"
10,M4A,North York,"Victoria Village,"


In [79]:
df = df.groupby(['PostalCode','Borough']).sum()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
PostalCode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Rouge,Rouge,Malvern,Malvern,Malvern,"
M1C,Scarborough,"Highland Creek,Highland Creek,Highland Creek,R..."
M1E,Scarborough,"Guildwood,Guildwood,Guildwood,Morningside,Morn..."
M1G,Scarborough,"Woburn,Woburn,Woburn,"
M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae,"


In [81]:
df = df.reset_index()
df['Neighborhood'] = df['Neighborhood'].map(lambda x: x.rstrip(','))
df.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,0,M1B,Scarborough,"Rouge,Rouge,Rouge,Malvern,Malvern,Malvern"
1,1,M1C,Scarborough,"Highland Creek,Highland Creek,Highland Creek,R..."
2,2,M1E,Scarborough,"Guildwood,Guildwood,Guildwood,Morningside,Morn..."
3,3,M1G,Scarborough,"Woburn,Woburn,Woburn"
4,4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae"


In [83]:
df.shape

(103, 4)

# Part 2 

In [84]:
geo = pd.read_csv("https://cocl.us/Geospatial_data")
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [85]:
geo = geo.sort_values('Postal Code',ascending=True)
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [92]:
df = df.rename(columns ={'PostalCode':'Postal Code'})
df.reset_index()
df.drop(df['index'])
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,0,M1B,Scarborough,"Rouge,Rouge,Rouge,Malvern,Malvern,Malvern"
1,1,M1C,Scarborough,"Highland Creek,Highland Creek,Highland Creek,R..."
2,2,M1E,Scarborough,"Guildwood,Guildwood,Guildwood,Morningside,Morn..."
3,3,M1G,Scarborough,"Woburn,Woburn,Woburn"
4,4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae"


In [93]:
df2 = pd.concat([df,geo], axis=1)
df2.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood,Postal Code.1,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge,Rouge,Rouge,Malvern,Malvern,Malvern",M1B,43.806686,-79.194353
1,1,M1C,Scarborough,"Highland Creek,Highland Creek,Highland Creek,R...",M1C,43.784535,-79.160497
2,2,M1E,Scarborough,"Guildwood,Guildwood,Guildwood,Morningside,Morn...",M1E,43.763573,-79.188711
3,3,M1G,Scarborough,"Woburn,Woburn,Woburn",M1G,43.770992,-79.216917
4,4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae",M1H,43.773136,-79.239476
