# <b>Segmenting and Clustering Neighborhoods in Toronto</b>

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
# download url data from wikipedia
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(link)

df_raw = pd.read_html(wikipedia_page.content, header=0)[0]
df_new = df_raw[df_raw.Borough != 'Not assigned']

df_new.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
#Find whether there is a "Not assigned" in Neighbourhood
df_new.loc[df_new.Neighbourhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [13]:
#Group Neighborhoods with the same Postcode
df_toronto = df_new.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(lambda x: ', '.join(x))
df_toronto = df_toronto.reset_index()
df_toronto.rename(columns = {'Postcode':'PostalCode'}, inplace = True)
df_toronto.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
df_toronto.shape

(103, 3)

In [14]:
url = 'http://cocl.us/Geospatial_data'
geo_df=pd.read_csv(url)
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
df_toronto = pd.merge(geo_df, df_toronto, on='Postal Code')
df_toronto

Unnamed: 0,Postal Code,Latitude,Longitude,Latitude_x,Longitude_x,Borough,Neighborhood,Latitude_y,Longitude_y
0,M1B,43.806686,-79.194353,43.806686,-79.194353,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,43.784535,-79.160497,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,43.763573,-79.188711,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,43.770992,-79.216917,43.770992,-79.216917,Scarborough,Woburn,43.770992,-79.216917
4,M1H,43.773136,-79.239476,43.773136,-79.239476,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...,...,...,...,...
98,M9N,43.706876,-79.518188,43.706876,-79.518188,York,Weston,43.706876,-79.518188
99,M9P,43.696319,-79.532242,43.696319,-79.532242,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,43.688905,-79.554724,43.688905,-79.554724,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,43.739416,-79.588437,43.739416,-79.588437,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [19]:
df_toronto = df_toronto[['Postal Code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
