# Segmenting and Clustering Neighborhoods in Toronto

## Part 2

In [1]:
#! pip install beautifulsoup4 

from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import numpy as np


Collect and clean data

In [2]:
url="https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "xml")

In [3]:
table=soup.find('table')

col_name = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = col_name)

# add all the data into table
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [4]:
# remove all the "Not assigned" rows in the df
df=df[df['Borough']!='Not assigned']
df[df['Neighborhood']=='Not assigned']=df['Borough']

In [5]:
# Combine the df where PostalCode and Borough are assigned to multiple Neighborhood
df = df.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False)

Get the longitude and latitude of the location from the location csv file

In [14]:
loc_df=pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')

In [15]:
loc_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge original data frame and location data frame

In [18]:
loc_df.rename(columns={'Postal Code':'Postalcode'},inplace=True)
loc_merged = pd.merge(loc_df, df, on='Postalcode')

In [17]:
loc_data=loc_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
loc_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
