# Segmenting and Clustering Neighbourhoods in Toronto

### Adding all the necessary libraries

In [1]:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Scraping the Wikipedia page to extract useful information 

In [2]:
# Getting the Webpage from the url using 'requests'
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

### Extracing the table from the scraped data using 'BeautifulSoup'

In [3]:
my_table = soup.find('table',{'class':'wikitable sortable'})

### Extract 'Postal Codes, 'Borough' and 'Neighbourhoods' Data from the HTML table

In [4]:
row = []
tr = my_table.find_all('tr')

for j in range(1,len(tr)):
    td = tr[j].findAll('td')
    cell = []
    for i in range(len(td)):
        cell.append(td[i].text)
    row.append(cell)

### Convert the Nested List into a Pandas Dataframe

In [5]:
## Convert the Nested List into a Pandas Dataframe
df_table = pd.DataFrame(row)
df_table.columns = ['postal_codes', 'Borough', 'Neighborhoods']
df_table.head()

Unnamed: 0,postal_codes,Borough,Neighborhoods
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [6]:
df_temp = df_table.copy(deep= True)

### Remove the rows where the value of 'Borough' is 'Not Assigned'

In [7]:
# Drop the rows where the value of 'Borough' is 'Not Assigned'
df_temp = df_temp[df_temp.Borough != 'Not assigned']
df_temp.head()

Unnamed: 0,postal_codes,Borough,Neighborhoods
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


### Combine two rows into one which have more than one Neighbourhood for the same Postal Code

In [8]:
df_temp_updated = df_temp.groupby(['postal_codes','Borough'])['Neighborhoods'].apply(', '.join).reset_index()
df_temp_updated.head()

Unnamed: 0,postal_codes,Borough,Neighborhoods
0,M1B,Scarborough,"Rouge\n, Malvern\n"
1,M1C,Scarborough,"Highland Creek\n, Rouge Hill\n, Port Union\n"
2,M1E,Scarborough,"Guildwood\n, Morningside\n, West Hill\n"
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


### Making Neighbourhood value same as Borough for 'Not Assigned' Neighbourhood values

In [9]:
# Create a list of indices of the cells where the value of Column2 is "Not assigned"
idx_to_change = df_temp_updated.loc[df_temp_updated['Neighborhoods'] == "Not assigned"].index

# Iterate the list of indicies and set Column2 to the value of Column1 at the given index
for i in idx_to_change:
    df_temp_updated.iloc[i, 2] = df_temp_updated.iloc[i, 1] 


In [10]:
df_temp_updated.head()

Unnamed: 0,postal_codes,Borough,Neighborhoods
0,M1B,Scarborough,"Rouge\n, Malvern\n"
1,M1C,Scarborough,"Highland Creek\n, Rouge Hill\n, Port Union\n"
2,M1E,Scarborough,"Guildwood\n, Morningside\n, West Hill\n"
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


### Cleaning the Dataset

In [11]:
df_temp_updated['Neighborhoods'] = df_temp_updated['Neighborhoods'].replace('\n','', regex=True)


In [12]:
df_temp_updated.head()

Unnamed: 0,postal_codes,Borough,Neighborhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
df_temp_updated.shape

(103, 3)

# Adding Latitude and Longitude information

In [14]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df_loc = df_data_1.copy(deep=True)
df_loc.columns = ['postal_codes', 'latitude', 'longitude']
df_loc.head()

Unnamed: 0,postal_codes,latitude,longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
df_loc = pd.merge(df_temp_updated, df_loc, on="postal_codes")

In [17]:
df_loc.head()

Unnamed: 0,postal_codes,Borough,Neighborhoods,latitude,longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
