# Segmenting and Clustering Neighborhoods in Toronto

In [5]:
#----Importing Libraries-------
from bs4 import BeautifulSoup
import pandas as pd
import wikipedia as wp

### Web Scraping 

In [45]:
#Get the html source

html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
data_postal_codes = pd.read_html(html)[0]
data_postal_codes.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
data_postal_codes.head()

Unnamed: 0,0,1,2
0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [46]:
#Renaming Column names of dataframe data_postal_codes

data_postal_codes.rename(columns={0: 'PostalCode', 1: 'Borough' , 2: 'Neighborhood'}, inplace=True)
data_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Data Cleaning 

In [47]:
#Droping first row.
data_postal_codes.drop( data_postal_codes.index[[0]] , inplace=True)
data_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [48]:
#Total No. of rows in dataframe
data_postal_codes.shape[0]

180

Only processing the rows that have an assigned borough.Droping rows with a borough that is Not assigned.

In [49]:
ind_drop = data_postal_codes[data_postal_codes['Borough']=='Not assigned'].index

new_data_postal_codes = data_postal_codes.drop(ind_drop)
new_data_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [50]:
#No of rows in new data frame.
new_data_postal_codes.shape[0]

103

In [51]:
#Total no. of rows removed.

Total_Rows_Removed = data_postal_codes.shape[0] - new_data_postal_codes.shape[0]
Total_Rows_Removed

77

In [52]:
new_data_postal_codes[new_data_postal_codes['PostalCode']=='M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [53]:
new_data_postal_codes[new_data_postal_codes['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [55]:
new_data_postal_codes.reset_index(inplace = True)
new_data_postal_codes.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,6,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [56]:
del new_data_postal_codes['index']
new_data_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [57]:
new_data_postal_codes.shape

(103, 3)

In [58]:
#Read csv that contains latitude,longitude values corresponding to each postal code.

data_lat_long = pd.read_csv("C:\\Users\\sakshi aggarwal\\Downloads\\Geospatial_Coordinates.csv")
data_lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [63]:
data_lat_long.shape

(103, 3)

In [64]:
data_lat_long.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
data_lat_long.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [62]:
#Sorting the dataframe to merge data further.
final_new_data_postal_codes = new_data_postal_codes.sort_values(by=['PostalCode'], ascending=True)
final_new_data_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [65]:
#Reseting Index.
final_new_data_postal_codes.reset_index(inplace = True)
del final_new_data_postal_codes['index']
final_new_data_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [66]:
final_new_data_postal_codes.shape

(103, 3)

In [69]:
#Type of Boroughs.
final_new_data_postal_codes['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [67]:
#Merging both  dataframes

merged_df = pd.merge(final_new_data_postal_codes , data_lat_long , on='PostalCode')
merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [68]:
merged_df.shape

(103, 5)