# Project on Clustering of Toronto Neighbourhoods

In [7]:
import pandas as pd
import numpy as np

### Part1: Data scraping

In [8]:
# import the library we use to open URLs
import urllib.request
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "html.parser")

In [9]:
# The 'find_all' function to bring back all instances of the 'table' in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")
# use the 'find function' to bring back the wiki sort table
right_table=soup.find('table', class_='wikitable sortable')


In [10]:
# identify the row entries of each collumn
ColPC=[]
ColBo=[]
ColNH=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        ColPC.append(cells[0].find(text=True))
        ColBo.append(cells[1].find(text=True))
        ColNH.append(cells[2].find(text=True))
        
# build the dataframe from the scaped data
dfTor=pd.DataFrame(ColPC,columns=['PostalCode'])
dfTor['Borough']=ColBo
dfTor['Neighborhood']=ColNH
dfTor.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Data wrangling 

In [11]:
# drop rows without assigned borough
dfTor=dfTor[dfTor.Borough != 'Not assigned\n']

# update the index
dfTor.reset_index(drop=True,inplace=True)

# remove all the white space 
dfTor = dfTor.replace('\n','', regex=True)

# test that there is no Borough without Neighborhood
dfTorTest=dfTor[dfTor.Neighborhood == 'Not assigned']
print('Length of test set:' + str(dfTorTest.shape[0]) + ' row index 0? - you are good; row index >0? replace the neighborhood by borough')

# display the final data set
dfTor.head(12)

Length of test set:0 row index 0? - you are good; row index >0? replace the neighborhood by borough


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [12]:
# length of final Data Frame
print ('The data set contains of: '+ str(dfTor.shape[0]) + ' rows')

The data set contains of: 103 rows


### Longitude and Lattitude of Post Codes

In [13]:
#import geospacial data as csv file as the geocoder did not return values within reasonable time
df_DatLaLo = pd.read_csv('https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
print('Data downloaded and read into a dataframe!')


Data downloaded and read into a dataframe!


In [14]:
#loop over the toronto postal codes to assign a latitude and longitude to each neighborhood
# initialize the columns
ColLa=[]
ColLo=[]

# for each index in the toronto neighborhood data frame
for ind in dfTor.index: 
    # extract the post code
    PC=dfTor['PostalCode'][ind]
    # find the post code in the list of latitude and longitude
    idx = df_DatLaLo[df_DatLaLo['Postal Code']==PC].index.values
    # save the data of latitude and longitude in column
    ColLa.append(float(df_DatLaLo['Latitude'][idx].values))
    ColLo.append(float(df_DatLaLo['Longitude'][idx].values))
        
# construct the new dataframe with latitude and longitude
dfTorLaLo=dfTor
dfTorLaLo['Latitude']=ColLa
dfTorLaLo['Longitude']=ColLo
dfTorLaLo.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
