# Capstone Project: Segmenting and Clustering Neighbourhoods in Toronto

## 1.- Install Libraries and packages

In [3]:
# Gabriel: Install the beautifulsoup package; which needs to be install to use bs4
! pip install beautifulsoup4



In [4]:
# Gabriel: Install the lxml parser which needs to be install to use bs4
! pip install lxml



In [5]:
# Gabriel: Install the html5lib parser which needs to be install to use bs4
! pip install html5lib



In [6]:
#Installing libraries
import numpy as np
import pandas as pd

## 2.-Scrape the Data from wikipedia

In [47]:
# Scrape the data from the web link
Data=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
Toronto_Data=Data[0]
Toronto_Data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


## 3.- Ignore cells with a borough that is Not assigned.

In [48]:
# To Eliminate the lines with Borough not assigned first we make a quick view to our data
Toronto_Data.describe()

Unnamed: 0,Postal Code,Borough,Neighbourhood
count,180,180,180
unique,180,11,100
top,M1M,Not assigned,Not assigned
freq,1,77,77


In [49]:
na=(Toronto_Data['Borough']=='Not assigned').value_counts() # Here i count how many "Not Assigned" Borough exists
na

False    103
True      77
Name: Borough, dtype: int64

In [50]:
# Eliminate the lines with Borough not assigned
Toronto_Data['Borough'].replace('Not assigned',np.nan,inplace=True) #Replace the "Not assinged by Nan"
Toronto_Data.dropna(subset=["Borough"], axis=0, inplace=True) #Drop Nan
Toronto_Data.describe() # I check that the Not Assigned rows were eliminated from our data. I used to have 180 rows in which 103 rows had an Assigned borough and 77 were not assigned

Unnamed: 0,Postal Code,Borough,Neighbourhood
count,103,103,103
unique,103,10,99
top,M4G,North York,Downsview
freq,1,24,4


In [51]:
na=(Toronto_Data['Borough']=='Not assigned').value_counts() # As a double check, i count how many rows are not assigned
na # we can see that there are not any 'not assigned' rows.

False    103
Name: Borough, dtype: int64

In [52]:
#Check on my new Toronto Data
Toronto_Data

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [53]:
# Reset Index
Toronto_Data.reset_index(drop=True,inplace=True)
Toronto_Data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## 3.- Replace Neighbourhoods are not assigned with their Borough

In [54]:
# Double check to see if there is any "Not Assigend" in the Neighbourhood column
neigh_na=Toronto_Data[['Neighbourhood']]=='Not assigned'
neigh_na.value_counts()

Neighbourhood
False            103
dtype: int64

#### We see that there is not "Not assigned" neighbourhoods to replace

## 4.- Group neighbourhoods by Postal Codes

In [63]:
Toronto_Data = Toronto_Data.groupby('Postal Code',as_index=False).agg(lambda x: ','.join(set(x.dropna()))) # Group by function to group by postal code
Toronto_Data # in the previous line of code we ad the "agg" object to separate the sets by a comma

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


## 5.- Check the size of the data

In [64]:
Toronto_Data.shape

(103, 3)