# Toronto Neighbourhood Web Scrapping

### Checking for the new libraries needed

- beautifulsoup4
- lxml
- request

In [1]:

!pip install beautifulsoup4
!pip install lxml
!pip install request

Collecting request
  Downloading https://files.pythonhosted.org/packages/f1/27/7cbde262d854aedf217061a97020d66a63163c5c04e0ec02ff98c5d8f44e/request-2019.4.13.tar.gz
Collecting get (from request)
  Downloading https://files.pythonhosted.org/packages/3f/ef/bb46f77f7220ac1b7edba0c76d810c89fddb24ddd8c08f337b9b4a618db7/get-2019.4.13.tar.gz
Collecting post (from request)
  Downloading https://files.pythonhosted.org/packages/0f/05/bd79da5849ea6a92485ed7029ef97b1b75e55c26bc0ed3a7ec769af666f3/post-2019.4.13.tar.gz
Collecting query_string (from get->request)
  Downloading https://files.pythonhosted.org/packages/12/3c/412a45daf5bea9b1d06d7de41787ec4168001dfa418db7ec8723356b119f/query-string-2019.4.13.tar.gz
Collecting public (from query_string->get->request)
  Downloading https://files.pythonhosted.org/packages/54/4d/b40004cc6c07665e48af22cfe1e631f219bf4282e15fa76a5b6364f6885c/public-2019.4.13.tar.gz
Building wheels for collected packages: request, get, post, query-string, public
  Building wheel

### Importing the libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Web Scrap phase

In [3]:
#getting the source code from the url
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [5]:
#set the html object
soup = BeautifulSoup(source,'lxml')


In [6]:
#Extracting the table from the source code
table=soup.find('table',class_='wikitable sortable')


In [7]:
#Creating a list to contain the text from the rows of the HTML table
row_test=[]
for row in table.find_all('tr'):
    row_test.append(row.text)


In [8]:
#Removing /n text 
row_test=[value.split('\n') for value in row_test]


In [9]:
#Creating the first df
df=pd.DataFrame(row_test[1:],columns=row_test[0])
#Reshapping the df
df=df[['Postcode','Borough','Neighbourhood']]

### Data Wrangling phase 

In [10]:
#Setting the non assigned values as nan
df.replace("Not assigned",np.nan,inplace=True)

In [11]:
#Removing the nan values
df.dropna(inplace=True)

In [12]:
#Checking the amount of unique postcodes
postcode=list(df['Postcode'].unique())
len(postcode)

102

In [13]:
#Checking the amount of unique borough
borough=list(df['Borough'].unique())
len(borough)

10

In [27]:

o_post=[]
o_boro=[]
o_neig=[]
for i in postcode:
    o_post.append(str(i))
    o_boro.append(list(df[df['Postcode']==str(i)].Borough))
    o_neig.append(list(df[df['Postcode']==str(i)].Neighbourhood))

In [42]:
#Check if there are two borough in the same postcode
[set(sublist) for sublist in o_boro]

[{'North York'},
 {'North York'},
 {'Downtown Toronto'},
 {'North York'},
 {'Downtown Toronto'},
 {'Scarborough'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'North York'},
 {'Etobicoke'},
 {'Scarborough'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'York'},
 {'Etobicoke'},
 {'Scarborough'},
 {'East Toronto'},
 {'Downtown Toronto'},
 {'York'},
 {'Scarborough'},
 {'East York'},
 {'Downtown Toronto'},
 {'Downtown Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'West Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'West Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East Toronto'},
 {'Downtown Toronto'},
 {'West Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East Toronto'},
 {'Downtown Toronto'},
 {'North York'},
 {'North York'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East Toronto'},
 {'North Yor

In [43]:
#Saving the unique values of the borough instead of having a list of lists.
o_boro = [sublist[0] for sublist in o_boro]

In [106]:
#Removing the characters like '',[] and saving it in o_neig as a list
o_neig=[str(value).translate(str.maketrans("'"," ")).strip("[ ]") for value in o_neig]

In [140]:
#Building the final DF and removing the characters ""
df_final=pd.DataFrame({"Postcode" : o_post, "Borough" : o_boro, "Neigbourhood": o_neig})
df_final["Neigbourhood"].replace('\"',"",regex=True,inplace=True)

In [141]:
df_final

Unnamed: 0,Postcode,Borough,Neigbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights , Lawrence Manor"
4,M7A,Downtown Toronto,Queen s Park
5,M1B,Scarborough,"Rouge , Malvern"
6,M3B,North York,Don Mills North
7,M4B,East York,"Woodbine Gardens , Parkview Hill"
8,M5B,Downtown Toronto,"Ryerson , Garden District"
9,M6B,North York,Glencairn


In [142]:
df_final.shape

(102, 3)