In [41]:
import numpy as np 
import pandas as pd
import requests # a package to send http request
from bs4 import BeautifulSoup #BeautifulSoup is a package to parse and work with html file

In [42]:
# Data extraction from wikipedia

wiki_data_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #return the data of the page in html format
soup = BeautifulSoup(wiki_data_url, 'lxml') #assign the url address to a variable in order to parse it
#print(soup.prettify())
data_table = soup.find('table',{'class':'wikitable sortable'}) #taking the part of the html file that we need
first_column = []
second_column = []
third_column = []
for row in data_table.findAll('tr'): # we need everything between 'tr' and 'td'
    cells = row.findAll('td')
    if len(cells)==3: #3 is the number of columns that we want (Neighborhood, Postal Code, Borough)
        first_column.append(cells[0].find(text=True)) 
        second_column.append(cells[1].find(text=True))
        third_column.append(cells[2].find(text=True))
                                                      #find is a function that extract the data not in a html format in the specific location where cells is in 
                                                      #if we not use find the data in our data frame will be in a html format

df = pd.DataFrame() 
df['Postal Code'] = first_column
df['Borough'] = second_column
df['Neighborhood'] = third_column
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [43]:
# Data cleaning 1 - drop 'not assigned' in 'borough' column
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [44]:
# Data cleaning 3 - replace neighborhoods with 'not assigned' to corresponds borough
for i,j in zip(df['Borough'], df['Neighborhood']):
    if j == 'Not assigned':
        df.replace(j, i,  inplace = True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [45]:
# Data cleaning 2 - same postal code analysis
df_groupby = df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(','.join)
df_groupby_DF = pd.DataFrame(df_groupby)
df_groupby_DF = df_groupby_DF.reset_index()
df_groupby_DF.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood\n,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae\n


In [46]:
# Number of rows of the data frame after 'cleaning'
df_groupby_DF.shape[0]

103

In [49]:
#extract data and display its header
data_lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
data_lat_long.head()
#adding long and lat coordinates to our grouped data frame
df_groupby_DF['Latitude'] = data_lat_long['Latitude']
df_groupby_DF['Longitude'] = data_lat_long['Longitude']
df_groupby_DF.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
