# Part 1 - Creating Initial Dataframe

Importing all the libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import geocoder

Making a beautifulsoup object given the site link.

In [2]:
URL = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')

Finding the table in the html code.

In [3]:
table=soup.find('table')

Creating an inital rough table with all the details.

In [4]:
table_contents=[]
for row in table.findAll('tr')[1:]:
    cell = {}
    if row.findAll('td')[1:2][0].text=='Not assigned':     ## If 'Borough' is 'Not assigned' then we simply ignore the row
        pass
    else:
        cell['PostalCode'] = row.td.text[:3]
        cell['Borough'] = row.findAll('td')[1:2][0].text.strip("\n")
        
        ## If 'Neighborhood' is 'Not assigned' then we give it the value from 'Borough'
        if row.findAll('td')[2:3][0].text == 'Not assigned\n':
            cell['Neighborhood'] = row.findAll('td')[1:2][0].text.strip("\n")
        else:
            cell['Neighborhood'] = row.findAll('td')[2:3][0].text.strip("\n")
        
        ## Appending it to the list
        table_contents.append(cell)

Converting list to dataframes.

In [5]:
df=pd.DataFrame(table_contents)

Combining cells with same 'Postal Code'.

In [6]:
df['Neighborhood'] = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].transform(lambda x: ', '.join(x))
df = df.drop_duplicates()
df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)
print(df.head(10))

  PostalCode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Harbourfront, Regent Park
3        M6A        North York  Lawrence Heights, Lawrence Manor
4        M7A      Queen's Park                      Queen's Park
5        M9A         Etobicoke                  Islington Avenue
6        M1B       Scarborough                    Rouge, Malvern
7        M3B        North York                   Don Mills North
8        M4B         East York   Woodbine Gardens, Parkview Hill
9        M5B  Downtown Toronto          Ryerson, Garden District


Printing the shape.

In [7]:
print(df.shape)

(103, 3)


# Part 2 - Finding Coordinates

Creating a function to get latitude and longitude value given postal code.

In [8]:
geoSpatial = pd.read_csv('Geospatial_Coordinates.csv')
print(geoSpatial.head(5))

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


Creating two lists with information regarding the coordinates. So that we can add the lists as columns to the existing table.

In [9]:
Latitude = []
Longitude = []

for index, row in df.iterrows():
    postal_code = row['PostalCode']
    Latitude.append(geoSpatial.loc[geoSpatial['Postal Code']==postal_code]['Latitude'].iloc[0])    
    Longitude.append(geoSpatial.loc[geoSpatial['Postal Code']==postal_code]['Longitude'].iloc[0])

In [10]:
df['Longitude'] = Longitude
df['Latitude'] = Latitude

Printing Final Table

In [11]:
print(df.head(10))

  PostalCode           Borough                      Neighborhood  Longitude  \
0        M3A        North York                         Parkwoods -79.329656   
1        M4A        North York                  Victoria Village -79.315572   
2        M5A  Downtown Toronto         Harbourfront, Regent Park -79.360636   
3        M6A        North York  Lawrence Heights, Lawrence Manor -79.464763   
4        M7A      Queen's Park                      Queen's Park -79.389494   
5        M9A         Etobicoke                  Islington Avenue -79.532242   
6        M1B       Scarborough                    Rouge, Malvern -79.194353   
7        M3B        North York                   Don Mills North -79.352188   
8        M4B         East York   Woodbine Gardens, Parkview Hill -79.309937   
9        M5B  Downtown Toronto          Ryerson, Garden District -79.378937   

    Latitude  
0  43.753259  
1  43.725882  
2  43.654260  
3  43.718518  
4  43.662301  
5  43.667856  
6  43.806686  
7  43.7459