In [36]:
# importing the required libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# **1** Fetching data from wikipedia page <a href='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'>wikipedia page link</a>

In [37]:
# using beautiful soup 
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')

In [38]:
# finding table tag in soup object
table=soup.find('table',class_='wikitable sortable')

In [39]:
# initialzing the dataframe with three columns
toronto_neighbourhood=pd.DataFrame(columns=['Postal Code','Borough','Neighbourhood'])
toronto_neighbourhood

Unnamed: 0,Postal Code,Borough,Neighbourhood


# 2 Appending data to dataframe from table


In [40]:
i=-1
for a in table.find_all('tr'):
    list1=[]
    for b in a.find_all('td'):
        if('\n' in b.text):
            string=(b.text).split('\n')[0]
            list1.append(string)
        else:
            list1.append(b.text)
    if(i==-1):
        i=i+1
        continue # skip the first row with heading
    toronto_neighbourhood.loc[i]=list1
    i=i+1
toronto_neighbourhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 3 Cleaning the dataframe
### removing rows with borough column as Not assigned.

In [41]:
toronto_neighbourhood=toronto_neighbourhood[toronto_neighbourhood['Borough'] != 'Not assigned']
toronto_neighbourhood.reset_index(drop=True,inplace=True)
toronto_neighbourhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### **combining Neighbourhood with same postcode**

In [42]:
toronto_neighbourhood['Neighbourhood']=toronto_neighbourhood.groupby('Postal Code')['Neighbourhood'].transform(lambda x :
', '.join(x)).values
toronto_neighbourhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"


In [43]:
# removing the duplicates that arises.
toronto_neighbourhood = toronto_neighbourhood.drop_duplicates().reset_index(drop=True)
toronto_neighbourhood.head(10) 

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### **Replace Neighbourhoods with 'Not assigned' to their Burough name**

In [44]:
toronto_neighbourhood['Neighbourhood'].replace('Not assigned',toronto_neighbourhood['Borough'],inplace=True)
toronto_neighbourhood.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [45]:
# Shape of the dataframe
toronto_neighbourhood.shape

(103, 3)

# 4 Obtaining geospatial data
By importing the CSV file from <a href='https://cocl.us/Geospatial_data'>https://cocl.us/Geospatial_data</a>

In [46]:
lat_long_df=pd.read_csv('Geospatial_Coordinates.csv')
lat_long_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [47]:
lat_long_df.shape

(103, 3)

### Merging the two dataframe with respect to their **Postal Code**

In [48]:
toronto_neighbourhood=pd.merge(toronto_neighbourhood,lat_long_df, on='Postal Code')
toronto_neighbourhood.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [49]:
toronto_neighbourhood.shape

(103, 5)