In [77]:
# importing the required libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# **1** Fetching data from wikipedia page <a href='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'>wikipedia page link</a>

In [78]:
# using beautiful soup 
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')

In [79]:
# finding table tag in soup object
table=soup.find('table',class_='wikitable sortable')

In [80]:
# initialzing the dataframe with three columns
toronto_neighbourhood=pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'])
toronto_neighbourhood

Unnamed: 0,Postcode,Borough,Neighbourhood


# 2 Appending data to dataframe from table


In [81]:
i=-1
for a in table.find_all('tr'):
    list1=[]
    for b in a.find_all('td'):
        if('\n' in b.text):
            string=(b.text).split('\n')[0]
            list1.append(string)
        else:
            list1.append(b.text)
    if(i==-1):
        i=i+1
        continue # skip the first row with heading
    toronto_neighbourhood.loc[i]=list1
    i=i+1
toronto_neighbourhood.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 3 Cleaning the dataframe
### removing rows with borough column as Not assigned.

In [82]:
toronto_neighbourhood=toronto_neighbourhood[toronto_neighbourhood['Borough'] != 'Not assigned']
toronto_neighbourhood.reset_index(drop=True,inplace=True)
toronto_neighbourhood.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### **combining Neighbourhood with same postcode**

In [83]:
toronto_neighbourhood['Neighbourhood']=toronto_neighbourhood.groupby('Postcode')['Neighbourhood'].transform(lambda x :
', '.join(x)).values
toronto_neighbourhood.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"


In [84]:
# removing the duplicates that arises.
toronto_neighbourhood = toronto_neighbourhood.drop_duplicates().reset_index(drop=True)
toronto_neighbourhood.head(10) 

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### **Replace Neighbourhoods with 'Not assigned' to their Burough name**

In [85]:
toronto_neighbourhood['Neighbourhood'].replace('Not assigned',toronto_neighbourhood['Borough'],inplace=True)
toronto_neighbourhood.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [86]:
# Shape of the dataframe
toronto_neighbourhood.shape

(103, 3)