# 1. Wikipedia page to pandas dataframe

### Web scraping with BeautifulSoup library:

In [1]:
#Import BeautifulSoup:
import requests
from bs4 import BeautifulSoup

In [2]:
#Import Wikipedia webpage:
r=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
c=r.content
soup=BeautifulSoup(c,"html.parser")

In [3]:
# Scrape the wanted elements:
data=[]
for element in soup.find_all('td'):
    data.append(element.text)

In [4]:
if "M9Z" in data:
    databis = data[:data.index("M9Z")+3]

In [5]:
num_of_parts=3
newdata=[databis[i::3] for i in range(num_of_parts)]

### Presentation of the wanted elements in a pandas dataframe:

In [6]:
import pandas as pd
import numpy as np
page = {'PostalCode':newdata[0],'Borough':newdata[1],'Neighborhood':newdata[2]}
df = pd.DataFrame.from_dict(page)
df=df.replace('\n','', regex=True)
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [7]:
# Creation of a new dataframe using groupby for PostalCode with more than 1 neighborhood:
merge=df.groupby('PostalCode')['Neighborhood'].apply(lambda x: ', '.join(x))
df2=pd.DataFrame(merge)
df2=df2.reset_index()
df2.head()

Unnamed: 0,PostalCode,Neighborhood
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [8]:
# Merger of the 2 dataframes created to get the PostalCode with more than 1 neighborhood
# into one row and separated with a comma:
df3=df.merge(df2, on='PostalCode', how='left')
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood_x,Neighborhood_y
0,M3A,North York,Parkwoods,Parkwoods
1,M4A,North York,Victoria Village,Victoria Village
2,M5A,Downtown Toronto,Harbourfront,"Harbourfront, Regent Park"
3,M5A,Downtown Toronto,Regent Park,"Harbourfront, Regent Park"
4,M6A,North York,Lawrence Heights,"Lawrence Heights, Lawrence Manor"


In [9]:
df3=df3[['PostalCode','Borough','Neighborhood_y']]
df3=df3.rename(columns={"Neighborhood_y": "Neighborhood"})

In [10]:
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"


In [11]:
# if Neighborhood = 'Not assigned' then Neighborhood = Borough:
df3['Neighborhood'] = np.where(df3['Neighborhood'] == 'Not assigned', df3['Borough'], df3['Neighborhood'])

In [12]:
# Delete duplicates rows:
df3=df3.drop_duplicates(subset=['PostalCode','Borough', 'Neighborhood'])

### Display the final shape of the dataframe obtained:

In [13]:
df3.shape

(103, 3)

In [14]:
# Save df3 for Part 2:
df3.to_csv('df3.csv')