# Capstone Project Part 1

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

## Part 1: getting the table from wikipedia

In [2]:
web = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
bs = BeautifulSoup(web.text, "html.parser")

In [4]:
data_table = bs.find("table")

### get header

In [5]:
header = [h.text.strip() for h in data_table.find_all('th')]
print(header)

['Postal Code', 'Borough', 'Neighborhood']


### get table data

In [6]:
data = [[i.text.strip() for i in row.find_all('td')] for row in data_table.find_all('tr')]

### build pandas dataframe

In [7]:
df = pd.DataFrame(data[1:])
df.columns = header
df.replace("Not assigned", np.nan, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
df.describe()

Unnamed: 0,Postal Code,Borough,Neighborhood
count,180,103,103
unique,180,10,99
top,M3X,North York,Downsview
freq,1,24,4


### clean the dataframe

In [9]:
df = df.dropna()
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combine the Postal Code with multiple Neighborhoods

In [10]:
def combine_string(a: pd.Series) -> str:
    result: str = a.iloc[0]
    for s in a[1:]:
        result += ','
        result += str(s)
    return result

df = df.groupby(by=['Postal Code', 'Borough']).agg(combine_string).reset_index()

In [11]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
print("number of rows: {}".format(df.shape[0]))

number of rows: 103
