# Prepping the Toronto Data

### Initially get the wiki page in html code form using requests module

In [None]:
import requests
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
website_as_html = requests.get(url).text

### Now use _BS4_ to parse the HTML page

In [None]:
from bs4 import BeautifulSoup 
soup = BeautifulSoup(website_as_html,'lxml')

#### First find _table_ and then _tr_ tag to extract the data

In [None]:
#as we can see our required data is in the table which has class='wikitable sortable'
my_table = soup.find('table',{'class':'wikitable sortable'})
tr_tag_data=my_table.findAll('tr')

In [None]:
data={'PostalCode':[],'Borough':[],'Neighborhood':[]}  # dictionary which will be converted to dataframe later

#### Looping over all the tr and add to the _data_ dictionary

In [None]:
#skip first row, which is ('PostalCode','Borough','Neighborhood')
for i in tr_tag_data[1:]:
    row=i.text.split('\n')[1:-1]
    if row[1]=='Not assigned':
        continue
    elif row[2]=='Not assigned':
        data['Neighborhood'].append(row[1])
    else:
        data['Neighborhood'].append(row[2])
        
    data['PostalCode'].append(row[0])
    data['Borough'].append(row[1])


## Finally create the dataframe using the _data_ dictionary

In [None]:
import pandas as pd
df=pd.DataFrame(data)
toronto_data=df.sort_values(['PostalCode']).reset_index(drop=True)

### Merging the rows with same _PostalCode_

In [None]:
toronto_data=toronto_data.groupby(['PostalCode','Borough'], as_index=False).agg(', '.join)
toronto_data.head()

## Finally print the shape of cleaned dataframe

In [None]:
print('Shape of Final dataframe is: ',toronto_data.shape)