#### Installing Beautiful Soup

In [1]:
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd

In [36]:
website_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(website_url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [37]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [38]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0

In [39]:
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


In [40]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [41]:
[len(C) for (title,C) in col]


[288, 288, 288]

In [42]:
Dict={title:column for (title,column) in col}
df= pd.DataFrame(Dict)

In [43]:
df.head()

Unnamed: 0,Borough,Neighbourhood,Postcode
0,Not assigned,Not assigned\n,M1A
1,Not assigned,Not assigned\n,M2A
2,North York,Parkwoods\n,M3A
3,North York,Victoria Village\n,M4A
4,Downtown Toronto,Harbourfront\n,M5A


In [44]:
new_order = [-1,0,1]
df = df[df.columns[new_order]]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [47]:
df = df[df.Borough != 'Not assigned']

In [48]:
df.shape

(211, 3)

In [51]:
df = df.reset_index(drop=True)

In [52]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n


In [64]:
df.columns.values

array(['Postcode', 'Borough', 'Neighbourhood'], dtype=object)

In [65]:
df.rename(columns={'Neighbourhood\n': 'Neighbourhood'}, inplace=True)

In [67]:
df['Neighbourhood'] = df['Neighbourhood'].map(lambda x: x.rstrip('\n'))

In [68]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [74]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
# So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

for index, row in df.iterrows():
     if row['Neighbourhood'] == "Not assigned":
        row['Neighbourhood'] = row['Borough']

In [79]:
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

