## First install the beautifulsoup4 if not exists already

In [1]:
!pip install beautifulsoup4



## Then install the lxml library

In [2]:
!pip install lxml



In [3]:
import bs4 as bs
import urllib.request

In [4]:
#read the URL and clean by BeautifulSoup class
sauce = urllib.request.urlopen ('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup (sauce,'lxml')

In [5]:
#Check the title
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [6]:
#Read the table using find function since there is only one table available
table = soup.find('table')

In [7]:
#Read the table rows by reading tags 'tr'
table_rows =table.find_all('tr')

In [8]:
#Run loops to read the table rows by reading  < td > html tags
for tr in table_rows:
    td = tr.find_all('td')
    row= [i.text for i in td]
    print (row)

[]
['M1A', 'Not assigned', 'Not assigned\n']
['M2A', 'Not assigned', 'Not assigned\n']
['M3A', 'North York', 'Parkwoods\n']
['M4A', 'North York', 'Victoria Village\n']
['M5A', 'Downtown Toronto', 'Harbourfront\n']
['M6A', 'North York', 'Lawrence Heights\n']
['M6A', 'North York', 'Lawrence Manor\n']
['M7A', 'Downtown Toronto', "Queen's Park\n"]
['M8A', 'Not assigned', 'Not assigned\n']
['M9A', "Queen's Park", 'Not assigned\n']
['M1B', 'Scarborough', 'Rouge\n']
['M1B', 'Scarborough', 'Malvern\n']
['M2B', 'Not assigned', 'Not assigned\n']
['M3B', 'North York', 'Don Mills North\n']
['M4B', 'East York', 'Woodbine Gardens\n']
['M4B', 'East York', 'Parkview Hill\n']
['M5B', 'Downtown Toronto', 'Ryerson\n']
['M5B', 'Downtown Toronto', 'Garden District\n']
['M6B', 'North York', 'Glencairn\n']
['M7B', 'Not assigned', 'Not assigned\n']
['M8B', 'Not assigned', 'Not assigned\n']
['M9B', 'Etobicoke', 'Cloverdale\n']
['M9B', 'Etobicoke', 'Islington\n']
['M9B', 'Etobicoke', 'Martin Grove\n']
['M9B', '

In [9]:
#create a list of lists for all rowas
listrows=[]
j=0
for tr in table_rows:
    td = tr.find_all('td')
    row= [i.text for i in td]
    # exclude the header
    if (j>0):
        listrows.append(row)
    j=j+1
#Print first 10 rows    
listrows [:10]

[['M1A', 'Not assigned', 'Not assigned\n'],
 ['M2A', 'Not assigned', 'Not assigned\n'],
 ['M3A', 'North York', 'Parkwoods\n'],
 ['M4A', 'North York', 'Victoria Village\n'],
 ['M5A', 'Downtown Toronto', 'Harbourfront\n'],
 ['M6A', 'North York', 'Lawrence Heights\n'],
 ['M6A', 'North York', 'Lawrence Manor\n'],
 ['M7A', 'Downtown Toronto', "Queen's Park\n"],
 ['M8A', 'Not assigned', 'Not assigned\n'],
 ['M9A', "Queen's Park", 'Not assigned\n']]

In [10]:
# import pandas and convert the listrows into datafrane
import pandas as pd
df = pd.DataFrame(listrows)
df.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [12]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood; Thereforem assignign those columns
df.columns = ['Postcode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [13]:
#Create fucntion to clean the ending new line "\n"
def cleanNeighborhood(Neighborhood):
    str2 = Neighborhood.replace('\n', '')
    return str2

In [14]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
#So for the 9th cell in the table on the Wikipedia page, 
#the value of the Borough and the Neighborhood columns will be Queen's Park.
def notAssignedMatch(Borough, Neighborhood):
    if (Neighborhood == 'Not assigned'):
        matchVar = Borough
    else:
        matchVar = Neighborhood
    return matchVar

In [15]:
#Check these two functions before applying to the dataframe
print(cleanNeighborhood ('Not assigned\n'))
print(notAssignedMatch ('Queen''s Park', 'Not assigned'))

Not assigned
Queens Park


In [16]:
#Apply the two functions to the dataframe
df['Neighborhood'] = df.apply(lambda row: cleanNeighborhood(row['Neighborhood']), axis=1)
df['Neighborhood'] = df.apply(lambda row: notAssignedMatch(row['Borough'], row['Neighborhood']), axis=1)

In [18]:
#check the data frame
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [23]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df = df[df['Neighborhood'] !='Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [24]:
#check the dataframe shape before applying group by functions
df.shape

(210, 3)

In [27]:
df_grouped=df.groupby(['Postcode','Borough']).agg({'Neighborhood' : ','.join}).reset_index()
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [29]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe
df_grouped.shape

(103, 3)