# Coursera Capstone Project

In [1]:
#import required libraries
import pandas as pd
from bs4 import BeautifulSoup as bsp
import requests

In [2]:
#use requests to 'get'the url text
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#pass this source (and lxml) to BeautifulSoup
soup = bsp(source, 'lxml')

#scrape the webpage for the table of canada neighbourhoods
table = soup.find('table', class_='wikitable sortable')

#collect the table column titles into a list
table_columns = []
for th in table.find_all('th'):
    table_columns.append(th.text)


In [3]:
# replace the 'Neighbourhood\n' in the list with 'Neighbourhood'
table_columns[2] = 'Neighbourhood'
table_columns

['Postcode', 'Borough', 'Neighbourhood']

In [4]:
# collect the table row values into a row of lists 
output_rows = []
for table_row in table.find_all('tr'): # find all tr -> table rows
    columns =  table_row.find_all('td') # within each row, find all table data belonging to that row
    output_row = []
    for column in columns:
         output_row.append(column.text) # append each data to its column 
    output_rows.append(output_row) #append each row to its row


In [5]:
can_df = pd.DataFrame(output_rows)

In [6]:
can_df.columns = table_columns
can_df.drop([0], axis = 0, inplace=True)

In [7]:
can_df.iloc[:, 2].head(5)

1        Not assigned\n
2        Not assigned\n
3           Parkwoods\n
4    Victoria Village\n
5        Harbourfront\n
Name: Neighbourhood, dtype: object

In [8]:
# the values in Neighbourhood column need to be stripped of '\n' special character
can_df['Neighbourhood'] = can_df['Neighbourhood'].map(lambda x:str(x)[:-1])

In [9]:
can_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


In [10]:
can_df.shape[0]

288

In [11]:
# some Postcodes have Boroughs but neighbourhood 'Not assigned'
# assign to a Neighbourhood with 'Not Assigned' the value of it's Borough 
for i in range(can_df.shape[0]):
    if (can_df.iloc[i,1] !='Not assigned') & (can_df.iloc[i,2] == 'Not assigned'):
        can_df.iloc[i,2] = can_df.iloc[i,1]

In [12]:
can_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
10,M8A,Not assigned,Not assigned


In [13]:
# some boroughs have the same postcodes but different neighbourhoods
# group together these boroughs based on Postcodes and aggregate the values of their neighbourhood
aggregate_func = {'Borough':'first', 'Neighbourhood': lambda x: ', '.join(x)}
can_df_new = can_df.groupby(can_df['Postcode']).aggregate(aggregate_func)


In [14]:
can_df_new

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1A,Not assigned,Not assigned
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"


In [15]:
#reset index
can_df_new.reset_index(inplace=True)
can_df_new.shape

(180, 3)

In [16]:
# we need to the Postodes with Borough 'Not assigned'
#But first we must collect their indices
drop_list = []
for i in range(can_df_new.shape[0]):
    if can_df_new.iloc[i,1] == 'Not assigned':
        drop_list.append(i)
drop_list[0:9]

[0, 18, 19, 20, 21, 22, 23, 24, 33]

In [17]:
# drop these rows
can_df_new.drop(drop_list, axis=0, inplace=True)

In [18]:
# reset index to make the count start from 0
can_df_new.reset_index(inplace=True)
can_df_new.head(5)

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,1,M1B,Scarborough,"Rouge, Malvern"
1,2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,4,M1G,Scarborough,Woburn
4,5,M1H,Scarborough,Cedarbrae


In [19]:
# Let's drop the 'index' column
can_df_new.drop(['index'], axis=1, inplace=True)

In [20]:
# finally our toronto neighbourhood!!!
tor_neighbor = can_df_new
tor_neighbor.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [21]:
tor_neighbor.shape

(103, 3)