## PART 1 - WEB SCRAPPING, DATA LOADING & CLEANSING

In [1]:
## Get the data from Wikipedia page
import requests
from bs4 import BeautifulSoup
post_codes_ca_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(post_codes_ca_url,'lxml')

In [2]:
## extract the data to three lists
codes_list=[]
borough_list=[]
neighborhood_list=[]
i=1
for tag in soup.table.find_all('td'):
    if i == 1:
        codes_list.append(tag.text)
    if i == 2:
        borough_list.append(tag.text)
    if i == 3: 
        neighborhood_list.append(tag.text)
    i = i+1
    if i==4:
        i=1

In [3]:
## input the data in the dataframe
import pandas as pd

toronto_df = pd.DataFrame(columns=['Postalcode','Borough','Neighborhood'])
toronto_df['Postalcode'] = codes_list
toronto_df['Borough'] = borough_list
toronto_df['Neighborhood'] = neighborhood_list
toronto_df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
8,M8A,Not assigned,Not assigned\n
9,M9A,Etobicoke,Islington Avenue\n


### CLEANING DATA

In [4]:
## Find the indexs of rows which Borough is 'Not assigned'
indexs = toronto_df[toronto_df['Borough']=="Not assigned"].index
indexs

Int64Index([  0,   1,   8,  12,  19,  20,  29,  35,  36,  44,  45,  49,  50,
             51,  53,  54,  58,  59,  60,  72,  73,  74,  87,  88,  89, 103,
            104, 105, 119, 120, 135, 136, 147, 148, 154, 160, 161, 166, 174,
            180, 181, 187, 188, 189, 193, 194, 200, 201, 202, 203, 208, 209,
            222, 223, 236, 237, 240, 241, 246, 247, 252, 253, 257, 258, 259,
            260, 262, 263, 273, 274, 275, 276, 277, 278, 279, 280, 286],
           dtype='int64')

In [5]:
## remove the rows with 'Not assigned' Borough
toronto_df.drop(toronto_df.index[indexs], inplace=True)

In [6]:
## remove the '\n' in Neighborhood at all rows
## reset index
toronto_df['Neighborhood'] = toronto_df.Neighborhood.str.replace("\n","")
toronto_df = toronto_df.reset_index(drop=True)
toronto_df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [7]:
## Get the indexs of which Neighborhood are 'Not assigned'
indexs_nb = toronto_df[toronto_df['Neighborhood']=="Not assigned"].index
indexs_nb

Int64Index([], dtype='int64')

In [8]:
## Define the Neighborhoods which are 'Not assgined' with value of Borough.
for i in range(210):
    if toronto_df.Neighborhood[i]=='Not assigned':
        toronto_df.Neighborhood[i] = toronto_df.Borough[i]

In [9]:
toronto_df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [10]:
## Combine the Neighborhood of same Posttal lines in one line. 
toronto_df_Final = toronto_df.groupby(['Postalcode','Borough'])['Neighborhood'].apply(','.join).reset_index()
toronto_df_Final

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [11]:
toronto_df_Final.shape

(103, 3)