## Import all necessary Python libraries

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors


## Load HTML code into BeautifulSoup object for scraping

In [2]:
# HTML code from wiki-page with Canadian Postal Codes
html=urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# Create Beautiful Soup Object with HTML code from Wiki-Page
bsObj=BeautifulSoup(html.read())

# Scrape the Postal Codes, Boroughs, and Neighborhoods out of Beautiful Soup Object.
data=(bsObj.tbody.findAll('td'))

In [3]:
# Initialize the three following lists for population with data from 'data' BS Object

Postal_Code=[]
Borough=[]
Neighborhood=[]

column_count=1


# The following loop will "parse" the Beautiful Soup Object into their respective columns for later 
# data frame creation.

for i in range (len(data)):
    if column_count==1:
        Postal_Code.append(data[i].get_text(strip=True))        
    elif column_count==2:
        Borough.append(data[i].get_text(strip=True))
    elif column_count==3:
        Neighborhood.append(data[i].get_text(strip=True))

    column_count+=1
    if column_count>3:
        column_count=1

## Create and clean data frame with columns scraped from Wiki-Page

In [4]:
df=pd.DataFrame(Postal_Code, columns=['Postal_Code'])
df['Borough']=Borough
df['Neighborhood Name']=Neighborhood

In [5]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
# replace "Not assigned" with NaN for later removal
df.replace("Not assigned", np.nan, inplace = True)
df.head(5)

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
# Checking boolean values for null / no-null values

no_assign = df.isnull()
no_assign.head(5)

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,False,True,True
1,False,True,True
2,False,False,False
3,False,False,False
4,False,False,False


In [8]:
# Checking null vs. no-null column value counts 

for column in no_assign.columns.values.tolist():
    print(column)
    print (no_assign[column].value_counts())
    print("")

Postal_Code
False    180
Name: Postal_Code, dtype: int64

Borough
False    103
True      77
Name: Borough, dtype: int64

Neighborhood Name
False    103
True      77
Name: Neighborhood Name, dtype: int64



In [9]:
# Drop the NAs in the data frame

df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)


In [10]:
# Verify data frame has no NaN

no_assign_no_na = df.isnull()
no_assign_no_na.head(5)

for column in no_assign_no_na.columns.values.tolist():
    print(column)
    print (no_assign_no_na[column].value_counts())
    print("")

Postal_Code
False    103
Name: Postal_Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood Name
False    103
Name: Neighborhood Name, dtype: int64



In [11]:
# Check newly formatted data frame after dropping NaNs

df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
df.shape

(103, 3)