## Import all necessary Python libraries

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors


## Load CSV file and create Pandas data frame

In [2]:
#csv_file = ('https://cocl.us/Geospatial_data')
#df_csv = pd.read_csv(csv_file)

## Load HTML code into BeautifulSoup object for scraping

In [3]:
# HTML code from wiki-page with Canadian Postal Codes
html=urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# Create Beautiful Soup Object with HTML code from Wiki-Page
bsObj=BeautifulSoup(html.read())

# Scrape the Postal Codes, Boroughs, and Neighborhoods out of Beautiful Soup Object.
data=(bsObj.tbody.findAll('td'))

In [4]:
# Initialize the three following lists for population with data from 'data' BS Object

Postal_Code=[]
Borough=[]
Neighborhood=[]

column_count=1


# The following loop will "parse" the Beautiful Soup Object into their respective columns for later 
# data frame creation.

for i in range (len(data)):
    if column_count==1:
        Postal_Code.append(data[i].get_text(strip=True))        
    elif column_count==2:
        Borough.append(data[i].get_text(strip=True))
    elif column_count==3:
        Neighborhood.append(data[i].get_text(strip=True))

    column_count+=1
    if column_count>3:
        column_count=1

## Create and clean data frame with columns scraped from Wiki-Page

In [5]:
df=pd.DataFrame(Postal_Code, columns=['Postal_Code'])
df['Borough']=Borough
df['Neighborhood Name']=Neighborhood

In [6]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
# replace "Not assigned" with NaN for later removal
df.replace("Not assigned", np.nan, inplace = True)
df.head(5)

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
# Checking boolean values for null / no-null values

no_assign = df.isnull()
no_assign.head(5)

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,False,True,True
1,False,True,True
2,False,False,False
3,False,False,False
4,False,False,False


In [9]:
# Checking null vs. no-null column value counts 

for column in no_assign.columns.values.tolist():
    print(column)
    print (no_assign[column].value_counts())
    print("")

Postal_Code
False    180
Name: Postal_Code, dtype: int64

Borough
False    103
True      77
Name: Borough, dtype: int64

Neighborhood Name
False    103
True      77
Name: Neighborhood Name, dtype: int64



In [10]:
# Drop the NAs in the data frame

df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)


In [11]:
# Verify data frame has no NaN

no_assign_no_na = df.isnull()
no_assign_no_na.head(5)

for column in no_assign_no_na.columns.values.tolist():
    print(column)
    print (no_assign_no_na[column].value_counts())
    print("")

Postal_Code
False    103
Name: Postal_Code, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood Name
False    103
Name: Neighborhood Name, dtype: int64



In [12]:
# Check newly formatted data frame after dropping NaNs

df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Load CSV file containing lat. and long. of postal code zones

In [13]:
# Load CSV file and create Pandas data frame

csv_file = ('https://cocl.us/Geospatial_data')
df_csv = pd.read_csv(csv_file)

In [14]:
df_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Prepare both dataframes for later concatenating

In [15]:
# Sort values in geospatial data frame by postal code identifiers
df_csv.sort_values(by=['Postal Code'], inplace=True)

# Sort values in neighborhood / boroughs dataframe (scraped from wikipage) by postal code identifiers
df.sort_values(by=['Postal_Code'], inplace=True)

# reset BOTH indexes of each dataframe
df_csv.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [16]:
# Sanity check the geospatial dataframe against the neighborhood / boroughs dataframe in next cell below. both
# data frames should have the same number of rows, and have their respective postal code identifiers in ascending
# alphabetical order
df_csv

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [17]:
# both dataframes have 103 rows, and have postal code identifiers matching their respective row numbers. 
# NOTE: The column for the postal code columns in each dataframe are purposefully named uniquely. This is only
# for later comparison when the two dataframes are concatenated. 
df

Unnamed: 0,Postal_Code,Borough,Neighborhood Name
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [18]:
# Concatenate both dataframes into one single dataframe.
con_df=pd.concat([df, df_csv], axis=1)

In [19]:
# Sanity check the resulting dataframe. Again, note both postal code identifier columns. One will be dropped later.
con_df

Unnamed: 0,Postal_Code,Borough,Neighborhood Name,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,York,Weston,M9N,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",M9R,43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",M9V,43.739416,-79.588437


In [20]:
# Drop one of the postal code columns
con_df.drop('Postal Code',axis=1, inplace=True)

In [21]:
# Resulting dataframe after the combining of both the geospatial csv data file, and the web scraped wiki-page with
# the neighborhood, boroughs, and postal code identifiers for each.
con_df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood Name,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [22]:
# Create a separate dataframe from the concatenated dataframe above to isolate boroughs located in Toronto
Toronto_df=con_df[(con_df['Borough'].str.contains('Toronto'))]
Toronto_df.reset_index(drop=True, inplace=True)
Toronto_df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood Name,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
