# <div align="center">**Doug Delaney**</div>
## <div align="center">*Applied Data Science Capstone - Segmenting and Clustering Neighborhoods in Toronto*</div>
### <div align="center">Coursera_Capstone</div>

## NOTATION: This notebook is divided into 3 sections by task.  
- Task 1 = Web scraping
- Task 2 = Cleaning
- Task 3 = Clustering

# Prerequisites: install and load modules

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import random # library for random number generation

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
# !pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !pip install beautifulsoup4
# !pip install lxml
from bs4 import BeautifulSoup
import lxml

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

import fnmatch # wildcard match
import re # Regular Expressions

print('Libraries imported.')

Libraries imported.


# Task 1 - Web scraping (and some Cleaning)

## Scraping the Wikipedia page for postal codes of Canada. Postal codes beginning with M are located within the city of Toronto

In [2]:
# function to use left, right and mid, as I could in VB (and vbscript, VBA, etc.)
def left(str, amount):
    return str[:amount]

def right(str, amount):
    return str[-amount:]

def mid(str, offset, amount):
    return str[offset-1:offset+amount-1]

In [7]:
# from IPython.core.debugger import set_trace


url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_data = requests.get(url).text
# raw_data_decode = raw_data.content.decode()
# raw_json_data = requests.get(url).json()
# raw_json_data
soup=BeautifulSoup(raw_data,'lxml')
print("soup type: ")
print(type(soup))
soup
table = soup.find_all("table")[0]

table_paras = table.tbody.find_all("p")

content=''
p_code=''
burough=''
neighborhoods=''
row=''
temp=''
temp2=''
bu=''
nh=''
bu2=''
nh1=''
nh2=''
t1=''
t2=''
t3=''
t4=''

column_names = ['Postal Code','Burough','Neighborhoods']    # FOR LATER ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
df=pd.DataFrame(columns=column_names,index=None)

for tag in table.tbody.find_all('p'):
    content += tag.text
    
rows = content.splitlines()

for row in rows:
    bu=''
    nh=''
    bu2=''
    nh1=''
    nh2=''
    t1=''
    t2=''
    t3=''
    t4=''
    lreturn=''
    rreturn=''
    temp=''
    temp_new=''
    gotMatch=False


    row = row.rstrip()
    row = row.strip('\n')
    # get the first 3 characters
    p_code = left(row,3)
    # get the rest of the line
    temp2=mid(row, 4, len(row))
    # strip the \n (if in temp2)
    if '\n' in temp2:
        temp=temp2.strip('\n')
    else:
        temp=temp2
        
    # FIND Toronto strings that do not have Toronto( and instead have TorontoE (for example)
    found = any(re.search(regex_str, temp) for regex_str in ["Etobicoke\w", "Toronto\w", "Mississauga\w"])
    if found:
        gotMatch=False
        regexList = ["Etobicoke\w", "Toronto\w", "Mississauga\w"]
        for regex in regexList:
            s = re.search(regex,temp)
            if s:
                gotMatch = True
                rreturn=s
                break

    if gotMatch:
        substr=rreturn

        if "(" in temp:
            temp_new = temp.replace('(', ' - ')

        substr = mid(temp_new,rreturn.span()[0]+1,rreturn.span()[1]-rreturn.span()[0])
        s1 = mid(substr,1,len(substr)-1)
        s2 = right(substr, 1)
        newstr = s1 + "(" + s2
        temp_new = temp_new.replace(substr,newstr,1)
        if newstr == 'Etobicoke(N':
            temp_new = temp_new.replace("Etobicoke(Northwest - ", "Etobicoke - Northwest (")
        temp=temp_new

    if '(' not in temp:
        bu = temp
        # DON'T ADD to DF if values are 'Not assigned' .  I see no need to duplicate the burough into neighborhood for Not Assigned
        # df=df.append({'Postal Code':p_code, 'Burough':bu, 'Neighborhoods':bu}, ignore_index=True)
    elif temp.count('(') >= 1 and temp.count('(') < 2:
        # Handle the other lines that include "("
        t1,t2 = temp.split('(',2)
    
        bu=t1
        lreturn = fnmatch.filter(t2,')?')
        if not lreturn:
            nh1=t2.replace(')',' - ')
            nh = nh1
        else:
            nh1=t2.replace(')','')
            nh = nh1
            
        str2=right(nh, 3)
        if str2 == ' - ': 
            nh = nh.replace(' - ','')
        if 'East YorkEast Toronto' in bu:
            bu = bu.replace('East YorkEast Toronto', 'East York - East Toronto')
        df=df.append({'Postal Code':p_code, 'Burough':bu, 'Neighborhoods':nh}, ignore_index=True)
    elif temp.count('(') >= 1 and temp.count('(') <= 2:  
        t1,t2,t3 = temp.split('(',3)
        bu=t1
        lreturn = fnmatch.filter(t2,')?')
        if not lreturn:
            nh1=t2.replace(')',' - ')
            nh2 = t3.replace(')','')
            nh = nh1 + " / " + nh2
        else:
            nh1=t2.replace(')','')
            nh2 = t3.replace(')','')
            nh = nh1 + " / " + nh2
        
        str2=right(nh, 3)
        if str2 == ' - ': 
            nh = nh.replace(' - ','')
        if 'East YorkEast Toronto' in bu:
            bu = bu.replace('East YorkEast Toronto', 'East York - East Toronto')
        df=df.append({'Postal Code':p_code, 'Burough':bu, 'Neighborhoods':nh}, ignore_index=True)
    else:
        bu,nh = temp.split('(',2)
#             burough = bu
#             neighborhoods = nh
        nh = nh.strip(')')
    
        str2=right(nh, 3)
        # I didn't spend the time and instead duplicated this code 3 times (2 above).  I am sure there is a better way to do all this - but I am working towards a function that handles all kinds of exceptions to the rule.
        if str2 == ' - ': 
            nh = nh.replace(' - ','')
        if 'East YorkEast Toronto' in bu:
            bu = bu.replace('East YorkEast Toronto', 'East York - East Toronto')
        df=df.append({'Postal Code':p_code, 'Burough':bu, 'Neighborhoods':nh}, ignore_index=True)

df

soup type: 
<class 'bs4.BeautifulSoup'>


Unnamed: 0,Postal Code,Burough,Neighborhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills - North
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [39]:
df.rename(columns={'Burough':'Borough','Neighborhoods':'Neighborhood'}, inplace=True)
df.shape

(103, 3)

In [44]:
# !pip install pgeocode
import pgeocode

postal_code = df['Postal Code'].tolist()

geo = pgeocode.Nominatim('ca')
geo = pgeocode.Nominatim('ca')
location = geo.query_postal_code(postal_code)
latitude = location.latitude
longitude = location.longitude

In [46]:
dfll = pd.DataFrame([latitude,longitude]).transpose().astype(float)
dfll.rename(columns={'latitude':'Latitude','longitude':'Longitude'}, inplace=True)
print(dfll.shape)
dfll

(103, 2)


Unnamed: 0,Latitude,Longitude
0,43.7545,-79.33
1,43.7276,-79.3148
2,43.6555,-79.3626
3,43.7223,-79.4504
4,43.6641,-79.3889
5,43.6662,-79.5282
6,43.8113,-79.193
7,43.745,-79.359
8,43.7063,-79.3094
9,43.6572,-79.3783


In [49]:
new_df = pd.concat([df, dfll], axis=1)
# drop the NaN value
new_df.dropna(inplace=True)
new_df = new_df.reset_index(drop=True)
print(new_df.shape)
new_df

(102, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.6555,-79.3626
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
5,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282
6,M1B,Scarborough,Malvern / Rouge,43.8113,-79.193
7,M3B,North York,Don Mills - North,43.745,-79.359
8,M4B,East York,Parkview Hill / Woodbine Gardens,43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


In [50]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
T_location = geolocator.geocode(address)
T_latitude = T_location.latitude
T_longitude = T_location.longitude
print('The geograpical coordinate of Toronto, ON are {}, {}.'.format(T_latitude, T_longitude))

The geograpical coordinate of Toronto, ON are 43.6534817, -79.3839347.


In [68]:
# create map of Toronto, ON using latitude and longitude values
map_toronto = folium.Map(location=[T_latitude, T_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_df['Latitude'], new_df['Longitude'], new_df['Borough'], new_df['Neighborhood']):
    label = '{}, {}'.format('Borough: ' + borough, 'Neighborhood(s): ' + neighborhood)  # MY NOTES: That is how I should have done it in the last lab...
    # increase width for nicer display
    label = folium.Popup(label, parse_html=True, max_width=2650)
    tip = folium.Tooltip(borough) # add a hover tooltip
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        tooltip=tip,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto