<p><strong><i>IBM Data Science Professional Certificate</i></strong></p>
<p>Applied Data Science Week 3 Project: Neighborhood Segmentation and Clustering</p>
<p>Coursera: <a>https://www.coursera.org/learn/applied-data-science-capstone</a></p>

<p>Explore neighbourhood in Toronto</p>
<p>Technique: Web scrapping, API, Clustering<p>

<H2>Part 1: Scrap wikipage with "Beautifulsoup"</H2>

In [1]:
# import numpy and pandas (dataframe)
import pandas as pd
import numpy as np

# import map rendering libraries
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# import packages for web scrapping: beautifulsoup & requests
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

In [2]:
# wikipedia site to obtain postal code of Canada
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = urlopen(url) # open url
html = BeautifulSoup(html) # use Beautifulsoup to download html data and store into variable "html"
html

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XeHJYwpAMFsAADsDEkMAAADM","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":926306543,"wgRevisionId":926306543,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario

In [3]:
# use .prettify to improve visual of HTML structure

print(html.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XeHJYwpAMFsAADsDEkMAAADM","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":926306543,"wgRevisionId":926306543,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [4]:
# extract <title> of wikipage (user may see the "title" on webpage tab)

html.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [5]:
# extract the text within <title></title> tag

html.title.text

'List of postal codes of Canada: M - Wikipedia'

In [6]:
# identify <table> with class="wikitable sortable".
# this table contains table header (i.e columns): "postcode","borough" & 'neighbourhood' data
# and data for each column in <tr></tr>

tbl = html.find_all('table', class_="wikitable sortable") 
tbl

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>
 <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td></tr>
 <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td></tr>
 <tr>


In [7]:
# extract header by find_all (find every tag) with <th></th> in the table
headers = tbl[0].find_all('th')
headers

[<th>Postcode</th>, <th>Borough</th>, <th>Neighbourhood
 </th>]

In [8]:
# loop through each item in header, extract the text with .text and remove new line (i.e: \n) with .strip() in list comprehension
column = [i.text.strip() for i in headers]
column

['Postcode', 'Borough', 'Neighbourhood']

In [9]:
data = tbl[0].find_all('td') # extract item in table body (i.e: each cell in the table)
data

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Manor" title="Lawrence Manor">Lawrence Manor</a>
 </td>, <td>M7A</td>, <td>

In [10]:
total_cell, total_row = len(data), len(data)/3

print('The table contains: {} cells, which mean {} rows'.format(total_cell, total_row))

The table contains: 861 cells, which mean 287.0 rows


In [11]:
col_1 = [] # create list named "col_1"

# loop through cells in "data", start from index 0, with incremental step 3 (i.e. 0,3,6,9...)
for i in range(0, total_cell, 3):
    
    k = data[i].text.strip() # extract text within <td> tag; remove space or \n
    col_1.append(k) # append each item to "col_1"

In [12]:
# check the list of postcode
# the list should contains postcode from M*A, M*B, M*C... to M*Z; * being number 1 to 9

print(col_1, end=' ')

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B', 'M1B', 'M2B', 'M3B', 'M4B', 'M4B', 'M5B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B', 'M9B', 'M9B', 'M9B', 'M9B', 'M1C', 'M1C', 'M1C', 'M2C', 'M3C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C', 'M9C', 'M9C', 'M9C', 'M1E', 'M1E', 'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E', 'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G', 'M1H', 'M2H', 'M3H', 'M3H', 'M3H', 'M4H', 'M5H', 'M5H', 'M5H', 'M6H', 'M6H', 'M7H', 'M8H', 'M9H', 'M1J', 'M2J', 'M2J', 'M2J', 'M3J', 'M3J', 'M4J', 'M5J', 'M5J', 'M5J', 'M6J', 'M6J', 'M7J', 'M8J', 'M9J', 'M1K', 'M1K', 'M1K', 'M2K', 'M3K', 'M3K', 'M4K', 'M4K', 'M5K', 'M5K', 'M6K', 'M6K', 'M6K', 'M7K', 'M8K', 'M9K', 'M1L', 'M1L', 'M1L', 'M2L', 'M2L', 'M3L', 'M4L', 'M4L', 'M5L', 'M5L', 'M6L', 'M6L', 'M6L', 'M7L', 'M8L', 'M9L', 'M1M', 'M1M', 'M1M', 'M2M', 'M2M', 'M3M', 'M4M', 'M5M', 'M5M', 'M6M', 'M6M', 'M6M', 'M6M', 'M7M', 'M8M', 'M9M', 'M9M', 'M1N', 'M1N', 'M2N', 'M3N'

In [13]:
col_2 = [] # create list named "col_2"

# loop through cells in "data", start from index 1, with incremental step 3 (i.e. 1,4,7,10...)
for i in range(1, total_cell, 3):
    
    k = data[i].text.strip() # extract text within <td> tag; remove space or \n
    col_2.append(k) # append each item to "col_2"

In [14]:
# check the list of borough

col_2

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto',
 'North York',
 'North York',
 "Queen's Park",
 'Not assigned',
 "Queen's Park",
 'Scarborough',
 'Scarborough',
 'Not assigned',
 'North York',
 'East York',
 'East York',
 'Downtown Toronto',
 'Downtown Toronto',
 'North York',
 'Not assigned',
 'Not assigned',
 'Etobicoke',
 'Etobicoke',
 'Etobicoke',
 'Etobicoke',
 'Etobicoke',
 'Scarborough',
 'Scarborough',
 'Scarborough',
 'Not assigned',
 'North York',
 'North York',
 'East York',
 'Downtown Toronto',
 'York',
 'Not assigned',
 'Not assigned',
 'Etobicoke',
 'Etobicoke',
 'Etobicoke',
 'Etobicoke',
 'Scarborough',
 'Scarborough',
 'Scarborough',
 'Not assigned',
 'Not assigned',
 'East Toronto',
 'Downtown Toronto',
 'York',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Scarborough',
 'Not assigned',
 'Not assigned',
 'East York',
 'Downtown Toronto',
 'Downtown Toronto',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Scarborough',

In [15]:
col_3 = [] # create list named "col_3"

# loop through cells in "data", start from index 2, with incremental step 3 (i.e. 2,5,8,11...)
for i in range(2, total_cell, 3):
    
    k = data[i].text.strip() # extract text within <td> tag; remove space or \n
    col_3.append(k) # append each item to "col_3"

In [16]:
# check the list of neighbourhood

col_3

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Harbourfront',
 'Lawrence Heights',
 'Lawrence Manor',
 'Not assigned',
 'Not assigned',
 "Queen's Park",
 'Rouge',
 'Malvern',
 'Not assigned',
 'Don Mills North',
 'Woodbine Gardens',
 'Parkview Hill',
 'Ryerson',
 'Garden District',
 'Glencairn',
 'Not assigned',
 'Not assigned',
 'Cloverdale',
 'Islington',
 'Martin Grove',
 'Princess Gardens',
 'West Deane Park',
 'Highland Creek',
 'Rouge Hill',
 'Port Union',
 'Not assigned',
 'Flemingdon Park',
 'Don Mills South',
 'Woodbine Heights',
 'St. James Town',
 'Humewood-Cedarvale',
 'Not assigned',
 'Not assigned',
 'Bloordale Gardens',
 'Eringate',
 'Markland Wood',
 'Old Burnhamthorpe',
 'Guildwood',
 'Morningside',
 'West Hill',
 'Not assigned',
 'Not assigned',
 'The Beaches',
 'Berczy Park',
 'Caledonia-Fairbanks',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Woburn',
 'Not assigned',
 'Not assigned',
 'Leaside',
 'Central Bay Street',
 'Christie',
 '

In [17]:
# initiate a dictionary
tbl_df = {}

for a, b in zip(column, [col_1, col_2, col_3]): # use zip() to combine header for each col_* and data
    tbl_df[a] = b # put header and column into dictionary

In [18]:
print(tbl_df['Postcode'], end=' ') # take a quick look at 'Postcode' within dictionary

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B', 'M1B', 'M2B', 'M3B', 'M4B', 'M4B', 'M5B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B', 'M9B', 'M9B', 'M9B', 'M9B', 'M1C', 'M1C', 'M1C', 'M2C', 'M3C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C', 'M9C', 'M9C', 'M9C', 'M1E', 'M1E', 'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E', 'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G', 'M1H', 'M2H', 'M3H', 'M3H', 'M3H', 'M4H', 'M5H', 'M5H', 'M5H', 'M6H', 'M6H', 'M7H', 'M8H', 'M9H', 'M1J', 'M2J', 'M2J', 'M2J', 'M3J', 'M3J', 'M4J', 'M5J', 'M5J', 'M5J', 'M6J', 'M6J', 'M7J', 'M8J', 'M9J', 'M1K', 'M1K', 'M1K', 'M2K', 'M3K', 'M3K', 'M4K', 'M4K', 'M5K', 'M5K', 'M6K', 'M6K', 'M6K', 'M7K', 'M8K', 'M9K', 'M1L', 'M1L', 'M1L', 'M2L', 'M2L', 'M3L', 'M4L', 'M4L', 'M5L', 'M5L', 'M6L', 'M6L', 'M6L', 'M7L', 'M8L', 'M9L', 'M1M', 'M1M', 'M1M', 'M2M', 'M2M', 'M3M', 'M4M', 'M5M', 'M5M', 'M6M', 'M6M', 'M6M', 'M6M', 'M7M', 'M8M', 'M9M', 'M9M', 'M1N', 'M1N', 'M2N', 'M3N'

In [19]:
tbl_postcode = pd.DataFrame(tbl_df) # create dataframe based on dictionary "tbl_df"
tbl_postcode.head() # a glance of dataframe

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [20]:
# exclude "Not assigned" in "Borough"
tbl = tbl_postcode.query("Borough != 'Not assigned'")
tbl

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [21]:
# create a temporary dataframe to check number of 'Neighbourhood' within each 'postcode'
# use .to_frame() to convert series of object to dataframe
# .sort_values() > sort 'Neighbourhood' in descending order

tbl_temp = tbl.groupby(['Postcode'])['Neighbourhood'].count().to_frame().reset_index(drop=False).sort_values(by='Neighbourhood', ascending=False)
tbl_temp

Unnamed: 0,Postcode,Neighbourhood
91,M8Y,8
101,M9V,8
68,M5V,7
94,M9B,5
92,M8Z,5
49,M4V,5
95,M9C,4
100,M9R,4
14,M1V,4
80,M6M,4


In [22]:
# let's check 'postcode' M8Y to confirm num # of neighourhood

tbl.query("Postcode == 'M8Y'") # result shows 8 neighbourhoods within M8Y, same as above

Unnamed: 0,Postcode,Borough,Neighbourhood
265,M8Y,Etobicoke,Humber Bay
266,M8Y,Etobicoke,King's Mill Park
267,M8Y,Etobicoke,Kingsway Park South East
268,M8Y,Etobicoke,Mimico NE
269,M8Y,Etobicoke,Old Mill South
270,M8Y,Etobicoke,The Queensway East
271,M8Y,Etobicoke,Royal York South East
272,M8Y,Etobicoke,Sunnylea


In [23]:
# extract all 'postcode' with > 1 neighbourhood
# use .query() to select rows with 'Neighbourhood' > 1 record, then select 'Postcode' and convert data into array
# with .values method

multi = tbl_temp.query("Neighbourhood > 1")['Postcode'].values
multi

array(['M8Y', 'M9V', 'M5V', 'M9B', 'M8Z', 'M4V', 'M9C', 'M9R', 'M1V',
       'M6M', 'M1K', 'M1L', 'M5R', 'M3H', 'M6K', 'M6L', 'M5J', 'M5H',
       'M8V', 'M2J', 'M5T', 'M1E', 'M1T', 'M1C', 'M1P', 'M1M', 'M8X',
       'M5B', 'M5K', 'M5L', 'M5M', 'M5P', 'M5S', 'M1B', 'M5X', 'M6A',
       'M6H', 'M6J', 'M6N', 'M4T', 'M6R', 'M6S', 'M8W', 'M9M', 'M6P',
       'M4X', 'M3J', 'M4L', 'M3C', 'M2L', 'M4K', 'M2M', 'M1R', 'M1N',
       'M3K', 'M4B'], dtype=object)

In [24]:
# extract all 'postcode' with only 1 neighbourhood
# use .query() to select rows with 'Neighbourhood' > 1 record, then select 'Postcode' and convert data into array
# with .values method

single = tbl_temp.query("Neighbourhood == 1")['Postcode'].values
single

array(['M2N', 'M7Y', 'M2P', 'M2R', 'M3A', 'M7A', 'M3B', 'M7R', 'M4R',
       'M2K', 'M2H', 'M1W', 'M1S', 'M9A', 'M9L', 'M1J', 'M9N', 'M9P',
       'M1H', 'M1G', 'M1X', 'M6E', 'M6G', 'M4H', 'M4P', 'M4N', 'M4W',
       'M4M', 'M4Y', 'M5A', 'M5C', 'M5E', 'M5G', 'M4J', 'M4G', 'M4S',
       'M4E', 'M5N', 'M4C', 'M4A', 'M3N', 'M3M', 'M5W', 'M3L', 'M6B',
       'M6C', 'M9W'], dtype=object)

In [25]:
# split 'tbl' into table which > 1 'Neighbourhood' within a 'Postcode'

t1 = tbl.loc[tbl.Postcode.isin(multi),:].sort_values(by='Postcode').reset_index(drop=True)
t1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Port Union
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Highland Creek
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,West Hill
7,M1E,Scarborough,Morningside
8,M1K,Scarborough,East Birchmount Park
9,M1K,Scarborough,Ionview


In [26]:
# split 'tbl' into table which only 1 'Neighbourhood' within a 'Postcode'

t2 = tbl.loc[tbl.Postcode.isin(single),:].sort_values(by='Postcode').reset_index(drop=True)
t2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1G,Scarborough,Woburn
1,M1H,Scarborough,Cedarbrae
2,M1J,Scarborough,Scarborough Village
3,M1S,Scarborough,Agincourt
4,M1W,Scarborough,L'Amoreaux West
5,M1X,Scarborough,Upper Rouge
6,M2H,North York,Hillcrest Village
7,M2K,North York,Bayview Village
8,M2N,North York,Willowdale South
9,M2P,North York,York Mills West


In [27]:
# initiate 3 lists for 'postcode', 'borough', 'neighbourhood'

codes = []
boroughs = []
neighbours = []

for code in multi:
    
    table = t1.loc[t1.Postcode == code, :] # split 't1' to specific table by 'postcode'
    
    code = np.unique(table.Postcode) # extract unique 'postcode' in the column
    codes.append(code[0])
    
    borough = np.unique(table.Borough) # extract unique 'borough' in the column
    boroughs.append(borough[0])
    
    neighbour = table.Neighbourhood.to_list() # extract all 'neighbourhood' & convert to list format
    neighbour = ', '.join(neighbour) # use .join() method to combine each 'neighbourhood' with ", "
    neighbours.append(neighbour)

In [28]:
neighbours # take a look into result of combined 'neighbourhood'

["Mimico NE, Kingsway Park South East, Old Mill South, Humber Bay, Sunnylea, Royal York South East, The Queensway East, King's Mill Park",
 'Beaumond Heights, Silverstone, Humbergate, Jamestown, Mount Olive, Thistletown, Albion Gardens, South Steeles',
 'Railway Lands, Harbourfront West, Island airport, Bathurst Quay, CN Tower, South Niagara, King and Spadina',
 'Princess Gardens, Martin Grove, Islington, Cloverdale, West Deane Park',
 'The Queensway West, Kingsway Park South West, Mimico NW, South of Bloor, Royal York South West',
 'Forest Hill SE, Rathnelly, Summerhill West, Deer Park, South Hill',
 'Old Burnhamthorpe, Markland Wood, Eringate, Bloordale Gardens',
 'St. Phillips, Richview Gardens, Martin Grove Gardens, Kingsview Village',
 "Agincourt North, Steeles East, L'Amoreaux East, Milliken",
 'Silverthorn, Mount Dennis, Keelesdale, Del Ray',
 'East Birchmount Park, Ionview, Kennedy Park',
 'Clairlea, Golden Mile, Oakridge',
 'Yorkville, The Annex, North Midtown',
 'Wilson Heigh

In [29]:
# a new dataframe for 'Postcode' with > 1 'Neighbourhood'

tbl_multi = pd.DataFrame({'Postcode': codes, 'Borough': boroughs, 'Neighbourhood': neighbours})
tbl_multi

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M8Y,Etobicoke,"Mimico NE, Kingsway Park South East, Old Mill ..."
1,M9V,Etobicoke,"Beaumond Heights, Silverstone, Humbergate, Jam..."
2,M5V,Downtown Toronto,"Railway Lands, Harbourfront West, Island airpo..."
3,M9B,Etobicoke,"Princess Gardens, Martin Grove, Islington, Clo..."
4,M8Z,Etobicoke,"The Queensway West, Kingsway Park South West, ..."
5,M4V,Central Toronto,"Forest Hill SE, Rathnelly, Summerhill West, De..."
6,M9C,Etobicoke,"Old Burnhamthorpe, Markland Wood, Eringate, Bl..."
7,M9R,Etobicoke,"St. Phillips, Richview Gardens, Martin Grove G..."
8,M1V,Scarborough,"Agincourt North, Steeles East, L'Amoreaux East..."
9,M6M,York,"Silverthorn, Mount Dennis, Keelesdale, Del Ray"


In [30]:
# combine 'tbl_multi' with 't2' (table with single neighbourhood for each postcode) with concat()

new_df = pd.concat([tbl_multi, t2], axis=0).sort_values(by='Postcode').reset_index(drop=True)
print(new_df.head())

  Postcode      Borough                           Neighbourhood
0      M1B  Scarborough                          Rouge, Malvern
1      M1C  Scarborough  Port Union, Rouge Hill, Highland Creek
2      M1E  Scarborough       Guildwood, West Hill, Morningside
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae


In [31]:
# check any row has 'Neighbourhood' that is 'Not assigned'

new_df.query("Neighbourhood == 'Not assigned'")

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [32]:
# instruction of assignment: any record in Neighbourhood with "Not assigned" will be same as name of 'Borough'
# in this case, 'Postcode' M7A >> Borough & Neighbourhood is "Queen's Park"

new_df.loc[new_df.Postcode == 'M7A', 'Neighbourhood'] = "Queen's Park" # assign new value
new_df.query("Postcode == 'M7A'") # check M7A again

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [33]:
row, column = new_df.shape # use .shape to indicate number of rows as per assignment requirement

print("Postcode table consists of {} rows & {} columns".format(row, column))

Postcode table consists of 103 rows & 3 columns


<H2>Part 2: Retrieve Latitude/Longitude with "Geopy"</H2>

In [34]:
# import library for Geocoding
import geopy
import geopandas

In [35]:
# trial n error: search geo-location of Toronto based on "Borough"
# note: not requirement of assignment

geo_lat = {}
geo_lon = {}

geo = geopy.Nominatim(user_agent="Detector", timeout=20)

for x in np.unique(new_df.Borough):
    
    loc = geo.geocode("{}, Toronto, Ontario".format(x))
    
    geo_lat[x] = loc.latitude
    geo_lon[x] = loc.longitude

In [36]:
# quick check on latitude

geo_lat

{'Central Toronto': 43.6449033,
 'Downtown Toronto': 43.6563221,
 'East Toronto': 43.626243,
 'East York': 43.699971,
 'Etobicoke': 43.67145915,
 'Mississauga': 43.6677248,
 'North York': 43.7543263,
 "Queen's Park": 43.6606092,
 'Scarborough': 43.773077,
 'West Toronto': 43.6449033,
 'York': 43.67910515}

In [37]:
# quick check on longitude

geo_lon

{'Central Toronto': -79.3818364,
 'Downtown Toronto': -79.3809161,
 'East Toronto': -79.396962,
 'East York': -79.3325199626159,
 'Etobicoke': -79.5524920661167,
 'Mississauga': -79.586436,
 'North York': -79.4491169663959,
 "Queen's Park": -79.3905725,
 'Scarborough': -79.257774,
 'West Toronto': -79.3818364,
 'York': -79.4911841400715}

In [38]:
# instruction given by assignment: to extract latitude & longitutde based on "Postcode"
# API failed to search for geo-location for M1E and other postcodes.
# will use csv file provided in coursera instead

geo = geopy.Nominatim(user_agent="Detector", timeout=50)

for x, y in zip(new_df.Postcode[0:5], new_df.Borough[0:5]):
    
    loc = geo.geocode("{}, {}, Toronto, Canada".format(x, y))
    print("{}, {}: latitude {}, longitude {}".format(x, y, loc.latitude, loc.longitude))

M1B, Scarborough: latitude 43.773077, longitude -79.257774
M1C, Scarborough: latitude 43.773077, longitude -79.257774


AttributeError: 'NoneType' object has no attribute 'latitude'

In [39]:
# import "Geospatial_Coordinates.csv"

geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [40]:
geo_data.info() # Geospatial_Coordinates.csv has 103 rows (i.e. same as new_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [41]:
# all postcodes in Geospatial_Coordinates.csv match with postcodes in new_df
# both files have 103 records

geo_df = geo_data.loc[geo_data["Postal Code"].isin(new_df.Postcode.values), :]
geo_df.shape

(103, 3)

In [42]:
# combine new_df & geo_df with .merge() on Postcode/Postal Code columns
# note: to remove "Postcode"

geo_tbl = new_df.merge(geo_df, left_on="Postcode", right_on="Postal Code")
geo_tbl.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [43]:
geo_toronto = geo_tbl.iloc[:,[3,1,2,4,5]] # exclude column with index = 0 ("Postcode") and re-arrange column names
geo_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [44]:
geo_toronto.shape

(103, 5)

<H2>Part 3: Explore Toronto</H2>

In [45]:
# use geopy to obtain latitude/longitude of Toronto.
# coordinates will be used for map visualization

address = 'Toronto, Ontario'

geolocator = geopy.Nominatim(user_agent="ny_explorer", timeout=30)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto 43.653963, -79.387207.


In [46]:
# import map rendering library

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

In [47]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map based on coordinates of postcode
for lat, lng, neighbourhood in zip(geo_toronto['Latitude'], geo_toronto['Longitude'], geo_toronto['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [48]:
# Define Foursquare Credentials and Version

CLIENT_ID = 'BTXEHYDBF0CI5H3NRC3OFYWQ3RZ341ANCJ31XKQ4YH2EQJ3Q' # your Foursquare ID
CLIENT_SECRET = 'IO52MQU0B4FSOO0IARL2QCWOO3W0WVDRBZN5B2JANN2ZAVSB' # your Foursquare Secret
VERSION = '20161225' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BTXEHYDBF0CI5H3NRC3OFYWQ3RZ341ANCJ31XKQ4YH2EQJ3Q
CLIENT_SECRET:IO52MQU0B4FSOO0IARL2QCWOO3W0WVDRBZN5B2JANN2ZAVSB


### Let's explore Top 5 venues in each postcode within 500 meters radius

In [49]:
# define limit = 5 (limit to 5 venues only) & radius = 500 (meters)
LIMIT = 5
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(geo_toronto.Neighbourhood, geo_toronto.Latitude, geo_toronto.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [50]:
temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
temp.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, West Hill, Morningside",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop
4,Cedarbrae,43.773136,-79.239476,Federick Restaurant,43.774697,-79.241142,Hakka Restaurant


In [51]:
print("{} nearby locations downloaded for {} neighbourhood.".format(len(temp.Venue), len(geo_toronto.Neighbourhood)))

101 nearby locations downloaded for 103 neighbourhood.


In [52]:
cat = pd.get_dummies(temp.category) # one hot encoding with get_dummies()
df_01 = pd.concat([temp[['Neighbourhood']], cat], axis=1) # combine neighbourhood & category tables
df_01.head()

Unnamed: 0,Neighbourhood,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,Brewery,...,Rental Car Location,Restaurant,Sandwich Place,Skating Rink,Sports Bar,Steakhouse,Theme Restaurant,Toy / Game Store,Trail,Warehouse Store
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Port Union, Rouge Hill, Highland Creek",0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, West Hill, Morningside",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Woburn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Cedarbrae,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
temp['category'].value_counts()

Park                    10
Coffee Shop              4
Bakery                   4
Café                     4
Restaurant               4
Fast Food Restaurant     3
Liquor Store             3
Pizza Place              3
Chinese Restaurant       3
Italian Restaurant       3
Baseball Field           3
Bar                      2
Bank                     2
Airport                  2
Trail                    2
Sandwich Place           2
Grocery Store            2
Brewery                  2
Playground               2
Burger Joint             1
Department Store         1
Deli / Bodega            1
Greek Restaurant         1
Steakhouse               1
Ramen Restaurant         1
Sports Bar               1
Fish & Chips Shop        1
Motel                    1
Garden                   1
Clothing Store           1
Toy / Game Store         1
Dog Run                  1
Skating Rink             1
Golf Course              1
Rental Car Location      1
Theme Restaurant         1
Warehouse Store          1
A

<H2>Part 4: Clustering neighbourhood</H2>

In [54]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [55]:
df_02 = df_01.drop('Neighbourhood', axis=1)

n_group = 6 # we will group neighbourhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df_02)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 5, 0, 0, 0, 0, 0, 4], dtype=int32)

In [56]:
# add clustering labels
df_01.insert(1, 'label', kmeans.labels_)

In [57]:
df_01.head()

Unnamed: 0,Neighbourhood,label,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,...,Rental Car Location,Restaurant,Sandwich Place,Skating Rink,Sports Bar,Steakhouse,Theme Restaurant,Toy / Game Store,Trail,Warehouse Store
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Port Union, Rouge Hill, Highland Creek",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, West Hill, Morningside",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Woburn,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Cedarbrae,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
toronto_merged = pd.merge(geo_toronto, df_01, on='Neighbourhood', how='right')
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,label,Airport,Arts & Crafts Store,Bakery,Bank,...,Rental Car Location,Restaurant,Sandwich Place,Skating Rink,Sports Bar,Steakhouse,Theme Restaurant,Toy / Game Store,Trail,Warehouse Store
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(n_group)
ys = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters