# Segmenting and Clustering Neighborhood (Part 1: Getting the Dataframe Ready)

## First install the beautifulsoup4 if not exists already

In [1]:
!pip install beautifulsoup4



## Then install the lxml library

In [2]:
!pip install lxml



In [3]:
import bs4 as bs
import urllib.request

In [4]:
#read the URL and clean by BeautifulSoup class
sauce = urllib.request.urlopen ('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup (sauce,'lxml')

In [5]:
#Check the title
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [6]:
#Read the table using find function since there is only one table available
table = soup.find('table')

In [7]:
#Read the table rows by reading tags 'tr'
table_rows =table.find_all('tr')

In [8]:
#Run loops to read the table rows by reading  < td > html tags
for tr in table_rows:
    td = tr.find_all('td')
    row= [i.text for i in td]
    print (row)

[]
['M1A', 'Not assigned', 'Not assigned\n']
['M2A', 'Not assigned', 'Not assigned\n']
['M3A', 'North York', 'Parkwoods\n']
['M4A', 'North York', 'Victoria Village\n']
['M5A', 'Downtown Toronto', 'Harbourfront\n']
['M6A', 'North York', 'Lawrence Heights\n']
['M6A', 'North York', 'Lawrence Manor\n']
['M7A', 'Downtown Toronto', "Queen's Park\n"]
['M8A', 'Not assigned', 'Not assigned\n']
['M9A', "Queen's Park", 'Not assigned\n']
['M1B', 'Scarborough', 'Rouge\n']
['M1B', 'Scarborough', 'Malvern\n']
['M2B', 'Not assigned', 'Not assigned\n']
['M3B', 'North York', 'Don Mills North\n']
['M4B', 'East York', 'Woodbine Gardens\n']
['M4B', 'East York', 'Parkview Hill\n']
['M5B', 'Downtown Toronto', 'Ryerson\n']
['M5B', 'Downtown Toronto', 'Garden District\n']
['M6B', 'North York', 'Glencairn\n']
['M7B', 'Not assigned', 'Not assigned\n']
['M8B', 'Not assigned', 'Not assigned\n']
['M9B', 'Etobicoke', 'Cloverdale\n']
['M9B', 'Etobicoke', 'Islington\n']
['M9B', 'Etobicoke', 'Martin Grove\n']
['M9B', '

In [9]:
#create a list of lists for all rowas
listrows=[]
j=0
for tr in table_rows:
    td = tr.find_all('td')
    row= [i.text for i in td]
    # exclude the header
    if (j>0):
        listrows.append(row)
    j=j+1
#Print first 10 rows    
listrows [:10]

[['M1A', 'Not assigned', 'Not assigned\n'],
 ['M2A', 'Not assigned', 'Not assigned\n'],
 ['M3A', 'North York', 'Parkwoods\n'],
 ['M4A', 'North York', 'Victoria Village\n'],
 ['M5A', 'Downtown Toronto', 'Harbourfront\n'],
 ['M6A', 'North York', 'Lawrence Heights\n'],
 ['M6A', 'North York', 'Lawrence Manor\n'],
 ['M7A', 'Downtown Toronto', "Queen's Park\n"],
 ['M8A', 'Not assigned', 'Not assigned\n'],
 ['M9A', "Queen's Park", 'Not assigned\n']]

In [10]:
# import pandas and convert the listrows into datafrane
import pandas as pd
df = pd.DataFrame(listrows)
df.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [12]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood; Thereforem assignign those columns
df.columns = ['Postcode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [13]:
#Create fucntion to clean the ending new line "\n"
def cleanNeighborhood(Neighborhood):
    str2 = Neighborhood.replace('\n', '')
    return str2

In [14]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
#So for the 9th cell in the table on the Wikipedia page, 
#the value of the Borough and the Neighborhood columns will be Queen's Park.
def notAssignedMatch(Borough, Neighborhood):
    if (Neighborhood == 'Not assigned'):
        matchVar = Borough
    else:
        matchVar = Neighborhood
    return matchVar

In [15]:
#Check these two functions before applying to the dataframe
print(cleanNeighborhood ('Not assigned\n'))
print(notAssignedMatch ('Queen''s Park', 'Not assigned'))

Not assigned
Queens Park


In [16]:
#Apply the two functions to the dataframe
df['Neighborhood'] = df.apply(lambda row: cleanNeighborhood(row['Neighborhood']), axis=1)
df['Neighborhood'] = df.apply(lambda row: notAssignedMatch(row['Borough'], row['Neighborhood']), axis=1)

In [18]:
#check the data frame
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [23]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df = df[df['Neighborhood'] !='Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [24]:
#check the dataframe shape before applying group by functions
df.shape

(210, 3)

In [27]:
df_grouped=df.groupby(['Postcode','Borough']).agg({'Neighborhood' : ','.join}).reset_index()
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [29]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe
df_grouped.shape

(103, 3)

# Segmenting and Clustering Neighborhood (Part 2: Apply Geocoding to Get Latitude and Longitude for Neighborhood)


## In this example I am using pgeocode Library for Geocoding instead of Google API

In [46]:
!pip install pgeocode

Collecting pgeocode
  Downloading https://files.pythonhosted.org/packages/f4/1e/d35805c59f167751fccb3fc7093fb2e45ee1e9e4a057e5d74da926ef9518/pgeocode-0.2.0-py2.py3-none-any.whl
Installing collected packages: pgeocode
Successfully installed pgeocode-0.2.0


In [53]:
import pgeocode
nomi = pgeocode.Nominatim('ca')
print(nomi.query_postal_code("M1B").latitude)
print(nomi.query_postal_code("M1B").longitude) 

43.8113
-79.193


In [56]:
# import geocoder
import geocoder 
#create a function for computing lat long
def computeLatLong (Postcode):
    nomi = pgeocode.Nominatim('ca')
    lat = round(nomi.query_postal_code(Postcode).latitude,6)
    long = round(nomi.query_postal_code(Postcode).longitude,6)
    return lat,long

In [57]:
computeLatLong ('M5G')

(43.6564, -79.386)

In [85]:
computeLatLong ('M7R')

(nan, nan)

In [99]:
df_grouped[['Latitude', 'Longitude']] = df_grouped.apply(lambda row: computeLatLong(row['Postcode']), axis=1,result_type='expand')

In [100]:
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.8113,-79.193
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389


In [101]:
df_grouped

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.8113,-79.193
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.7298,-79.2639
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.7122,-79.2843
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.7247,-79.2312
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.6952,-79.2646


In [90]:
#after inspecting the data, it is found that Mississauga could not be populated by this package.
#lets apple Geopy packate to get the info for Latlong
!pip install geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
location = geolocator.geocode("Mississauga, ON M7R")
print(location)

None


In [116]:
#Since Geopy also could not find it. Thefore, using google map I found  the approximate location 43.587889, -79.657896
df_grouped['Latitude'] = df_grouped.apply(lambda x: 43.587889 if np.isnan(x.Latitude) else x.Latitude, axis=1)
df_grouped['Longitude'] = df_grouped.apply(lambda x: -79.657896 if np.isnan(x.Longitude) else x.Longitude, axis=1)
df_grouped

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.8113,-79.193
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.7298,-79.2639
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.7122,-79.2843
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.7247,-79.2312
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.6952,-79.2646


In [123]:
#However, applying lat log using  the geocoding API does have good prevision for mapping. Therefore, we will import the lat-log provided in the csV
df_latlong = pd.read_csv('C:/Users/Shafiul Azam/Desktop/Geospatial_Coordinates.csv')
#Now, Concating with the existing dataframe by PostCode

df_latlong = df_latlong.rename(columns={"Postal Code": "Postcode"})

df_grouped =df_grouped.drop(['Latitude', 'Longitude'], axis=1)

df_merge=df_grouped.merge(df_latlong,how='left',on='Postcode')
df_merge

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# Segmenting and Clustering Neighborhood (Part 3: Apply Clusteing Analysis)

In [67]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_grouped['Borough'].unique()),
        df_grouped.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [70]:
import numpy as np # library to handle data in a vectorized manner

#import pandas as pd # library for data analsysis, Aleady done, Therefore commenting out
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!pip install geopy

Collecting geopy
  Downloading https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl (100kB)
Collecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.20.0


In [71]:
from geopy.geocoders import Nominatim

In [72]:
geolocator = Nominatim(user_agent="specify_your_app_name_here")

In [73]:
location = geolocator.geocode("Toronto, ON M1B")

In [74]:
print((location.latitude, location.longitude))

(43.653963, -79.387207)


In [77]:
! pip install folium

Collecting folium
  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.1


In [82]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [124]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto