# Segmenting and Clustering Neighborhoods in Toronto
Applied Data Science Capstone week 3 project

In [1]:
# Installs
! pip3 install lxml
! pip3 install html5lib
! pip install geocoder
! pip install pgeocode
! pip install folium==0.5.0



In [15]:
# imports
import lxml
import pandas as pd
import numpy as np
import pgeocode
from unicodedata import normalize

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import math
print('Libraries imported.')

Libraries imported.


Read html page and get the zipcode table.

In [3]:
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
postalDF = table[0]
postalDF.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Clean data

In [4]:
# Only process the cells that have an assigned borough.  Ignore cells with a borough that is Not assigned.
postalDF = postalDF[postalDF.Borough != 'Not assigned'].copy()

#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
postalDF.loc[(postalDF['Neighbourhood'] == 'Not assigned', 'Neighbourhood')] = postalDF['Borough']
postalDF.reset_index(drop = False, inplace=True)

# The dataframe will consist of three columns: PostalCode, Borough, Neighborhood
postalDF.rename(columns={"Postal Code":"PostalCode", "Neighbourhood":"Neighborhood"}, inplace=True)

duplicatedDF = postalDF[postalDF.duplicated(subset=['PostalCode'], keep=False)]
sortedDF = duplicatedDF.sort_values(by=['PostalCode'])

postalDF.shape

(103, 4)

More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.

In [5]:
# The wiki page didn't have duplicate zipcode like the lab assumes.  I had to add some throughout to test this code.  I removed my code that added the duplicates.  This isn't efficient, but it works.
if len(sortedDF.index) > 0:
    uniquePostalCodes = sortedDF.PostalCode.unique();
    print("unique values: ", uniquePostalCodes)
    for postal_code in uniquePostalCodes:
        startIndex = 0;
        duplicateIndexes = []
        neighborhood = ''
        for index, row in sortedDF.iterrows():
            if neighborhood == '' and row.PostalCode != postal_code:
                continue
            if neighborhood == '' and row.PostalCode == postal_code:
                startIndex = row.iloc[0]
                neighborhood = row.Neighborhood
                continue
            if (row.PostalCode == postal_code):
                duplicateIndexes.append(row.iloc[0])
                neighborhood += ", "
                neighborhood += row.Neighborhood
            else:
                break
        postalDF.at[startIndex, 'Neighborhood'] = neighborhood
        for index in duplicateIndexes:
            postalDF.drop([0, index])
postalDF.head()   

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
postalDF.shape

(103, 4)

## Creating Data Frame with Lat and Long

In [7]:
# Add Latitude and Longitude columns
postalLatLonDF = postalDF.copy()
postalLatLonDF['Latitude'] = np.nan
postalLatLonDF['Longitude'] = np.nan
postalLatLonDF.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,2,M3A,North York,Parkwoods,,
1,3,M4A,North York,Victoria Village,,
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


I could not get geocoder to work, so I ended upp using a different library.  I iterate through the postalLatLonDF, get the postal code, call query_postal_code and get a datframe with 

In [8]:
nomi = pgeocode.Nominatim('ca')
for index, row in postalLatLonDF.iterrows():
    zipRow = nomi.query_postal_code(row.PostalCode)
    postalLatLonDF.at[index, 'Latitude'] = zipRow.latitude
    postalLatLonDF.at[index, 'Longitude'] = zipRow.longitude

postalLatLonDF.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,2,M3A,North York,Parkwoods,43.7545,-79.33
1,3,M4A,North York,Victoria Village,43.7276,-79.3148
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


## Part 3, map it

In [14]:
nan_values = postalLatLonDF.isna()
nan_columns = nan_values.any()
columns_with_nan = postalLatLonDF.columns[nan_columns].tolist()
print(columns_with_nan)
print(nan_values)

['Latitude', 'Longitude']
     index  PostalCode  Borough  Neighborhood  Latitude  Longitude
0    False       False    False         False     False      False
1    False       False    False         False     False      False
2    False       False    False         False     False      False
3    False       False    False         False     False      False
4    False       False    False         False     False      False
..     ...         ...      ...           ...       ...        ...
98   False       False    False         False     False      False
99   False       False    False         False     False      False
100  False       False    False         False     False      False
101  False       False    False         False     False      False
102  False       False    False         False     False      False

[103 rows x 6 columns]


In [21]:
toronto_latitude = 43.6532
toronto_longitude = -79.3832
toronto_map = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=15) # generate map centred around Ecco


# add Ecco as a red circle mark
folium.CircleMarker(
    [toronto_latitude, toronto_longitude],
    radius=150,
    popup='Ecco',
    fill=True,
    color='red',
    fill_color='red',
    fill_opacity=0.6
    ).add_to(toronto_map)


# add popular spots to the map as blue circle markers
for lat, lng, label in zip(postalLatLonDF.Latitude, postalLatLonDF.Longitude, postalLatLonDF.PostalCode):
    if not math.isnan(lat) and not math.isnan(lng):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            fill=True,
            color='blue',
            fill_color='blue',
            fill_opacity=0.6
            ).add_to(toronto_map)

# display map
toronto_map

I decided to place a dot at the center of each zip code.  Near the center of toronoto, by the lake, the postal code areas were smaller than the areas farther from the lake.  The north eastern area seems the be the least populated, hence the larger area for the postal code M1X.