# Exploring / Segmenting Neighborhoods - Toronto

We will be exploring various neighborhoods in the Toronto area as part of this exercise, using clustering to determine information about how they are related.

Let's begin by importing all of our dependencies for our project:

In [1]:
!pip install beautifulsoup4
!pip install lxml
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 


from IPython.display import display_html
import pandas as pd
import numpy as np
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')
print('Libraries imported.')

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 14.1MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.2 soupsieve-1.9.5
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/68/30/affd16b77edf9537f5be051905f33527021e20d563d013e8c42c7fd01949/lxml-4.4.2-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 20.1MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.4.2
Solving environment: done


  current version: 4.5.11
  lates

# Using Beautiful Soup to Scrape for Toronto Postal Codes

Let's use this space to prep the code for scraping the Wiki site for Toronto Postal Codes...

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
print(soup.title)
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


Postcode,Borough,Neighborhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Downtown Toronto,Queen's Park
M8A,Not assigned,Not assigned
M9A,Queen's Park,Not assigned


# Conversion of HTML data to a DataFrame

In [3]:
# Convert HTML to DF
dfs = pd.read_html(tab)
df=dfs[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Let's process and clean the data

In [4]:
# Eliminate rows where Borough is 'Not assigned'
df1 = df[df.Borough != 'Not assigned']

# Combine the neighborhoods with same Postal Code
df2 = df1.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

# Replacing the name of the neighborhoods which are 'Not assigned' with names of Borough
df2['Neighborhood'] = np.where(df2['Neighborhood'] == 'Not assigned',df2['Borough'], df2['Neighborhood'])

df2

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


# What is the shape of the data frame?

In [5]:
df2.shape

(103, 3)

# Let's begin working with the provided GeoSpatial Data

We will compare the geospatial data to the Toronto Postal Codes

In [6]:
toronto_lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
toronto_lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
toronto_lat_lon.rename(columns={'Postal Code':'Postcode'},inplace=True)
df3 = pd.merge(df2,toronto_lat_lon,on='Postcode')
df3.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


# We will be working with neighborhoods that contain the word "York" (since I am from NYC)

In [8]:
df4 = df3[df3['Borough'].str.contains('York',regex=False)]
df4

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
10,M6B,North York,Glencairn,43.709577,-79.445073
13,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
14,M4C,East York,Woodbine Heights,43.695344,-79.318389
16,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
21,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512


In [9]:
map_of_york = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighborhood in zip(df4['Latitude'],df4['Longitude'],df4['Borough'],df4['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_of_york)
map_of_york

In [10]:
# Let's cluster the neighborhoods using K-Means

k=5
toronto_clustering = df4.drop(['Postcode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df4.insert(0, 'Cluster Labels', kmeans.labels_)
df4

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighborhood,Latitude,Longitude
0,1,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
3,0,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
7,1,M3B,North York,Don Mills North,43.745906,-79.352188
8,3,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
10,0,M6B,North York,Glencairn,43.709577,-79.445073
13,3,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
14,3,M4C,East York,Woodbine Heights,43.695344,-79.318389
16,0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
21,0,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512


In [11]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Neighborhood'], df4['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters