# Week 2 Assignment
## Segmenting and Clustering Neighborhoods in Toronto
### All 3 parts are in this notebook


## PART 1

Import pandas and numpy

In [1]:
import pandas as pd
import numpy as np

Installing lxml

In [2]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


Reading data from source and creating dataframe

In [3]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## PART 1 - Process 1: Filtering Boroughs that are 'Not assigned'

In [4]:
df_borough_notna = df[df['Borough'] != 'Not assigned']
df_borough_notna.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Checking if Dataframe has Postal Code Instances greater than 0

In [5]:
df_PC_Count = df_borough_notna['Postal Code'].value_counts(ascending=False)
df_PC_Count

M5N    1
M1H    1
M5T    1
M5L    1
M1C    1
      ..
M4P    1
M6C    1
M8V    1
M4K    1
M5M    1
Name: Postal Code, Length: 103, dtype: int64

No Postal Codes are listed more than once

Although none are found, below is the step. 

## PART 1 - Process 2: More than 1 Postal code rows, combine Neighborhoods into 1 row separated by commas

In [6]:
df_groupby = df_borough_notna.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_groupby.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Checking if any Neighborhoods are 'Not assigned'

In [7]:
df_neigh_na = df_groupby[df_groupby['Neighbourhood'] == 'Not assigned']
df_neigh_na.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood


No Bouroughs are 'Not assigned' as a Neighborhood

Although none are found. Below is the step.

## PART 1 - Process 3: To replace 'Not assigned' Neighborhoods to Borough Name

In [8]:
df_groupby['Neighbourhood'] = np.where((df_groupby.Neighbourhood=='Not assigned'),df_groupby.Borough, df_groupby.Neighbourhood)
df_groupby.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Displaying number of rows in dataframe

## PART 1 - Process 4: Shape of dataframe

In [9]:
df_groupby.shape

(103, 3)

# ------------------------------------
# PART 2
## Geocoding Postal Codes

Geocoder

In [11]:
pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 5.2 MB/s eta 0:00:011
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [23]:
import geocoder # import geocoder

Downloading Postal Code Lat Lon file 

In [24]:
df_latlon = pd.read_csv('http://cocl.us/Geospatial_data')
df_latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## PART 2 - Process 1: Add Lat Long to dataframe

In [25]:
df_wlatlon = pd.merge(df_groupby, df_latlon, on='Postal Code')
df_wlatlon.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [47]:
df_wlatlon.Borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

# ------------------------------------
# Part 3
## Visual Map

Filtering Toronto Borough

In [72]:
Toronto_df = df_wlatlon[df_wlatlon['Borough'].str.contains("Toronto")]
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Install Folium for mapping

In [30]:
pip install folium

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.7 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [76]:
import folium

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans 


## Map of Toronto Postal Codes

In [44]:
map_toronto = folium.Map(location=[43.6487, -79.38544], zoom_start=10)

for lat, lng, borough, neighbourhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Clustering Toronto Neighborhoods

In [51]:
Toronto_boroughs = Toronto_df.Borough.unique()
Toronto_boroughs

array(['East Toronto', 'Central Toronto', 'Downtown Toronto',
       'West Toronto'], dtype=object)

## Part 3 - Process 1: Since there are 4 Boroughs in Toronto, 4 clusters will be created. k clusters will be set to 4.

In [73]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = Toronto_df.drop(['Postal Code', 'Borough', 'Neighbourhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_


array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 1], dtype=int32)

Adding Cluster labels to dataframe

In [74]:
Toronto_df_merged = Toronto_df
Toronto_df_merged.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_df_merged

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,1,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,1,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,2,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


## Part 3 - Process 2: Generate maps to Visualize the neighborhoods clustered together

In [78]:
# create map
map_clusters = folium.Map(location=[43.6487, -79.38544], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_df_merged['Latitude'], Toronto_df_merged['Longitude'], Toronto_df_merged['Neighbourhood'], Toronto_df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
