**Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.**



**Imports**

In [9]:
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.cluster import KMeans

**Getting the table**

In [10]:
# reading the html page using pandas
df_p = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# get the table we need in this assignment
df_d = df_p[0]

# make a new dataframe without not assigned Borough
df_f = df_d[df_d["Borough"] != "Not assigned"].reset_index(drop=True)

# replace not assigned neighbourhoods with borough's value
df_f['Neighbourhood'] = np.where(df_f['Neighbourhood'] == 'Not assigned', df_f['Borough'], df_f['Neighbourhood'])

# rename the columns : Postal Code and Neighbourhood to be PostalCode and Neighborhood
df_f.rename(columns={"Postal Code": "PostalCode", "Neighbourhood": "Neighborhood"}, inplace=True)

# get the first 12 rows of the data
df_f.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


**Shape**

In [11]:
# get the size of our dataframe
df_f.shape

(103, 3)

**Merge the two given datas together**


In [12]:
# get the latitude and the longitude data
df_ll = pd.read_csv('http://cocl.us/Geospatial_data')

# rename the columns : Postal Code and Neighbourhood to be PostalCode and Neighborhood
df_ll.rename(columns={"Postal Code": "PostalCode", "Neighbourhood": "Neighborhood"}, inplace=True)

# Merge the two data latitude and the longitude data with the previous data to get the new one 
df_n = pd.merge(df_f,df_ll,on="PostalCode")

# get the first 12 rows of the data
df_n.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


**Get the data with Borough containing "Toronto"**

In [13]:
# Get the data with Borough containg "Toronto"
df_toronto = df_n[df_n['Borough'].astype(str).str.contains("Toronto")].reset_index(drop=True)

**Visualize Neighborhoods with a map**

In [14]:
# create a map
map = folium.Map(location=[43.654260, -79.360636], zoom_start=10)

# add markers to the map
for lat, lng, bor, nbh in zip(df_toronto["Latitude"], df_toronto["Longitude"], df_toronto["Borough"], df_toronto["Neighborhood"]):
    label = '{}, {}'.format(nbh, bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
      ).add_to(map)  
map

**Clustering Neighborhoods**

In [15]:
# make a new dataframe without PostalCode, Borough and Neighborhood
df_clustered = df_toronto.drop(columns=["PostalCode","Borough", "Neighborhood"])

# use kMeans
k_means = KMeans(n_clusters=5).fit(df_clustered)

# make a new column named labels contained kMeans labels
df_toronto["labels"] = k_means.labels_

# make the new column labels to be shown at the beginning of the dataframe
fixed_columns = [df_toronto.columns[-1]] + list(df_toronto.columns[:-1])
df_toronto_n = df_toronto[fixed_columns]

**Visualize clustered Map**

In [16]:
# create map
map_clusters = folium.Map(location=[43.654260, -79.360636], zoom_start=11)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_n['Latitude'], df_toronto_n['Longitude'], df_toronto_n['Neighborhood'], df_toronto_n['labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters