In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files



import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


Libraries imported.


In [68]:
df = pd.read_csv("Coursera_Capstone_Neigborhoods.csv" , sep = ";") # I would not be able to figure out how to correctly scrape it. So i just make some work in excel and save the table as csv. It is so much faster
df = df[~df.Borough.str.contains("Not assigned")] # edit table
df.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M2L,North York,"York Mills, Silver Hills"
1,M2P,North York,York Mills West
2,M4C,East York,Woodbine Heights
3,M1G,Scarborough,Woburn
4,M2R,North York,"Willowdale, Willowdale West"
5,M2N,North York,"Willowdale, Willowdale East"
6,M2M,North York,"Willowdale, Newtonbrook"
7,M1R,Scarborough,"Wexford, Maryvale"
8,M9N,York,Weston
9,M9P,Etobicoke,Westmount


In [69]:
df.shape

(103, 3)

### Creating Dataframe

In [70]:
coordinates = pd.read_csv("Geospatial_Coordinates.csv")

In [71]:
df = df.merge(coordinates, left_on='Postal Code', right_on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
1,M2P,North York,York Mills West,43.752758,-79.400049
2,M4C,East York,Woodbine Heights,43.695344,-79.318389
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M2R,North York,"Willowdale, Willowdale West",43.782736,-79.442259


In [72]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Clustering

### Map visualization

In [74]:
# create map of Toronto using latitude and longitude values
toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto) 
    
toronto

### Just Boroughs that contains Toronto

In [81]:
df_toronto = df[df.Borough.str.contains("Toronto")]
df_toronto.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
13,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
14,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
17,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
18,M4E,East Toronto,The Beaches,43.676357,-79.293031
19,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


### Clustering Neighborhoods

In [92]:
# set number of clusters
kclusters = 5

toronto_grouped_clusters = df_toronto.drop(['Postal Code','Borough','Neighbourhood'], 1)
#toronto_grouped_clusters.head()
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clusters)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 4, 4, 2, 1, 4, 0, 0, 0])

### Insert cluster labels into df_toronto

In [95]:
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto.head(5)

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
13,4,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
14,0,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
17,2,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
18,2,M4E,East Toronto,The Beaches,43.676357,-79.293031
19,4,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


In [96]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters