# Segmenting and Clustering Neighborhoods in Toronto
## Week 3 Applied Data Science Capstone - Question 3

Explore and cluster the neighborhoods in Toronto. 

### Importing all the required libraries, loading the saved postcode dataframe and establishing the Foursquare credentials

In [1]:
import requests 
import pandas as pd 
import numpy as np

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium 

from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')
print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


In [2]:
csv_file= "Q2.csv"
df2 = pd.read_csv(csv_file)
df2.drop('Unnamed: 0', axis=1, inplace=True)  
df2.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [1]:
CLIENT_ID = '###' # your Foursquare ID
CLIENT_SECRET = '###' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ###
CLIENT_SECRET:###


### Finding Toronto's Coordinates and plotting neighborhoods

In [4]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of the City of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of the City of Toronto are 43.653963, -79.387207.


In [5]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to the map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Including only Boroughs that Include 'Toronto' and plotting those

In [6]:
df3=df2[df2['Borough'].str.contains('Toronto')]

df3.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [7]:
map_toronto2 = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto2)  
    
map_toronto2

### Defining function to aquire the foursquare data and building a dataframe

In [8]:
def getVenues(postcodes,neighborhoods, latitudes, longitudes):
    radius=500
    venues_output=[]
    for pc, names, lat, lng in zip(postcodes, neighborhoods, latitudes, longitudes):
            
        # API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # data for each venue
        venues_output.append([(
            pc,
            names, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    #create dataframe
    list_venues = pd.DataFrame([item for venue_output in venues_output for item in venue_output])
    list_venues.columns = ['Postcode',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue Name', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(list_venues)

In [9]:
toronto_venues = getVenues(df3['Postcode'], df3['Neighborhood'],df3['Latitude'],df3['Longitude'])
toronto_venues.head()

Unnamed: 0,Postcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,M5A,"Harbourfront,Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,"Harbourfront,Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,"Harbourfront,Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,M5A,"Harbourfront,Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,"Harbourfront,Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


### Create a dataframe to group data by postcode and category and then grouping the data

In [10]:
toronto_category = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_category['Postcode'] = toronto_venues['Postcode']
change_columns = [toronto_category.columns[-1]] + list(toronto_category.columns[:-1])
toronto_category = toronto_category[change_columns]
toronto_category.head()
toronto_category.shape

(828, 193)

In [11]:

toronto_grouping = toronto_category.groupby('Postcode').mean().reset_index()
toronto_grouping.head()



Unnamed: 0,Postcode,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### We analyse only the top 5 venues of each neighborhood to capture the 'essence' of each and arrange the data frame to reflect this

In [12]:

top_venues = 5

#create new column names
indexes = ['st', 'nd', 'rd']
columns = ['Postcode']
for nb in np.arange(top_venues):
    try:
        columns.append('Top {}{} Venue'.format(nb+1, indexes[nb]))
    except:
        columns.append('Top {}th Venue'.format(nb+1))

# create the new dataframe
summary_top_venues = pd.DataFrame(columns=columns)
summary_top_venues['Postcode'] = toronto_grouping['Postcode']

def return_top_venues(groupings,top_venues):
    groupings_categories = groupings.iloc[1:]
    categories_sorted = groupings_categories.sort_values(ascending=False)
    return categories_sorted.index.values[0:top_venues]

for ind in np.arange(toronto_grouping.shape[0]):
    summary_top_venues.iloc[ind, 1:] = return_top_venues(toronto_grouping.iloc[ind, :], top_venues)

summary_top_venues.head()

Unnamed: 0,Postcode,Top 1st Venue,Top 2nd Venue,Top 3rd Venue,Top 4th Venue,Top 5th Venue
0,M4E,Astrologer,Grocery Store,Health Food Store,Pub,Neighborhood
1,M4K,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Pub
2,M4L,Sandwich Place,Ice Cream Shop,Pub,Board Shop,Coffee Shop
3,M4M,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery
4,M4N,Park,Lake,Swim School,Bus Line,Yoga Studio


## Clustering

The point of this exercise is to get an overall view of the Toronto neighborhoods so the number of clusters used is small, only 3,  to try and capture big similarities and outliers.

### Running K-means clustering with 3 clusters

In [13]:
from sklearn.cluster import KMeans
kclusters = 3
toronto_grouped_clustering = toronto_grouping.drop('Postcode', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

### Merging all the data to create a final summary of Postcodes, Venues and Clusters

In [16]:
summary_top_venues['Cluster label'] = kmeans.labels_
summary_toronto_clustered = pd.merge(summary_top_venues, df3, how='left',
        on='Postcode', validate="1:1")
summary_toronto_clustered.head()
#summary_toronto_clustered

Unnamed: 0,Postcode,Top 1st Venue,Top 2nd Venue,Top 3rd Venue,Top 4th Venue,Top 5th Venue,Cluster label,Borough,Neighborhood,Latitude,Longitude
0,M4E,Astrologer,Grocery Store,Health Food Store,Pub,Neighborhood,2,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Pub,2,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,Sandwich Place,Ice Cream Shop,Pub,Board Shop,Coffee Shop,2,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,2,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Park,Lake,Swim School,Bus Line,Yoga Studio,0,Central Toronto,Lawrence Park,43.72802,-79.38879


### Plotting the clusters in 3 different colors

In [17]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neigh, pc, cluster in zip(summary_toronto_clustered['Latitude'], summary_toronto_clustered['Longitude'], summary_toronto_clustered['Neighborhood'],summary_toronto_clustered['Postcode'], summary_toronto_clustered['Cluster label']):
    label = folium.Popup(str(neigh) + '(' + str(pc) + '): Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Conclusion
The data used was just for the Toronto named neighbourhoods. 

As it was a preliminary analysis, the decision was taken to use the top 5 venues of each neighbourhood and cluster them in 3 groups. 
The majority of the postcodes fall into one cluster. 
One cluster has only one postcode, which looks like an outlier. Its most common venue is garden (Roselawn). 
The remaining cluster with only 4 postcodes and in the North side of town.

Further analysis is required to explore further.