# Assignment for segmenting neighborhoods of Toronto, CA

In [1]:
#libraries that might need to be installed
'''!pip3 install geopy
!pip3 install folium
!pip3 install numpy==1.16.2'''
#Note that numpy version 1.16.2 was what was running with IBM
#After installing folium, my numpy installation was corrupted locally
#This was a quick fix, the damage to depencies remains to be seen

'!pip3 install geopy\n!pip3 install folium\n!pip3 install numpy==1.16.2'

In [59]:
#import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#import html/scraping libraries
import requests
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize

#import mapping libraries
import geopy
import folium
from geopy.geocoders import Nominatim
from IPython.display import Image

#import colors
import matplotlib.cm as cm
import matplotlib.colors as colors

#import clustering libraries
from sklearn.cluster import KMeans, DBSCAN

## In this section, we will scrape and clean the data to produce the correct dataframe

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

t_wiki = BeautifulSoup(requests.get(url).text,'lxml') #get soup

In [4]:
cols = ['Postcode',"Borough",'Neighbourhood']#create columns
pID = [] #list of postal ids
bID = [] #list of borough names
nID = [] #list of neighbourhoods
#the next line finds the table, then separates the rows
locs = t_wiki.find_all('tbody')[0].find_all('tr')[1:]

#loop over each row and extract the information
for n in locs:
    d = n.find_all('td')
    pID.append(d[0].text)
    bID.append(d[1].text)
    nID.append(d[2].text.strip())

In [5]:
#create a data frame
t_df = pd.DataFrame(columns=cols)
t_df['Postcode'] = pID
t_df['Borough'] = bID
t_df['Neighbourhood'] = nID
    

In [6]:
t_df.shape

(287, 3)

In [7]:
#Clean data frame to specifications
#This code ignores area without a borough assignment
t_df = t_df[t_df['Borough']!= 'Not assigned']

#This code will assign missing neighborhood names with borough names
correct_hoods = [t_df.iloc[i].Neighbourhood if t_df.iloc[i].Neighbourhood != 'Not assigned'\
                         else t_df.iloc[i].Borough for i in range(len(t_df))]
t_df['Neighbourhood'] = correct_hoods

#Finally, this code will group neighbourhoods by postal code and make lists of hoods
t_df = t_df.groupby('Postcode').agg(lambda x: ', '.join(set(x))).reset_index()

In [8]:
print(t_df.shape)

(103, 3)


In [9]:
t_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union"
2,M1E,Scarborough,"Guildwood, West Hill, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Setting Latitude and Longitude coordinates

In [10]:
import geocoder

pcodes = t_df.Postcode
g = geocoder.google('{}, Toronto, Ontario'.format(pcodes[0]))
g

<[REQUEST_DENIED] Google - Geocode [empty]>

In [11]:
latlong_df = pd.read_csv('Geospatial_Coordinates.csv')
latlong_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
t_df['Latitude'] = latlong_df['Latitude']
t_df['Longitude']= latlong_df['Longitude']
t_df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, West Hill, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Exploration and Clustering

In this section, I will explore and cluster the neighborhoods of Torono.
To start, I will explore all of toronto, then narrow my focus as the analysis goes on.

First we will get the coordinates centered around toronto.

In [13]:
address = "Toronto, ON"

geolocator = Nominatim(user_agent="extra_lime")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Torono are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Torono are 43.653963, -79.387207.


In [66]:
# create map of Toronto using latitude and longitude values from geocoder
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(t_df['Latitude'], t_df['Longitude'], t_df['Borough'], t_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

The names of the neighborhoods will be revealed with a click on each circle

For a closer look I will be comparing neighborhoods in and around downtown toronto only.

In [15]:
neighborhood_list = ['Downtown Toronto','East Toronto','West Toronto','East York','Central Toronto']
dt_t = t_df[t_df.Borough.isin(neighborhood_list)]
dt_t.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
35,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
36,M4C,East York,Woodbine Heights,43.695344,-79.318389
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
38,M4G,East York,Leaside,43.70906,-79.363452
39,M4H,East York,Thorncliffe Park,43.705369,-79.349372


In [16]:
#lets make a map of toronto but just with the downtown and surrounding neighborhoods, we can use the same lat,long
# create map of Manhattan using latitude and longitude values
dt_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(dt_t['Latitude'], dt_t['Longitude'], dt_t['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.3,
        parse_html=False).add_to(dt_map)  
    
dt_map

Now that we have our neighborhoods selected, we can get foursquare data about each neighborhood and segment it.

In [17]:
#this cell retrieves id and key info for the foursquare api
with open('foursquare.json') as f:
    fs = f.read()
creds = json.loads(fs)
CLIENT_ID = creds['CLIENT_ID']
CLIENT_SECRET = creds['CLIENT_SECRET']
VERSION = creds["VERSION"]


We can explore the first neighborhood in the dataframe

In [18]:
dt_t.iloc[0].Neighbourhood

'Parkview Hill, Woodbine Gardens'

In [19]:
#since there are two neighborhoods, we will choose parkview hill

In [20]:
neighborhood_name = ''.join(dt_t.iloc[0].Neighbourhood).split(',')[1]

In [21]:
neighborhood_name.strip()

'Woodbine Gardens'

In [22]:
ph_latitude = dt_t.iloc[0].Latitude # neighborhood latitude value
ph_longitude = dt_t.iloc[0].Longitude # neighborhood longitude value

ph_name = ''.join(dt_t.iloc[0].Neighbourhood).split(',')[1].strip() # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(ph_name, 
                                                               ph_latitude, 
                                                               ph_longitude))

Latitude and longitude values of Woodbine Gardens are 43.7063972, -79.309937.


In [23]:
#Lets get up to 100 venues in parkview hill
LIMIT=100
radius=500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    ph_latitude, 
    ph_longitude, 
    radius, 
    LIMIT)

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e31fc79a2e538001cf4d56a'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': "O'Connor - Parkview",
  'headerFullLocation': "O'Connor - Parkview, Toronto",
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 13,
  'suggestedBounds': {'ne': {'lat': 43.7108972045, 'lng': -79.30372360313615},
   'sw': {'lat': 43.701897195499996, 'lng': -79.31615039686386}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b5a3842f964a52023b528e3',
       'name': 'Jawny Bakers',
       'location': {'address': "804 O'Connor Dr",
        'crossStreet': 'St Clair E',
        'lat': 43.705782646822,
        'lng': -79.31291304477831,
        'labeledLatLngs':

In [25]:
#Borrow function from foursquare lab
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Jawny Bakers,Gastropub,43.705783,-79.312913
1,East York Gymnastics,Gym / Fitness Center,43.710654,-79.309279
2,Shoppers Drug Mart,Pharmacy,43.705933,-79.312825
3,TD Canada Trust,Bank,43.70574,-79.31227
4,Pizza Pizza,Pizza Place,43.705159,-79.31313


In [27]:
#lets see how many venues were retireved
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

13 venues were returned by Foursquare.


In [28]:
#function to extract venues for each neighborhood in toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [30]:
dt_t_venues = getNearbyVenues(names=dt_t['Neighbourhood'],
                                   latitudes=dt_t['Latitude'],
                                   longitudes=dt_t['Longitude']
                                  )

Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Summerhill West, South Hill, Rathnelly, Forest Hill SE, Deer Park
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Richmond, King, Adelaide
Toronto Islands, Union Station, Harbourfront East
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
North Midtown, Yorkville, The Annex
Harbord, University of Toronto
Grange Park, Kensington Market, Chinatown
Bathurst Quay, Island airport, King and Spadina, South Niagara, CN Tower, Railway Lands, Harbourfront West
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Trinity, Li

In [31]:
#check the size of the DF
print(dt_t_venues.shape)
dt_t_venues.head()

(1776, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub
1,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center
2,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy
3,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank
4,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place


In [32]:
#Check how many venues were returned by neighborhood
dt_t_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Quay, Island airport, King and Spadina, South Niagara, CN Tower, Railway Lands, Harbourfront West",17,17,17,17,17,17
Berczy Park,55,55,55,55,55,55
Business Reply Mail Processing Centre 969 Eastern,15,15,15,15,15,15
"Cabbagetown, St. James Town",43,43,43,43,43,43
Central Bay Street,83,83,83,83,83,83
Christie,18,18,18,18,18,18
Church and Wellesley,82,82,82,82,82,82
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,36,36,36,36,36,36
Davisville North,8,8,8,8,8,8


In [33]:
#See how many unique categories there are for the given neighborhoods
print('There are {} uniques categories.'.format(len(dt_t_venues['Venue Category'].unique())))

There are 235 uniques categories.


In [34]:
#Lets analyze each neighborhood
# one hot encoding
t_onehot = pd.get_dummies(dt_t_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
t_onehot['Neighbourhood'] = dt_t_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [t_onehot.columns[-1]] + list(t_onehot.columns[:-1])
t_onehot = t_onehot[fixed_columns]

t_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
t_onehot.shape

(1776, 236)

In [36]:
#group by neighborhood and get mean frequency of each category
t_grouped = t_onehot.groupby('Neighbourhood').mean().reset_index()
t_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Bathurst Quay, Island airport, King and Spadin...",0.0,0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,...,0.0,0.012048,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012195,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,0.0,...,0.0,0.0,0.0,0.0,0.012195,0.0,0.0,0.012195,0.0,0.012195
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
t_grouped.shape

(44, 236)

In [38]:
#Print out top categories for each neighborhood
num_top_venues = 5

for hood in t_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = t_grouped[t_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Quay, Island airport, King and Spadina, South Niagara, CN Tower, Railway Lands, Harbourfront West----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3             Plane  0.06
4       Coffee Shop  0.06


----Berczy Park----
            venue  freq
0     Coffee Shop  0.07
1    Cocktail Bar  0.05
2        Beer Bar  0.04
3      Steakhouse  0.04
4  Farmers Market  0.04


----Business Reply Mail Processing Centre 969 Eastern----
              venue  freq
0        Comic Shop  0.07
1     Auto Workshop  0.07
2  Recording Studio  0.07
3        Restaurant  0.07
4        Skate Park  0.07


----Cabbagetown, St. James Town----
                venue  freq
0         Coffee Shop  0.07
1  Italian Restaurant  0.05
2         Pizza Place  0.05
3                 Pub  0.05
4              Bakery  0.05


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.14
1  Italian Restaurant  0.06
2        Burger Joint  0.04


In [39]:
#function to sort in order of most popular

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [40]:
#create dataframe for top 10 venues in each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = t_grouped['Neighbourhood']

for ind in np.arange(t_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(t_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Quay, Island airport, King and Spadin...",Airport Lounge,Airport Service,Airport Terminal,Boutique,Sculpture Garden,Bar,Coffee Shop,Boat or Ferry,Plane,Harbor / Marina
1,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Steakhouse,Bakery,Farmers Market,Seafood Restaurant,Cheese Shop,Café,Restaurant
2,Business Reply Mail Processing Centre 969 Eastern,Recording Studio,Auto Workshop,Brewery,Light Rail Station,Spa,Farmers Market,Fast Food Restaurant,Burrito Place,Restaurant,Skate Park
3,"Cabbagetown, St. James Town",Coffee Shop,Italian Restaurant,Café,Restaurant,Bakery,Pub,Chinese Restaurant,Pizza Place,Grocery Store,Pharmacy
4,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Juice Bar,Burger Joint,Café,Japanese Restaurant,Ice Cream Shop,Spa,Bar


## Using KMeans, DBSCAN to cluster nieghborhoods

In [41]:
#Use silhouette score to determine best number of clusters
from sklearn.metrics import silhouette_score

t_grouped_clustering = t_grouped.drop('Neighbourhood', 1)

for i in range(2,10):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(t_grouped_clustering)
    print('N.Clusters= {} -- Sil.Score {}'.format(i,silhouette_score(t_grouped_clustering,kmeans.labels_)))

N.Clusters= 2 -- Sil.Score 0.6400273731634717
N.Clusters= 3 -- Sil.Score 0.4045475267648364
N.Clusters= 4 -- Sil.Score 0.4188203302152573
N.Clusters= 5 -- Sil.Score 0.4120175219839336
N.Clusters= 6 -- Sil.Score 0.4043325486200782
N.Clusters= 7 -- Sil.Score 0.05402822115891504
N.Clusters= 8 -- Sil.Score 0.39795971617242126
N.Clusters= 9 -- Sil.Score 0.06700275726811665


It seems that the fewer the clusters the higher the score, lets try DBSCAN

In [42]:
db = DBSCAN(metric="euclidean",algorithm='ball_tree')
db.fit(t_grouped_clustering)
silhouette_score(t_grouped_clustering,db.labels_), len(db.labels_)

(0.47459087665430394, 44)

Through trial and error, I have decided to choose 3,5,7 clusters. This is because I feel that less than 3 clusters is not enough, 5 clusters seems to be alright, but 7 clusters sees a resurgence in silhouette score and could show more narrow similarities

In [43]:
# run k-means clustering for each cluster number
kmeans3 = KMeans(n_clusters=3, random_state=0).fit(t_grouped_clustering)
kmeans5 = KMeans(n_clusters=5, random_state=0).fit(t_grouped_clustering)
kmeans7 = KMeans(n_clusters=7, random_state=0).fit(t_grouped_clustering)
#add each label group to DF
neighborhoods_venues_sorted.insert(0, 'Cluster3 Labels', kmeans3.labels_)
neighborhoods_venues_sorted.insert(0, 'Cluster5 Labels', kmeans5.labels_)
neighborhoods_venues_sorted.insert(0, 'Cluster7 Labels', kmeans7.labels_)
#merge with other DF
t_merged = dt_t

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
t_merged = t_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

t_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster7 Labels,Cluster5 Labels,Cluster3 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
35,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,6,0,1,Fast Food Restaurant,Pizza Place,Athletics & Sports,Bus Line,Breakfast Spot,Pet Store,Intersection,Bank,Pharmacy,Gym / Fitness Center
36,M4C,East York,Woodbine Heights,43.695344,-79.318389,6,0,1,Curling Ice,Pharmacy,Park,Beer Store,Video Store,Skating Rink,Cosmetics Shop,Dance Studio,Asian Restaurant,Yoga Studio
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,6,0,1,Pub,Health Food Store,Trail,Neighborhood,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Electronics Store,Department Store,Doner Restaurant
38,M4G,East York,Leaside,43.70906,-79.363452,6,0,1,Coffee Shop,Sporting Goods Shop,Burger Joint,Furniture / Home Store,Sushi Restaurant,Juice Bar,Shopping Mall,Bike Shop,Electronics Store,Bank
39,M4H,East York,Thorncliffe Park,43.705369,-79.349372,6,0,1,Indian Restaurant,Yoga Studio,Supermarket,Bank,Burger Joint,Coffee Shop,Discount Store,Fast Food Restaurant,Gas Station,Gym


In [44]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster7 Labels,Cluster5 Labels,Cluster3 Labels,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,6,0,1,"Bathurst Quay, Island airport, King and Spadin...",Airport Lounge,Airport Service,Airport Terminal,Boutique,Sculpture Garden,Bar,Coffee Shop,Boat or Ferry,Plane,Harbor / Marina
1,0,0,1,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Steakhouse,Bakery,Farmers Market,Seafood Restaurant,Cheese Shop,Café,Restaurant
2,6,0,1,Business Reply Mail Processing Centre 969 Eastern,Recording Studio,Auto Workshop,Brewery,Light Rail Station,Spa,Farmers Market,Fast Food Restaurant,Burrito Place,Restaurant,Skate Park
3,6,0,1,"Cabbagetown, St. James Town",Coffee Shop,Italian Restaurant,Café,Restaurant,Bakery,Pub,Chinese Restaurant,Pizza Place,Grocery Store,Pharmacy
4,0,0,1,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Juice Bar,Burger Joint,Café,Japanese Restaurant,Ice Cream Shop,Spa,Bar


Now we all the data we need to map our different nieghborhood clusters

### Map of 3 clusters

In [45]:
# create map
map_clusters3 = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(3)
ys = [i + x + (i*x)**2 for i in range(3)]
colors_array = cm.cividis(np.linspace(0, 1, len(ys)))
mako = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(t_merged['Latitude'], t_merged['Longitude'], t_merged['Neighbourhood'], t_merged['Cluster3 Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=mako[cluster-1],
        fill=True,
        fill_color=mako[cluster-1],
        fill_opacity=0.3).add_to(map_clusters3)
       
map_clusters3

This doesn't look like very many different kinds of neighborhoods. Lets try with more clusters

In [47]:
# create map
map_clusters5 = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.cividis(np.linspace(0, 1, len(ys)))
mako = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(t_merged['Latitude'], t_merged['Longitude'], t_merged['Neighbourhood'], t_merged['Cluster5 Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=mako[cluster-1],
        fill=True,
        fill_color=mako[cluster-1],
        fill_opacity=0.3).add_to(map_clusters5)
       
map_clusters5

Still not much diversity in neighborhoods most seem to be in cluster 0

In [48]:
# create map
map_clusters7 = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(7)
ys = [i + x + (i*x)**2 for i in range(7)]
colors_array = cm.Dark2(np.linspace(0, 1, len(ys)))
mako = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(t_merged['Latitude'], t_merged['Longitude'], t_merged['Neighbourhood'], t_merged['Cluster7 Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=mako[cluster-1],
        fill=True,
        fill_color=mako[cluster-1],
        fill_opacity=0.3).add_to(map_clusters7)
       
map_clusters7

Looks like 7 clusters is starting to show some diversity (at least in neighborhoods centered in downtown)
Lets examine each of the 7 clusters

In [52]:
t_merged.loc[t_merged['Cluster7 Labels'] == 0, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
41,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Fruit & Vegetable Store,Pub,Pizza Place
49,Central Toronto,0,Coffee Shop,Pub,Pizza Place,Fried Chicken Joint,Liquor Store,Sports Bar,Supermarket,Sushi Restaurant,Restaurant,Light Rail Station
52,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Gastropub,Hotel,Burger Joint,Men's Store,Café
53,Downtown Toronto,0,Coffee Shop,Pub,Café,Park,Bakery,Restaurant,Mexican Restaurant,Hotel,Chocolate Shop,Ice Cream Shop
54,Downtown Toronto,0,Coffee Shop,Clothing Store,Cosmetics Shop,Bakery,Japanese Restaurant,Café,Ramen Restaurant,Tea Room,Electronics Store,Pizza Place
55,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Breakfast Spot,Cocktail Bar,Beer Bar,Cosmetics Shop,Bakery,Italian Restaurant
56,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Beer Bar,Steakhouse,Bakery,Farmers Market,Seafood Restaurant,Cheese Shop,Café,Restaurant
57,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Sandwich Place,Juice Bar,Burger Joint,Café,Japanese Restaurant,Ice Cream Shop,Spa,Bar
58,Downtown Toronto,0,Coffee Shop,Bar,Steakhouse,Café,Cosmetics Shop,Bakery,Asian Restaurant,Restaurant,Thai Restaurant,Gym
59,Downtown Toronto,0,Coffee Shop,Aquarium,Hotel,Café,Italian Restaurant,Brewery,Fried Chicken Joint,Restaurant,Scenic Lookout,Baseball Stadium


In [53]:
t_merged.loc[t_merged['Cluster7 Labels'] == 1, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Central Toronto,1,Garden,Yoga Studio,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [54]:
t_merged.loc[t_merged['Cluster7 Labels'] == 2, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Downtown Toronto,2,Park,Playground,Trail,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
64,Central Toronto,2,Park,Jewelry Store,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [55]:
t_merged.loc[t_merged['Cluster7 Labels'] == 3, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
40,East York,3,Coffee Shop,Convenience Store,Park,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [56]:
t_merged.loc[t_merged['Cluster7 Labels'] == 4, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
48,Central Toronto,4,Playground,Summer Camp,Farmers Market,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [57]:
t_merged.loc[t_merged['Cluster7 Labels'] == 5, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Central Toronto,5,Park,Swim School,Bus Line,Dim Sum Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [58]:
t_merged.loc[t_merged['Cluster7 Labels'] == 6, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]].drop(['Cluster3 Labels','Cluster5 Labels'],axis=1)

Unnamed: 0,Borough,Cluster7 Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
35,East York,6,Fast Food Restaurant,Pizza Place,Athletics & Sports,Bus Line,Breakfast Spot,Pet Store,Intersection,Bank,Pharmacy,Gym / Fitness Center
36,East York,6,Curling Ice,Pharmacy,Park,Beer Store,Video Store,Skating Rink,Cosmetics Shop,Dance Studio,Asian Restaurant,Yoga Studio
37,East Toronto,6,Pub,Health Food Store,Trail,Neighborhood,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Electronics Store,Department Store,Doner Restaurant
38,East York,6,Coffee Shop,Sporting Goods Shop,Burger Joint,Furniture / Home Store,Sushi Restaurant,Juice Bar,Shopping Mall,Bike Shop,Electronics Store,Bank
39,East York,6,Indian Restaurant,Yoga Studio,Supermarket,Bank,Burger Joint,Coffee Shop,Discount Store,Fast Food Restaurant,Gas Station,Gym
42,East Toronto,6,Park,Sandwich Place,Food & Drink Shop,Pet Store,Pub,Movie Theater,Burrito Place,Burger Joint,Brewery,Liquor Store
43,East Toronto,6,Café,Coffee Shop,Gastropub,Brewery,Italian Restaurant,Bakery,American Restaurant,Yoga Studio,Park,Seafood Restaurant
45,Central Toronto,6,Department Store,Dog Run,Sandwich Place,Gym,Breakfast Spot,Park,Hotel,Food & Drink Shop,Diner,Discount Store
46,Central Toronto,6,Coffee Shop,Sporting Goods Shop,Yoga Studio,Italian Restaurant,Salon / Barbershop,Restaurant,Park,Miscellaneous Shop,Mexican Restaurant,Fast Food Restaurant
47,Central Toronto,6,Sandwich Place,Dessert Shop,Pizza Place,Gym,Café,Coffee Shop,Sushi Restaurant,Italian Restaurant,Deli / Bodega,Indian Restaurant


Overall the clustering provides some interesting results. What we can learn from this is that there are a TON of cafes,coffee shops,pubs and bars in the Downtown toronto area!