<h1> Segmenting and Clustering Neighborhoods in Toronto

<h2> Part 1: Scraping Wikipedia page for Toronto Boroughs

Import all necessary packages

In [49]:
#!conda install -c conda-forge bs4 --yes
#!conda install -c conda-forge lxml --yes
#!conda install -c conda-forge requests --yes
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from pandas.io.json import json_normalize

<h3> Creating the dataframe with BeautifulSoup4

In [3]:
#create variable for url
result = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#create variable for the content of said url
src = result.content

#create soup item for the content
soup = BeautifulSoup(src, 'html.parser')

#since what we are looking for is the dataframe on the site we have to know what type of html class it is. you can inspect the page to find the html code that refers to the dataframe.
#it happens to be a table type, so we tell BeautifulSoup we want to find anything with the table tag
table = soup.find('table')

#now we need the table rows which are the 'tr' tag in html
table_rows = table.find_all('tr')

#create a list to store the values
tor_list = []

#for loop to find all 'td' tags as that is the data for the rows. If td has no text is is removed and not added to the list
for tr in table_rows:
    td = tr.find_all('td')
    if td:
        row = [i.text.strip() for i in td]
        tor_list.append(row)

#create dataframe from the list with the columns Postal Code, Borough, and Neighborhood(s)
tor_df = pd.DataFrame(data=tor_list, columns=['Postal Code', 'Borough', 'Neighborhood(s)'])

#remove all \n in the dataset
tor_df = tor_df.replace(r'\n', '', regex=True)

#replace all empty values with NaN
tor_df = tor_df.replace(r'^\s*$', np.NaN, regex=True)

#drop all rows with NaN in them
tor_df.dropna(axis=0, how='any', inplace=True)

#reset index of dataframe
tor_df.reset_index(drop=True, inplace=True)

#show head of dataframe
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood(s)
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
#get size of dataframe
tor_df.shape

(103, 3)

<h3> Creating the dataframe with Pandas

In [5]:
#assign html to variable, set na_values to NaN, with no header
tor_df_pd = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', na_values='NAN', header=0)

#previous code imports all of the tables on the page, chose the first one
tor_df_pd = tor_df_pd[0]

#drop NaN valued rows in dataframe
tor_df_pd.dropna(axis=0, inplace=True)

#reset index of rows
tor_df_pd.reset_index(drop=True, inplace=True)

#print dataframe
tor_df_pd.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
#get the size of the dataframe
tor_df_pd.shape

(103, 3)

<h2> Part 2: Adding Longitude and Latitude to Dataframe

In [12]:
#import coordinates
coords = pd.read_csv('Geospatial_Coordinates.csv')

#merge the two datasets based on shared Postal Code column
merge_tor = pd.merge(left=tor_df, right=coords, left_on='Postal Code', right_on='Postal Code')

#print dataset
merge_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood(s),Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


<h2>Part 3: Clustering Toronto Neighborhoods

<h5>Set geolocator to Toronto, ON

In [21]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


<h5>Create a map of Toronto through Folium

In [75]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merge_tor['Latitude'], merge_tor['Longitude'], merge_tor['Borough'], merge_tor['Neighborhood(s)']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [38]:
merge_tor["Borough"].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

<h5>I subsetted the data based on Downtown Toronto as i felt like it would have the most points.

In [None]:
#subeset data based on what borough you want
dt_tor = merge_tor[merge_tor['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dt_tor.head()

<h5>Change the geolocator to Downtown Toronto

In [25]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Downtown Toronto are 43.6563221, -79.3809161.


<h5>Create map of Downtown Toronto

In [81]:
# create map of Manhattan using latitude and longitude values
map_dt_tor = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(dt_tor['Latitude'], dt_tor['Longitude'], dt_tor['Neighborhood(s)']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dt_tor)  
    
map_dt_tor

<h3> Access Foursquare API for Downtown Toronto neighborhood data

In [27]:
#set up variables to be used later to parse Foursquare API
CLIENT_ID = 'XLJGJXBQZONSQSFRBCSXYPKHJMJRSCQDD5ZH5IVXV0ZKYWFK' # your Foursquare ID
CLIENT_SECRET = 'O1DLZTBHTGQZGIXRSLZX2IKKLBCDSFHRPIDXP5GFTXKOCRYX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XLJGJXBQZONSQSFRBCSXYPKHJMJRSCQDD5ZH5IVXV0ZKYWFK
CLIENT_SECRET:O1DLZTBHTGQZGIXRSLZX2IKKLBCDSFHRPIDXP5GFTXKOCRYX


<h5> Pick a neighborhood from the overall dataset, I chose UT because I like that school.

In [31]:
dt_tor.loc[11, 'Neighborhood(s)']

'University of Toronto, Harbord'

In [33]:
neighborhood_latitude = dt_tor.loc[11, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dt_tor.loc[11, 'Longitude'] # neighborhood longitude value

neighborhood_name = dt_tor.loc[11, 'Neighborhood(s)'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of University of Toronto, Harbord are 43.6626956, -79.4000493.


<h5>Grab neighborhood data from Foursquare, Top 100 venues within 500 meters of UT

In [34]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=XLJGJXBQZONSQSFRBCSXYPKHJMJRSCQDD5ZH5IVXV0ZKYWFK&client_secret=O1DLZTBHTGQZGIXRSLZX2IKKLBCDSFHRPIDXP5GFTXKOCRYX&v=20180605&ll=43.6626956,-79.4000493&radius=500&limit=100'

<h5>Function that extracts the category of the venue

In [82]:
results = requests.get(url).json()

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

<h5>Create a dataframe from the JSON file

In [50]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Yasu,Japanese Restaurant,43.662837,-79.403217
1,Rasa,Restaurant,43.662757,-79.403988
2,The Dessert Kitchen,Dessert Shop,43.662823,-79.402746
3,Piano Piano,Italian Restaurant,43.662949,-79.402898
4,Her Father's Cider Bar + Kitchen,Beer Bar,43.662448,-79.404703


In [51]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

34 venues were returned by Foursquare.


<h5>Create a function to repeat the same process to all the neighborhoods in Downtown Toronto

In [52]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

<h5>Code to run the above function on each neighborhood and create a new dataframe for Downtown Toronto venues

In [54]:
dt_tor_venues = getNearbyVenues(names=dt_tor['Neighborhood(s)'],
                                   latitudes=dt_tor['Latitude'],
                                   longitudes=dt_tor['Longitude']
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [55]:
print(dt_tor_venues.shape)
dt_tor_venues.head()

(1202, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


<h5>Group dataframe by neighborhood and get the counts

In [56]:
dt_tor_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,63,63,63,63,63,63
Christie,17,17,17,17,17,17
Church and Wellesley,74,74,74,74,74,74
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",55,55,55,55,55,55


<h3>Analyze the neighborhoods

<h5>create dummy variables for each type of venue and add them to a dataframe

In [57]:
# one hot encoding
dt_tor_onehot = pd.get_dummies(dt_tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dt_tor_onehot['Neighborhood'] = dt_tor_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [dt_tor_onehot.columns[-1]] + list(dt_tor_onehot.columns[:-1])
dt_tor_onehot = dt_tor_onehot[fixed_columns]

dt_tor_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h5>Change the measurement of the amount of each type of venue in each neighborhood to the average occurance

In [58]:
dt_tor_grouped = dt_tor_onehot.groupby('Neighborhood').mean().reset_index()
dt_tor_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.015873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.015873,0.0,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.027027,0.013514,0.0,0.0,0.0,0.0,0.0,0.0,0.013514,...,0.013514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013514,0.0
5,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0
6,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.036364,0.0,0.0,0.054545,0.018182,0.0,0.0


<h3>Getting top venues for Downtown Toronto

In [59]:
num_top_venues = 5

for hood in dt_tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = dt_tor_grouped[dt_tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2            Beer Bar  0.04
3                Café  0.04
4  Seafood Restaurant  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3   Harbor / Marina  0.06
4               Bar  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.17
1  Italian Restaurant  0.08
2                Café  0.05
3      Sandwich Place  0.05
4     Thai Restaurant  0.03


----Christie----
                venue  freq
0       Grocery Store  0.24
1                Café  0.18
2                Park  0.12
3  Athletics & Sports  0.06
4          Baby Store  0.06


----Church and Wellesley----
                 venue  freq
0  Japanese Restaurant  0.07
1     Sushi Restaurant  0.07
2          Coffee Shop  0.07
3       

<h5>Create function that makes a dataframe of each neighborhood and its respective common venues from most common to least

In [60]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [64]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = dt_tor_grouped['Neighborhood']

for ind in np.arange(dt_tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dt_tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Seafood Restaurant,Beer Bar,Cheese Shop,Café,Irish Pub,Park
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Plane,Coffee Shop,Sculpture Garden,Boat or Ferry,Rental Car Location,Bar,Boutique
2,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place,Japanese Restaurant,Thai Restaurant,Ice Cream Shop,Burger Joint,Bubble Tea Shop
3,Christie,Grocery Store,Café,Park,Coffee Shop,Baby Store,Athletics & Sports,Italian Restaurant,Candy Store,Diner,Nightclub
4,Church and Wellesley,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Restaurant,Yoga Studio,Hotel,Men's Store,Café,Mediterranean Restaurant,Pub


<h3>Cluster the neighborhoods of Downtown Toronto

In [86]:
# set number of clusters
kclusters = 5

dt_tor_grouped_clustering = dt_tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dt_tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 4, 2, 0, 0, 0, 0, 0, 0], dtype=int32)

In [87]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dt_tor_merged = dt_tor

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dt_tor_merged = dt_tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood(s)')

dt_tor_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood(s),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Ice Cream Shop,French Restaurant,Health Food Store
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4,Coffee Shop,Sushi Restaurant,Yoga Studio,Creperie,Smoothie Shop,Beer Bar,Sandwich Place,Burrito Place,Café,College Auditorium
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Cosmetics Shop,Bubble Tea Shop,Japanese Restaurant,Restaurant,Italian Restaurant,Café,Middle Eastern Restaurant,Diner
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Cocktail Bar,American Restaurant,Gastropub,Gym,Lingerie Store,Restaurant,Italian Restaurant,Seafood Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Seafood Restaurant,Beer Bar,Cheese Shop,Café,Irish Pub,Park


<h3>Create a map of the Downtown Toronto clusters

In [88]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_tor_merged['Latitude'], dt_tor_merged['Longitude'], dt_tor_merged['Neighborhood(s)'], dt_tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h3>Explore each cluster

<h5>Cluster 0

In [96]:
dt_tor_merged.loc[dt_tor_merged['Cluster Labels'] == 0, dt_tor_merged.columns[[2] + list(range(5, dt_tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood(s),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",0,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Ice Cream Shop,French Restaurant,Health Food Store
2,"Garden District, Ryerson",0,Clothing Store,Coffee Shop,Cosmetics Shop,Bubble Tea Shop,Japanese Restaurant,Restaurant,Italian Restaurant,Café,Middle Eastern Restaurant,Diner
3,St. James Town,0,Café,Coffee Shop,Cocktail Bar,American Restaurant,Gastropub,Gym,Lingerie Store,Restaurant,Italian Restaurant,Seafood Restaurant
4,Berczy Park,0,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Seafood Restaurant,Beer Bar,Cheese Shop,Café,Irish Pub,Park
7,"Richmond, Adelaide, King",0,Coffee Shop,Café,Restaurant,Deli / Bodega,Gym,Hotel,Thai Restaurant,Clothing Store,Seafood Restaurant,Pizza Place
8,"Harbourfront East, Union Station, Toronto Islands",0,Coffee Shop,Aquarium,Hotel,Café,Scenic Lookout,Sporting Goods Shop,Brewery,Restaurant,Italian Restaurant,Fried Chicken Joint
9,"Toronto Dominion Centre, Design Exchange",0,Coffee Shop,Café,Hotel,Restaurant,Salad Place,Seafood Restaurant,Japanese Restaurant,Italian Restaurant,American Restaurant,Beer Bar
10,"Commerce Court, Victoria Hotel",0,Coffee Shop,Café,Restaurant,Hotel,Gym,American Restaurant,Italian Restaurant,Deli / Bodega,Seafood Restaurant,Japanese Restaurant
11,"University of Toronto, Harbord",0,Café,Restaurant,Bar,Italian Restaurant,Japanese Restaurant,Bookstore,Bakery,Yoga Studio,Beer Bar,Sandwich Place
12,"Kensington Market, Chinatown, Grange Park",0,Café,Bakery,Vietnamese Restaurant,Mexican Restaurant,Coffee Shop,Dessert Shop,Vegetarian / Vegan Restaurant,Gaming Cafe,Bar,Belgian Restaurant


<h5>Cluster 1

In [97]:
dt_tor_merged.loc[dt_tor_merged['Cluster Labels'] == 1, dt_tor_merged.columns[[2] + list(range(5, dt_tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood(s),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Rosedale,1,Park,Playground,Trail,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


<h5>Cluster 2

In [98]:
dt_tor_merged.loc[dt_tor_merged['Cluster Labels'] == 2, dt_tor_merged.columns[[2] + list(range(5, dt_tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood(s),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Christie,2,Grocery Store,Café,Park,Coffee Shop,Baby Store,Athletics & Sports,Italian Restaurant,Candy Store,Diner,Nightclub


<h5>Cluster 3

In [99]:
dt_tor_merged.loc[dt_tor_merged['Cluster Labels'] == 3, dt_tor_merged.columns[[2] + list(range(5, dt_tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood(s),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,"CN Tower, King and Spadina, Railway Lands, Har...",3,Airport Lounge,Airport Service,Airport Terminal,Plane,Coffee Shop,Sculpture Garden,Boat or Ferry,Rental Car Location,Bar,Boutique


<h5>Cluster 4

In [100]:
dt_tor_merged.loc[dt_tor_merged['Cluster Labels'] == 4, dt_tor_merged.columns[[2] + list(range(5, dt_tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood(s),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Queen's Park, Ontario Provincial Government",4,Coffee Shop,Sushi Restaurant,Yoga Studio,Creperie,Smoothie Shop,Beer Bar,Sandwich Place,Burrito Place,Café,College Auditorium
5,Central Bay Street,4,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place,Japanese Restaurant,Thai Restaurant,Ice Cream Shop,Burger Joint,Bubble Tea Shop
