
Import everything we need before we start

In [2]:
# import libraries

# time for performance measures
import time

# numpy for data vectors
import numpy as np

# pandas for data analysis
import pandas as pd
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

from collections import Counter

# transform json into pandas dataframe
from pandas.io.json import json_normalize

# json
import json

# requests

!pip install lxml

import requests
import urllib
import lxml.html




Part 1: Postal Code Data
We are using a wikipedia table of postal codes in Toronto: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [3]:
# download html to dataframe
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df = pd.read_html(html)[0] #read html table as df
df.columns = ['PostalCode', 'Borough', 'Neighborhood'] #rename columns

In [18]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [4]:
df = df[df['Borough'] != 'Not assigned'] #exclude borough 'not assigned'
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood']) #where neighborhood is 'not assigned', use borough

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# group by postal code, aggregate into sets of unique values for borough and neighborhood
df = df.groupby('PostalCode').agg({
    'Borough': lambda x: set(x),
    'Neighborhood': lambda x: set(x)
    }).reset_index()

# count unique boroughs per postal code
borough_count = Counter([len(s) for s in df['Borough']])
for k,v in borough_count.items():
    print('%i postal code(s) with %i borough(s) each.' % (v, k))
print()

# count unique neighborhoods per postal code
neighbor_count = Counter([len(s) for s in df['Neighborhood']])
for k,v in neighbor_count.items():
    print('%i postal code(s) with %i neighborhood(s) each.' % (v, k))
print()

# combine sets into delimited strings (e.g.'Regent Park, Harbourfront')
df['Borough']=df['Borough'].apply(', '.join)
df['Neighborhood']=df['Neighborhood'].apply(', '.join)

103 postal code(s) with 1 borough(s) each.

103 postal code(s) with 1 neighborhood(s) each.



In [7]:
# check final row count
pc_count = df.shape[0]
print(pc_count, 'rows')

103 rows


Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In [9]:
# import geocoder to get lat, long of neighborhoods
try:
    import geocoder
except (ImportError, ModuleNotFoundError): #install only if necessary
    !conda install -c conda-forge geocoder --yes 
    print('geocoder installed.')
    import geocoder
print('geocoder imported.')

geocoder imported.


In [10]:
# use geocoder

# initialize limiting variables
calls=0
call_limit=2500
t1 = time.perf_counter()
t2=0.0
t_limit = 60.0
error = None

geo_data=[]
# loop through all postal codes
for postal_code in df['PostalCode']:

    # initialize lat_long_coords to None
    lat_lng_coords = None

    # loop until you get the coordinates or error condition met
    while( lat_lng_coords is None and not error):
        g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
        lat_lng_coords = g.latlng
        calls+=1
        t2=time.perf_counter()
        if str(g)[1:17] == '[REQUEST_DENIED]': # stop if request denied
            error=('Geocoder request denied. ')
        if calls == call_limit: # stop if call limit reached
            error=('Geocoder known call limit (%i) reached. ' % call_limit)
        if t2-t1 > t_limit: # stop if time limit exceeded
            error=('Time limit (%0.2fs) exceeded. ')

    if error: # if any error conditions reached, stop looping postal codes
        print(error)
        break
            
    # build list
    try:
        geo_data.append({
            'PostalCode' : postal_code,
            'Latitude' : lat_lng_coords[0],
            'Longitude' : lat_lng_coords[1]})
    except:
        break
# convert to dataframe
df_geo=pd.DataFrame(geo_data)

coord_count = df_geo.shape[0]
print('Geocoder found %i out of %i postal code coordinates.' % (coord_count, pc_count))
print('Geocoder made %i calls calls over %0.2fs.' % (calls, t2-t1))

Geocoder request denied. 
Geocoder found 0 out of 103 postal code coordinates.
Geocoder made 1 calls calls over 0.09s.


If the geocoder doesn't work, I'll just read the csv and merge it to my postal code dataframe

In [11]:
if coord_count != pc_count:
    # read csv
    df_geo = pd.read_csv('https://cocl.us/Geospatial_data')
    df_geo.columns=('PostalCode','Latitude','Longitude')

In [13]:
df = pd.merge(df, df_geo)
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


You can decide to work with only boroughs that contain the word Toronto.

I'll use str.contains to slice the dataframe and create a new dataframe.

In [16]:
# nominatim to convert an address into latitude and longitude values
try:
    from geopy.geocoders import Nominatim 
except (ImportError, ModuleNotFoundError): #install only if necessary
    !conda install -c conda-forge geopy --yes 
    from geopy.geocoders import Nominatim

# Matplotlib for plotting
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as py

# import k-means from clustering stage
from sklearn.cluster import KMeans

# folium for visualizaing maps
try:
    import folium
except (ImportError, ModuleNotFoundError): #install only if necessary
    !conda install -c conda-forge folium=0.5.0 --yes
    import folium # plotting library

In [17]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
print(toronto_data['Borough'].value_counts())
toronto_data.head()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Use geopy library to get the latitude and longitude values of Toronto.

In [18]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="explorer") #define a user_agent.
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of %s are %0.6f %0.6f.' % (address, latitude, longitude))

The coordinates of Toronto, Ontario are 43.653482 -79.383935.


Create the map with folium, labeling each neighborhood

In [30]:
# create map using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for postalcode, borough, neighborhood, lat, long in zip(toronto_data['PostalCode'], 
                                                        toronto_data['Borough'], 
                                                        toronto_data['Neighborhood'], 
                                                        toronto_data['Latitude'], 
                                                        toronto_data['Longitude']):
    label = '%s, (%s), %s' % (postalcode, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Use a function to get the top venues within a radius of each postal code in Toronto

In [46]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=10):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( 'Y0NEDLW2W33JEP5234YBVSG1AMNMSEIEW3HRW1JSRX3KFFSZ','QS1U02EPNO0QHGOOBQ0ZYVDZK3BVGC3PUZSLZ55AKDF14NTY' , '20180604', lat, lng, radius, limit)
           
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Origin', 
                  'Origin Latitude', 
                  'Origin Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


Now run the above function on each neighborhood and create a new dataframe

In [47]:
toronto_venues = getNearbyVenues(names=toronto_data['PostalCode'],
                                 latitudes=toronto_data['Latitude'],
                                 longitudes=toronto_data['Longitude'],
                                 radius=500,
                                 limit=100
                                  )



M4E
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6G
M6H
M6J
M6K
M6P
M6R
M6S
M7A
M7Y


In [48]:
print(toronto_venues.shape)
toronto_venues.head()

(1606, 7)


Unnamed: 0,Origin,Origin Latitude,Origin Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


Hmm, Neighborhood doesn't seem like it should be a venue category. Let's remove it.

In [49]:
toronto_venues = toronto_venues[toronto_venues['Venue Category']!='Neighborhood'].reset_index(drop=True)
toronto_venues.head()

Unnamed: 0,Origin,Origin Latitude,Origin Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4K,43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
4,M4K,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [50]:
#Let's check how many venues were returned for each postal code.
toronto_venues[['Origin','Venue']].groupby('Origin').count()

Unnamed: 0_level_0,Venue
Origin,Unnamed: 1_level_1
M4E,3
M4K,42
M4L,22
M4M,39
M4N,3
M4P,8
M4R,21
M4S,38
M4T,2
M4V,16


Let's find out how many unique categories can be curated from all the returned venues

In [51]:
print('There are %i uniques categories across %i venues.' % (
    len(toronto_venues['Venue Category'].unique()), 
    toronto_venues.shape[0]))

toronto_venues['Venue Category'].value_counts()

There are 233 uniques categories across 1602 venues.


Coffee Shop              143
Café                      91
Restaurant                57
Italian Restaurant        42
Park                      35
                        ... 
Hospital                   1
Stationery Store           1
Ethiopian Restaurant       1
Bus Line                   1
College Arts Building      1
Name: Venue Category, Length: 233, dtype: int64

In [53]:
categories = toronto_venues['Venue Category'].str.split() #split words

for i in range(len(categories)): #for each venue
    categories[i]=[categories[i][0], #first word and
                   categories[i][-1]] #last word

#Some of these categories could be grouped into broader categories, like 'Restaurant'. I'll roughly group them by taking the first and last word. e.g. 'Chinese Restaurant' becomes 'Chinese' and 'Restaurant'.
    
categories = categories.apply(pd.Series).stack() #separate words into 2-column array, then stack them as rows instead
categories.index = categories.index.droplevel(1) #remove index level created by stack
categories.name = 'Categories' #merge requires named series or dataframe
grouped_categories = pd.merge(toronto_venues, categories, left_index=True, right_index=True) #merge on index
grouped_categories.head()

Unnamed: 0,Origin,Origin Latitude,Origin Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Categories
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail,Trail
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail,Trail
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store,Health
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store,Store
2,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub,Pub


We can see that the grouped categories have some larger groups, such as 'Restaurant', 'Shop', 'Cafe', 'Bar', 'Store'. Although 'Coffee shop', which was already consistently classified, did not benefit. 

In [54]:
grouped_categories['Categories'].value_counts()

Restaurant    441
Shop          261
Café          182
Coffee        143
Bar           111
             ... 
Site            1
Home            1
Gas             1
Other           1
Boat            1
Name: Categories, Length: 257, dtype: int64

In [56]:

grouped_categories.groupby(['Categories','Venue Category']).size()

Categories   Venue Category       
Accessories  Accessories Store         1
Afghan       Afghan Restaurant         1
Airport      Airport                   2
             Airport Food Court        1
             Airport Gate              1
                                      ..
Vietnamese   Vietnamese Restaurant     7
Wine         Wine Bar                  8
Women's      Women's Store             1
Workshop     Auto Workshop             1
Yoga         Yoga Studio              12
Length: 421, dtype: int64


Analyze Each Neighborhood

The rest follows the New York lab pretty closely, except that my unit is postal code instead of neighborhood.

In [57]:
# one hot encoding categories
toronto_onehot = pd.get_dummies(grouped_categories[['Categories']], prefix="", prefix_sep="")

# add postal code column back to dataframe
toronto_onehot['PostalCode'] = grouped_categories['Origin'] 

# move postal code column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Accessories,Afghan,Airport,American,Antique,Aquarium,Area,Art,Arts,...,Travel,Truck,Vegetarian,Venue,Video,Vietnamese,Wine,Women's,Workshop,Yoga
0,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
toronto_onehot.shape

(3204, 258)

Next, let's group rows by postal code, taking the mean of the frequency of occurrence of each category

In [59]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,Accessories,Afghan,Airport,American,Antique,Aquarium,Area,Art,Arts,...,Travel,Truck,Vegetarian,Venue,Video,Vietnamese,Wine,Women's,Workshop,Yoga
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.011905,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011905
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.025641,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,0.0,0.012821
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0


In [60]:
toronto_grouped.shape

(39, 258)

Let's put each postal code with the top venues in a pandas dataframe
First, let's write a function to sort the venues in descending order.

In [61]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [62]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Pub,Trail,Store,Health,Falafel,Eastern,Electronics,Entertainment,Ethiopian,Event
1,M4K,Restaurant,Shop,Greek,Store,Italian,Coffee,Trail,Pub,Lounge,Furniture
2,M4L,Restaurant,Park,Shop,Place,Pub,Steakhouse,Brewery,Sandwich,Store,Fast
3,M4M,Restaurant,Café,Shop,Bakery,Store,Bar,Brewery,Gastropub,Coffee,Park
4,M4N,Park,Line,Swim,School,Bus,Ethiopian,Eastern,Electronics,Entertainment,Yoga


Cluster Neighborhoods
Run k-means to cluster the neighborhood into 5 clusters

In [63]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 3, 3, 3, 1, 1, 3, 3, 2, 3], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [64]:
# add clustering labels
venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_data.join(venues_sorted.set_index('PostalCode'), on='PostalCode')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Trail,Store,Health,Falafel,Eastern,Electronics,Entertainment,Ethiopian,Event
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,3,Restaurant,Shop,Greek,Store,Italian,Coffee,Trail,Pub,Lounge,Furniture
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,3,Restaurant,Park,Shop,Place,Pub,Steakhouse,Brewery,Sandwich,Store,Fast
3,M4M,East Toronto,Studio District,43.659526,-79.340923,3,Restaurant,Café,Shop,Bakery,Store,Bar,Brewery,Gastropub,Coffee,Park
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Park,Line,Swim,School,Bus,Ethiopian,Eastern,Electronics,Entertainment,Yoga


In [65]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
colors_array = cm.Set1(np.linspace(0, 1, 9)) # Set1 color map has 9 colors
color_map = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, postalcode, borough, neighbor, cluster, first, second in zip(
    toronto_merged['Latitude'], 
    toronto_merged['Longitude'], 
    toronto_merged['PostalCode'], 
    toronto_merged['Borough'], 
    toronto_merged['Neighborhood'],
    toronto_merged['Cluster Labels'],
    toronto_merged['1st Most Common Venue'],
    toronto_merged['2nd Most Common Venue']):
    label = folium.Popup( '%s (%s: %s). \nCluster %i. \n%s and %s' % (postalcode, borough, neighbor, cluster+1, first, second), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color_map[cluster%9],
        fill=True,
        fill_color=color_map[cluster%9],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


5. Examine Clusters

In [66]:
#Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Trail,Store,Health,Falafel,Eastern,Electronics,Entertainment,Ethiopian,Event


In [67]:
#Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Park,Line,Swim,School,Bus,Ethiopian,Eastern,Electronics,Entertainment,Yoga
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Hotel,Store,Gym,Park,Food,Department,Place,Sandwich,Shop,Breakfast
23,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307,1,Park,Trail,Store,Sushi,Restaurant,Jewelry,Ethiopian,Eastern,Electronics,Entertainment
27,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,1,Airport,Service,Bar,Boutique,Lounge,Terminal,Shop,Marina,Boat,Location
30,M6G,Downtown Toronto,Christie,43.669542,-79.422564,1,Store,Café,Grocery,Park,Restaurant,Nightclub,Diner,Candy,Shop,Baby
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,1,Bakery,Pharmacy,Bar,Park,Bank,Café,Supermarket,Pool,Brewery,Restaurant
38,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558,1,Restaurant,Garden,Park,Place,Spa,Brewery,Studio,Burrito,Skate,Shop


In [68]:
#Cluser 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2,Playground,Park,Falafel,Eastern,Electronics,Entertainment,Ethiopian,Event,Yoga,Doner
10,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,2,Park,Playground,Trail,Yoga,Event,Eastern,Electronics,Entertainment,Ethiopian,Falafel


In [69]:
#Cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,3,Restaurant,Shop,Greek,Store,Italian,Coffee,Trail,Pub,Lounge,Furniture
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,3,Restaurant,Park,Shop,Place,Pub,Steakhouse,Brewery,Sandwich,Store,Fast
3,M4M,East Toronto,Studio District,43.659526,-79.340923,3,Restaurant,Café,Shop,Bakery,Store,Bar,Brewery,Gastropub,Coffee,Park
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,3,Restaurant,Shop,Store,Clothing,Café,Diner,Spa,Coffee,Park,Rental
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,3,Restaurant,Shop,Place,Gym,Pizza,Café,Sandwich,Dessert,Park,Italian
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,3,Restaurant,Pub,Shop,Supermarket,Coffee,Bank,Bagel,Pizza,Liquor,Fried
11,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,3,Restaurant,Store,Café,Pub,Park,Shop,Pharmacy,Place,Bakery,Market
12,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,3,Restaurant,Shop,Coffee,Bar,Store,Japanese,Sushi,Pub,Hotel,Gastropub
13,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,Shop,Coffee,Restaurant,Bakery,Park,Pub,Store,Café,Theater,Hotel
14,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3,Restaurant,Shop,Store,Clothing,Coffee,Café,Theater,Hotel,Diner,Bookstore


In [70]:
#Cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M5N,Central Toronto,Roselawn,43.711695,-79.416936,4,Garden,Service,Home,Event,Donut,Eastern,Electronics,Entertainment,Ethiopian,Falafel
