In [2]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
import time

print('Libraries imported.')

Libraries imported.


## Creating dataframe from wikipedia source for neighborhoods in Toronto

In [3]:
html_string='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
df = pd.read_html(html_string)[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [5]:
df1=df[df['Borough']!='Not assigned'].reset_index(drop=True)
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [65]:
df1[df1.duplicated()] #No duplicates!
#df2=df1.groupby('Postal code', as_index=False).agg({'Neighborhood':','.join})
df1['Neighborhood'].replace({' /':','}, inplace=True,regex=True)

In [67]:
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [66]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal code   103 non-null    object
 1   Borough       103 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [220]:
df1[df1['Neighborhood'].duplicated()] #There're duplicated Neighborhoods with different Postal code!

Unnamed: 0,Postal code,Borough,Neighborhood
13,M3C,North York,Don Mills
46,M3L,North York,Downsview
53,M3M,North York,Downsview
60,M3N,North York,Downsview
72,M2R,North York,Willowdale


In [277]:
df1.reset_index()
df1.shape

(103, 3)

## Extracting latitude and longitude from postal codes in Toronto

### API key for Google Maps (to hide)

In [2]:
API_KEY = ''

In [83]:
address=[]

for i in range(df1.shape[0]):
    complete_address = '{}, Toronto, Ontario'.format(df1.iloc[i,0])
    print(complete_address)
    
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(complete_address)
    if API_KEY is not None:
        geocode_url = geocode_url + "&key={}".format(API_KEY)
        
    results = requests.get(geocode_url)
    results = results.json()
    time.sleep(1)
    
    if len(results['results']) == 0:
        output = {
            "latitude": None,
            "longitude": None,
        }
    else:    
        answer = results['results'][0]
        output = {
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
        }            
    
    address.append(output)
  

M3A, Toronto, Ontario
M4A, Toronto, Ontario
M5A, Toronto, Ontario
M6A, Toronto, Ontario
M7A, Toronto, Ontario
M9A, Toronto, Ontario
M1B, Toronto, Ontario
M3B, Toronto, Ontario
M4B, Toronto, Ontario
M5B, Toronto, Ontario
M6B, Toronto, Ontario
M9B, Toronto, Ontario
M1C, Toronto, Ontario
M3C, Toronto, Ontario
M4C, Toronto, Ontario
M5C, Toronto, Ontario
M6C, Toronto, Ontario
M9C, Toronto, Ontario
M1E, Toronto, Ontario
M4E, Toronto, Ontario
M5E, Toronto, Ontario
M6E, Toronto, Ontario
M1G, Toronto, Ontario
M4G, Toronto, Ontario
M5G, Toronto, Ontario
M6G, Toronto, Ontario
M1H, Toronto, Ontario
M2H, Toronto, Ontario
M3H, Toronto, Ontario
M4H, Toronto, Ontario
M5H, Toronto, Ontario
M6H, Toronto, Ontario
M1J, Toronto, Ontario
M2J, Toronto, Ontario
M3J, Toronto, Ontario
M4J, Toronto, Ontario
M5J, Toronto, Ontario
M6J, Toronto, Ontario
M1K, Toronto, Ontario
M2K, Toronto, Ontario
M3K, Toronto, Ontario
M4K, Toronto, Ontario
M5K, Toronto, Ontario
M6K, Toronto, Ontario
M1L, Toronto, Ontario
M2L, Toron

In [84]:
df2=pd.concat([df1, pd.DataFrame(address).apply(pd.Series)],axis=1)
df2.head()

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [86]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Postal code   103 non-null    object 
 1   Borough       103 non-null    object 
 2   Neighborhood  103 non-null    object 
 3   latitude      103 non-null    float64
 4   longitude     103 non-null    float64
dtypes: float64(2), object(3)
memory usage: 4.1+ KB


In [221]:
df2[df2['Neighborhood'].duplicated()] #Duplicated are different locations but same neighborhood

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude
13,M3C,North York,Don Mills,43.7259,-79.340923
46,M3L,North York,Downsview,43.739015,-79.506944
53,M3M,North York,Downsview,43.728496,-79.495697
60,M3N,North York,Downsview,43.761631,-79.520999
72,M2R,North York,Willowdale,43.782736,-79.442259


## Explore and cluster the neighborhoods in Toronto

In [89]:
neighborhoods=df2.copy()

In [297]:
len(neighborhoods['Postal code'].unique())

103

In [298]:
len(neighborhoods['Neighborhood'].unique())

98

In [299]:
print('The dataframe has {} boroughs and {} neighborhoods (for the same postal code too).'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods (for the same postal code too).


In [91]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [96]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['latitude'], neighborhoods['longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Foursquare credentials (to hide)

In [1]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '' # Foursquare API version

In [300]:
def getNearbyVenues(postal_codes, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postal_code, name, lat, lng in zip(postal_codes, names, latitudes, longitudes):
        print(postal_code, name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal_code,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal code',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [305]:
radius=1500 #extend radius from original 500m to find venues for all neighborhoods
LIMIT=100

toronto_venues = getNearbyVenues(postal_codes=neighborhoods['Postal code'],
                                   names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['latitude'],
                                   longitudes=neighborhoods['longitude'],
                                   radius=radius
                                  )

M3A Parkwoods
M4A Victoria Village
M5A Regent Park, Harbourfront
M6A Lawrence Manor, Lawrence Heights
M7A Queen's Park, Ontario Provincial Government
M9A Islington Avenue
M1B Malvern, Rouge
M3B Don Mills
M4B Parkview Hill, Woodbine Gardens
M5B Garden District, Ryerson
M6B Glencairn
M9B West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
M1C Rouge Hill, Port Union, Highland Creek
M3C Don Mills
M4C Woodbine Heights
M5C St. James Town
M6C Humewood-Cedarvale
M9C Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
M1E Guildwood, Morningside, West Hill
M4E The Beaches
M5E Berczy Park
M6E Caledonia-Fairbanks
M1G Woburn
M4G Leaside
M5G Central Bay Street
M6G Christie
M1H Cedarbrae
M2H Hillcrest Village
M3H Bathurst Manor, Wilson Heights, Downsview North
M4H Thorncliffe Park
M5H Richmond, Adelaide, King
M6H Dufferin, Dovercourt Village
M1J Scarborough Village
M2J Fairview, Henry Farm, Oriole
M3J Northwood Park, York University
M4J East Toronto
M5J Harbourfront East,

In [306]:
print(toronto_venues.shape)
toronto_venues.head()

(6900, 8)


Unnamed: 0,Postal code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,M3A,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
3,M3A,Parkwoods,43.753259,-79.329656,LCBO,43.757774,-79.314257,Liquor Store
4,M3A,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café


In [307]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 344 uniques categories.


In [308]:
len(toronto_venues['Postal code'].unique())

103

In [309]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [310]:
toronto_onehot.info(verbose=True) #Take care, there's a venue called 'Neighborhood'!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6900 entries, 0 to 6899
Data columns (total 344 columns):
 #   Column                                    Dtype
---  ------                                    -----
 0   ATM                                       uint8
 1   Accessories Store                         uint8
 2   Afghan Restaurant                         uint8
 3   Airport                                   uint8
 4   Airport Lounge                            uint8
 5   American Restaurant                       uint8
 6   Amphitheater                              uint8
 7   Animal Shelter                            uint8
 8   Antique Shop                              uint8
 9   Aquarium                                  uint8
 10  Arcade                                    uint8
 11  Argentinian Restaurant                    uint8
 12  Art Gallery                               uint8
 13  Art Museum                                uint8
 14  Arts & Crafts Store                    

In [311]:
toronto_onehot.rename(columns={'Neighborhood':'Neighborhood category'},inplace=True)

In [312]:
toronto_onehot.shape

(6900, 344)

In [313]:
toronto_onehot.head()

Unnamed: 0,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [314]:
# add neighborhood column back to dataframe
toronto_onehot['Postal code'] = toronto_venues['Postal code'] 
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

In [315]:
toronto_onehot.shape

(6900, 346)

In [316]:
toronto_onehot.head()

Unnamed: 0,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Postal code,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,M3A,Parkwoods
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,M3A,Parkwoods
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,M3A,Parkwoods
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,M3A,Parkwoods
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,M3A,Parkwoods


In [317]:
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-2]] + [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-2])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Postal code,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [318]:
toronto_grouped = toronto_onehot.groupby('Postal code').mean().reset_index()
toronto_grouped

Unnamed: 0,Postal code,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,M1B,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.029412,0.0,0.000000,0.0,0.294118
1,M1C,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2,M1E,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000
3,M1G,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000
4,M1H,0.0,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015385,0.015385,0.0,0.015385,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M9N,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000
99,M9P,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000
100,M9R,0.0,0.0,0.0,0.0,0.0,0.046512,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.023256,0.000000,0.0,0.000000,0.0,0.000000
101,M9V,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000


In [319]:
toronto_grouped.shape

(103, 345)

In [320]:
num_top_venues = 5

for hood in toronto_grouped['Postal code']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Postal code'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                  venue  freq
0           Zoo Exhibit  0.29
1  Fast Food Restaurant  0.09
2            Restaurant  0.06
3           Pizza Place  0.06
4                Bakery  0.03


----M1C----
            venue  freq
0            Park   0.2
1  Breakfast Spot   0.1
2           Hotel   0.1
3             Gym   0.1
4   Grocery Store   0.1


----M1E----
                  venue  freq
0           Pizza Place  0.10
1           Coffee Shop  0.06
2                  Bank  0.06
3  Fast Food Restaurant  0.06
4        Breakfast Spot  0.06


----M1G----
         venue  freq
0  Coffee Shop  0.15
1  Pizza Place  0.12
2     Pharmacy  0.09
3         Park  0.06
4         Bank  0.06


----M1H----
            venue  freq
0     Coffee Shop  0.08
1      Restaurant  0.05
2     Gas Station  0.05
3  Clothing Store  0.05
4  Sandwich Place  0.05


----M1J----
            venue  freq
0  Sandwich Place  0.14
1        Pharmacy  0.11
2  Ice Cream Shop  0.07
3            Bank  0.07
4     Coffee Shop  0.07


4  Caribbean Restaurant  0.05


----M6C----
                venue  freq
0         Coffee Shop  0.07
1                Bank  0.06
2                Café  0.06
3  Italian Restaurant  0.06
4         Pizza Place  0.06


----M6E----
                    venue  freq
0             Coffee Shop  0.10
1      Italian Restaurant  0.08
2             Pizza Place  0.05
3  Furniture / Home Store  0.05
4    Caribbean Restaurant  0.04


----M6G----
                venue  freq
0                Café  0.12
1         Coffee Shop  0.06
2  Italian Restaurant  0.04
3   Indian Restaurant  0.04
4   Korean Restaurant  0.03


----M6H----
                venue  freq
0                Café  0.14
1  Italian Restaurant  0.09
2         Coffee Shop  0.08
3                 Bar  0.07
4              Bakery  0.06


----M6J----
                venue  freq
0                Café  0.08
1              Bakery  0.05
2        Cocktail Bar  0.05
3  Italian Restaurant  0.05
4          Restaurant  0.05


----M6K----
                venue 

In [321]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [334]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal code'] = toronto_grouped['Postal code']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Zoo Exhibit,Fast Food Restaurant,Restaurant,Pizza Place,Women's Store,Coffee Shop,Movie Theater,Supermarket,Big Box Store,Caribbean Restaurant
1,M1C,Park,Grocery Store,Italian Restaurant,Burger Joint,Neighborhood category,Gym,Gym / Fitness Center,Breakfast Spot,Hotel,Department Store
2,M1E,Pizza Place,Juice Bar,Coffee Shop,Fast Food Restaurant,Bank,Breakfast Spot,Burger Joint,Train Station,Beer Store,Supermarket
3,M1G,Coffee Shop,Pizza Place,Pharmacy,Fast Food Restaurant,Park,Indian Restaurant,Bank,Chinese Restaurant,Sandwich Place,Grocery Store
4,M1H,Coffee Shop,Restaurant,Gas Station,Clothing Store,Sandwich Place,Indian Restaurant,Bakery,Bank,Pharmacy,Bus Station


In [335]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([4, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 2, 0,
       0, 2, 2, 2, 0, 2, 1, 2, 0, 1, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2])

In [336]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [337]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Postal code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,4,M1B,Zoo Exhibit,Fast Food Restaurant,Restaurant,Pizza Place,Women's Store,Coffee Shop,Movie Theater,Supermarket,Big Box Store,Caribbean Restaurant
1,1,M1C,Park,Grocery Store,Italian Restaurant,Burger Joint,Neighborhood category,Gym,Gym / Fitness Center,Breakfast Spot,Hotel,Department Store
2,2,M1E,Pizza Place,Juice Bar,Coffee Shop,Fast Food Restaurant,Bank,Breakfast Spot,Burger Joint,Train Station,Beer Store,Supermarket
3,2,M1G,Coffee Shop,Pizza Place,Pharmacy,Fast Food Restaurant,Park,Indian Restaurant,Bank,Chinese Restaurant,Sandwich Place,Grocery Store
4,2,M1H,Coffee Shop,Restaurant,Gas Station,Clothing Store,Sandwich Place,Indian Restaurant,Bakery,Bank,Pharmacy,Bus Station


In [338]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = neighborhoods.join(neighborhoods_venues_sorted.set_index('Postal code'), on='Postal code')

In [340]:
toronto_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Pharmacy,Coffee Shop,Bank,Gas Station,Supermarket,Fast Food Restaurant,Café,Medical Supply Store,Shop & Service,Bus Stop
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,Coffee Shop,Gym,Fast Food Restaurant,Grocery Store,Middle Eastern Restaurant,Shoe Store,Electronics Store,Bus Line,BBQ Joint,Gourmet Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Café,Park,Diner,Restaurant,Japanese Restaurant,Pub,Bar,Farmers Market,Italian Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2,Clothing Store,Fast Food Restaurant,Restaurant,Coffee Shop,Furniture / Home Store,Sandwich Place,Dessert Shop,Vietnamese Restaurant,Latin American Restaurant,Toy / Game Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Café,Park,Restaurant,Japanese Restaurant,Gastropub,Comic Shop,Diner,Plaza,Middle Eastern Restaurant


In [348]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster+1), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining clusters

### Cluster 1

In [342]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,0,Coffee Shop,Café,Park,Diner,Restaurant,Japanese Restaurant,Pub,Bar,Farmers Market,Italian Restaurant
4,Downtown Toronto,0,Coffee Shop,Café,Park,Restaurant,Japanese Restaurant,Gastropub,Comic Shop,Diner,Plaza,Middle Eastern Restaurant
7,North York,0,Coffee Shop,Japanese Restaurant,Bank,Restaurant,Pizza Place,Burger Joint,Italian Restaurant,Pharmacy,Liquor Store,Electronics Store
9,Downtown Toronto,0,Coffee Shop,Café,Gastropub,Japanese Restaurant,Restaurant,Theater,Diner,Farmers Market,Seafood Restaurant,Burrito Place
15,Downtown Toronto,0,Coffee Shop,Café,Hotel,Restaurant,Plaza,Farmers Market,Seafood Restaurant,Bakery,Italian Restaurant,Concert Hall
16,York,0,Coffee Shop,Pizza Place,Bank,Italian Restaurant,Café,Caribbean Restaurant,Sandwich Place,Grocery Store,Indian Restaurant,Restaurant
19,East Toronto,0,Coffee Shop,Pub,Grocery Store,Breakfast Spot,Japanese Restaurant,Beach,Ice Cream Shop,Sandwich Place,BBQ Joint,Bakery
20,Downtown Toronto,0,Coffee Shop,Café,Hotel,Seafood Restaurant,Restaurant,Plaza,Farmers Market,Concert Hall,Vegetarian / Vegan Restaurant,Liquor Store
21,York,0,Coffee Shop,Italian Restaurant,Furniture / Home Store,Pizza Place,Grocery Store,Café,Bakery,Caribbean Restaurant,Sushi Restaurant,Sandwich Place
23,East York,0,Coffee Shop,Indian Restaurant,Bakery,Restaurant,Grocery Store,Electronics Store,Thai Restaurant,Sandwich Place,Sushi Restaurant,Supermarket


### Cluster 2

In [343]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Etobicoke,1,Bank,Grocery Store,Shopping Mall,Pharmacy,Bakery,Bus Line,Liquor Store,Café,Park,Bus Stop
11,Etobicoke,1,Park,Hotel,Sandwich Place,Coffee Shop,Electronics Store,Gym,Transportation Service,Grocery Store,Restaurant,Intersection
12,Scarborough,1,Park,Grocery Store,Italian Restaurant,Burger Joint,Neighborhood category,Gym,Gym / Fitness Center,Breakfast Spot,Hotel,Department Store
28,North York,1,Park,Coffee Shop,Pizza Place,Bank,Gas Station,Sushi Restaurant,Gift Shop,Restaurant,Bus Line,Dog Run
39,North York,1,Gas Station,Trail,Park,Bank,Grocery Store,Chinese Restaurant,Skating Rink,Café,Restaurant,Japanese Restaurant
46,North York,1,Park,Bank,Shopping Mall,Plaza,Pizza Place,Tea Room,Fast Food Restaurant,Electronics Store,Ethiopian Restaurant,Event Space
50,North York,1,Park,Electronics Store,Asian Restaurant,Vietnamese Restaurant,Bakery,Coffee Shop,Mexican Restaurant,Skating Rink,Shopping Mall,Latin American Restaurant
51,Scarborough,1,Park,Harbor / Marina,Ice Cream Shop,Discount Store,Beach,Sandwich Place,Grocery Store,Pizza Place,Fast Food Restaurant,Pharmacy
58,Scarborough,1,Park,Restaurant,College Stadium,General Entertainment,Gym,Skating Rink,Diner,Café,Thai Restaurant,Golf Course
88,Etobicoke,1,Park,Grocery Store,Café,Bakery,Indian Restaurant,Pharmacy,Bar,Restaurant,General Entertainment,Beer Store


### Cluster 3

In [345]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,2,Pharmacy,Coffee Shop,Bank,Gas Station,Supermarket,Fast Food Restaurant,Café,Medical Supply Store,Shop & Service,Bus Stop
1,North York,2,Coffee Shop,Gym,Fast Food Restaurant,Grocery Store,Middle Eastern Restaurant,Shoe Store,Electronics Store,Bus Line,BBQ Joint,Gourmet Shop
3,North York,2,Clothing Store,Fast Food Restaurant,Restaurant,Coffee Shop,Furniture / Home Store,Sandwich Place,Dessert Shop,Vietnamese Restaurant,Latin American Restaurant,Toy / Game Store
8,East York,2,Pizza Place,Pharmacy,Park,Fast Food Restaurant,Coffee Shop,Athletics & Sports,Brewery,Soccer Stadium,Bank,Gastropub
10,North York,2,Coffee Shop,Fast Food Restaurant,Grocery Store,Bank,Caribbean Restaurant,Restaurant,Gas Station,Pizza Place,Fried Chicken Joint,Sandwich Place
13,North York,2,Coffee Shop,Restaurant,Gym,Japanese Restaurant,Middle Eastern Restaurant,Sandwich Place,Park,Beer Store,Italian Restaurant,Bank
14,East York,2,Pizza Place,Coffee Shop,Pharmacy,Park,Bank,Bar,Thai Restaurant,Breakfast Spot,Sandwich Place,Gastropub
17,Etobicoke,2,Baseball Field,Coffee Shop,Pizza Place,Convenience Store,Bank,Grocery Store,Mexican Restaurant,Farmers Market,Shopping Plaza,Garden
18,Scarborough,2,Pizza Place,Juice Bar,Coffee Shop,Fast Food Restaurant,Bank,Breakfast Spot,Burger Joint,Train Station,Beer Store,Supermarket
22,Scarborough,2,Coffee Shop,Pizza Place,Pharmacy,Fast Food Restaurant,Park,Indian Restaurant,Bank,Chinese Restaurant,Sandwich Place,Grocery Store


### Cluster 4

In [346]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
95,Scarborough,3,Donut Shop,Farm,National Park,Zoo Exhibit,Filipino Restaurant,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant


### Cluster 5

In [347]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Scarborough,4,Zoo Exhibit,Fast Food Restaurant,Restaurant,Pizza Place,Women's Store,Coffee Shop,Movie Theater,Supermarket,Big Box Store,Caribbean Restaurant
