# Clustering and Segmenting Toronto Neighborhoods

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')

## Scrape wikipedia page for table

In [2]:
my_table = soup.find('table',{'class':'wikitable sortable'})

In [3]:
table_rows = my_table.findAll('tr')

In [4]:
header = ['PostalCode', 'Borough', 'Neighborhood']
header

['PostalCode', 'Borough', 'Neighborhood']

In [5]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
neighborhoods = pd.DataFrame(l, columns=header)

In [6]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


## Clean up data

In [7]:
# Remove newlines in Neighborhood column
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace('\n', '')
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [8]:
# Change all the unknowns to a common value
neighborhoods = neighborhoods.replace('Not assigned', None)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,,
2,M2A,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [9]:
# Assign all unknown Neighborhood's with the Borough Value
neighborhoods['Neighborhood'].fillna(neighborhoods['Borough'], inplace=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,,
2,M2A,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [10]:
# Drop all rows with unassigned Borough
neighborhoods.dropna(how='any', axis='index', inplace=True)
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [11]:
# Group by Postal Code
neighborhoods = neighborhoods.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Output Shape & Rows

In [12]:
neighborhoods.shape

(178, 3)

In [13]:
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


# Get Postal Code Coordinates

### Coordinate Library is too unreliable, so code has been commented out

In [94]:
!pip install --upgrade geocoder
import geocoder

!pip install geopy
import geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

Requirement already up-to-date: geocoder in c:\users\baileyduncan\anaconda3\lib\site-packages (1.38.1)


### Using a csv file with the coordinates for Toronto Postal Codes

In [17]:
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
# Merge coordinates with neighorhoods where the postalcodes match up
neighborhoods = pd.merge(coordinates, neighborhoods, how='left', left_on=['Postal Code'], right_on=['PostalCode'])

In [19]:
# Drop Duplicate Column
neighborhoods.drop(['PostalCode'], axis=1, inplace=True)

In [20]:
neighborhoods.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge,Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

Just make sure:

to add enough Markdown cells to explain what you decided to do and to report any observations you make.
to generate maps to visualize your neighborhoods and how they cluster together.

## Lets map Toronto to see what the neighborhoods look like:

In [54]:
def generate_map(dataframe, longitude, latitude):
        map = folium.Map(location=[latitude, longitude], zoom_start=11)

        # add markers to map
        for lat, lng, borough, neighborhood in zip(dataframe['Latitude'], dataframe['Longitude'], dataframe['Borough'], dataframe['Neighborhood']):
            label = '{}, {}'.format(neighborhood, borough)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color='blue',
                fill=True,
                fill_color='#3186cc',
                fill_opacity=0.7,
                parse_html=False).add_to(map)  
            
        return map

In [71]:
latitude = 43.72
longitude = -79.347

# create map of New York using latitude and longitude values
map_neighborhoods = generate_map(neighborhoods, longitude, latitude)
map_neighborhoods

## What if we just pick the boroughs with "Toronto" in it...

In [84]:
toronto_borough = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')]
toronto_borough.reset_index(drop=True, inplace=True)
toronto_borough.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M4E,43.676357,-79.293031,East Toronto,The Beaches
1,M4K,43.679557,-79.352188,East Toronto,"The Danforth West,Riverdale"
2,M4L,43.668999,-79.315572,East Toronto,"The Beaches West,India Bazaar"
3,M4M,43.659526,-79.340923,East Toronto,Studio District
4,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


In [80]:
map_to = generate_map(toronto_borough, longitude, latitude)
map_to

In [81]:
# @hidden_cell
# FOURSQUARE API CREDENTIALS
CLIENT_ID = '5N1I2ZBKG55NFLJQELOZI3XTLBUXXCF3FKTJQSZ2WXPZBWR5' # your Foursquare ID
CLIENT_SECRET = 'P5MPRQBUI4MRPYUYQHFHDT0CWEF3D3U1WTKBGKGTK0VXAIBU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## Lets ask FourSquare for the venues around our first Borough

In [82]:
neighborhood_latitude = toronto_borough.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_borough.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_borough.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [85]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=5N1I2ZBKG55NFLJQELOZI3XTLBUXXCF3FKTJQSZ2WXPZBWR5&client_secret=P5MPRQBUI4MRPYUYQHFHDT0CWEF3D3U1WTKBGKGTK0VXAIBU&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [86]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cd625a36a607121240f4377'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad4c062f964a52011f820e3',
       'name': 'The Big Carrot Natural Food Market',
       'location': {'address': '125 Southwood Dr',
        'lat': 43.678879,
        'lng': -79.297734,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.678879,
          'lng': -79.297734}],
        'distance': 471,
        'postalCode': 'M4E 0B8',
   

In [87]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [95]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
1,Grover Pub and Grub,Pub,43.679181,-79.297215
2,St-Denis Studios Inc.,Music Venue,43.675031,-79.288022
3,Upper Beaches,Neighborhood,43.680563,-79.292869


## Nice. Now we can use a function to get venues for all of our neighborhoods

In [160]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [161]:
toronto_venues = getNearbyVenues(names=toronto_borough['Neighborhood'],
                                   latitudes=toronto_borough['Latitude'],
                                   longitudes=toronto_borough['Longitude']
                                  )

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvall

In [162]:
print(toronto_venues.shape)
toronto_venues.head()

(3082, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
1,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater
2,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub
3,The Beaches,43.676357,-79.293031,Ed's Real Scoop,43.67263,-79.287993,Ice Cream Shop
4,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery


In [163]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton,Exhibition Place,Parkdale Village",100,100,100,100,100,100
Business Reply Mail Processing Centre 969 Eastern,49,49,49,49,49,49
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",16,16,16,16,16,16
"Cabbagetown,St. James Town",38,38,38,38,38,38
Central Bay Street,100,100,100,100,100,100
"Chinatown,Grange Park,Kensington Market",100,100,100,100,100,100
Christie,100,100,100,100,100,100
Church and Wellesley,100,100,100,100,100,100


In [164]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 277 unique categories.


In [165]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(3082, 277)


Unnamed: 0,Zoo,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Lets get the most common venues per neighborhood

In [166]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped

(38, 277)


Unnamed: 0,Neighborhood,Zoo,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.01,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.07,0.01,0.0,0.03,0.0,0.01,0.0,0.0,0.02
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,...,0.0,0.01,0.01,0.01,0.01,0.0,0.0,0.01,0.01,0.01


In [167]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [168]:
import numpy as np

In [169]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Café,Hotel,Coffee Shop,American Restaurant,Theater,Pizza Place,Steakhouse,Sushi Restaurant,Gastropub,Japanese Restaurant
1,Berczy Park,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Bakery,Beer Bar,Park,Cocktail Bar,Gastropub
2,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Restaurant,Bar,Furniture / Home Store,Tibetan Restaurant,Bakery,Gift Shop,Park,Caribbean Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Park,Coffee Shop,Brewery,Pizza Place,Italian Restaurant,Sushi Restaurant,Pet Store,New American Restaurant,Pub,French Restaurant
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Coffee Shop,Harbor / Marina,Café,Garden,Track,Scenic Lookout,Sculpture Garden,Dog Run,Airport,Airport Lounge


### Now that we have some venues for each neighborhood, lets try and cluster them

In [170]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 0, 3, 1, 1, 1, 1, 1])

In [173]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_borough

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,43.676357,-79.293031,East Toronto,The Beaches,0,Pub,Coffee Shop,Pizza Place,Breakfast Spot,Beach,Japanese Restaurant,Bar,Sandwich Place,Salon / Barbershop,Burger Joint
1,M4K,43.679557,-79.352188,East Toronto,"The Danforth West,Riverdale",0,Greek Restaurant,Coffee Shop,Café,Pub,Ice Cream Shop,Fast Food Restaurant,Pizza Place,Yoga Studio,Discount Store,Sandwich Place
2,M4L,43.668999,-79.315572,East Toronto,"The Beaches West,India Bazaar",0,Indian Restaurant,Coffee Shop,Café,Beach,Light Rail Station,Fast Food Restaurant,Restaurant,Bakery,Burrito Place,Sandwich Place
3,M4M,43.659526,-79.340923,East Toronto,Studio District,1,Coffee Shop,Bar,Café,American Restaurant,Bakery,Vietnamese Restaurant,Italian Restaurant,Diner,Sandwich Place,Brewery
4,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park,2,College Quad,Gym / Fitness Center,College Gym,Coffee Shop,Park,Café,Trail,Bookstore,Gym Pool,Gym


In [189]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Now we can see what the clusters represent...

### Cluster 1: 

In [175]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,43.676357,0,Pub,Coffee Shop,Pizza Place,Breakfast Spot,Beach,Japanese Restaurant,Bar,Sandwich Place,Salon / Barbershop,Burger Joint
1,43.679557,0,Greek Restaurant,Coffee Shop,Café,Pub,Ice Cream Shop,Fast Food Restaurant,Pizza Place,Yoga Studio,Discount Store,Sandwich Place
2,43.668999,0,Indian Restaurant,Coffee Shop,Café,Beach,Light Rail Station,Fast Food Restaurant,Restaurant,Bakery,Burrito Place,Sandwich Place
5,43.712751,0,Coffee Shop,Fast Food Restaurant,Italian Restaurant,Café,Gym,Restaurant,Sushi Restaurant,Pizza Place,Dessert Shop,Yoga Studio
7,43.704324,0,Coffee Shop,Italian Restaurant,Sushi Restaurant,Café,Pizza Place,Gym,Indian Restaurant,Fast Food Restaurant,Restaurant,Dessert Shop
8,43.689574,0,Coffee Shop,Italian Restaurant,Park,Grocery Store,Bagel Shop,Café,Sandwich Place,Pub,Gym,Pizza Place
9,43.686412,0,Coffee Shop,Park,Sushi Restaurant,Gym / Fitness Center,Italian Restaurant,Grocery Store,Pizza Place,Pub,Spa,Convenience Store
13,43.65426,0,Coffee Shop,Café,Italian Restaurant,Theater,Restaurant,Park,Breakfast Spot,Bakery,Diner,Pub
22,43.711695,0,Sushi Restaurant,Coffee Shop,Bank,Pharmacy,Italian Restaurant,Japanese Restaurant,Asian Restaurant,Café,Clothing Store,Bagel Shop
23,43.696948,0,Café,Park,Coffee Shop,Gym / Fitness Center,Italian Restaurant,Bakery,Bank,Japanese Restaurant,Sushi Restaurant,Trail


### Cluster 1: 

In [176]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,43.659526,1,Coffee Shop,Bar,Café,American Restaurant,Bakery,Vietnamese Restaurant,Italian Restaurant,Diner,Sandwich Place,Brewery
6,43.715383,1,Italian Restaurant,Coffee Shop,Sporting Goods Shop,Mexican Restaurant,Restaurant,Diner,Café,Skating Rink,Sushi Restaurant,Baseball Field
11,43.667967,1,Park,Diner,Gastropub,Japanese Restaurant,Café,Coffee Shop,Taiwanese Restaurant,Dance Studio,Jewelry Store,Deli / Bodega
12,43.66586,1,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Café,Park,Ramen Restaurant,Bubble Tea Shop,Grocery Store
14,43.657162,1,Coffee Shop,Middle Eastern Restaurant,Café,Italian Restaurant,Clothing Store,Tea Room,Ramen Restaurant,Diner,Gastropub,Restaurant
15,43.651494,1,Coffee Shop,Café,Restaurant,Hotel,Bakery,Cosmetics Shop,Italian Restaurant,Seafood Restaurant,American Restaurant,Gastropub
16,43.644771,1,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Bakery,Beer Bar,Park,Cocktail Bar,Gastropub
17,43.657952,1,Coffee Shop,Café,Italian Restaurant,Bubble Tea Shop,Clothing Store,Ramen Restaurant,Japanese Restaurant,Bar,Park,Pizza Place
18,43.650571,1,Café,Hotel,Coffee Shop,American Restaurant,Theater,Pizza Place,Steakhouse,Sushi Restaurant,Gastropub,Japanese Restaurant
19,43.640816,1,Coffee Shop,Café,Hotel,Italian Restaurant,Aquarium,Brewery,Park,Scenic Lookout,Restaurant,Fried Chicken Joint


### Cluster 2: Student Ammenities

In [177]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,43.72802,2,College Quad,Gym / Fitness Center,College Gym,Coffee Shop,Park,Café,Trail,Bookstore,Gym Pool,Gym


### Cluster 3: Nature & Scenery

In [178]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,43.628947,3,Coffee Shop,Harbor / Marina,Café,Garden,Track,Scenic Lookout,Sculpture Garden,Dog Run,Airport,Airport Lounge


### Cluster 4: 

In [179]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,43.679563,4,Coffee Shop,Park,Grocery Store,Bank,Hostel,Athletics & Sports,Playground,Convenience Store,Office,Sandwich Place


## Save to CSV

In [181]:
# Save to CSV File
toronto_merged.to_csv("toronto_merged.csv")