# Peer-graded assignment for the Applied Data Science Capstone course on Coursera - Segmenting and Clustering Neighborhoods in Toronto #

## Part 1: Preparing the dataframe with the neighborhoods in Toronto ##

### Installing the necessary packages ###

In [1]:
import pandas as pd
import numpy as np
!pip install lxml



### Scraping of the data from a wikipedia page and transforming it into a pandas dataframe ###

In [2]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') # creates a list with data from the html page
data_canada = data[0] # creates a dataframe from the list

### Cleaning the dataframe ###

In [3]:
data_canada.rename(columns={'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace=True) # renames columns
data_canada = data_canada[data_canada.Borough != 'Not assigned'] # removes the cells with a 'Borough' that is "Not assigned"
data_canada.reset_index(inplace=True) # resets the index values
data_canada.drop('index', axis=1, inplace=True) # removes the old index values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [4]:
# If a cell has a 'Borough', but a "Not assigned" 'Neighborhood', the 'Neighborhood' will be the same as the 'Borough':

idx_to_change = data_canada.loc[data_canada['Neighborhood'] == "Not assigned"].index # creates a list of indices of the cells where the value of 'Neighborhood' column is "Not assigned"
for i in idx_to_change:
    data_canada.iloc[i, 2] = data_canada.iloc[i, 1] # iterates the list of indicies and set Column 3 to the value of Column 2 at the given index


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
# If rows have the same 'PostalCode' and 'Borough' values, they will be merged together: 
data_canada = data_canada.groupby(['PostalCode','Borough'])['Neighborhood'].apply(list)
data_canada = data_canada.sample(frac=1).reset_index()
data_canada['Neighborhood']= data_canada['Neighborhood'].str.join(', ')

# Now the data frame looks like this:
data_canada

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3C,North York,"Flemingdon Park, Don Mills South"
1,M1N,Scarborough,"Birch Cliff, Cliffside West"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M3L,North York,Downsview West
4,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station"
5,M5N,Central Toronto,Roselawn
6,M6G,Downtown Toronto,Christie
7,M8W,Etobicoke,"Alderwood, Long Branch"
8,M1E,Scarborough,"Guildwood, Morningside, West Hill"
9,M5H,Downtown Toronto,"Adelaide, King, Richmond"


### The shape of the ready dataframe ###

In [6]:
data_canada.shape

(103, 3)

## Part 2: Adding the latitude and the longitude coordinates of each neighborhood to the dataframe

### Creating a dataframe containing the postal code, latitude and longitude of each neighborhood from the provided csv file ###

In [7]:
geo_data = pd.read_csv('http://cocl.us/Geospatial_data')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the dataframe from the Part 1 with the new dataframe containing the latitude and longitude coordinates ###

In [8]:
data_merged = pd.merge(data_canada, geo_data, left_on='PostalCode', right_on='Postal Code') # merges the two dataframes based on the common postal code

### Removing the second column 'Postal Code' ###

In [9]:
data_merged.drop('Postal Code', axis=1, inplace=True)

### The dataframe containing the neighborhoods with the latitude and longitude coordinates is now ready ###

In [10]:
data_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3C,North York,"Flemingdon Park, Don Mills South",43.725900,-79.340923
1,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
3,M3L,North York,Downsview West,43.739015,-79.506944
4,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752
5,M5N,Central Toronto,Roselawn,43.711695,-79.416936
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
8,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
9,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568


## Part 3: Exploring and clustering the neighborhoods in Toronto ##

In this part, we will use Foursquare API to explore neighborhoods in Toronto. We will use the explore function to get the most common venue categoriesin each neighborhood, and then use this feature to group neighborhoods into clusters with the k-means clustering algorithm. At the end, we will use the Folium library to visualize the neighborhoods in Toronto and their emerging clusters.

### Downloading all the additionnal dependencies that we will need in this part ###

In [11]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


### Using geopy library to get the latitude and longitude values of Toronto ###

In [12]:
# Defining an instance of the geocoder
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer") # need to define a user_agent; here it will be "toronto_explorer"
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


### Creating a map of Toronto with neighborhoods superimposed on top ###

In [13]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data_merged['Latitude'], data_merged['Longitude'], data_merged['Borough'], data_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Defining Foursquare Credentials and Version ###

In [14]:
CLIENT_ID = '55MPXSTGUABHT31JTO4LNQH44YXH0NGZTMIV05NJ1KKESMS3' # my Foursquare ID
CLIENT_SECRET = 'B1L0E1CPY1FIXPO1YXEAM5AC5MYHNNM1D1GXQVINITQOVNGJ' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: 55MPXSTGUABHT31JTO4LNQH44YXH0NGZTMIV05NJ1KKESMS3
CLIENT_SECRET:B1L0E1CPY1FIXPO1YXEAM5AC5MYHNNM1D1GXQVINITQOVNGJ


### Exploring one neighborhood in the dataframe ###

Getting the name of the n-th neighborhood in the dataframe

In [17]:
data_merged.loc[0, 'Neighborhood']

'Flemingdon Park, Don Mills South'

Getting the neighborhood's latitude and longitude values

In [18]:
neighborhood_latitude = data_merged.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = data_merged.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = data_merged.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Flemingdon Park, Don Mills South are 43.72589970000001, -79.340923.


Getting the top 100 venues that are in East Toronto within a radius of 500 meters

In [19]:
# We need to create the GET request URL, which will be named url
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=55MPXSTGUABHT31JTO4LNQH44YXH0NGZTMIV05NJ1KKESMS3&client_secret=B1L0E1CPY1FIXPO1YXEAM5AC5MYHNNM1D1GXQVINITQOVNGJ&v=20180605&ll=43.72589970000001,-79.340923&radius=500&limit=100'

Sending the request and examining the results

In [20]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5de911dcc8cff245ee67cead'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Sunnybrook - York Mills',
  'headerFullLocation': 'Sunnybrook - York Mills, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 21,
  'suggestedBounds': {'ne': {'lat': 43.73039970450001,
    'lng': -79.33470758059177},
   'sw': {'lat': 43.721399695500004, 'lng': -79.34713841940824}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bdaff7463c5c9b67bcb2568',
       'name': 'Sorento Restaurant',
       'location': {'address': '900 Don Mills Rd.',
        'lat': 43.72657509457231,
        'lng': -79.34198930569546,
        'labeledLatLngs': [{'label':

We first need to create the get_category_type function, that will extract the information from the items key

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Cleaning the json and structuring it into a pandas dataframe

In [22]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Sorento Restaurant,Italian Restaurant,43.726575,-79.341989
1,Oomomo,Discount Store,43.726429,-79.343283
2,Fitness Connection,Gym,43.727473,-79.341707
3,Tilley Endurables,Clothing Store,43.727033,-79.342926
4,Swiss Chalet Rotisserie & Grill,Restaurant,43.726737,-79.341403


Showing the number of venues returned by Foursquare

In [23]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

21 venues were returned by Foursquare.


### Exploring the venues in all the neighborhoods in Toronto ###

We first create a function that repeats the same process to all the neighborhoods of Toronto.

In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

The following code will run the above function on each neighborhood and create a new dataframe callded toronto_venues

In [25]:
toronto_venues = getNearbyVenues(names=data_merged['Neighborhood'],
                                   latitudes=data_merged['Latitude'],
                                   longitudes=data_merged['Longitude']
                                  )

Flemingdon Park, Don Mills South
Birch Cliff, Cliffside West
Highland Creek, Rouge Hill, Port Union
Downsview West
Harbourfront East, Toronto Islands, Union Station
Roselawn
Christie
Alderwood, Long Branch
Guildwood, Morningside, West Hill
Adelaide, King, Richmond
Dovercourt Village, Dufferin
First Canadian Place, Underground city
Parkwoods
Hillcrest Village
Parkdale, Roncesvalles
Caledonia-Fairbanks
Newtonbrook, Willowdale
Upper Rouge
Stn A PO Boxes 25 The Esplanade
Don Mills North
Clarks Corners, Sullivan, Tam O'Shanter
Glencairn
St. James Town
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
The Junction North, Runnymede
L'Amoreaux West
Maryvale, Wexford
Woburn
Leaside
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Church and Wellesley
Bayview Village
High Park, The Junction South
Silver Hills, York Mills
Humber Bay, King's Mill Park, Kingsway Park South East, Mimico NE, Old Mill South, The Queensway East, 

Checking the size of the resulting dataframe

In [26]:
print(toronto_venues.shape)
toronto_venues.head()

(2244, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Flemingdon Park, Don Mills South",43.7259,-79.340923,Sorento Restaurant,43.726575,-79.341989,Italian Restaurant
1,"Flemingdon Park, Don Mills South",43.7259,-79.340923,Oomomo,43.726429,-79.343283,Discount Store
2,"Flemingdon Park, Don Mills South",43.7259,-79.340923,Fitness Connection,43.727473,-79.341707,Gym
3,"Flemingdon Park, Don Mills South",43.7259,-79.340923,Tilley Endurables,43.727033,-79.342926,Clothing Store
4,"Flemingdon Park, Don Mills South",43.7259,-79.340923,Swiss Chalet Rotisserie & Grill,43.726737,-79.341403,Restaurant


Checking the number of venues which were returned for each neighborhood

In [27]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,5,5,5,5,5,5
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3,3,3,3,3,3
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",8,8,8,8,8,8
"Alderwood, Long Branch",11,11,11,11,11,11
"Bathurst Manor, Downsview North, Wilson Heights",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",22,22,22,22,22,22
Berczy Park,56,56,56,56,56,56
"Birch Cliff, Cliffside West",4,4,4,4,4,4


How many unique categories can be curated from all the returned values?

In [28]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 277 uniques categories.


### Analyzing each neighborhood ##

In [30]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
#fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
#toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The new dataframe size

In [31]:
toronto_onehot.shape

(2244, 277)

Grouping rows by neighborhoods and by taking the mean of the frequency of occurence of each category

In [32]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030000,...,0.0,0.020000,0.00,0.000000,0.000000,0.000000,0.010000,0.000000,0.0,0.000000
1,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,"Alderwood, Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.045455,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
6,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
7,"Bedford Park, Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.045455,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
8,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.017857,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
9,"Birch Cliff, Cliffside West",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


The new size

In [33]:
toronto_grouped.shape

(100, 277)

Printing each neighborhood along with the top 5 most common venues

In [34]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
             venue  freq
0      Coffee Shop  0.07
1             Café  0.05
2  Thai Restaurant  0.04
3       Steakhouse  0.04
4       Restaurant  0.03


----Agincourt----
                       venue  freq
0             Clothing Store   0.2
1                     Lounge   0.2
2             Breakfast Spot   0.2
3  Latin American Restaurant   0.2
4               Skating Rink   0.2


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
               venue  freq
0               Park  0.33
1         Playground  0.33
2   Sculpture Garden  0.33
3  Accessories Store  0.00
4        Men's Store  0.00


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0            Beer Store  0.12
1  Fast Food Restaurant  0.12
2              Pharmacy  0.12
3           Pizza Place  0.12
4          Liquor Store  0.12


----Alderwood, Long Branch----
            venue  freq
0

### Creating a dataframe displaying the top 10 venues for each neighborhood ###

In [35]:
# This function will sort the values in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
# Creating the new dataframe and displaying the top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,Bar,Sushi Restaurant,Asian Restaurant,Burger Joint,Restaurant,Bakery
1,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Clothing Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Sculpture Garden,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Yoga Studio
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Liquor Store,Sandwich Place,Beer Store,Fried Chicken Joint,Pharmacy,Fast Food Restaurant,Dumpling Restaurant,Drugstore
4,"Alderwood, Long Branch",Pizza Place,Gym,Pool,Skating Rink,Coffee Shop,Pub,Pharmacy,Athletics & Sports,Dance Studio,Sandwich Place


### Clustering neighborhoods ###

Running k-means to cluster the neighborhoods into 5 clusters

In [37]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Creating a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [38]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

data_merged = data_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how='right')

data_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923,0,Gym,Coffee Shop,Asian Restaurant,Beer Store,Discount Store,Restaurant,Dim Sum Restaurant,Sandwich Place,Italian Restaurant,Sporting Goods Shop
1,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,0,Café,General Entertainment,College Stadium,Skating Rink,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0,Moving Target,Bar,Yoga Studio,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,M3L,North York,Downsview West,43.739015,-79.506944,2,Grocery Store,Park,Bank,Shopping Mall,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
4,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,0,Coffee Shop,Hotel,Aquarium,Café,Restaurant,Fried Chicken Joint,Italian Restaurant,Scenic Lookout,Brewery,Music Venue


### Creating the map to visualize the resulting clusters ###

In [39]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(data_merged['Latitude'], data_merged['Longitude'], data_merged['Neighborhood'], data_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters