# Coursera IBM Data Science Capstone

### In this project I will segment a city into different Neighborhoods using the geographical coordinates of the center of each Neighborhood, and then using a combination of location data and machine learning to cluster it.

In [1]:
# importing libaries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

## Segmenting and Clustering Neighberhood

### Scraping Neighborhoods from wiki

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

In [3]:
table_elements = soup.find_all('td')

In [4]:
#building lists with list-comprehension

#add every 3 element to list postcode, starting from element 0
postcode = [table_elements[x].text for x in range(0, len(table_elements), 3)]

#add every 3 element to list borough, starting from element 1
borough = [table_elements[x].text for x in range(1, len(table_elements), 3)]

#add every 3 element to list Neighborhood, starting from element 2
Neighborhood = [table_elements[x].text for x in range(2, len(table_elements), 3)]

In [5]:
#checking the length of the lists
len_post = len(postcode)
len_bor = len(borough)
len_nei = len(Neighborhood)
print(f'len postcode= {len_post}, len borough= {len_bor}, len neighberhood= {len_nei}')

len postcode= 300, len borough= 299, len neighberhood= 299


In [6]:
#deleting unnecessary lines 
del postcode[-12:]
del borough [-11:]
del Neighborhood[-11:]

In [7]:
#making the list neighberhood nicer
neighberhood = [word.replace('\n', '') for word in Neighborhood]

In [8]:
len_post = len(postcode)
len_bor = len(borough)
len_nei = len(Neighborhood)
print(f'len postcode= {len_post}, len borough= {len_bor}, len Neighborhood= {len_nei}')

len postcode= 288, len borough= 288, len Neighborhood= 288


### Putting the data into a pandas dataframe

In [9]:
#first we need a dictionary:
data = {'postcode': postcode, 'borough': borough, 'Neighborhood': Neighborhood}
#and her comes the dataframe :-) :
df = pd.DataFrame(data)
df.tail()

Unnamed: 0,postcode,borough,Neighborhood
283,M8Z,Etobicoke,Mimico NW\n
284,M8Z,Etobicoke,The Queensway West\n
285,M8Z,Etobicoke,Royal York South West\n
286,M8Z,Etobicoke,South of Bloor\n
287,M9Z,Not assigned,Not assigned\n


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
postcode        288 non-null object
borough         288 non-null object
Neighborhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [11]:
#dropping "not assigned" in borough
df.borough.replace('Not assigned', np.NaN, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,postcode,borough,Neighborhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


In [12]:
df[df['Neighborhood'].str.contains('Not')]

Unnamed: 0,postcode,borough,Neighborhood
8,M7A,Queen's Park,Not assigned\n


In [13]:
df.Neighborhood.replace('Not assigned', "Queen's Park", inplace=True)

In [14]:
len(df.postcode.unique())

103

In [15]:
# combining rows with the same postalcde
#into one row with the Neighborhoods separated with a comma
df_grouped = df.groupby(['postcode','borough'])['Neighborhood'].apply(', '.join).reset_index()
df_grouped.head()

Unnamed: 0,postcode,borough,Neighborhood
0,M1B,Scarborough,"Rouge\n, Malvern\n"
1,M1C,Scarborough,"Highland Creek\n, Rouge Hill\n, Port Union\n"
2,M1E,Scarborough,"Guildwood\n, Morningside\n, West Hill\n"
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


In [16]:
df_grouped.shape

(103, 3)

### Adding Lattitude and Longitude to the Dataframe

In [17]:
# # Test with the suggested methoed from coursera

# import geocoder # import geocoder

# postcodes = df['postcode']

# lat = []
# lon = []

# attempts = 1

# for postcode in postcodes:
#     # initialize your variable to None
#     lat_lng_coords = None

#     # loop until you get the coordinates
#     while(lat_lng_coords is None) and attempts < 5:
#         g = geocoder.google(f'{postcode}, Toronto, Ontario')
#         print('g = ',g)
#         lat_lng_coords = g.latlng
#         print('lat_lng_coords= ', lat_lng_coords)
#         attempts += 1
        

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
    
#     lat.append(latitude)
#     lon.append(longitude)

# lat

In [18]:
# #Found an other library (https://github.com/symerio/pgeocode) 
# #(I got the results that I wanted, but 
# #I couldn't work with the given datatype(type(nomi)=pgeocode.Nominatim))

# import pgeocode

# df['postcode'] = "5CA " + df['postcode']

# postcode_list = df['postcode'].values.tolist()

# nomi = pgeocode.Nominatim('CA')
# nomi.query_postal_code(postcode_list)


In [19]:
# Importing Data from csv file
geo_data = pd.read_csv('Geospatial_Coordinate.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
#combining the tables
geo_data.rename(columns={'Postal Code': 'postcode'}, inplace=True)
merged_data =pd.merge(df_grouped, geo_data, on='postcode')

In [21]:
merged_data.head()

Unnamed: 0,postcode,borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge\n, Malvern\n",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek\n, Rouge Hill\n, Port Union\n",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711
3,M1G,Scarborough,Woburn\n,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


In [22]:
merged_data.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
postcode        103 non-null object
borough         103 non-null object
Neighborhood    103 non-null object
Latitude        103 non-null float64
Longitude       103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


## Segmenting and Clustering Neighborhoods in Toronto

In [23]:
# importing libaries

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [24]:
# Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of Toronto are: latitude= {latitude}, longitude= {longitude}.')

The geograpical coordinate of Toronto are: latitude= 43.653963, longitude= -79.387207.


In [25]:
# create a map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighborhood in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['borough'], merged_data['Neighborhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [26]:
# @hidden_cell
# Define Foursquare Credentials and Version
CLIENT_ID = '5ZKENUDBF3GUOSFG2RGUDJA0CJMNABWPOWOP3MA5Q25KPD1H' # your Foursquare ID
CLIENT_SECRET = 'NFQUCXAHACFKSAVQ21BLFRKZFWGAGFF0BJJWLJKT20YRUMES' # your Foursquare Secret
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5ZKENUDBF3GUOSFG2RGUDJA0CJMNABWPOWOP3MA5Q25KPD1H
CLIENT_SECRET:NFQUCXAHACFKSAVQ21BLFRKZFWGAGFF0BJJWLJKT20YRUMES


### Explore Neighborhoods in Toronto

In [27]:
# a function to get the top 100 venues for each Neighborhood within a radius of 500 meters

LIMIT = 100 # limit of number of venues returned by Foursquare API

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
# run the above function on each Neighborhood and create a new dataframe

toronto_venues = getNearbyVenues(names=merged_data['Neighborhood'],
                                   latitudes=merged_data['Latitude'],
                                   longitudes=merged_data['Longitude']
                                  )



Rouge
, Malvern

Highland Creek
, Rouge Hill
, Port Union

Guildwood
, Morningside
, West Hill

Woburn

Cedarbrae

Scarborough Village

East Birchmount Park
, Ionview
, Kennedy Park

Clairlea
, Golden Mile
, Oakridge

Cliffcrest
, Cliffside
, Scarborough Village West

Birch Cliff
, Cliffside West

Dorset Park
, Scarborough Town Centre
, Wexford Heights

Maryvale
, Wexford

Agincourt

Clarks Corners
, Sullivan
, Tam O'Shanter

Agincourt North
, L'Amoreaux East
, Milliken
, Steeles East

L'Amoreaux West

Upper Rouge

Hillcrest Village

Fairview
, Henry Farm
, Oriole

Bayview Village

Silver Hills
, York Mills

Newtonbrook
, Willowdale

Willowdale South

York Mills West

Willowdale West

Parkwoods

Don Mills North

Flemingdon Park
, Don Mills South

Bathurst Manor
, Downsview North
, Wilson Heights

Northwood Park
, York University

CFB Toronto
, Downsview East

Downsview West

Downsview Central

Downsview Northwest

Victoria Village

Woodbine Gardens
, Parkview Hill

Woodbine Heights

Th

In [29]:
toronto_venues.shape

(2244, 7)

In [30]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge\n, Malvern\n",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek\n, Rouge Hill\n, Port Union\n",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


In [31]:
toronto_venues.Neighborhood = [word.replace('\n', '') for word in toronto_venues.Neighborhood]

In [32]:
#Let's check how many venues were returned for each Neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3,3,3,3,3,3
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",9,9,9,9,9,9
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Downsview North, Wilson Heights",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
Berczy Park,57,57,57,57,57,57
"Birch Cliff, Cliffside West",4,4,4,4,4,4


In [33]:
#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 274 uniques categories.


### Analyze Each Neighborhood

In [34]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add Neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move Neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
toronto_onehot.shape

(2244, 274)

In [36]:
# grouping rows by Neighborhood and by taking the mean of 
# the frequency of occurrence of each category

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.010000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.000000
1,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.000000
6,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.017544,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [37]:
# each Neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3  American Restaurant  0.04
4      Thai Restaurant  0.04


----Agincourt----
                venue  freq
0        Skating Rink  0.25
1      Sandwich Place  0.25
2              Lounge  0.25
3      Breakfast Spot  0.25
4  Mexican Restaurant  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                venue  freq
0          Playground  0.33
1         Coffee Shop  0.33
2                Park  0.33
3         Yoga Studio  0.00
4  Mexican Restaurant  0.00


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0        Grocery Store  0.22
1             Pharmacy  0.11
2           Beer Store  0.11
3       Sandwich Place  0.11
4  Fried Chicken Joint  0.11


----Alderwood, Long Branch----
                venue  freq
0         Pizza

                        venue  freq
0              Massage Studio  0.25
1                        Park  0.25
2  Construction & Landscaping  0.25
3                      Bakery  0.25
4                 Yoga Studio  0.00


----East Birchmount Park, Ionview, Kennedy Park----
               venue  freq
0   Department Store  0.25
1        Bus Station  0.25
2     Discount Store  0.25
3        Coffee Shop  0.25
4  Mobile Phone Shop  0.00


----East Toronto----
                             venue  freq
0                             Park  0.50
1                    Metro Station  0.25
2                Convenience Store  0.25
3                      Yoga Studio  0.00
4  Molecular Gastronomy Restaurant  0.00


----Emery, Humberlea----
                        venue  freq
0  Construction & Landscaping   0.5
1              Baseball Field   0.5
2                 Yoga Studio   0.0
3          Mexican Restaurant   0.0
4         Monument / Landmark   0.0


----Fairview, Henry Farm, Oriole----
                 

4  Molecular Gastronomy Restaurant   0.0


----Silver Hills, York Mills----
                             venue  freq
0                        Cafeteria   0.5
1                             Park   0.5
2               Mexican Restaurant   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


----St. James Town----
            venue  freq
0     Coffee Shop  0.08
1      Restaurant  0.05
2            Café  0.05
3           Hotel  0.05
4  Cosmetics Shop  0.04


----Stn A PO Boxes 25 The Esplanade----
                venue  freq
0         Coffee Shop  0.11
1          Restaurant  0.04
2                Café  0.04
3  Seafood Restaurant  0.03
4               Hotel  0.03


----Studio District----
                 venue  freq
0                 Café  0.11
1          Coffee Shop  0.08
2  American Restaurant  0.05
3               Bakery  0.05
4   Italian Restaurant  0.05


----The Annex, North Midtown, Yorkville----
            venue  freq
0     Coffee Shop  0.12
1    

In [38]:
# putting that data into a pandas df
# first: function to sort the venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [39]:
# creating the new df and displaying the top 5 venues

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
Neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    Neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

Neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,American Restaurant,Steakhouse
1,Agincourt,Lounge,Breakfast Spot,Sandwich Place,Skating Rink,Dim Sum Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Coffee Shop,Playground,Dog Run,Deli / Bodega
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Pharmacy
4,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Athletics & Sports,Skating Rink


## Clustering Neighborhoods

In [40]:
# Run k-means to cluster the Neighborhood into 5 clusters.
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 1, 2, 2, 2, 2, 2, 2, 2])

In [41]:
# create a new dataframe that includes the cluster 
# as well as the top 10 venues for each Neighborhood

# add clustering labels
Neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = merged_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each Neighborhood
toronto_merged = toronto_merged.join(Neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

KeyError: 'Neighborhood'

## Visualization of the Clusters