# Segmenting and Clustering neighborhoods in Toronto

## 1. Importing librairies and ressources

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes 
# uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# 
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# 
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# 
# import k-means from clustering stage
from sklearn.cluster import KMeans
# 
# !conda install -c conda-forge folium=0.5.0 --yes 
# uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
# import data
df = pd.read_csv('Wiki_data_Toronto.csv')
print df.shape
df.head()

(287, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 2. Transform datafram as acceptance criteriae 
### A) remove Postal code not assigned to a Borough
    The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
    Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    

In [3]:
# Check type of data and make sure there are type object (string)
print df.dtypes
#df = df.astype('string')
df['Borough']= df['Borough'].astype(str)
print df.dtypes
# remove the ones that are not assigned
# df1 = df[df.Neighborhood != "Not assigned"]
df1 = df[~df['Borough'].str.contains("Not assigned")]

# Drop a row by condition
print df1.shape
df1.head()
#print df1

Postcode        object
Borough         object
Neighborhood    object
dtype: object
Postcode        object
Borough         object
Neighborhood    object
dtype: object
(210, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### B) where Neighborhood is 'not assigned', give the name of the borough
If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [4]:
# break down data set in two : one with assigned neighborood and one with non assigned neighborood
df2a = df1[~df['Neighborhood'].str.contains("Not assigned")] 
df2b = df1[df['Neighborhood'].str.contains("Not assigned")]
# for the one with non assigned neighborood, copy the name of the Borough
df2b['Neighborhood'] = df2b['Borough']
print df2b
# concatenate the two split. Sort by index to keep original order. Reset index
df2 = pd.concat([df2a, df2b]).sort_index().reset_index(drop=True)#, inplace=True)
print df2.head(10)

  Postcode       Borough  Neighborhood
9      M9A  Queen's Park  Queen's Park
  Postcode           Borough       Neighborhood
0      M3A        North York         Parkwoods 
1      M4A        North York  Victoria Village 
2      M5A  Downtown Toronto      Harbourfront 
3      M6A        North York  Lawrence Heights 
4      M6A        North York    Lawrence Manor 
5      M7A  Downtown Toronto      Queen's Park 
6      M9A      Queen's Park       Queen's Park
7      M1B       Scarborough             Rouge 
8      M1B       Scarborough           Malvern 
9      M3B        North York   Don Mills North 


  
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### C) Merge duplicated postal code and borough into one row
More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.


In [5]:
# combine postcode so there are unique
df3 = df2.groupby(['Postcode','Borough'], sort = False).agg(lambda x: ','.join(x))
df3.reset_index(level=['Postcode','Borough'], inplace=True) 

# show results
print df3.head()
print '\n'
print 'shape of the postal, borough and neigborhood dataframe is ', df3.shape
print 'note, there was {} unique postcode in orginal dataset (without nan assigned borough)'.format(len(pd.unique(df1.Postcode)))


  Postcode           Borough                       Neighborhood
0      M3A        North York                         Parkwoods 
1      M4A        North York                  Victoria Village 
2      M5A  Downtown Toronto                      Harbourfront 
3      M6A        North York  Lawrence Heights ,Lawrence Manor 
4      M7A  Downtown Toronto                      Queen's Park 


shape of the postal, borough and neigborhood dataframe is  (103, 3)
note, there was 103 unique postcode in orginal dataset (without nan assigned borough)


...

## 3. Getting coordinate for each post code.
Note, since there is reliability problem with the  geopy.geocoders, we'll get directly the coordinate from an extarnal csv files..


In [6]:
## 3. Get lattitude and longitude for each neighborhood
#address = 'Parkwoods, North York, M3A'
#address = '102 North End Ave, New York, NY'
#
#geolocator = Nominatim(user_agent="foursquare_agent")
#location = geolocator.geocode(address)
#latitude = location.latitude
#longitude = location.longitude
#print(latitude, longitude)

#address = 'Toronto, CA'
#
#geolocator = Nominatim(user_agent="ca_explorer")
#
##geolocator = Nominatim(user_agent="foursquare_agent")
#location = geolocator.geocode(address)
#
#latitude = location.latitude
#
#longitude = location.longitude
#
#print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# import geocoder # import geocoder
# 
# # initialize your variable to None
# lat_lng_coords = None
# 
# # loop until you get the coordinates
# postal_code = df3.Postcode.unique()
# 
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng
# 
# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

## A) Import coordinate from extenal csv files

In [7]:
# 3. bis get coordinate from external csv files 
## import data
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
print df_coord.shape
df_coord.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## B) Merge coordinate data frame and borough data frame (df3)

In [8]:
df4 = df3.merge(df_coord, left_on='Postcode', right_on='Postal Code') #'inner', df_coord, )
df4.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights ,Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,M7A,43.662301,-79.389494


## C) drop Postal code columns as it is the same as postcode

In [9]:
## drop Postal Code since it is the same value as Postcode.
df5 = df4.drop(['Postal Code'], axis=1)
df5.head(200)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights ,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge ,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens ,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson ,Garden District",43.657162,-79.378937


# 4. Explore and cluster the neighborhoods in Toronto
Here, we'll just focus on M3A Postcode. Note, the analysis could be done for each postcode or even combine by borough (e.g. North York = results of M3A and M4A)

Define Foursquare Credentials and version from a separate private files

In [10]:
secrets = json.load(open('4S_secret.json'))
CLIENT_ID = secrets['CLIENT_ID']
CLIENT_SECRET = secrets['CLIENT_SECRET']
VERSION = secrets['VERSION']
print VERSION

print('Secret Credential loaded')

20180605
Secret Credential loaded


Let's explore the first Postcode in our dataframe 'M3A'

In [11]:
print 'exploring postcode ', df5.iloc[0]['Postcode']
print df5.iloc[0]

exploring postcode  M3A
Postcode               M3A
Borough         North York
Neighborhood    Parkwoods 
Latitude           43.7533
Longitude         -79.3297
Name: 0, dtype: object


Now, let's get the top 100 venues that are in this postcode within a radius of 500 meters

In [12]:
import requests
latitude = df5.iloc[0]['Latitude']
longitude = df5.iloc[0]['Longitude']
print latitude
print longitude
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
#

43.7532586
-79.3296565


In [13]:
results = requests.get(url).json()
results

{u'meta': {u'code': 200, u'requestId': u'5e36e085882fc7001b613478'},
 u'response': {u'groups': [{u'items': [{u'reasons': {u'count': 0,
       u'items': [{u'reasonName': u'globalInteractionReason',
         u'summary': u'This spot is popular',
         u'type': u'general'}]},
      u'referralId': u'e-0-4e8d9dcdd5fbbbb6b3003c7b-0',
      u'venue': {u'categories': [{u'icon': {u'prefix': u'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_',
          u'suffix': u'.png'},
         u'id': u'4bf58dd8d48988d163941735',
         u'name': u'Park',
         u'pluralName': u'Parks',
         u'primary': True,
         u'shortName': u'Park'}],
       u'id': u'4e8d9dcdd5fbbbb6b3003c7b',
       u'location': {u'address': u'Toronto',
        u'cc': u'CA',
        u'city': u'Toronto',
        u'country': u'Canada',
        u'distance': 245,
        u'formattedAddress': [u'Toronto', u'Toronto ON', u'Canada'],
        u'labeledLatLngs': [{u'label': u'display',
          u'lat': 43.75197604605557

In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [15]:
venues = results['response']['groups'][0]['items']
# print ("venue", venues)
nearby_venues = json_normalize(venues) # flatten JSON
# print ('nearby_venues', nearby_venues)
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114
2,Bella Vita Catering & Private Chef Service,BBQ Joint,43.756651,-79.331524


# 5 Analyze one borough
There is no much data for one postcode, so instead, let's look for all venue in one borough = many postcode
We will look further for Nort York
## 5a. Create a dataframe with North York postcode, neighboroud and coordinate

In [16]:
df_NorthYork = df5[df5.Borough == 'North York']
print df_NorthYork.head()

   Postcode     Borough                       Neighborhood   Latitude  \
0       M3A  North York                         Parkwoods   43.753259   
1       M4A  North York                  Victoria Village   43.725882   
3       M6A  North York  Lawrence Heights ,Lawrence Manor   43.718518   
7       M3B  North York                   Don Mills North   43.745906   
10      M6B  North York                         Glencairn   43.709577   

    Longitude  
0  -79.329656  
1  -79.315572  
3  -79.464763  
7  -79.352188  
10 -79.445073  


## 5b. get all venue for each postcode / coordinate in a radius of 500


In [17]:
radius = 500
LIMIT = 100
Results_list = list()
for i in range (0, len(df_NorthYork.Postcode)):
    latitude = df5.iloc[i]['Latitude']
    longitude = df5.iloc[i]['Longitude']
    # print latitude
    # print longitude
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
    Results_list.append(requests.get(url).json())
#

## 5c. put all results in a nice dataframe containing postcode/borough/... and venues

In [18]:
#tobecontinued try to get a dataframe like in coursera 
nearby_venues_list = list()
for results, postcode, borough, neighborhood, pclat, pclng in zip(Results_list, df_NorthYork.Postcode,
                                                                  df_NorthYork.Borough, df_NorthYork.Neighborhood,
                                                                 df_NorthYork.Latitude, df_NorthYork.Longitude):
    venues = results['response']['groups'][0]['items']
    # print ("venue", venues)
    nearby_venues = json_normalize(venues) # flatten JSON
    # print ('nearby_venues', nearby_venues)
    # filter columns
    filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
    # print 'toto'
    # print nearby_venues
    # check if results dataframe is empty 
    if nearby_venues.empty:
        print 'empty'
    else:
        nearby_venues =nearby_venues.loc[:, filtered_columns]

        # filter the category for each row
        nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

        # clean columns
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
        Postcode = [postcode] * len(nearby_venues.index) 
        Borough = [borough] * len(nearby_venues.index)
        Neighborhood = [neighborhood] * len(nearby_venues.index)
        nearby_venues['Postcode'] = Postcode
        nearby_venues['Borough'] = Borough
        nearby_venues['Neighborhood'] = Neighborhood
        nearby_venues['Neighborhood Latitude'] = pclat
        nearby_venues['Neighborhood Longitude'] = pclng
    # Store results 
    nearby_venues_list.append(nearby_venues)
nearby_venues_list[0]



empty
empty


Unnamed: 0,name,categories,lat,lng,Postcode,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,Brookbanks Park,Park,43.751976,-79.33214,M3A,North York,Parkwoods,43.753259,-79.329656
1,Variety Store,Food & Drink Shop,43.751974,-79.333114,M3A,North York,Parkwoods,43.753259,-79.329656
2,Bella Vita Catering & Private Chef Service,BBQ Joint,43.756651,-79.331524,M3A,North York,Parkwoods,43.753259,-79.329656


In [19]:
# let's group everything in one dataframe instead of a list of dataframe
nearby_venues_df = pd.DataFrame() # nearby_venues_list[0]
for nv in nearby_venues_list:
    nearby_venues_df= nearby_venues_df.append(nv)

# reset index
nearby_venues_df.reset_index(drop =True, inplace=True)
nearby_venues_df.head(5)

Unnamed: 0,name,categories,lat,lng,Postcode,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,Brookbanks Park,Park,43.751976,-79.33214,M3A,North York,Parkwoods,43.753259,-79.329656
1,Variety Store,Food & Drink Shop,43.751974,-79.333114,M3A,North York,Parkwoods,43.753259,-79.329656
2,Bella Vita Catering & Private Chef Service,BBQ Joint,43.756651,-79.331524,M3A,North York,Parkwoods,43.753259,-79.329656
3,Victoria Village Arena,Hockey Arena,43.723481,-79.315635,M4A,North York,Victoria Village,43.725882,-79.315572
4,Tim Hortons,Coffee Shop,43.725517,-79.313103,M4A,North York,Victoria Village,43.725882,-79.315572


In [20]:
# lets reorganize the dataframe
#print nearby_venues_df['Postcode']
nearby_venues_df = nearby_venues_df.rename(columns={
    "name": "Venue",
    "lat": "Venue Latitude",
    "lng": "Venue Longitude",
    "categories": "Venue Category"
})
nearby_venues_df.head()

Unnamed: 0,Venue,Venue Category,Venue Latitude,Venue Longitude,Postcode,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,Brookbanks Park,Park,43.751976,-79.33214,M3A,North York,Parkwoods,43.753259,-79.329656
1,Variety Store,Food & Drink Shop,43.751974,-79.333114,M3A,North York,Parkwoods,43.753259,-79.329656
2,Bella Vita Catering & Private Chef Service,BBQ Joint,43.756651,-79.331524,M3A,North York,Parkwoods,43.753259,-79.329656
3,Victoria Village Arena,Hockey Arena,43.723481,-79.315635,M4A,North York,Victoria Village,43.725882,-79.315572
4,Tim Hortons,Coffee Shop,43.725517,-79.313103,M4A,North York,Victoria Village,43.725882,-79.315572


In [21]:
# let's reorder 
nearby_venues_df = nearby_venues_df[['Postcode', 'Borough', 'Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',
                                     'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']]
# Neighborhood 	Neighborhood Latitude 	Neighborhood Longitude 	Venue 	Venue Latitude 	Venue Longitude 	Venue Category
nearby_venues_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,North York,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M3A,North York,Parkwoods,43.753259,-79.329656,Bella Vita Catering & Private Chef Service,43.756651,-79.331524,BBQ Joint
3,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,North York,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


## 5D. Analyze Each Neighborhood / Postcode of North York
### 5D1. refactor the data frame

In [22]:
# one hot encoding
NorthYork_nearby_venues_df_onehot = pd.get_dummies(nearby_venues_df[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NorthYork_nearby_venues_df_onehot['Neighborhood'] = nearby_venues_df['Neighborhood'] 
NorthYork_nearby_venues_df_onehot['Postcode'] = nearby_venues_df['Postcode'] 

# move neighborhood column to the first column
col_list =  list(NorthYork_nearby_venues_df_onehot.columns)
col_list.pop(col_list.index('Neighborhood'))
col_list.pop(col_list.index('Postcode'))
fixed_columns = ['Neighborhood', 'Postcode'] + col_list

NorthYork_nearby_venues_df_onehot = NorthYork_nearby_venues_df_onehot[fixed_columns]

NorthYork_nearby_venues_df_onehot.head()

Unnamed: 0,Neighborhood,Postcode,Accessories Store,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Theater,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Parkwoods,M3A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,M3A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,M3A,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,M4A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,M4A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 5D.2 Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [23]:
NorthYork_grouped = NorthYork_nearby_venues_df_onehot.groupby('Neighborhood').mean().reset_index()
NorthYork_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,...,Theater,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Bathurst Manor ,Downsview North ,Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park ,Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Don Mills North,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0
4,"Downsview ,North Park ,Upwood Park",0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0


### 5D.3 Let's create a dataframe containing the most common venue by neighborhood

In [24]:
# reusing ibm courses function (DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0)
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = NorthYork_grouped['Neighborhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(5)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor ,Downsview North ,Wilson Heights",Café,Gym / Fitness Center,Japanese Restaurant,Caribbean Restaurant,Basketball Court,Baseball Field,Department Store,Eastern European Restaurant,Discount Store,Diner
1,Bayview Village,Park,Japanese Restaurant,Asian Restaurant,Pub,Yoga Studio,Curling Ice,Diner,Dim Sum Restaurant,Dessert Shop,Department Store
2,"Bedford Park ,Lawrence Manor East",Rental Car Location,Breakfast Spot,Electronics Store,Pizza Place,Intersection,Medical Center,Mexican Restaurant,Yoga Studio,Dessert Shop,Department Store
3,Don Mills North,Clothing Store,Furniture / Home Store,Boutique,Event Space,Miscellaneous Shop,Coffee Shop,Accessories Store,Women's Store,Vietnamese Restaurant,Gourmet Shop
4,"Downsview ,North Park ,Upwood Park",Skating Rink,Curling Ice,Park,Video Store,Beer Store,Pharmacy,Athletics & Sports,Dance Studio,Cosmetics Shop,Diner


### 5D.4 Cluster neighborood
We'll use K-means method to clusterize Neighborhood.
We'll use the dataframe containing all venue type frequencies

In [26]:
# number of cluster, K
K = 5

NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=K, random_state=0).fit(NorthYork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 4, 0, 1, 1, 1, 3])

In [27]:
# now merge the results of the clustering to our dataframe containing the 10 most common venue
# add clustering labels
if 'Cluster Labels' in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
else:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted['Cluster Labels'] = neighborhoods_venues_sorted['Cluster Labels'].astype(int, inplace=True)
print neighborhoods_venues_sorted['Cluster Labels'].dtype
NorthYork_merged = df_NorthYork

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
NorthYork_merged = NorthYork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# remove nan row 
print NorthYork_merged.head(50)
NorthYork_merged.dropna(axis=0, inplace=True) # M3C has no results 

# force Cluster label to be int
NorthYork_merged['Cluster Labels'] = NorthYork_merged['Cluster Labels'].astype(int, inplace=True)

# show final cluster
NorthYork_merged.head() 

int32
   Postcode     Borough                                      Neighborhood  \
0       M3A  North York                                        Parkwoods    
1       M4A  North York                                 Victoria Village    
3       M6A  North York                 Lawrence Heights ,Lawrence Manor    
7       M3B  North York                                  Don Mills North    
10      M6B  North York                                        Glencairn    
13      M3C  North York                 Flemingdon Park ,Don Mills South    
27      M2H  North York                                Hillcrest Village    
28      M3H  North York  Bathurst Manor ,Downsview North ,Wilson Heights    
33      M2J  North York                     Fairview ,Henry Farm ,Oriole    
34      M3J  North York                  Northwood Park ,York University    
39      M2K  North York                                  Bayview Village    
40      M3K  North York                      CFB Toronto ,Downsview Ea

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,Park,BBQ Joint,Food & Drink Shop,Curling Ice,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Dance Studio
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Intersection,Creperie,Diner,Dim Sum Restaurant,Dessert Shop,Department Store
3,M6A,North York,"Lawrence Heights ,Lawrence Manor",43.718518,-79.464763,1,Coffee Shop,Park,Café,Pub,Bakery,Mexican Restaurant,Restaurant,Yoga Studio,Electronics Store,Event Space
7,M3B,North York,Don Mills North,43.745906,-79.352188,1,Clothing Store,Furniture / Home Store,Boutique,Event Space,Miscellaneous Shop,Coffee Shop,Accessories Store,Women's Store,Vietnamese Restaurant,Gourmet Shop
10,M6B,North York,Glencairn,43.709577,-79.445073,1,Coffee Shop,Gym,Park,Fast Food Restaurant,Portuguese Restaurant,Nightclub,Music Venue,Mexican Restaurant,Juice Bar,Italian Restaurant


### 5.E Map the different cluster

In [28]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(K)
ys = [i + x + (i*x)**2 for i in range(K)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'], NorthYork_merged['Longitude'], NorthYork_merged['Neighborhood'], NorthYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 5 Observation
- Most neighborood fall in the same category : cluster 1 
- There is only one neighborood for each other clusters
