# Coursera IBM Data Science Capstone

### In this project I will segment a city into different Neighborhoods using the geographical coordinates of the center of each Neighborhood, and then using a combination of location data and machine learning to cluster it.

In [1]:
# importing libaries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


## Segmenting and Clustering Neighberhood

### Scraping Neighborhoods from wiki

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
# print(soup.prettify())

In [3]:
table_elements = soup.find_all('td')

In [4]:
#building lists with list-comprehension

#add every 3 element to list postcode, starting from element 0
postcode = [table_elements[x].text for x in range(0, len(table_elements), 3)]

#add every 3 element to list borough, starting from element 1
borough = [table_elements[x].text for x in range(1, len(table_elements), 3)]

#add every 3 element to list Neighborhood, starting from element 2
Neighborhood = [table_elements[x].text for x in range(2, len(table_elements), 3)]

In [5]:
#checking the length of the lists
len_post = len(postcode)
len_bor = len(borough)
len_nei = len(Neighborhood)
# print(f'len postcode= {len_post}, len borough= {len_bor}, len neighberhood= {len_nei}')

In [6]:
#deleting unnecessary lines 
del postcode[-12:]
del borough [-11:]
del Neighborhood[-11:]

In [7]:
#making the list neighberhood nicer
neighberhood = [word.replace('\n', '') for word in Neighborhood]

In [8]:
len_post = len(postcode)
len_bor = len(borough)
len_nei = len(Neighborhood)
# print(f'len postcode= {len_post}, len borough= {len_bor}, len Neighborhood= {len_nei}')

### Putting the data into a pandas dataframe

In [9]:
#first we need a dictionary:
data = {'postcode': postcode, 'borough': borough, 'Neighborhood': Neighborhood}
#and heer comes the dataframe :-) :
df = pd.DataFrame(data)
df.tail()

Unnamed: 0,postcode,borough,Neighborhood
283,M8Z,Etobicoke,Mimico NW\n
284,M8Z,Etobicoke,The Queensway West\n
285,M8Z,Etobicoke,Royal York South West\n
286,M8Z,Etobicoke,South of Bloor\n
287,M9Z,Not assigned,Not assigned\n


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
postcode        288 non-null object
borough         288 non-null object
Neighborhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [11]:
#dropping "not assigned" in borough
df.borough.replace('Not assigned', np.NaN, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,postcode,borough,Neighborhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


In [12]:
df[df['Neighborhood'].str.contains('Not')]

Unnamed: 0,postcode,borough,Neighborhood
8,M7A,Queen's Park,Not assigned\n


In [13]:
df.Neighborhood.replace('Not assigned', "Queen's Park", inplace=True)

In [14]:
len(df.postcode.unique())

103

In [15]:
# combining rows with the same postalcde
#into one row with the Neighborhoods separated with a comma
df_grouped = df.groupby(['postcode','borough'])['Neighborhood'].apply(', '.join).reset_index()
df_grouped.head()

Unnamed: 0,postcode,borough,Neighborhood
0,M1B,Scarborough,"Rouge\n, Malvern\n"
1,M1C,Scarborough,"Highland Creek\n, Rouge Hill\n, Port Union\n"
2,M1E,Scarborough,"Guildwood\n, Morningside\n, West Hill\n"
3,M1G,Scarborough,Woburn\n
4,M1H,Scarborough,Cedarbrae\n


In [16]:
df_grouped.shape

(103, 3)

### Adding Lattitude and Longitude to the Dataframe

In [17]:
# # Test with the suggested methoed from coursera

# import geocoder # import geocoder

# postcodes = df['postcode']

# lat = []
# lon = []

# attempts = 1

# for postcode in postcodes:
#     # initialize your variable to None
#     lat_lng_coords = None

#     # loop until you get the coordinates
#     while(lat_lng_coords is None) and attempts < 5:
#         g = geocoder.google(f'{postcode}, Toronto, Ontario')
#         print('g = ',g)
#         lat_lng_coords = g.latlng
#         print('lat_lng_coords= ', lat_lng_coords)
#         attempts += 1
        

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
    
#     lat.append(latitude)
#     lon.append(longitude)

# lat

In [18]:
# #Found an other library (https://github.com/symerio/pgeocode) 
# #(I got the results that I wanted, but 
# #I couldn't work with the given datatype(type(nomi)=pgeocode.Nominatim))

# import pgeocode

# df['postcode'] = "5CA " + df['postcode']

# postcode_list = df['postcode'].values.tolist()

# nomi = pgeocode.Nominatim('CA')
# nomi.query_postal_code(postcode_list)


In [19]:
# # # Importing Data from csv file with IBM Cloud
# Deleted because of the api keys


In [20]:
# Importing Data from csv file
geo_data = pd.read_csv('Geospatial_Coordinate.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
#combining the tables
geo_data.rename(columns={'Postal Code': 'postcode'}, inplace=True)
merged_data =pd.merge(df_grouped, geo_data, on='postcode')

In [22]:
merged_data.head()

Unnamed: 0,postcode,borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge\n, Malvern\n",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek\n, Rouge Hill\n, Port Union\n",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711
3,M1G,Scarborough,Woburn\n,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


In [23]:
merged_data.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
postcode        103 non-null object
borough         103 non-null object
Neighborhood    103 non-null object
Latitude        103 non-null float64
Longitude       103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


## Segmenting and Clustering Neighborhoods in Toronto

In [24]:
# importing libaries

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [25]:
# Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
# print(f'The geograpical coordinate of Toronto are: latitude= {latitude}, longitude= {longitude}.')

In [26]:
# create a map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighborhood in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['borough'], merged_data['Neighborhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [27]:
# @hidden_cell
# Define Foursquare Credentials and Version
CLIENT_ID = '' # Foursquare ID
CLIENT_SECRET =  '' #  Foursquare Secret
VERSION = '20180605'

### Explore Neighborhoods in Toronto

In [28]:
# a function to get the top 100 venues for each Neighborhood within a radius of 500 meters

LIMIT = 100 # limit of number of venues returned by Foursquare API

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
# run the above function on each Neighborhood and create a new dataframe

toronto_venues = getNearbyVenues(names=merged_data['Neighborhood'],
                                   latitudes=merged_data['Latitude'],
                                   longitudes=merged_data['Longitude']
                                  )



Rouge
, Malvern

Highland Creek
, Rouge Hill
, Port Union

Guildwood
, Morningside
, West Hill

Woburn

Cedarbrae

Scarborough Village

East Birchmount Park
, Ionview
, Kennedy Park

Clairlea
, Golden Mile
, Oakridge

Cliffcrest
, Cliffside
, Scarborough Village West

Birch Cliff
, Cliffside West

Dorset Park
, Scarborough Town Centre
, Wexford Heights

Maryvale
, Wexford

Agincourt

Clarks Corners
, Sullivan
, Tam O'Shanter

Agincourt North
, L'Amoreaux East
, Milliken
, Steeles East

L'Amoreaux West

Upper Rouge

Hillcrest Village

Fairview
, Henry Farm
, Oriole

Bayview Village

Silver Hills
, York Mills

Newtonbrook
, Willowdale

Willowdale South

York Mills West

Willowdale West

Parkwoods

Don Mills North

Flemingdon Park
, Don Mills South

Bathurst Manor
, Downsview North
, Wilson Heights

Northwood Park
, York University

CFB Toronto
, Downsview East

Downsview West

Downsview Central

Downsview Northwest

Victoria Village

Woodbine Gardens
, Parkview Hill

Woodbine Heights

Th

In [30]:
toronto_venues.shape

(2238, 7)

In [31]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge\n, Malvern\n",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek\n, Rouge Hill\n, Port Union\n",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Highland Creek\n, Rouge Hill\n, Port Union\n",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood\n, Morningside\n, West Hill\n",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [32]:
toronto_venues.Neighborhood = [word.replace('\n', '') for word in toronto_venues.Neighborhood]

In [33]:
#Let's check how many venues were returned for each Neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3,3,3,3,3,3
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",10,10,10,10,10,10
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Downsview North, Wilson Heights",18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
Berczy Park,55,55,55,55,55,55
"Birch Cliff, Cliffside West",4,4,4,4,4,4


In [34]:
#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 272 uniques categories.


### Analyze Each Neighborhood

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add Neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move Neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
toronto_onehot.shape

(2238, 272)

In [37]:
# grouping rows by Neighborhood and by taking the mean of 
# the frequency of occurrence of each category

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.010000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.000000
1,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000
6,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.018182,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [38]:
# each Neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2  American Restaurant  0.04
3      Thai Restaurant  0.04
4           Steakhouse  0.04


----Agincourt----
            venue  freq
0    Skating Rink  0.25
1  Sandwich Place  0.25
2          Lounge  0.25
3  Breakfast Spot  0.25
4   Metro Station  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                venue  freq
0                Park  0.33
1          Playground  0.33
2    Asian Restaurant  0.33
3         Yoga Studio  0.00
4  Mexican Restaurant  0.00


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0        Grocery Store   0.2
1          Pizza Place   0.2
2             Pharmacy   0.1
3           Beer Store   0.1
4  Fried Chicken Joint   0.1


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place   0.2
1             Pub  

                             venue  freq
0                   Baseball Field   0.5
1       Construction & Landscaping   0.5
2                      Yoga Studio   0.0
3  Molecular Gastronomy Restaurant   0.0
4       Modern European Restaurant   0.0


----Fairview, Henry Farm, Oriole----
                  venue  freq
0        Clothing Store  0.11
1  Fast Food Restaurant  0.10
2           Coffee Shop  0.08
3            Food Court  0.03
4                Bakery  0.03


----First Canadian Place, Underground city----
                 venue  freq
0          Coffee Shop  0.09
1                 Café  0.09
2                Hotel  0.04
3  American Restaurant  0.03
4                  Bar  0.03


----Flemingdon Park, Don Mills South----
              venue  freq
0       Coffee Shop  0.10
1               Gym  0.10
2  Asian Restaurant  0.10
3        Beer Store  0.10
4    Discount Store  0.05


----Forest Hill North, Forest Hill West----
              venue  freq
0             Trail  0.25
1              

                             venue  freq
0                       Playground   1.0
1                      Yoga Studio   0.0
2                    Metro Station   0.0
3  Molecular Gastronomy Restaurant   0.0
4       Modern European Restaurant   0.0


----St. James Town----
            venue  freq
0     Coffee Shop  0.07
1            Café  0.06
2           Hotel  0.05
3      Restaurant  0.05
4  Cosmetics Shop  0.04


----Stn A PO Boxes 25 The Esplanade----
                venue  freq
0         Coffee Shop  0.11
1                Café  0.04
2          Restaurant  0.04
3  Italian Restaurant  0.03
4               Hotel  0.03


----Studio District----
                venue  freq
0                Café  0.10
1         Coffee Shop  0.08
2  Italian Restaurant  0.05
3         Music Store  0.05
4           Gastropub  0.05


----The Annex, North Midtown, Yorkville----
            venue  freq
0            Café  0.13
1  Sandwich Place  0.13
2     Coffee Shop  0.13
3     Pizza Place  0.09
4  Cosmetics Sh

In [68]:
# putting that data into a pandas df
# first: function to sort the venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [69]:
# creating the new df and displaying the top 5 venues

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
Neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    Neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

Neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Thai Restaurant,Steakhouse
1,Agincourt,Lounge,Skating Rink,Sandwich Place,Breakfast Spot,Electronics Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Asian Restaurant,College Stadium,Colombian Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Fried Chicken Joint,Beer Store,Coffee Shop
4,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Skating Rink,Sandwich Place,Dance Studio


## Clustering Neighborhoods

In [70]:
# Run k-means to cluster the Neighborhood into 5 clusters.
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 3, 0, 0, 0])

In [71]:
# create a new dataframe that includes the cluster 
# as well as the top 10 venues for each Neighborhood

# adjust the Neighborhood column in the merged_data dataframe
merged_data.Neighborhood = [word.replace('\n', '') for word in merged_data.Neighborhood]

# add clustering labels
Neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = merged_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each Neighborhood
toronto_merged = toronto_merged.join(Neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(200) # check the last columns!

Unnamed: 0,postcode,borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,4.0,Fast Food Restaurant,Department Store,Event Space,Ethiopian Restaurant,Empanada Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0.0,Bar,History Museum,Women's Store,Diner,Discount Store
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0,Electronics Store,Rental Car Location,Spa,Intersection,Pizza Place
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Korean Restaurant,Women's Store,Drugstore,Diner
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Athletics & Sports,Bank,Caribbean Restaurant,Bakery,Thai Restaurant
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1.0,Playground,Women's Store,Drugstore,Dim Sum Restaurant,Diner
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,0.0,Bus Station,Coffee Shop,Discount Store,Chinese Restaurant,Department Store
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,0.0,Bakery,Park,Intersection,Fast Food Restaurant,Metro Station
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,0.0,Motel,Movie Theater,American Restaurant,Women's Store,Department Store
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,0.0,Café,General Entertainment,College Stadium,Skating Rink,Dessert Shop


In [72]:
toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

In [73]:
toronto_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 0 to 102
Data columns (total 11 columns):
postcode                 99 non-null object
borough                  99 non-null object
Neighborhood             99 non-null object
Latitude                 99 non-null float64
Longitude                99 non-null float64
Cluster Labels           99 non-null int32
1st Most Common Venue    99 non-null object
2nd Most Common Venue    99 non-null object
3rd Most Common Venue    99 non-null object
4th Most Common Venue    99 non-null object
5th Most Common Venue    99 non-null object
dtypes: float64(2), int32(1), object(8)
memory usage: 8.9+ KB


## Visualization of the Clusters

In [75]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters