In [30]:
# Uncomment to install libraries

# !conda install -c conda-forge beautifulsoup4 --yes
# !conda install -c conda-forge lxml --yes
# !conda install -c conda-forge requests --yes
# !conda install -c conda-forge geocoder --yes
# !conda install -c conda-forge folium=0.5.0 --yes

### Import packages

In [31]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib.request
import geocoder # library to convert address to latitude, longitude
import folium # map rendering library
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans # the most popular clustering algorithm
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## 1. Get the contents of a wiki page for the postal codes of Canada

In [32]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = urllib.request.urlopen(url)
data = response.read()      # a 'bytes' object
text = data.decode('utf-8') # a 'str'

Use BeautifulSoup to load the contents of the page

In [33]:
soup = BeautifulSoup(text, 'lxml')
#soup # uncomment to see the whole page as text

In [34]:
# access title of page
title = soup.title.text
title

'List of postal codes of Canada: M - Wikipedia'

In [35]:
# access specific table (first table with this class name)
table = soup.find('table', class_='wikitable sortable')
# table # uncomment to see the whole table as text

## 2. Tranform the data into a *pandas* dataframe

#### Handle columns

In [36]:
column_names = []
table_head = table.find('tr') # first row of the table
for header in table_head.find_all('th'): # loop through all elements of header
    column_names.append(header.text.strip('\n')) # define the dataframe columns
    
# instantiate the dataframe
df = pd.DataFrame(columns=column_names)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


#### Handle rows

In [37]:
postcodes, boroughs, neighbourhoods = [], [], []
for row in table.find_all('tr')[1:]: # for each row except the first one
    for i, element in enumerate(row.find_all('td')):
        if (i==0):
            postcodes.append(element.text)
        elif (i==1):
            boroughs.append(element.text)
        elif (i==2):
            neighbourhoods.append(element.text.strip('\n').strip(']'))

In [38]:
for i in range(len(postcodes)):
    postcode = postcodes[i]
    borough = boroughs[i]
    neighbourhood = neighbourhoods[i]
    
    # insert rows in dataframe
    df = df.append({'Postcode' : postcode,
                    'Borough' : borough,
                    'Neighbourhood' : neighbourhood}, ignore_index=True)
    
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [39]:
df.shape

(289, 3)

Remove rows with incomplete information

In [40]:
df = df[df['Borough'] != 'Not assigned']
df = df[df.Neighbourhood != 'Not assigned']
df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Join neighbourhoods (with a comma) that have the same postcode and borough

In [41]:
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))

df = df.to_frame()
df = df.reset_index()
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [42]:
df.shape

(102, 3)

## 3. Get coordinates of postal codes

In [43]:
# create two columns
df['Latitude'] = ""
df['Longitude'] = ""

In [44]:
# needs some minutes to complete

for index, row in df.iterrows():

    postal_code = row['Postcode']

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]

    df['Latitude'][index] = latitude
    df['Longitude'][index] = longitude
    
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395


#### Create a map of Toronto with neighbourhoods.

In [45]:
# Get Toronto coordinates
lat_lng_coords = None

while(lat_lng_coords is None): # we have to be persistent, as this package doesn't always respond
    g = geocoder.google('Toronto, Ontario')
    lat_lng_coords = g.latlng

    toronto_latitude = lat_lng_coords[0]
    toronto_longitude = lat_lng_coords[1]

print('The geograpical coordinate of Toronto are {}, {}.'.format(toronto_latitude, toronto_longitude))    

The geograpical coordinate of Toronto are 43.653226, -79.3831843.


In [46]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## 4. Getting started with Foursquare

#### Define Foursquare Credentials and Version

Register/Log in to Foursquare and under Developers tab, find your credentials. Insert them below.

In [47]:
# Enter your credentials. Sign up/Login to foursquare to obtain them. Go to developers tab.

#CLIENT_ID = 'your-client-ID' # your Foursquare ID
#CLIENT_SECRET = 'your-client-secret' # your Foursquare Secret
#VERSION = '20180605' # Foursquare API version

#print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)

In [75]:
# @hidden_cell

#### Let's explore the first neighborhood in our dataframe.

In [49]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Rouge, Malvern are 43.8066863, -79.1943534.


#### Now, let's get the top 100 venues that are in Rouge, Malvern within a radius of 500 meters.

In [50]:
#LIMIT = 100 # limit of number of venues returned by Foursquare API
#radius = 500 # define radius in meters

# create URL
#url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
#    CLIENT_ID, 
#    CLIENT_SECRET, 
#    VERSION, 
#    neighborhood_latitude, 
#    neighborhood_longitude, 
#    radius, 
#    LIMIT)

#url

In [76]:
# @hidden_cell

In [52]:
# Send the GET request and examine the resutls
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5b9804e2dd57972b7eb5701d'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4d669cba83865481c948fa53-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/spa_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1ed941735',
         'name': 'Spa',
         'pluralName': 'Spas',
         'primary': True,
         'shortName': 'Spa'}],
       'id': '4d669cba83865481c948fa53',
       'location': {'address': '8130 Sheppard Ave E',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Morningside Ave',
        'distance': 595,
        'formattedAddress': ['8130 Sheppard Ave E (Morningside Ave)',
         'Toronto ON M1B 3W3',
         'Canada'],
        'labeledLatLngs': [{'label': 'd

All the information is in the items key.

In [53]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [54]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # transform in pandas dataframe

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(20)

Unnamed: 0,name,categories,lat,lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777
2,Harvey's,Fast Food Restaurant,43.800106,-79.198258
3,Wendy's,Fast Food Restaurant,43.802008,-79.19808
4,Wendy's,Fast Food Restaurant,43.807448,-79.199056
5,Tim Hortons,Coffee Shop,43.802,-79.198169
6,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
7,Lee Valley,Hobby Shop,43.803161,-79.199681
8,Bus Stop: 85 & 116,Bus Station,43.802198,-79.199389
9,Tim Hortons / Esso,Coffee Shop,43.80166,-79.199133


In [55]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

17 venues were returned by Foursquare.


## 5. Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [56]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [57]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West, Steeles West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The D

In [58]:
print(toronto_venues.shape)
toronto_venues.head()

(2197, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet,43.767697,-79.189914,BBQ Joint


Let's check how many venues were returned for each neighborhood

In [59]:
toronto_venues['Neighborhood'].value_counts()

Ryerson, Garden District                                                                                                                  100
First Canadian Place, Underground city                                                                                                    100
Adelaide, King, Richmond                                                                                                                  100
Chinatown, Grange Park, Kensington Market                                                                                                 100
Commerce Court, Victoria Hotel                                                                                                            100
St. James Town                                                                                                                            100
Design Exchange, Toronto Dominion Centre                                                                                                  100
Harbou

So some neighbours don't have any venues next to them. So lets remove them from the initial dataframe.

In [60]:
final = []

for index, row in toronto_venues.iterrows():
    final.append(row['Neighborhood'])

for index, row in df.iterrows():
    if (row['Neighbourhood'] not in set(final)):
        print(row['Neighbourhood'])
        df = df[df['Neighbourhood'] != row['Neighbourhood']]

df.shape

Upper Rouge
Silver Hills, York Mills
Newtonbrook, Willowdale
Islington Avenue


(98, 5)

Let's find out how many unique categories can be curated from all the returned venues

In [61]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 272 uniques categories.


## 6. Analyze Each Neighborhood

In [62]:
#create new dataframe

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [63]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.010000,0.000000,0.000000,0.000000,0.0,0.010000,0.0,0.000000
1,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
4,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.058824,0.000000,0.0,0.000000,0.0,0.000000
6,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
7,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
8,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
9,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000


#### Let's print each neighborhood along with the top 5 most common venues

In [64]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.07
1  American Restaurant  0.04
2           Steakhouse  0.04
3                 Café  0.04
4      Thai Restaurant  0.04


----Agincourt----
                venue  freq
0      Sandwich Place  0.17
1  Chinese Restaurant  0.17
2              Lounge  0.17
3          Print Shop  0.17
4      Breakfast Spot  0.17


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                      venue  freq
0                Playground   0.5
1                      Park   0.5
2            Massage Studio   0.0
3            Medical Center   0.0
4  Mediterranean Restaurant   0.0


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue  freq
0         Grocery Store  0.25
1  Fast Food Restaurant  0.12
2            Beer Store  0.12
3   Fried Chicken Joint  0.12
4        Sandwich Place  0.12


----Alderwood, Long Branch----
 


----East Toronto----
                       venue  freq
0                       Park   0.5
1          Convenience Store   0.5
2  Middle Eastern Restaurant   0.0
3                      Motel   0.0
4        Monument / Landmark   0.0


----Emery, Humberlea----
                       venue  freq
0             Baseball Field   1.0
1                Yoga Studio   0.0
2  Middle Eastern Restaurant   0.0
3                      Motel   0.0
4        Monument / Landmark   0.0


----Fairview, Henry Farm, Oriole----
                  venue  freq
0        Clothing Store  0.12
1  Fast Food Restaurant  0.07
2           Coffee Shop  0.06
3            Shoe Store  0.04
4         Women's Store  0.03


----First Canadian Place, Underground city----
                 venue  freq
0          Coffee Shop  0.11
1                 Café  0.07
2                Hotel  0.06
3           Restaurant  0.05
4  American Restaurant  0.04


----Flemingdon Park, Don Mills South----
                 venue  freq
0           Beer 

#### Let's put that into a *pandas* dataframe

In [65]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [66]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Thai Restaurant,Steakhouse,Clothing Store,Burger Joint,Bakery,Bar,Restaurant
1,Agincourt,Breakfast Spot,Lounge,Skating Rink,Sandwich Place,Chinese Restaurant,Print Shop,Donut Shop,Diner,Discount Store,Dog Run
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Fried Chicken Joint,Sandwich Place,Fast Food Restaurant,Beer Store,Pharmacy,Deli / Bodega,Electronics Store,Eastern European Restaurant
4,"Alderwood, Long Branch",Pizza Place,Pub,Gym,Coffee Shop,Athletics & Sports,Dance Studio,Sandwich Place,Bank,Diner,Discount Store
5,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Frozen Yogurt Shop,Shopping Mall,Sandwich Place,Bridal Shop,Diner,Bank,Restaurant,Supermarket,Deli / Bodega
6,Bayview Village,Chinese Restaurant,Café,Japanese Restaurant,Bank,Diner,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Women's Store
7,"Bedford Park, Lawrence Manor East",Italian Restaurant,Fast Food Restaurant,Juice Bar,Sushi Restaurant,Coffee Shop,Hardware Store,Pizza Place,Restaurant,Butcher,Café
8,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Beer Bar,Farmers Market,Café,Steakhouse,Cheese Shop,Restaurant
9,"Birch Cliff, Cliffside West",College Stadium,Café,Skating Rink,General Entertainment,Women's Store,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant


## 7. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [67]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 4, 2, 4, 0, 0, 4, 0, 0, 4])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [68]:
toronto_merged = df

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.tail() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
97,M9N,York,Weston,43.7069,-79.5182,0,Park,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
98,M9P,Etobicoke,Westmount,43.6963,-79.5322,0,Pizza Place,Intersection,Playground,Chinese Restaurant,Sandwich Place,Coffee Shop,Gluten-free Restaurant,Gift Shop,Eastern European Restaurant,Dumpling Restaurant
99,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.6889,-79.5547,4,Pizza Place,Mobile Phone Shop,Park,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
100,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.7394,-79.5884,4,Grocery Store,Pizza Place,Fried Chicken Joint,Sandwich Place,Fast Food Restaurant,Beer Store,Pharmacy,Deli / Bodega,Electronics Store,Eastern European Restaurant
101,M9W,Etobicoke,Northwest,43.7067,-79.5941,2,Drugstore,Rental Car Location,Bar,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant


Finally, let's visualize the resulting clusters

In [69]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

__[CLICK HERE](https://github.com/Deffro/cluster-neighborhoods/blob/master/map-clusters.jpg)__ to see the map and the clusters.

## 5. Examine Clusters

#### Cluster 1

In [70]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0,Fast Food Restaurant,Print Shop,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
4,Scarborough,0,Hakka Restaurant,Bank,Fried Chicken Joint,Thai Restaurant,Caribbean Restaurant,Bakery,Athletics & Sports,Dog Run,Doner Restaurant,Donut Shop
5,Scarborough,0,Playground,Construction & Landscaping,Business Service,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
7,Scarborough,0,Bus Line,Bakery,Soccer Field,Bus Station,Ice Cream Shop,Intersection,Park,Empanada Restaurant,Electronics Store,Eastern European Restaurant
8,Scarborough,0,Motel,Movie Theater,Skating Rink,American Restaurant,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
11,Scarborough,0,Auto Garage,Smoke Shop,Middle Eastern Restaurant,Sandwich Place,Breakfast Spot,Women's Store,Donut Shop,Dog Run,Doner Restaurant,Dumpling Restaurant
15,Scarborough,0,Fast Food Restaurant,Chinese Restaurant,Coffee Shop,Thrift / Vintage Store,Pizza Place,Burger Joint,Sandwich Place,Breakfast Spot,Japanese Restaurant,Pharmacy
18,North York,0,Clothing Store,Fast Food Restaurant,Coffee Shop,Shoe Store,Women's Store,Tea Room,Sporting Goods Shop,Bakery,Bus Station,Cosmetics Shop
22,North York,0,Restaurant,Ramen Restaurant,Sandwich Place,Coffee Shop,Pizza Place,Café,Steakhouse,Indonesian Restaurant,Fast Food Restaurant,Japanese Restaurant
23,North York,0,Bank,Park,Women's Store,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant


#### Cluster 2

In [71]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
77,West Toronto,1,Bar,Café,Restaurant,Coffee Shop,Men's Store,Vietnamese Restaurant,Asian Restaurant,Bakery,Cocktail Bar,French Restaurant


#### Cluster 3

In [72]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Scarborough,2,Electronics Store,Medical Center,Breakfast Spot,Rental Car Location,BBQ Joint,Mexican Restaurant,Women's Store,Donut Shop,Discount Store,Dog Run
13,Scarborough,2,Pizza Place,Fried Chicken Joint,Shopping Mall,Italian Restaurant,Rental Car Location,Thai Restaurant,Chinese Restaurant,Noodle House,Fast Food Restaurant,Coffee Shop
43,East Toronto,2,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Sandwich Place,Stationery Store,Bar,Fish Market
76,West Toronto,2,Supermarket,Bakery,Gym / Fitness Center,Pharmacy,Park,Music Venue,Café,Middle Eastern Restaurant,Discount Store,Brewery
95,North York,2,Pizza Place,Empanada Restaurant,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
101,Etobicoke,2,Drugstore,Rental Car Location,Bar,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant


#### Cluster 4

In [73]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,North York,3,Bar,Coffee Shop,Massage Studio,Furniture / Home Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


#### Cluster 5

In [74]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,4,Bar,History Museum,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Women's Store
3,Scarborough,4,Coffee Shop,Insurance Office,Korean Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
6,Scarborough,4,Chinese Restaurant,Hobby Shop,Discount Store,Train Station,Convenience Store,Department Store,Coffee Shop,Electronics Store,Empanada Restaurant,Ethiopian Restaurant
9,Scarborough,4,College Stadium,Café,Skating Rink,General Entertainment,Women's Store,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant
10,Scarborough,4,Indian Restaurant,Pet Store,Vietnamese Restaurant,Latin American Restaurant,Gaming Cafe,Chinese Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
12,Scarborough,4,Breakfast Spot,Lounge,Skating Rink,Sandwich Place,Chinese Restaurant,Print Shop,Donut Shop,Diner,Discount Store,Dog Run
14,Scarborough,4,Playground,Park,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
17,North York,4,Pool,Golf Course,Mediterranean Restaurant,Dog Run,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
19,North York,4,Chinese Restaurant,Café,Japanese Restaurant,Bank,Diner,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Women's Store
24,North York,4,Pizza Place,Grocery Store,Coffee Shop,Butcher,Pharmacy,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Department Store
