# Setup

In [250]:
import pandas as pd
import numpy as np
import requests as re
import csv

# Part 1: Set up table

In [251]:
# create empty csv file
table_csv = open('table_csv.csv','w')
csv_headers = csv.writer(table_csv)
csv_headers.writerow(['Postcode', 'Borough','Neighbourhood'])

32

# Scraping

In [252]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Remove "Not assigned" boroughs

In [253]:
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [254]:
# Count rows
len(df)

210

### Combine Neighbourhood names if in same Borough

In [255]:
# Create new table for combined Neighbourhood names
df_comb = df.groupby(['Postcode', 'Borough'], as_index = False).agg(lambda x: ",".join(x))
print(len(df_comb))
df_comb.head(10)

103


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Give "Not assigned" neighbourhoods name of Borough

In [256]:
# Find any Neighbourhoods with 'Not assigned' value
NANb = df_comb[df_comb['Neighbourhood'] == 'Not assigned']
NANb

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Not assigned


In [257]:
# Replace 'Not assigned' values
for index, row in df_comb.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

# Check if Queen's Park has changed
df_comb.iloc[93,:]

Postcode                  M9A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 93, dtype: object

### Dataframe shape

In [258]:
df_comb.shape

(103, 3)

# Part 2: Lat/Longs

In [259]:
latlong = pd.read_csv('http://cocl.us/Geospatial_data')
latlong

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [260]:
# Change column name to match for merge
latlong.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
latlong.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [261]:
# Merge dataframes
df_latlong = df_comb.merge(latlong, on='Postcode', how='left')
df_latlong.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3: Explore and cluster boroughs

In [262]:
import folium
from geopy.geocoders import Nominatim
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create initial map

In [263]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_latlong['Latitude'], df_latlong['Longitude'], df_latlong['Borough'], df_latlong['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Use Foursquare credentials

In [264]:
CLIENT_ID = 'ZLTURHWLSBTGEBULQHC0UVBPWOGXUHZGXSGY2V5ZM5U0JJTC' # your Foursquare ID
CLIENT_SECRET = 'KIBCZYSPTZ53I5BDAEKCMV3DIGWIAJVS11TBOQYU1TZCM300' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: ZLTURHWLSBTGEBULQHC0UVBPWOGXUHZGXSGY2V5ZM5U0JJTC
CLIENT_SECRET:KIBCZYSPTZ53I5BDAEKCMV3DIGWIAJVS11TBOQYU1TZCM300


## Find Borough to compare

In [265]:
# New Toronto in Etobicoke looks nice!  (bottom borough)
df_eto = df_latlong.loc[df_latlong['Borough'] == 'Etobicoke',:].reset_index()
df_eto.drop('index', axis=1, inplace=True)
df_eto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321
1,M8W,Etobicoke,"Alderwood,Long Branch",43.602414,-79.543484
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
3,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout...",43.636258,-79.498509
4,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw...",43.628841,-79.520999
5,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar...",43.650943,-79.554724
6,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B...",43.643515,-79.577201
7,M9P,Etobicoke,Westmount,43.696319,-79.532242
8,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
9,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


In [266]:
# Get lat/long of New Toronto
NT_lat = df_eto.loc[0, 'Latitude']
NT_long = df_eto.loc[0, 'Longitude']
NT_name = df_eto.loc[0, 'Neighbourhood']
print('Lat is {}, Long is {}, neighbourhood is {}.'.format(NT_lat, NT_long, NT_name))

Lat is 43.6056466, Long is -79.50132070000001, neighbourhood is Humber Bay Shores,Mimico South,New Toronto.


### Get request from Foursquare

In [267]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius in meters

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    NT_lat, 
    NT_long, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=ZLTURHWLSBTGEBULQHC0UVBPWOGXUHZGXSGY2V5ZM5U0JJTC&client_secret=KIBCZYSPTZ53I5BDAEKCMV3DIGWIAJVS11TBOQYU1TZCM300&v=20180605&ll=43.6056466,-79.50132070000001&radius=500&limit=100'

In [268]:
# Get request
results = re.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e276851760a7f001ba321da'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 14,
  'suggestedBounds': {'ne': {'lat': 43.6101466045, 'lng': -79.49511771930959},
   'sw': {'lat': 43.6011465955, 'lng': -79.50752368069043}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b119977f964a520488023e3',
       'name': 'LCBO',
       'location': {'address': '2762 Lake Shore Blvd W',
        'crossStreet': 'btwn 1st & 2nd St',
        'lat': 43.60228082768786,
        'lng': -79.4993016827402,
        'labeledLatLngs': [{'label': 'display',
          'lat':

In [269]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Clean json into pandas dataframe

In [270]:
from pandas.io.json import json_normalize
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('Found {} venues.'.format(nearby_venues.shape[0]))
nearby_venues.head()

Found 14 venues.


Unnamed: 0,name,categories,lat,lng
0,LCBO,Liquor Store,43.602281,-79.499302
1,Domino's Pizza,Pizza Place,43.601583,-79.500905
2,New Toronto Fish & Chips,Restaurant,43.601849,-79.503281
3,Delicia Bakery & Pastry,Bakery,43.601403,-79.503012
4,Lucky Dice Restaurant,Café,43.601392,-79.503056


## Explore all neighbourhoods in Etobicoke

In [271]:
# Function that pulls and cleans vendors within radius info for all Etobicoke neighbourhoods

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = re.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [272]:
# List and count Neighbourhoods in Etobicoke
df_eto_hood = getNearbyVenues(names=df_eto['Neighbourhood'],
                                   latitudes=df_eto['Latitude'],
                                   longitudes=df_eto['Longitude']
                                  )
# Venue count
print('Venue count is {}.'.format(df_eto_hood.shape[0]))

Humber Bay Shores,Mimico South,New Toronto
Alderwood,Long Branch
The Kingsway,Montgomery Road,Old Mill North
Humber Bay,King's Mill Park,Kingsway Park South East,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea
Kingsway Park South West,Mimico NW,The Queensway West,Royal York South West,South of Bloor
Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park
Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe
Westmount
Kingsview Village,Martin Grove Gardens,Richview Gardens,St. Phillips
Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown
Northwest
Venue count is 74.


In [273]:
# Check top 5 rows of df_eto_hood
df_eto_hood.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,LCBO,43.602281,-79.499302,Liquor Store
1,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,Domino's Pizza,43.601583,-79.500905,Pizza Place
2,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,New Toronto Fish & Chips,43.601849,-79.503281,Restaurant
3,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,Delicia Bakery & Pastry,43.601403,-79.503012,Bakery
4,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,Lucky Dice Restaurant,43.601392,-79.503056,Café


### Etobicoke venue count by Neighbourhood

In [274]:
df_eto_hood.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",11,11,11,11,11,11
"Alderwood,Long Branch",9,9,9,9,9,9
"Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe",8,8,8,8,8,8
"Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park",1,1,1,1,1,1
"Humber Bay Shores,Mimico South,New Toronto",14,14,14,14,14,14
"Humber Bay,King's Mill Park,Kingsway Park South East,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea",1,1,1,1,1,1
"Kingsview Village,Martin Grove Gardens,Richview Gardens,St. Phillips",4,4,4,4,4,4
"Kingsway Park South West,Mimico NW,The Queensway West,Royal York South West,South of Bloor",13,13,13,13,13,13
Northwest,3,3,3,3,3,3
"The Kingsway,Montgomery Road,Old Mill North",2,2,2,2,2,2


### Unique categories from all venues

In [275]:
print('Unique category count: {}'.format(len(df_eto_hood['Venue Category'].unique())))

Unique category count: 42


## Set up dataframe with dummy variables for clustering

In [276]:
# one hot encoding
df_onehot = pd.get_dummies(df_eto_hood[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Neighbourhood'] = df_eto_hood['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

Unnamed: 0,Neighbourhood,Bakery,Bar,Baseball Field,Beer Store,Burger Joint,Burrito Place,Bus Line,Business Service,Café,...,Pub,Rental Car Location,Restaurant,River,Sandwich Place,Shopping Plaza,Skating Rink,Supplement Shop,Tanning Salon,Wings Joint
0,"Humber Bay Shores,Mimico South,New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Humber Bay Shores,Mimico South,New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Humber Bay Shores,Mimico South,New Toronto",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,"Humber Bay Shores,Mimico South,New Toronto",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Humber Bay Shores,Mimico South,New Toronto",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [277]:
# Compare column count of unique categories to dataframe columns. Should be one more than unique category count.
if df_onehot.shape[1] - 1 ==  len(df_eto_hood['Venue Category'].unique()):
    print('Column count checks out with {} venue categories.'.format(df_onehot.shape[1] - 1))
else:
    print('Check columns')

Column count checks out with 42 venue categories.


## Group Neighbourhoods and get mean of venue frequency

In [278]:
df_ven_mean = df_onehot.groupby('Neighbourhood').mean().reset_index()
df_ven_mean

Unnamed: 0,Neighbourhood,Bakery,Bar,Baseball Field,Beer Store,Burger Joint,Burrito Place,Bus Line,Business Service,Café,...,Pub,Rental Car Location,Restaurant,River,Sandwich Place,Shopping Plaza,Skating Rink,Supplement Shop,Tanning Salon,Wings Joint
0,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0
1,"Alderwood,Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111111,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0
2,"Bloordale Gardens,Eringate,Markland Wood,Old B...",0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,...,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0
3,"Cloverdale,Islington,Martin Grove,Princess Gar...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Humber Bay Shores,Mimico South,New Toronto",0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.142857,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Humber Bay,King's Mill Park,Kingsway Park Sout...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Kingsview Village,Martin Grove Gardens,Richvie...",0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
7,"Kingsway Park South West,Mimico NW,The Queensw...",0.076923,0.0,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.076923,0.076923,0.076923
8,Northwest,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"The Kingsway,Montgomery Road,Old Mill North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


### Top 5 venues for each Neighbourhood

In [279]:
num_top_venues = 5

for hood in df_ven_mean['Neighbourhood']:
    print("----"+hood+"----")
    temp = df_ven_mean[df_ven_mean['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                 venue  freq
0          Pizza Place  0.18
1        Grocery Store  0.18
2       Discount Store  0.09
3  Japanese Restaurant  0.09
4           Beer Store  0.09


----Alderwood,Long Branch----
          venue  freq
0   Pizza Place  0.22
1           Pub  0.11
2          Pool  0.11
3  Skating Rink  0.11
4           Gym  0.11


----Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe----
               venue  freq
0  Convenience Store  0.12
1         Beer Store  0.12
2     Shopping Plaza  0.12
3           Pharmacy  0.12
4               Café  0.12


----Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park----
                       venue  freq
0                 Print Shop   1.0
1                     Bakery   0.0
2               Liquor Store   0.0
3         Mexican Restaurant   0.0
4  Middle Eastern Restaurant   0.0


----Humber Bay Shores,Mimico So

### Put info into pandas df and show by top venue count

In [280]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [281]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = df_ven_mean['Neighbourhood']

for ind in np.arange(df_ven_mean.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_ven_mean.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Pizza Place,Pharmacy,Fried Chicken Joint,Beer Store,Sandwich Place,Japanese Restaurant,Fast Food Restaurant,Discount Store,Flower Shop
1,"Alderwood,Long Branch",Pizza Place,Pub,Coffee Shop,Pharmacy,Pool,Gym,Sandwich Place,Skating Rink,Beer Store,Chinese Restaurant
2,"Bloordale Gardens,Eringate,Markland Wood,Old B...",Coffee Shop,Shopping Plaza,Beer Store,Liquor Store,Café,Convenience Store,Pizza Place,Pharmacy,Wings Joint,Fried Chicken Joint
3,"Cloverdale,Islington,Martin Grove,Princess Gar...",Print Shop,Chinese Restaurant,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store,Convenience Store,Coffee Shop
4,"Humber Bay Shores,Mimico South,New Toronto",Café,Hobby Shop,Pizza Place,Business Service,Fast Food Restaurant,Flower Shop,Fried Chicken Joint,Gym,Liquor Store,Mexican Restaurant


## Cluster Neighbourhoods into groups

In [282]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 3

df_grouped_clustering = df_ven_mean.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ #[0:10]

array([1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1])

In [283]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster', kmeans.labels_)

df_clus = df_eto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_clus = df_clus.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

df_clus.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,1,Café,Hobby Shop,Pizza Place,Business Service,Fast Food Restaurant,Flower Shop,Fried Chicken Joint,Gym,Liquor Store,Mexican Restaurant
1,M8W,Etobicoke,"Alderwood,Long Branch",43.602414,-79.543484,1,Pizza Place,Pub,Coffee Shop,Pharmacy,Pool,Gym,Sandwich Place,Skating Rink,Beer Store,Chinese Restaurant
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944,1,River,Park,Wings Joint,Coffee Shop,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store
3,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout...",43.636258,-79.498509,2,Baseball Field,Wings Joint,Coffee Shop,Gym,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store
4,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw...",43.628841,-79.520999,1,Wings Joint,Tanning Salon,Burger Joint,Burrito Place,Convenience Store,Discount Store,Fast Food Restaurant,Grocery Store,Gym,Hardware Store


## Create map of clusters

In [284]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_clus['Latitude'], df_clus['Longitude'], df_clus['Neighbourhood'], df_clus['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [285]:
df_clus.loc[df_clus['Cluster'] == 0, df_clus.columns[[1] + list(range(5, df_clus.shape[1]))]]

Unnamed: 0,Borough,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Etobicoke,0,Print Shop,Chinese Restaurant,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store,Convenience Store,Coffee Shop


In [286]:
df_clus.loc[df_clus['Cluster'] == 1, df_clus.columns[[1] + list(range(5, df_clus.shape[1]))]]

Unnamed: 0,Borough,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Etobicoke,1,Café,Hobby Shop,Pizza Place,Business Service,Fast Food Restaurant,Flower Shop,Fried Chicken Joint,Gym,Liquor Store,Mexican Restaurant
1,Etobicoke,1,Pizza Place,Pub,Coffee Shop,Pharmacy,Pool,Gym,Sandwich Place,Skating Rink,Beer Store,Chinese Restaurant
2,Etobicoke,1,River,Park,Wings Joint,Coffee Shop,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store
4,Etobicoke,1,Wings Joint,Tanning Salon,Burger Joint,Burrito Place,Convenience Store,Discount Store,Fast Food Restaurant,Grocery Store,Gym,Hardware Store
6,Etobicoke,1,Coffee Shop,Shopping Plaza,Beer Store,Liquor Store,Café,Convenience Store,Pizza Place,Pharmacy,Wings Joint,Fried Chicken Joint
7,Etobicoke,1,Pizza Place,Chinese Restaurant,Intersection,Sandwich Place,Middle Eastern Restaurant,Discount Store,Coffee Shop,Fried Chicken Joint,Flower Shop,Fast Food Restaurant
8,Etobicoke,1,Pizza Place,Sandwich Place,Bus Line,Mobile Phone Shop,Wings Joint,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store
9,Etobicoke,1,Grocery Store,Pizza Place,Pharmacy,Fried Chicken Joint,Beer Store,Sandwich Place,Japanese Restaurant,Fast Food Restaurant,Discount Store,Flower Shop
10,Etobicoke,1,Bar,Drugstore,Rental Car Location,Wings Joint,Coffee Shop,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Discount Store


In [287]:
df_clus.loc[df_clus['Cluster'] == 2, df_clus.columns[[1] + list(range(5, df_clus.shape[1]))]]

Unnamed: 0,Borough,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Etobicoke,2,Baseball Field,Wings Joint,Coffee Shop,Gym,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store
