# 1. Scraping Toronto data from Wiki and cleaning

In [5]:
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url, header=0)[0]
print(df.shape)
df.head()

(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df.drop(df[df["Borough"] == "Not assigned"].index, axis = 0, inplace = True)
print(df.shape)
df.head()

(210, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [7]:
# rows will be combined into one row with the neighborhoods separated with a comma
df_grp = df.groupby(["Postcode", "Borough"], as_index = False).agg(', '.join)
print(df_grp.shape)
df_grp.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

import numpy as np
df_grp['Neighbourhood'] = np.where(df_grp['Neighbourhood'] == "Not assigned", df_grp["Borough"], df_grp['Neighbourhood'])

In [15]:
df_grp.loc[df_grp["Borough"] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park


In [22]:
df_grp.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [20]:
df_grp.groupby('Borough').count()

Unnamed: 0_level_0,Postcode,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,9,9
Downtown Toronto,19,19
East Toronto,5,5
East York,5,5
Etobicoke,11,11
Mississauga,1,1
North York,24,24
Queen's Park,1,1
Scarborough,17,17
West Toronto,6,6


In [23]:
print(df_grp.shape)

(103, 3)


# 2. Getting coordinates of each neighborhood

### Trying to get coordinates via geocoder

In [None]:
!conda install -c conda-forge geopy --yes

In [None]:
!pip install geocoder

In [None]:
!pip install --upgrade pip

In [38]:
import geocoder # import geocoder

In [None]:
df_coord = df_grp

latitudes = []
longitudes = []

for postal_code in df_grp["Postcode"]:
    # initialize your variable to None
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    
    latitudes.append(lat_lng_coords[0])
    longitudes.append(lat_lng_coords[1])
    print('code: {}, lat: {}, lng: {}'.format(postal_code, lat_lng_coords[0], lat_lng_coords[1]))
    
df_coord["Latitude"] = latitudes
df_coord["Longitude"] = longitudes

print(df_coord.shape)
df_coord.head()

geocoder doesn't return values, so the code above is useless, but I still leave it here 

### Getting coordinates via csv

In [56]:
coord = pd.read_csv('http://cocl.us/Geospatial_data')
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [59]:
# renaming Postal code column to be able to join 2 dataframes on it
coord.rename(columns={"Postal Code": "Postcode"}, inplace = True)
coord.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [None]:
# joining 2 dataframes
df_coord = df_grp.join(coord.set_index('Postcode'), on='Postcode')

In [61]:
df_coord.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# 3. Exploring and clustering the neighborhoods in Toronto

### A. Visualizing the dataset

In [66]:
# choosing boroughs that contain the word Toronto

df_toronto = df_coord[df_coord["Borough"].str.contains("Toronto")]

list(df_toronto["Borough"].unique()) 

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [67]:
print(df_toronto.shape)
df_toronto.head()

(39, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [68]:
print('The dataframe has {} boroughs and {} postcodes.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

The dataframe has 4 boroughs and 39 postcodes.


In [70]:
# Use geopy library to get the latitude and longitude values of Toronto.
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [72]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Collecting package metadata (repodata.json): done
Solving environment: - 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/osx-64::anaconda==5.2.0=py36_3
  - defaults/osx-64::astropy==3.0.2=py36h917ab60_1
  - defaults/osx-64::bkcharts==0.2=py36h073222e_0
  - defaults/osx-64::blaze==0.11.3=py36h02e7a37_0
  - defaults/osx-64::bokeh==0.12.16=py36_0
  - defaults/osx-64::bottleneck==1.2.1=py36hbd380ad_0
  - defaults/osx-64::dask==0.17.5=py36_0
  - defaults/osx-64::datashape==0.5.4=py36hfb22df8_0
  - defaults/osx-64::h5py==2.7.1=py36ha8ecd60_2
  - defaults/osx-64::imageio==2.3.0=py36_0
  - defaults/osx-64::matplotlib==2.2.2=py36ha7267d0_0
  - defaults/osx-64::mkl_fft==1.0.1=py36h917ab60_0
  - defaults/osx-64::mkl_random==1.0.1=py36h78cc56f_0
  - defaults/osx-64::numba==0.38.0=py36h1702cab_0
  - defaults/osx-64::numexpr==2.6.5=py36h057f876_0
  - defaults/osx-64::numpy==1.14.3=py36h9bb19eb_1
  - default

In [78]:
# Create a map of Toronto with boroughs and neighbourhoods superimposed on top.

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, postcode, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Postcode'], df_toronto['Neighbourhood']):
    label = '{}, {}, {}'.format(borough, postcode, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [152]:
# Define Foursquare Credentials and Version

CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180604'

In [90]:
# Let's explore the first neighborhood in our dataframe.
df_toronto.iloc[0]

Postcode                  M4E
Borough          East Toronto
Neighbourhood     The Beaches
Latitude              43.6764
Longitude             -79.293
Name: 37, dtype: object

In [92]:
neighborhood_latitude = df_toronto["Latitude"].iloc[0] # neighborhood latitude value
neighborhood_longitude = df_toronto["Longitude"].iloc[0] # neighborhood longitude value

neighborhood_name = df_toronto['Neighbourhood'].iloc[0] # names of neighborhoods under unique postcode

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [94]:
# Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

import requests # library to handle requests

radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)

results = requests.get(url).json()

In [95]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [96]:
#json to df

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Glen Stewart Park,Park,43.675278,-79.294647
3,Grover Pub and Grub,Pub,43.679181,-79.297215
4,Upper Beaches,Neighborhood,43.680563,-79.292869


In [97]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


### B. Explore venues in Toronto

Create a function to repeat the same process to all the neighborhoods (i.e. Postcodes) in Toronto

In [109]:
def getNearbyVenues(boroughs, postcodes, neighborhoods, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for borough, postcode, neighborhood, lat, lng in zip(boroughs, postcodes, neighborhoods, latitudes, longitudes):
        print(postcode, neighborhood)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            borough, 
            postcode, 
            neighborhood, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough',
                  'Postcode',
                  'Neighborhood(s)',           
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [110]:
# code to run the above function on each neighborhood and create a new dataframe called *manhattan_venues


toronto_venues = getNearbyVenues(boroughs = df_toronto["Borough"],
                                 postcodes = df_toronto['Postcode'],
                                 neighborhoods = df_toronto['Neighbourhood'],
                                 latitudes = df_toronto['Latitude'],
                                 longitudes = df_toronto['Longitude']
                                  )

M4E The Beaches
M4K The Danforth West, Riverdale
M4L The Beaches West, India Bazaar
M4M Studio District
M4N Lawrence Park
M4P Davisville North
M4R North Toronto West
M4S Davisville
M4T Moore Park, Summerhill East
M4V Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
M4W Rosedale
M4X Cabbagetown, St. James Town
M4Y Church and Wellesley
M5A Harbourfront
M5B Ryerson, Garden District
M5C St. James Town
M5E Berczy Park
M5G Central Bay Street
M5H Adelaide, King, Richmond
M5J Harbourfront East, Toronto Islands, Union Station
M5K Design Exchange, Toronto Dominion Centre
M5L Commerce Court, Victoria Hotel
M5N Roselawn
M5P Forest Hill North, Forest Hill West
M5R The Annex, North Midtown, Yorkville
M5S Harbord, University of Toronto
M5T Chinatown, Grange Park, Kensington Market
M5V CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
M5W Stn A PO Boxes 25 The Esplanade
M5X First Canadian Place, Underground city
M6G Christie
M6H

In [111]:
print(toronto_venues.shape)
toronto_venues.head()

(1717, 9)


Unnamed: 0,Borough,Postcode,Neighborhood(s),Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,East Toronto,M4E,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,East Toronto,M4E,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,East Toronto,M4E,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
3,East Toronto,M4E,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
4,East Toronto,M4E,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [117]:
toronto_venues.groupby('Neighborhood(s)').count()

Unnamed: 0_level_0,Borough,Postcode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood(s),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100,100,100
Berczy Park,58,58,58,58,58,58,58,58
"Brockton, Exhibition Place, Parkdale Village",24,24,24,24,24,24,24,24
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18,18,18
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14,14,14
"Cabbagetown, St. James Town",48,48,48,48,48,48,48,48
Central Bay Street,85,85,85,85,85,85,85,85
"Chinatown, Grange Park, Kensington Market",86,86,86,86,86,86,86,86
Christie,18,18,18,18,18,18,18,18
Church and Wellesley,85,85,85,85,85,85,85,85


In [114]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 229 uniques categories.


### C. Analyze Each Neighborhood (Postcode)

In [119]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood(s)'] = toronto_venues['Neighborhood(s)'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood(s),Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
toronto_onehot.shape

(1717, 230)

In [121]:
# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood(s)').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood(s),Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.071429,0.071429,0.071429,0.142857,0.071429,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
toronto_grouped.shape

(39, 230)

In [123]:
# First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [124]:
# Now let's create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood(s)']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood(s)'] = toronto_grouped['Neighborhood(s)']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood(s),1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Bar,Breakfast Spot,Hotel,Restaurant,Asian Restaurant,Thai Restaurant,Cosmetics Shop
1,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Steakhouse,Café,Bakery,Farmers Market,Beer Bar,Seafood Restaurant,Bistro
2,"Brockton, Exhibition Place, Parkdale Village",Nightclub,Coffee Shop,Breakfast Spot,Café,Bakery,Convenience Store,Italian Restaurant,Stadium,Intersection,Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Fast Food Restaurant,Park,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Terminal,Airport Lounge,Boutique,Bar,Coffee Shop,Boat or Ferry,Harbor / Marina,Sculpture Garden,Airport Service,Airport Gate


### D. Clustering Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [132]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood(s)', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[-10:] 

array([1, 0, 0, 0, 0, 0, 0, 3, 0, 0], dtype=int32)

In [133]:
set(kmeans.labels_)

{0, 1, 2, 3, 4}

In [134]:
# Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# renaming Neighbourhood column to be able to join 2 dataframes on it
toronto_merged.rename(columns={"Neighbourhood": "Neighborhood(s)"}, inplace = True)
toronto_merged.head()

# merge df to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood(s)'), on='Neighborhood(s)')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [136]:
print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(39, 16)


Unnamed: 0,Postcode,Borough,Neighborhood(s),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Pub,Park,Neighborhood,Health Food Store,Trail,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Yoga Studio,Bubble Tea Shop,Sports Bar,Spa,Juice Bar
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Pet Store,Pub,Burger Joint,Burrito Place,Liquor Store,Sandwich Place,Fast Food Restaurant,Italian Restaurant,Intersection,Fish & Chips Shop
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Italian Restaurant,Gastropub,Yoga Studio,Fish Market,Pet Store
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Lake,Park,Swim School,Bus Line,Dim Sum Restaurant,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant


In [140]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Neighborhood(s)'], toronto_merged['Cluster Labels']):
    label = folium.Popup('Borough: ' + str(bor) + ' Neigh: ' + str(poi) + ' Cluster: ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### E. Examine Clusters

Examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, a name to each cluster can be assigned. 

In [141]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
41,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop,Yoga Studio,Bubble Tea Shop,Sports Bar,Spa,Juice Bar
42,East Toronto,0,Pet Store,Pub,Burger Joint,Burrito Place,Liquor Store,Sandwich Place,Fast Food Restaurant,Italian Restaurant,Intersection,Fish & Chips Shop
43,East Toronto,0,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Italian Restaurant,Gastropub,Yoga Studio,Fish Market,Pet Store
44,Central Toronto,0,Lake,Park,Swim School,Bus Line,Dim Sum Restaurant,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant
46,Central Toronto,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Yoga Studio,Mexican Restaurant,Shoe Store,Salon / Barbershop,Restaurant,Rental Car Location,Park
47,Central Toronto,0,Sandwich Place,Dessert Shop,Pizza Place,Gym,Italian Restaurant,Café,Sushi Restaurant,American Restaurant,Coffee Shop,Transportation Service
49,Central Toronto,0,Coffee Shop,Pub,American Restaurant,Sushi Restaurant,Supermarket,Fried Chicken Joint,Sports Bar,Pizza Place,Liquor Store,Vietnamese Restaurant
51,Downtown Toronto,0,Coffee Shop,Café,Pizza Place,Park,Restaurant,Italian Restaurant,Pub,Bakery,Sandwich Place,Butcher
52,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Bubble Tea Shop,Burger Joint,Men's Store,Café,Mediterranean Restaurant
53,Downtown Toronto,0,Coffee Shop,Pub,Bakery,Park,Mexican Restaurant,Breakfast Spot,Café,Yoga Studio,Farmers Market,Restaurant


=> coffee shpop cluster

In [142]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Central Toronto,1,Garden,Yoga Studio,Dim Sum Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


=> garden cluster

In [143]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
48,Central Toronto,2,Playground,Trail,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


=> playground cluster

In [144]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,East Toronto,3,Pub,Park,Neighborhood,Health Food Store,Trail,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop
50,Downtown Toronto,3,Park,Playground,Trail,Yoga Studio,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
64,Central Toronto,3,Park,Jewelry Store,Trail,Sushi Restaurant,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


=> park cluster

In [145]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
45,Central Toronto,4,Gym,Breakfast Spot,Park,Food & Drink Shop,Hotel,Department Store,Dog Run,Sandwich Place,Discount Store,Event Space


=> gym cluster