## --> Please scroll down for tasks 2 and 3


In [126]:
# install WebScraper, HTML parser, request library
# import requests, pandas
!pip install beautifulsoup4
from bs4 import BeautifulSoup
!pip install lxml
!pip install requests
import requests
import pandas as pd
pd.set_option("precision",8)
import numpy as np

Requirement not upgraded as not directly required: beautifulsoup4 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: lxml in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests)
Requirement not upgraded as not directly required: idna<2.7,>=2.5 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests)
Requirement not upgraded as not directly required: urllib3<1.23,>=1.21.1 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests)
Requirement not upgraded as not directly required: certifi>=2017.4.17 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests)


## Task 1: Scrape and preprocess Toronto neighborhood data from wikipedia

In [127]:
# Define source and scrape webpage
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

# generate empty dataframe
df_tnbh = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

# iterate through all cells of the table and extract data
for i, cell in enumerate (soup.find_all('td')):
    try:
        # headline of the cell is the postal code
        # entry of the cell collects all boroughs and neiborhoods
        # first_link is an auxiliary variable used for clearly separating boroughs and neighborhoods
        headline = cell.p.b.text
        entry = cell.span.text
        try:
            first_link = cell.span.a.text
        except:
            first_link = '@'
        
        # process data only of cells with real data
        if entry != 'Not assigned':

            # data conditioning, e.g. removal of blanks, commas etc.
            entry_aux = entry.replace(' ','')
            first_link_aux = first_link.replace(' ','')            
            if entry_aux.startswith(first_link_aux): 
                entry = entry[:len(first_link)] + "/" + entry[len(first_link):]
            entry = entry.replace('(','/')
            entry = entry.replace(')','/')
            entry = entry.replace('//','/')
            if (entry[-1] == '/') or (entry[-1] == ' '):
                entry = entry[:-1]
            if (entry[-1] == '/') or (entry[-1] == ' '):
                entry = entry[:-1]
            
            # separate boroughs and individual neighborhoods
            entry_1 = entry.split('/')
            entry_2 = entry_1[1:]
            entry_3 = ''
            if len(entry_2[0]) == 1: 
                entry_3 = entry_1[0]
            else:
                for entr in entry_2:
                    if entr[0] == ' ':
                        entr = entr[0:]
                    if entr[-1] == ' ':
                        entr = entr[:-1]
                    entry_3 = entry_3 + ', ' + entr
                entry_3 = entry_3[2:]

            # write cell data to dataframe row
            df_tnbh.loc[i]=[headline,entry_1[0],entry_3]
        
    except:
        pass

df_tnbh.reset_index (inplace= True)
df_tnbh.drop (labels='index', axis=1, inplace=True)
df_tnbh.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,"Don Mills, North"
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [128]:
df_tnbh.shape

(103, 3)

## Task 2: Add location data from Google Geocoding API

In [129]:
# Google API key
API_key = 'AIzaSyASiIfKbBCvncSaTTxO5X9pNBd4j-5lVsM'

# Add 2 empty columns to the dataframe for geodata
df_tnbh['Latitude'] =''
df_tnbh['Longitude'] = ''

# iterate through rows of dataframe
for i, row in df_tnbh.iterrows():
    
    # searchterm ist "Toronto" + the postal code of interest
    searchterm = 'Toronto ' + row ['PostalCode']
    
    # construct URL to make API call
    url = 'https://maps.googleapis.com/maps/api/geocode/json?&address={}&key={}'.format(searchterm, API_key)

    # request to Google API
    response = requests.get(url).json() # get response
    geographical_data = response['results'][0]['geometry']['location']
    
    # write data to dataframe
    df_tnbh.iloc[i,3] = geographical_data['lat']
    df_tnbh.iloc[i,4] = geographical_data['lng']
    
df_tnbh.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,"Don Mills, North",43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Task 3: Clustering of Toronto neighborhoods

#### I'm basically doing the same ananlysis as in the lab of this week, only looking at Neibhborhoods in Boroughs that include "Toronto". So I want to find out which neighborhoods are most similar with respect to their venues

In [130]:
# Import folium, sklearn, matplotlib

!conda install -c conda-forge folium=0.5.0 --yes
import folium

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


In [131]:
# New dataframe with only boroughs containing "Toronto"

df_tbr = df_tnbh[df_tnbh['Borough'].str.contains("Toronto")]
df_tbr.shape

(38, 5)

In [132]:
# New dataframe containing individual neighborhoods

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
# instantiate the dataframe
df_toronto = pd.DataFrame(columns=column_names)

for i, row in df_tbr.iterrows():
    entry_1 = row['Neighborhood'].split(',')
    for entry_2 in entry_1:
        if len(entry_2) < 25 and len(row['Borough'])<25:
            df_toronto = df_toronto.append({'Borough': row['Borough'], 'Neighborhood': entry_2}, ignore_index = True) 
        
# Check for duplicates and drop
df_toronto.drop_duplicates(subset='Neighborhood', keep='first', inplace=True)
df_toronto.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,Regent Park,,
1,Downtown Toronto,Harbourfront,,
2,Downtown Toronto,Garden District,,
3,Downtown Toronto,Ryerson,,
4,Downtown Toronto,St. James Town,,


In [133]:
# Get geodata for neighborhoods from Google API

# Google API key
API_key = 'AIzaSyASiIfKbBCvncSaTTxO5X9pNBd4j-5lVsM'

# iterate through rows of dataframe
for i, row in df_toronto.iterrows():
    
    # searchterm ist "Toronto" + the postal code of interest
    searchterm = 'Toronto ' + row ['Neighborhood']
    
    # construct URL to make API call
    url = 'https://maps.googleapis.com/maps/api/geocode/json?&address={}&key={}'.format(searchterm, API_key)

    # request to Google API
    response = requests.get(url).json() # get response
    geographical_data = response['results'][0]['geometry']['location']
    
    # write data to dataframe
    row['Latitude'] = geographical_data['lat']
    row['Longitude'] = geographical_data['lng']
    
df_toronto.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,Regent Park,43.660323,-79.362044
1,Downtown Toronto,Harbourfront,43.640552,-79.378937
2,Downtown Toronto,Garden District,43.656898,-79.376122
3,Downtown Toronto,Ryerson,43.657658,-79.378802
4,Downtown Toronto,St. James Town,43.670867,-79.373306


In [134]:
# Get geodata of Toronto
    
# searchterm ist "Toronto" + the postal code of interest
searchterm = 'Toronto'
    
# construct URL to make API call
url = 'https://maps.googleapis.com/maps/api/geocode/json?&address={}&key={}'.format(searchterm, API_key)

# request to Google API
response = requests.get(url).json() # get response
geographical_data = response['results'][0]['geometry']['location']
toronto_lat = geographical_data['lat']
toronto_lng = geographical_data['lng']

In [135]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=11)

# Add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [136]:
# Set Foursquare credentials
CLIENT_ID = 'W52YQTZIGFNTF5TK2DDCDRRGRKJZF12ORH10G0UAG5MIMAYW' # your Foursquare ID
CLIENT_SECRET = 'IOLIX4VL3TJDULWIPA5LMSXWBRUPIPVL4GJTCMBPHOMS0QLG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [138]:
# Function to get location data from Foursquare for all neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    limit=100
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [139]:
# Run the above function on each neighborhood and create a new dataframe toronto_venues
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

Regent Park
  Harbourfront
Garden District
 Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond
  Adelaide
  King
Dufferin
  Dovercourt Village
Harbourfront East
  Union Station
  Toronto Islands
Little Portugal
  Trinity
The Danforth West
  Riverdale
Toronto Dominion Centre
  Design Exchange
Brockton
  Parkdale Village
  Exhibition Place
India Bazaar
  The Beaches West
Commerce Court
  Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park
  The Junction South
North Toronto West
The Annex
  North Midtown
  Yorkville
Parkdale
  Roncesvalles
Davisville
University of Toronto
  Harbord
Runnymede
  Swansea
Moore Park
  Summerhill East
Kensington Market
  Chinatown
  Grange Park
Summerhill West
  Rathnelly
  South Hill
  Forest Hill SE
  Deer Park
CN Tower
  King and Spadina
  Railway Lands
  Harbourfront West
  Bathurst Quay
  South Niagara
  Island airport
Rosedale
 Enclave of M5E
  Cabbagetown
First Canadi

In [140]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.6603228,-79.3620439,Daniels Spectrum,43.66013679,-79.36180783,Performing Arts Venue
1,Regent Park,43.6603228,-79.3620439,Regent Park Aquatic Centre,43.66060031,-79.36139163,Pool
2,Regent Park,43.6603228,-79.3620439,Sumach Espresso,43.65813541,-79.35951549,Coffee Shop
3,Regent Park,43.6603228,-79.3620439,Sukhothai,43.65844447,-79.36568085,Thai Restaurant
4,Regent Park,43.6603228,-79.3620439,Thai To Go,43.663418,-79.36071,Thai Restaurant


In [141]:
# number of venues for each neighborhood
toronto_venues.groupby('Neighborhood').count().head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,23,23,23,23,23,23
Cabbagetown,52,52,52,52,52,52
Chinatown,100,100,100,100,100,100
Deer Park,42,42,42,42,42,42


### Further analzye Neighborhoods

In [142]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborh.'] = toronto_venues['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborh.,Accessories Store,Adult Boutique,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborh.').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborh.,Accessories Store,Adult Boutique,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
1,Bathurst Quay,0.0,0.0,0.0,0.04347826,0.0,0.0,0.0,0.04347826,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cabbagetown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chinatown,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.03,0.0
4,Deer Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02380952,0.0,0.0,0.0,0.0,0.0,0.02380952,0.0


In [144]:
# Print each neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in toronto_grouped['Neighborh.']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborh.'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----  Adelaide----
             venue  freq
0            Hotel  0.06
1      Coffee Shop  0.06
2             Café  0.04
3  Thai Restaurant  0.03
4       Restaurant  0.03


----  Bathurst Quay----
          venue  freq
0   Coffee Shop  0.17
1          Café  0.09
2          Park  0.09
3  Dance Studio  0.04
4        Tunnel  0.04


----  Cabbagetown----
                 venue  freq
0           Restaurant  0.08
1          Coffee Shop  0.08
2                 Café  0.06
3          Pizza Place  0.06
4  Japanese Restaurant  0.04


----  Chinatown----
                   venue  freq
0     Chinese Restaurant  0.05
1                    Bar  0.05
2                   Café  0.04
3  Vietnamese Restaurant  0.04
4            Art Gallery  0.03


----  Deer Park----
                venue  freq
0         Coffee Shop  0.12
1    Sushi Restaurant  0.07
2                Café  0.05
3  Light Rail Station  0.05
4                 Gym  0.05


----  Design Exchange----
                 venue  freq
0          Coffee Sh

In [145]:
# Function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [146]:
# New dataframe with top 10 venues for each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborh.']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborh.'] = toronto_grouped['Neighborh.']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborh.,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Hotel,Café,Pizza Place,Japanese Restaurant,Restaurant,Steakhouse,Bar,Thai Restaurant,Beer Bar
1,Bathurst Quay,Coffee Shop,Park,Café,Mexican Restaurant,Tunnel,Garden,Caribbean Restaurant,Diner,Sculpture Garden,Japanese Restaurant
2,Cabbagetown,Coffee Shop,Restaurant,Café,Pizza Place,Pub,Japanese Restaurant,Italian Restaurant,Gastropub,Diner,Bakery
3,Chinatown,Bar,Chinese Restaurant,Vietnamese Restaurant,Café,Coffee Shop,Arts & Crafts Store,Yoga Studio,French Restaurant,Art Gallery,Caribbean Restaurant
4,Deer Park,Coffee Shop,Sushi Restaurant,Bagel Shop,Pizza Place,Gym,Pub,Café,Light Rail Station,Middle Eastern Restaurant,Fried Chicken Joint
5,Design Exchange,Coffee Shop,Café,Restaurant,Hotel,American Restaurant,Deli / Bodega,Seafood Restaurant,Steakhouse,Gastropub,Bakery
6,Dovercourt Village,Art Gallery,Café,Fast Food Restaurant,Bus Line,Coffee Shop,Park,Bar,Electronics Store,Donut Shop,Dumpling Restaurant
7,Exhibition Place,Park,Athletics & Sports,Soccer Stadium,Racetrack,Café,Theme Park,Train Station,Arts & Crafts Store,Convenience Store,Hockey Arena
8,Forest Hill SE,Pizza Place,Food & Drink Shop,Bagel Shop,Bank,Korean Restaurant,Sushi Restaurant,Optical Shop,Gift Shop,Trail,Frozen Yogurt Shop
9,Grange Park,Chinese Restaurant,Café,Japanese Restaurant,Coffee Shop,Sandwich Place,Ramen Restaurant,Ice Cream Shop,Vietnamese Restaurant,French Restaurant,Dumpling Restaurant


### Cluster Neighborhoods

In [113]:
# Set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborh.', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 0, 2, 2], dtype=int32)

In [147]:
# New dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

toronto_merged = df_toronto

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborh.'), on='Neighborhood')

toronto_merged.head()


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,Regent Park,43.660323,-79.362044,2,Coffee Shop,Thai Restaurant,Indian Restaurant,Fast Food Restaurant,Pharmacy,Pet Store,Sushi Restaurant,Pool,Performing Arts Venue,Restaurant
1,Downtown Toronto,Harbourfront,43.640552,-79.378937,2,Coffee Shop,Boat or Ferry,Café,Pizza Place,Park,Hotel,Thai Restaurant,Theater,Sushi Restaurant,Music Venue
2,Downtown Toronto,Garden District,43.656898,-79.376122,2,Coffee Shop,Clothing Store,Café,Restaurant,Movie Theater,Middle Eastern Restaurant,Theater,Tea Room,Japanese Restaurant,Ramen Restaurant
3,Downtown Toronto,Ryerson,43.657658,-79.378802,2,Coffee Shop,Clothing Store,Café,Italian Restaurant,Japanese Restaurant,Cosmetics Shop,Burger Joint,Pizza Place,Bar,Sandwich Place
4,Downtown Toronto,St. James Town,43.670867,-79.373306,2,Indian Restaurant,Pizza Place,Coffee Shop,Metro Station,Bar,Library,Market,Food & Drink Shop,Pharmacy,Filipino Restaurant


In [148]:
# Create map
map_clusters = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [149]:
# Examine cluster 1 as an example
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Central Bay Street,Coffee Shop,Clothing Store,Café,Italian Restaurant,Burger Joint,Tea Room,Bubble Tea Shop,Sandwich Place,Bakery,Pizza Place
22,Design Exchange,Coffee Shop,Café,Restaurant,Hotel,American Restaurant,Deli / Bodega,Seafood Restaurant,Steakhouse,Gastropub,Bakery
23,Brockton,Coffee Shop,Vietnamese Restaurant,Bar,Fast Food Restaurant,Shopping Mall,Grocery Store,Bakery,Clothing Store,Chinese Restaurant,Shoe Store
40,Yorkville,Italian Restaurant,Boutique,Café,Coffee Shop,Hotel,French Restaurant,Restaurant,Clothing Store,Spa,Japanese Restaurant
45,Harbord,Korean Restaurant,Coffee Shop,Café,Japanese Restaurant,Park,Deli / Bodega,Theater,Mexican Restaurant,Pizza Place,Spa
60,Railway Lands,Coffee Shop,Sports Bar,Hotel,Café,Italian Restaurant,Scenic Lookout,Park,Aquarium,Theater,Brewery
61,Harbourfront West,Coffee Shop,Boat or Ferry,Café,Pizza Place,Park,Hotel,Thai Restaurant,Theater,Sushi Restaurant,Music Venue
