In [7]:
# The code was removed by Watson Studio for sharing.

# Project: Segmenting and Clustering Neighborhoods in the city of Toronto, Canada
## Applied Data Science Capstone Week3_Project

### Eduardo Suarez

In [4]:
# Import necessary libraries

import pandas as pd
import numpy as np

# used pgeocode to get latitude and longitude. Geocoder is very unstable
!pip install pgeocode
import pgeocode

import json

!pip install geopy
from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip install folium
import folium


Collecting pgeocode
  Downloading pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Installing collected packages: pgeocode
Successfully installed pgeocode-0.3.0


# Part One: Scraping and Dataframe

In [71]:
# Get the postal code dataset from the Wikipedia website
# We use the .read_html method to read the data table

df = pd.read_html ('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

df

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                          Neighbourhood  
 0                                         Not assigned  
 1                                         Not assigned  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                       Not assigned  
 176                                       Not assigned  
 177                

In [72]:
# The table data is stored in the first element of the list.

pc_toronto = pd.DataFrame(df[0])

pc_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [73]:
# We eliminate the rows where Borouugh and neighborhood do not contain data (Not Assigned)

df_toronto = pc_toronto.drop(pc_toronto[(pc_toronto['Borough'] =='Not assigned') & (pc_toronto['Neighbourhood'] =='Not assigned')].index
                            ).reset_index(drop = True)

df_toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [74]:
# Save dataframe as csv file to storage

project.save_data(data=df_toronto.to_csv(index=False),file_name='toronto.csv',overwrite=True)


{'file_name': 'toronto.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'clusteringneighborhoodstoronto-donotdelete-pr-z6zo4nyn9kmd3m',
 'asset_id': 'c392d950-25f5-4456-a4a1-b5e13a549f9c'}

In [8]:
# Read csv file and loading as pandas datafarme

neigh_tor = pd.read_csv(project.get_file('toronto.csv'))

neigh_tor

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [77]:
neigh_tor.shape

(103, 3)

# Part Two: Get latitude and longitude coordinates of each neighborhood. 

In [47]:
# Let's take a look to pgeocode library to get latitude and longitude.
# Let's take zip code 'M3A' for example

nomi = pgeocode.Nominatim('ca') # country Canada
data_canada = nomi.query_postal_code("M3A")

data_canada

postal_code                                                     M3A
country_code                                                     CA
place_name        North York (York Heights / Victoria Village / ...
state_name                                                  Ontario
state_code                                                       ON
county_name                                             North York 
county_code                                                     NaN
community_name                                                  NaN
community_code                                                  NaN
latitude                                                    43.7545
longitude                                                    -79.33
accuracy                                                          1
Name: 0, dtype: object

In [9]:
# Let's create a loop to get the latitude and longitude of the neighborhoods and append to new DataFrame

# First, define the dataframe columns

column_names = ['PostalCode', 'PlaceName','Latitude', 'Longitude'] 
lat_long_neig = pd.DataFrame(columns=column_names)

lat_long_neig

Unnamed: 0,PostalCode,PlaceName,Latitude,Longitude


In [10]:
# Second, create the loop and append to Dataframe 

for i in range(len(neigh_tor)):
    
    postal = neigh_tor['Postal Code'][i]
    nomi = pgeocode.Nominatim('ca')
    lat_long_CA = nomi.query_postal_code(postal)
    lat_long_neig = lat_long_neig.append({'PostalCode':lat_long_CA['postal_code'],
                                          'PlaceName': lat_long_CA['place_name'],
                                          'Latitude': lat_long_CA['latitude'],'Longitude': lat_long_CA['longitude']}, ignore_index=True)
    
lat_long_neig # We take a look of the data, 
              # We can notice that there are few differences in the neighborhood names between the Wikipedia and pgeocode data sets 
              # It seems that geocode is more up to date
    

Unnamed: 0,PostalCode,PlaceName,Latitude,Longitude
0,M3A,North York (York Heights / Victoria Village / ...,43.7545,-79.3300
1,M4A,North York (Sweeney Park / Wigmore Park),43.7276,-79.3148
2,M5A,Downtown Toronto (Regent Park / Port of Toronto),43.6555,-79.3626
3,M6A,North York (Lawrence Manor / Lawrence Heights),43.7223,-79.4504
4,M7A,Queen's Park Ontario Provincial Government,43.6641,-79.3889
...,...,...,...,...
98,M8X,Etobicoke (The Kingsway / Montgomery Road / Ol...,43.6518,-79.5076
99,M4Y,Downtown Toronto (Church and Wellesley),43.6656,-79.3830
100,M7Y,East Toronto Business Reply Mail Processing Ce...,43.7804,-79.2505
101,M8Y,Etobicoke (Old Mill South / King's Mill Park /...,43.6325,-79.4939


In [11]:
# We do an exploration of the data (EDA)

lat_long_neig.describe(include='all')

Unnamed: 0,PostalCode,PlaceName,Latitude,Longitude
count,103,102,102.0,102.0
unique,103,102,,
top,M9N,Scarborough (Steeles West / L'Amoreaux West),,
freq,1,1,,
mean,,,43.706716,-79.393987
std,,,0.053028,0.096185
min,,,43.6021,-79.5909
25%,,,43.6611,-79.45175
50%,,,43.70395,-79.3888
75%,,,43.7491,-79.33775


In [12]:
# As we can see, Place Name, Latitude, Longitude have missing values (102 instead 103)

# Let's identify which rows contains the missing data
nan_row = lat_long_neig[lat_long_neig.isnull().any(1)]
nan_row


Unnamed: 0,PostalCode,PlaceName,Latitude,Longitude
76,M7R,,,


In [13]:
# As it is only a row that contains the missing data, we add the data (fillna) to dataframe by searching for the zip code on the internet. We only need Latitude and Longitude

missing = lat_long_neig['Latitude'].fillna(43.636966, inplace=True)
missing = lat_long_neig['Longitude'].fillna(-79.615819, inplace=True)

lat_long_neig.describe(include='all')


Unnamed: 0,PostalCode,PlaceName,Latitude,Longitude
count,103,102,103.0,103.0
unique,103,102,,
top,M9N,Scarborough (Steeles West / L'Amoreaux West),,
freq,1,1,,
mean,,,43.706039,-79.396141
std,,,0.053213,0.098176
min,,,43.6021,-79.615819
25%,,,43.65885,-79.4577
50%,,,43.702,-79.3889
75%,,,43.7482,-79.3387


In [14]:
# Third, We concatenade dataframes to obtain latitude and longitude 

lat_log_TO = pd.concat([neigh_tor,lat_long_neig['Latitude'],lat_long_neig['Longitude']], axis=1)

lat_log_TO

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.7804,-79.2505
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939


# Part Three: Explore and cluster the neighborhoods in Toronto. 

## Explore neighborhoods and venues in Toronto

In [15]:
# Get the latitude and longitude values of Toronto

address = 'TORONTO, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.65238435, -79.38356765.


In [16]:
# Create a map of Toronto and its neighborhoods using Folium

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(lat_log_TO['Latitude'], lat_log_TO['Longitude'], lat_log_TO['Borough'], lat_log_TO['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='black',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [17]:
# Let's only to segment and cluster the neighborhoods from Borough = Downtown Toronto

toronto_boro = lat_log_TO[lat_log_TO['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
toronto_boro.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
4,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754
5,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386
6,M6G,Downtown Toronto,Christie,43.6683,-79.4205
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.623,-79.3936
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.6469,-79.3823


In [18]:
# Let's get the geographical coordinates of Downtown Toronto.

address = 'Downtown Toronto, TO'

geolocator = Nominatim(user_agent="toro_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


In [19]:
# Let's visualize the Downtown Toronto neighborhoods

map_downtown = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, label in zip(toronto_boro['Latitude'], toronto_boro['Longitude'], toronto_boro['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

## We are going to start utilizing the Foursquare API to explore the neighborhoods and segment them. Let's define Foursquare Credentials and Version

In [46]:
# The code was removed by Watson Studio for sharing.

In [21]:
# Let's create a function to get nearby venues in neighborhoods.

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
# We're going to run the above function on each neighborhood and create a new dataframe called Downtown_venues.

Downtown_venues = getNearbyVenues(names=toronto_boro['Neighbourhood'],
                                   latitudes=toronto_boro['Latitude'],
                                   longitudes=toronto_boro['Longitude']
                                  )


Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [23]:
# Let's check the dataframe

print(Downtown_venues.shape)
Downtown_venues.head(100)

(1161, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.6555,-79.3626,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.6555,-79.3626,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.6555,-79.3626,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Regent Park, Harbourfront",43.6555,-79.3626,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,"Regent Park, Harbourfront",43.6555,-79.3626,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
...,...,...,...,...,...,...,...
95,"Garden District, Ryerson",43.6572,-79.3783,Ali Baba's - Yonge north of Dundas,43.657867,-79.381176,Middle Eastern Restaurant
96,"Garden District, Ryerson",43.6572,-79.3783,Kabul Express,43.656691,-79.376643,Middle Eastern Restaurant
97,"Garden District, Ryerson",43.6572,-79.3783,Magic Tailor,43.653742,-79.379745,Clothing Store
98,"Garden District, Ryerson",43.6572,-79.3783,SEPHORA,43.653527,-79.380154,Cosmetics Shop


In [24]:
# How many venues for each Neighborhood

Downtown_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,94,94,94,94,94,94
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",57,57,57,57,57,57
Central Bay Street,57,57,57,57,57,57
Christie,11,11,11,11,11,11
Church and Wellesley,78,78,78,78,78,78
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",4,4,4,4,4,4
"Kensington Market, Chinatown, Grange Park",49,49,49,49,49,49


In [25]:
# How many unique categories can be curated from all the returned venues

print('There are {} uniques categories.'.format(len(Downtown_venues['Venue Category'].unique())))

There are 178 uniques categories.


In [26]:
# We're going to analyze each Neighborhood

# First, one hot encoding to transpose Categories to Columns
downtown_onehot = pd.get_dummies(Downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# Second, add neighborhood column back to dataframe
downtown_onehot['Neighbourhood'] = Downtown_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()

Unnamed: 0,Neighbourhood,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bagel Shop,Bakery,...,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
# Checking size of the new dataframe

downtown_onehot.shape

(1161, 179)

In [28]:
# let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

downtown_grouped = downtown_onehot.groupby('Neighbourhood').mean().reset_index()
downtown_grouped


Unnamed: 0,Neighbourhood,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bagel Shop,Bakery,...,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,Berczy Park,0.010638,0.021277,0.0,0.0,0.0,0.0,0.0,0.010638,0.042553,...,0.0,0.0,0.010638,0.0,0.0,0.0,0.0,0.0,0.0,0.010638
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.035088,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017544,0.017544
2,Central Bay Street,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017544,0.017544,0.0,0.017544,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012821,0.0,0.0,0.0,0.0,0.025641
5,"Commerce Court, Victoria Hotel",0.03,0.01,0.0,0.0,0.03,0.0,0.0,0.0,0.01,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0
6,"First Canadian Place, Underground city",0.03,0.01,0.0,0.0,0.03,0.0,0.0,0.0,0.01,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0
7,"Garden District, Ryerson",0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.01,0.01,0.0,0.01,0.0,0.0,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.020408,0.0,0.040816,0.0,0.0,0.0,0.0,0.040816,...,0.0,0.0,0.061224,0.0,0.040816,0.0,0.020408,0.0,0.0,0.0


In [29]:
# Let's check the new size of dataframe

downtown_grouped.shape

(19, 179)

In [30]:
# Let's get the top 5 most common venues for each neighborhood to quick look.

num_top_venues = 5

for hood in downtown_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.11
1               Hotel  0.05
2                Café  0.04
3  Seafood Restaurant  0.04
4              Bakery  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0         Coffee Shop  0.07
1                Café  0.05
2  Italian Restaurant  0.05
3                 Bar  0.05
4              Bakery  0.04


----Central Bay Street----
                       venue  freq
0                Coffee Shop  0.23
1                       Café  0.05
2                Pizza Place  0.04
3            Bubble Tea Shop  0.04
4  Middle Eastern Restaurant  0.04


----Christie----
           venue  freq
0  Grocery Store  0.27
1           Café  0.27
2    Candy Store  0.09
3     Baby Store  0.09
4           Park  0.09


----Church and Wellesley----
                  venue  freq
0           Coffee Shop  0.06
1   Japanese Restaurant  0.06
2      Sus

In [31]:
# Get 10 most common venues and put into a pandas dataframe

# First, let's sort the venues in descending order (Create function)

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [32]:
# Now let's create the new dataframe (top 10 venues for each neighborhood)

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighbourhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Hotel,Café,Bakery,Seafood Restaurant,Cocktail Bar,Restaurant,Beer Bar,Japanese Restaurant,Deli / Bodega
1,"CN Tower, King and Spadina, Railway Lands, Har...",Coffee Shop,Café,Italian Restaurant,Bar,Grocery Store,Park,Speakeasy,Gym / Fitness Center,French Restaurant,Restaurant
2,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Pizza Place,Bubble Tea Shop,Sandwich Place,Middle Eastern Restaurant,Restaurant,Donut Shop,Plaza
3,Christie,Grocery Store,Café,Playground,Candy Store,Park,Baby Store,Coffee Shop,Electronics Store,Fish Market,Fast Food Restaurant
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Fast Food Restaurant,Restaurant,Gay Bar,Men's Store,Mediterranean Restaurant,Hotel,Grocery Store


## Cluster Neighborhoods 

In [33]:
# Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

downtownto_grouped_clustering = downtown_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtownto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([0, 0, 4, 3, 0, 0, 0, 0, 1, 0], dtype=int32)

In [34]:
# Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtownto_merged = toronto_boro

# merge downtown_grouped with toronto_boro to add latitude/longitude for each neighborhood
downtownto_merged = downtownto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

downtownto_merged.head() # check the last columns!


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,4,Coffee Shop,Breakfast Spot,Yoga Studio,Beer Store,Restaurant,Electronics Store,Spa,Bakery,Thai Restaurant,Theater
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,0,Sushi Restaurant,Escape Room,Burrito Place,Diner,Martial Arts School,Mexican Restaurant,Fast Food Restaurant,Coffee Shop,Ethiopian Restaurant,Gym
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783,0,Coffee Shop,Clothing Store,Japanese Restaurant,Café,Cosmetics Shop,Hotel,Middle Eastern Restaurant,Furniture / Home Store,Ramen Restaurant,Pizza Place
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756,0,Coffee Shop,Café,Seafood Restaurant,American Restaurant,Gastropub,Cocktail Bar,Italian Restaurant,Lingerie Store,Clothing Store,Cosmetics Shop
4,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754,0,Coffee Shop,Hotel,Café,Bakery,Seafood Restaurant,Cocktail Bar,Restaurant,Beer Bar,Japanese Restaurant,Deli / Bodega


In [35]:
# let's visualize the resulting clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtownto_merged['Latitude'], downtownto_merged['Longitude'], 
                                  downtownto_merged['Neighbourhood'], downtownto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters 

## Cluster 0 (Hotels, Restaurants, Food, Coffe, Gym, Bar, Pizza, Bakery, Park)

In [36]:
downtownto_merged.loc[downtownto_merged['Cluster Labels'] == 0,downtownto_merged.columns[[1] + list(range(5, downtownto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,0,Sushi Restaurant,Escape Room,Burrito Place,Diner,Martial Arts School,Mexican Restaurant,Fast Food Restaurant,Coffee Shop,Ethiopian Restaurant,Gym
2,Downtown Toronto,0,Coffee Shop,Clothing Store,Japanese Restaurant,Café,Cosmetics Shop,Hotel,Middle Eastern Restaurant,Furniture / Home Store,Ramen Restaurant,Pizza Place
3,Downtown Toronto,0,Coffee Shop,Café,Seafood Restaurant,American Restaurant,Gastropub,Cocktail Bar,Italian Restaurant,Lingerie Store,Clothing Store,Cosmetics Shop
4,Downtown Toronto,0,Coffee Shop,Hotel,Café,Bakery,Seafood Restaurant,Cocktail Bar,Restaurant,Beer Bar,Japanese Restaurant,Deli / Bodega
7,Downtown Toronto,0,Café,Coffee Shop,Gym,Restaurant,Hotel,American Restaurant,Salad Place,Sushi Restaurant,Thai Restaurant,Steakhouse
9,Downtown Toronto,0,Hotel,Coffee Shop,Café,Salad Place,Japanese Restaurant,Seafood Restaurant,American Restaurant,Asian Restaurant,Breakfast Spot,Concert Hall
10,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,Gym,Japanese Restaurant,Steakhouse,Seafood Restaurant,Salad Place,American Restaurant
11,Downtown Toronto,0,Café,Bakery,Bookstore,Japanese Restaurant,Beer Bar,Dessert Shop,Noodle House,Moving Target,Comfort Food Restaurant,Pub
12,Downtown Toronto,0,Café,Vegetarian / Vegan Restaurant,Coffee Shop,Mexican Restaurant,Caribbean Restaurant,Bakery,Grocery Store,Farmers Market,Arts & Crafts Store,Vietnamese Restaurant
13,Downtown Toronto,0,Coffee Shop,Café,Italian Restaurant,Bar,Grocery Store,Park,Speakeasy,Gym / Fitness Center,French Restaurant,Restaurant


## Cluster 1 (Electronics, Park, Grocery)

In [38]:
downtownto_merged.loc[downtownto_merged['Cluster Labels'] == 1,downtownto_merged.columns[[1] + list(range(5, downtownto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Downtown Toronto,1,Café,Harbor / Marina,Park,Music Venue,Yoga Studio,Electronics Store,Fish Market,Fast Food Restaurant,Farmers Market,Event Space


## Cluster 2 (Playground, Market, Grocery)

In [40]:
downtownto_merged.loc[downtownto_merged['Cluster Labels'] == 2,downtownto_merged.columns[[1] + list(range(5, downtownto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,2,Playground,Candy Store,Park,Grocery Store,Eastern European Restaurant,Fish Market,Fast Food Restaurant,Farmers Market,Event Space,Ethiopian Restaurant


## Cluster 3 (Grocery, Park, Shop, Cafe)

In [41]:
downtownto_merged.loc[downtownto_merged['Cluster Labels'] == 3,downtownto_merged.columns[[1] + list(range(5, downtownto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,3,Grocery Store,Café,Playground,Candy Store,Park,Baby Store,Coffee Shop,Electronics Store,Fish Market,Fast Food Restaurant


## Cluster 4 (Coffe, Restaurants, Bar, Spa, Store, Gym, Hotel) 

In [42]:
downtownto_merged.loc[downtownto_merged['Cluster Labels'] == 4,downtownto_merged.columns[[1] + list(range(5, downtownto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,4,Coffee Shop,Breakfast Spot,Yoga Studio,Beer Store,Restaurant,Electronics Store,Spa,Bakery,Thai Restaurant,Theater
5,Downtown Toronto,4,Coffee Shop,Café,Italian Restaurant,Pizza Place,Bubble Tea Shop,Sandwich Place,Middle Eastern Restaurant,Restaurant,Donut Shop,Plaza
15,Downtown Toronto,4,Coffee Shop,Restaurant,Gym,Hotel,Café,Sporting Goods Shop,Italian Restaurant,Deli / Bodega,Japanese Restaurant,Sports Bar


# Conclusion

## Cluster 0 (Hotels, Restaurants, Food, Coffe, Gym, Bar, Pizza, Bakery) has more variety of venues, good place to move! 