# **Segmenting and Clustering Neighborhoods in Toronto**

**Downloading all needed dependencies:**

In [167]:
import numpy as np
import pandas as pd
!pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
import urllib.request
from bs4 import SoupStrainer



**Parsing Html document with BeautifulSoup:**

In [168]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL)
html_content = r.text
only_table_tags = SoupStrainer("table")
soup = BeautifulSoup(html_content, "html.parser", parse_only=only_table_tags) # parses all tables from html document
soup = soup.find_all('table')[0] # parses only the first table
print(soup.prettify()) # prints the organized tree

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postal Code
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighborhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    North York
   </td>
   <td>
    Parkwoods
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    North York
   </td>
   <td>
    Victoria Village
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Regent Park, Harbourfront
   </td>
  </tr>
  <tr>
   <td>
    M6A
   </td>
   <td>
    North York
   </td>
   <td>
    Lawrence Manor, Lawrence Heights
   </td>
  </tr>
  <tr>
   <td>
    M7A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Queen's Park, Ontario Provincial Government
   </td>
  </tr>
  <tr>
   <td>
    M8A
   </td>
   <td>
    Not assigned

**Creating a Pandas Dataframe:**

In [169]:
new_table = pd.DataFrame(columns=['Postal Code', 'Borough', 'Neighborhood'])
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood


**Filling the dataframe with BeautifulSoup object's data:**

In [170]:
row_marker = 0
for row in soup.find_all('tr'): # loops through all soup object's table rows
    column_marker = 0 # identifies the column number
    columns = row.find_all('td') 
    listt = [0, 0, 0] # will be filled with a row's content and transformed to a new dataframe
    for column in columns: # loops through each row's column values
        listt[column_marker] = column.get_text().replace('\n','').strip() # fills the listt array with column values
        column_marker += 1
    df = pd.DataFrame([listt], columns=['Postal Code', 'Borough', 'Neighborhood']) # creates new dataframe containing one row
    new_table = new_table.append(df) # appends the new dataframe to the original
    row_marker += 1
new_table.reset_index(inplace = True, drop = True)
new_table.drop(new_table.index[0], inplace = True)
new_table.reset_index(inplace = True, drop = True)
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


**Removing rows which contain a "Not assigned" borough value:**

In [171]:
new_table = new_table[new_table['Borough'] != 'Not assigned']
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


**Making sure there aren't repeated postal codes in different rows:**

In [172]:
len(new_table['Postal Code'].unique())

103

**Making sure there aren't neighborhoods with "Not assigned" or "" values:**

In [173]:
new_table[new_table['Neighborhood']=='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


In [174]:
new_table[new_table['Neighborhood']=='']

Unnamed: 0,Postal Code,Borough,Neighborhood


**The organized dataframe looks like:**

In [175]:
new_table.reset_index(inplace = True, drop = True)
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# **Getting geographical coordinates of neighborhoods**

**Downloading the CSV file:**

In [176]:
!wget -q -O 'toronto_data.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


**Reading and loading the data into a pandas dataframe:**

In [177]:
df_lat_lon = pd.read_csv('toronto_data.csv')
df_lat_lon

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


**Merging both dataframes:**

In [178]:
merged_df = pd.merge(new_table, df_lat_lon, how='inner', on=['Postal Code'])
merged_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# **Clustering North York's Neighborhoods**

First, I'm going to cluster North York's neighborhoods. Later, I'll replicate the process for all Toronto neighborhoods. 

**Creating the dataframe with only North York information:**

In [179]:
NY_df = merged_df[merged_df['Borough'] == 'North York'].reset_index(drop = True)
NY_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don Mills,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073
5,M3C,North York,Don Mills,43.7259,-79.340923
6,M2H,North York,Hillcrest Village,43.803762,-79.363452
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
9,M3J,North York,"Northwood Park, York University",43.76798,-79.487262


**Downloading needed dependencies:**

In [180]:
import json
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



**Getting geographical coordinates of North York:**

In [181]:
address = 'North York, Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


**Visualizing North York Neighborhoods:**

In [182]:
# create map of North York using latitude and longitude values
map_NY = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(NY_df['Latitude'], NY_df['Longitude'], NY_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NY)  
    
map_NY

**Defining Foursquare credentials and version:**

In [183]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

**Defining function that creates a new dataframe with nearby venues data for each neighborhood:**

In [184]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Running the function and collecting venues and categories:**

In [185]:
NY_venues = getNearbyVenues(names=NY_df['Neighborhood'],
                                   latitudes=NY_df['Latitude'],
                                   longitudes=NY_df['Longitude']
                                  )
print(NY_venues.shape)
NY_venues.head()

(237, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Parkwoods,43.753259,-79.329656,GreenWin pool,43.756232,-79.333842,Pool
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


**Counting how many venues were returned for each neighborhood:**

In [186]:
gp_neighborhoods = NY_venues.groupby('Neighborhood').count()
gp_neighborhoods

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
Don Mills,27,27,27,27,27,27
Downsview,16,16,16,16,16,16
"Fairview, Henry Farm, Oriole",64,64,64,64,64,64
Glencairn,4,4,4,4,4,4
Hillcrest Village,4,4,4,4,4,4
Humber Summit,1,1,1,1,1,1
"Humberlea, Emery",1,1,1,1,1,1


As there are some neighborhoods with a very low number of venues, I'm going to remove those which have less than four. Therefore, the clustering will be done with neighborhoods with more than four.

**Removing neighborhoods with less than four venues associated:**

In [187]:
row_marker = 0
NY_venues.set_index('Neighborhood', inplace = True)
for index in gp_neighborhoods.index.tolist():
    if gp_neighborhoods.iloc[row_marker, 1] < 4:
        NY_venues.drop(index = index, axis = 0, inplace = True)
    row_marker += 1
NY_venues.reset_index(inplace = True)
NY_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Parkwoods,43.753259,-79.329656,GreenWin pool,43.756232,-79.333842,Pool
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
...,...,...,...,...,...,...,...
230,"Willowdale, Willowdale West",43.782736,-79.442259,Shoppers Drug Mart,43.784847,-79.446028,Pharmacy
231,"Willowdale, Willowdale West",43.782736,-79.442259,RBC Royal Bank,43.783894,-79.446603,Bank
232,"Willowdale, Willowdale West",43.782736,-79.442259,Tim Hortons,43.780940,-79.444231,Coffee Shop
233,"Willowdale, Willowdale West",43.782736,-79.442259,Price Chopper,43.783237,-79.446339,Grocery Store


**Setting dummy variables for venue categories:**

In [188]:
# one hot encoding
NY_onehot = pd.get_dummies(NY_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NY_onehot['Neighborhood'] = NY_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [NY_onehot.columns[-1]] + list(NY_onehot.columns[:-1])
NY_onehot = NY_onehot[fixed_columns]

print('The new dataframe contains {} rown and {} columns'.format(NY_onehot.shape[0], NY_onehot.shape[1]))
NY_onehot.head()

The new dataframe contains 235 rown and 99 columns


Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Grouping neighborhoods by venue category frequencies:**

In [189]:
NY_grouped = NY_onehot.groupby('Neighborhood').mean().reset_index()
NY_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.0,...,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.086957,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.043478
3,Don Mills,0.0,0.0,0.0,0.0,0.074074,0.037037,0.0,0.0,0.0,...,0.037037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.0625,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Fairview, Henry Farm, Oriole",0.0,0.0,0.015625,0.0,0.015625,0.0,0.03125,0.03125,0.015625,...,0.0,0.0,0.015625,0.0,0.015625,0.015625,0.015625,0.0,0.0,0.015625
6,Glencairn,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Lawrence Manor, Lawrence Heights",0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.090909
9,"North Park, Maple Leaf Park, Upwood Park",0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Defining a function that sorts the venues in descending order:**

In [190]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**Creating the new dataframe and displaying the top 4 venues for each neighborhood:**

In [191]:
num_top_venues = 4

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = NY_grouped['Neighborhood']

for ind in np.arange(NY_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NY_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Gas Station,Shopping Mall
1,Bayview Village,Japanese Restaurant,Chinese Restaurant,Café,Bank
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sushi Restaurant,Coffee Shop,Sandwich Place
3,Don Mills,Gym,Beer Store,Japanese Restaurant,Coffee Shop
4,Downsview,Grocery Store,Park,Airport,Construction & Landscaping


**Run *k*-means to cluster the neighborhood into 3 clusters:**

In [192]:
# set number of clusters
kclusters = 3

NY_grouped_clustering = NY_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, init = "k-means++", n_init = 20).fit(NY_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 0, 0, 1, 0, 1, 0, 0, 1], dtype=int32)

**Creating a new dataframe that includes the cluster as well as the top 4 venues for each neighborhood:**

In [193]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

NY_merged = NY_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
NY_merged = NY_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how = 'inner')

NY_merged.reset_index(drop = True, inplace = True)
NY_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Park,Food & Drink Shop,Pool,Bus Stop
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,French Restaurant,Financial or Legal Service,Portuguese Restaurant,Hockey Arena
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Women's Store,Miscellaneous Shop,Arts & Crafts Store,Boutique
3,M3B,North York,Don Mills,43.745906,-79.352188,0,Gym,Beer Store,Japanese Restaurant,Coffee Shop
4,M3C,North York,Don Mills,43.7259,-79.340923,0,Gym,Beer Store,Japanese Restaurant,Coffee Shop
5,M6B,North York,Glencairn,43.709577,-79.445073,1,Park,Asian Restaurant,Pub,Japanese Restaurant
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,0,Golf Course,Mediterranean Restaurant,Pool,Dog Run
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,0,Coffee Shop,Bank,Gas Station,Shopping Mall
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant
9,M3J,North York,"Northwood Park, York University",43.76798,-79.487262,0,Coffee Shop,Furniture / Home Store,Massage Studio,Caribbean Restaurant


**Visualizing the resulting clusters:**

In [194]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NY_merged['Latitude'], NY_merged['Longitude'], NY_merged['Neighborhood'], NY_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

**Examining cluster 1:**

In [195]:
NY_merged.loc[NY_merged['Cluster Labels'] == 0, NY_merged.columns[[2] + list(range(5, NY_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
1,Victoria Village,0,French Restaurant,Financial or Legal Service,Portuguese Restaurant,Hockey Arena
2,"Lawrence Manor, Lawrence Heights",0,Women's Store,Miscellaneous Shop,Arts & Crafts Store,Boutique
3,Don Mills,0,Gym,Beer Store,Japanese Restaurant,Coffee Shop
4,Don Mills,0,Gym,Beer Store,Japanese Restaurant,Coffee Shop
6,Hillcrest Village,0,Golf Course,Mediterranean Restaurant,Pool,Dog Run
7,"Bathurst Manor, Wilson Heights, Downsview North",0,Coffee Shop,Bank,Gas Station,Shopping Mall
8,"Fairview, Henry Farm, Oriole",0,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant
9,"Northwood Park, York University",0,Coffee Shop,Furniture / Home Store,Massage Studio,Caribbean Restaurant
16,"Bedford Park, Lawrence Manor East",0,Italian Restaurant,Sushi Restaurant,Coffee Shop,Sandwich Place
17,"Willowdale, Willowdale East",0,Ramen Restaurant,Pizza Place,Sushi Restaurant,Coffee Shop


**Examining cluster 2:**

In [196]:
NY_merged.loc[NY_merged['Cluster Labels'] == 1, NY_merged.columns[[2] + list(range(5, NY_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,Parkwoods,1,Park,Food & Drink Shop,Pool,Bus Stop
5,Glencairn,1,Park,Asian Restaurant,Pub,Japanese Restaurant
11,Downsview,1,Grocery Store,Park,Airport,Construction & Landscaping
12,Downsview,1,Grocery Store,Park,Airport,Construction & Landscaping
13,Downsview,1,Grocery Store,Park,Airport,Construction & Landscaping
14,Downsview,1,Grocery Store,Park,Airport,Construction & Landscaping
15,"North Park, Maple Leaf Park, Upwood Park",1,Park,Massage Studio,Construction & Landscaping,Bakery
18,York Mills West,1,Park,Convenience Store,Bank,Women's Store


**Examining cluster 3:**

In [197]:
NY_merged.loc[NY_merged['Cluster Labels'] == 2, NY_merged.columns[[2] + list(range(5, NY_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
10,Bayview Village,2,Japanese Restaurant,Chinese Restaurant,Café,Bank


# **Clustering Toronto Neighborhoods**

**Getting geographical coordinates of North York:**

In [198]:
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


**Visualizing Toronto Neighborhoods:**

In [199]:
# create map of North York using latitude and longitude values
map_TR = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TR)  
    
map_TR

**Running the function and collecting venues and categories:**

In [200]:
TR_venues = getNearbyVenues(names=merged_df['Neighborhood'],
                                   latitudes=merged_df['Latitude'],
                                   longitudes=merged_df['Longitude']
                                  )
print(TR_venues.shape)
TR_venues.head()

(2112, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Parkwoods,43.753259,-79.329656,GreenWin pool,43.756232,-79.333842,Pool
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


**Counting how many venues were returned for each neighborhood:**

In [201]:
gp_neighborhoods = TR_venues.groupby('Neighborhood').count()
gp_neighborhoods

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Wilson Heights, Downsview North",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
...,...,...,...,...,...,...
"Willowdale, Willowdale East",33,33,33,33,33,33
"Willowdale, Willowdale West",6,6,6,6,6,6
Woburn,4,4,4,4,4,4
Woodbine Heights,8,8,8,8,8,8


As there are some neighborhoods with a very low number of venues, I'm going to remove those which have less than four. Therefore, the clustering will be done with neighborhoods with more than four.

**Removing neighborhoods with less than four venues associated:**

In [202]:
row_marker = 0
TR_venues.set_index('Neighborhood', inplace = True)
for index in gp_neighborhoods.index.tolist():
    if gp_neighborhoods.iloc[row_marker, 1] < 4:
        TR_venues.drop(index = index, axis = 0, inplace = True)
    row_marker += 1
TR_venues.reset_index(inplace = True)
TR_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Parkwoods,43.753259,-79.329656,GreenWin pool,43.756232,-79.333842,Pool
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
...,...,...,...,...,...,...,...
2073,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Jim & Maria's No Frills,43.631152,-79.518617,Grocery Store
2074,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Once Upon A Child,43.631075,-79.518290,Kids Store
2075,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Value Village,43.631269,-79.518238,Thrift / Vintage Store
2076,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Kingsway Boxing Club,43.627254,-79.526684,Gym


**Setting dummy variables for venue categories:**

In [203]:
# one hot encoding
TR_onehot = pd.get_dummies(TR_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
TR_onehot['Neighborhood'] = TR_venues['Neighborhood'] 

# move neighborhood column to the first column
venue_columns = list(TR_onehot.columns)
venue_columns.remove('Neighborhood')
fixed_columns = ['Neighborhood'] + venue_columns
TR_onehot = TR_onehot[fixed_columns]

print('The new dataframe contains {} rown and {} columns'.format(TR_onehot.shape[0], TR_onehot.shape[1]))
TR_onehot.head()

The new dataframe contains 2078 rown and 265 columns


Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Grouping neighborhoods by venue category frequencies:**

In [204]:
TR_grouped = TR_onehot.groupby('Neighborhood').mean().reset_index()
TR_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.052632,0.000000,0.0,0.0,0.0,0.000000,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.043478,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,"Willowdale, Willowdale East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.030303,0.0,0.0,0.0,0.000000,0.0
74,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
75,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
76,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.125000,0.000000,0.0,0.0,0.0,0.000000,0.0


**Creating the new dataframe and displaying the top 4 venues for each neighborhood:**

In [205]:
num_top_venues = 4

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = TR_grouped['Neighborhood']

for ind in np.arange(TR_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(TR_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Skating Rink,Breakfast Spot
1,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Dance Studio
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Pharmacy,Ice Cream Shop
3,Bayview Village,Café,Japanese Restaurant,Bank,Chinese Restaurant
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sushi Restaurant,Coffee Shop,Sandwich Place


**Run *k*-means to cluster the neighborhood into 5 clusters:**

In [206]:
# set number of clusters
kclusters = 5

TR_grouped_clustering = TR_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, init = "k-means++", n_init = 100).fit(TR_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([4, 3, 3, 2, 3, 3, 4, 4, 3, 3, 3, 4, 3, 4, 3, 3, 3, 3, 0, 3],
      dtype=int32)

**Creating a new dataframe that includes the cluster as well as the top 4 venues for each neighborhood:**

In [207]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

TR_merged = merged_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
TR_merged = TR_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how = 'inner')

TR_merged.reset_index(drop = True, inplace = True)
TR_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Pool,Bus Stop,Food & Drink Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,3,Portuguese Restaurant,Hockey Arena,Coffee Shop,Intersection
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,3,Coffee Shop,Bakery,Park,Pub
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3,Accessories Store,Arts & Crafts Store,Furniture / Home Store,Event Space
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,3,Coffee Shop,Yoga Studio,Diner,Park
...,...,...,...,...,...,...,...,...,...,...
77,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,3,Café,Coffee Shop,Restaurant,Italian Restaurant
78,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.382280,3,Coffee Shop,Café,Gym,Restaurant
79,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,3,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant
80,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,3,Yoga Studio,Pizza Place,Light Rail Station,Smoke Shop


**Visualizing the resulting clusters:**

In [208]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(TR_merged['Latitude'], TR_merged['Longitude'], TR_merged['Neighborhood'], TR_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

**Examining cluster 1:**

In [209]:
TR_merged.loc[TR_merged['Cluster Labels'] == 0, TR_merged.columns[[2] + list(range(5, TR_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
0,Parkwoods,0,Park,Pool,Bus Stop,Food & Drink Shop
9,Glencairn,0,Park,Pub,Asian Restaurant,Japanese Restaurant
42,"North Park, Maple Leaf Park, Upwood Park",0,Park,Massage Studio,Bakery,Construction & Landscaping
48,Lawrence Park,0,Park,Swim School,Bus Line,Dim Sum Restaurant
51,York Mills West,0,Park,Convenience Store,Bank,Dessert Shop
52,Davisville North,0,Gym,Hotel,Department Store,Sandwich Place
53,"Forest Hill North & West, Forest Hill Road Park",0,Park,Trail,Bus Line,Sushi Restaurant
62,"Kingsview Village, St. Phillips, Martin Grove ...",0,Park,Pizza Place,Sandwich Place,Mobile Phone Shop
74,Rosedale,0,Park,Trail,Playground,Curling Ice


**Examining cluster 2:**

In [210]:
TR_merged.loc[TR_merged['Cluster Labels'] == 1, TR_merged.columns[[2] + list(range(5, TR_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
16,Woburn,1,Coffee Shop,Indian Restaurant,Korean Restaurant,Distribution Center


**Examining cluster 3:**

In [211]:
TR_merged.loc[TR_merged['Cluster Labels'] == 2, TR_merged.columns[[2] + list(range(5, TR_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
31,Bayview Village,2,Café,Japanese Restaurant,Bank,Chinese Restaurant


**Examining cluster 4:**

In [212]:
TR_merged.loc[TR_merged['Cluster Labels'] == 3, TR_merged.columns[[2] + list(range(5, TR_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
1,Victoria Village,3,Portuguese Restaurant,Hockey Arena,Coffee Shop,Intersection
2,"Regent Park, Harbourfront",3,Coffee Shop,Bakery,Park,Pub
3,"Lawrence Manor, Lawrence Heights",3,Accessories Store,Arts & Crafts Store,Furniture / Home Store,Event Space
4,"Queen's Park, Ontario Provincial Government",3,Coffee Shop,Yoga Studio,Diner,Park
5,Don Mills,3,Gym,Asian Restaurant,Japanese Restaurant,Restaurant
6,Don Mills,3,Gym,Asian Restaurant,Japanese Restaurant,Restaurant
7,"Parkview Hill, Woodbine Gardens",3,Pizza Place,Breakfast Spot,Pharmacy,Intersection
8,"Garden District, Ryerson",3,Clothing Store,Coffee Shop,Japanese Restaurant,Cosmetics Shop
11,St. James Town,3,Coffee Shop,Café,Cocktail Bar,Gastropub
12,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",3,Park,Coffee Shop,Cosmetics Shop,Beer Store


**Examining cluster 5:**

In [213]:
TR_merged.loc[TR_merged['Cluster Labels'] == 4, TR_merged.columns[[2] + list(range(5, TR_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue
10,Woodbine Heights,4,Skating Rink,Park,Beer Store,Pharmacy
13,"Guildwood, Morningside, West Hill",4,Mexican Restaurant,Medical Center,Electronics Store,Breakfast Spot
14,The Beaches,4,Trail,Pub,Health Food Store,Yoga Studio
19,Christie,4,Grocery Store,Café,Park,Baby Store
20,Cedarbrae,4,Gas Station,Thai Restaurant,Fried Chicken Joint,Bank
25,"Dufferin, Dovercourt Village",4,Pharmacy,Bakery,Park,Bank
38,"Brockton, Parkdale Village, Exhibition Place",4,Café,Breakfast Spot,Nightclub,Coffee Shop
39,"Golden Mile, Clairlea, Oakridge",4,Bus Line,Bakery,Park,Ice Cream Shop
46,"Birch Cliff, Cliffside West",4,Café,College Stadium,Skating Rink,General Entertainment
49,"Runnymede, The Junction North",4,Caribbean Restaurant,Pizza Place,Breakfast Spot,Brewery
