# Segmenting/Clustering Neighborhoods in Toronto - 3

## Exploring and clustering the neighborhoods in Toronto. 

### Import all necessary libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

!pip install geocoder==1.5.0
from geopy.geocoders import Nominatim 

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip -q install folium
import folium # map rendering library



In [2]:
toronto_info = pd.read_csv('toronto_info') 
toronto_info = toronto_info.drop(columns=['Unnamed: 0'])
toronto_info.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.66179,-79.38939


### Map of Toronto with all the boroughs shown

In [3]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The coordinates of Toronto are 43.6534817, -79.3839347.


In [4]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_info['Latitude'], toronto_info['Longitude'], toronto_info['Borough'], toronto_info['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='red',
        fill=True,
        fill_color='#8FBC8F',
        fill_opacity=1,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

In [5]:
york_data = toronto_info[toronto_info['Borough'].str.contains("York")].reset_index(drop=True)
york_data

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M6A,North York,"Lawrence Manor , Lawrence Heights",43.723265,-79.451211
3,M3B,North York,Don Mills,43.7489,-79.35722
4,M4B,East York,"Parkview Hill , Woodbine Gardens",43.707193,-79.311529
5,M6B,North York,Glencairn,43.707279,-79.4475
6,M3C,North York,Don Mills,43.722143,-79.352023
7,M4C,East York,Woodbine Heights,43.68974,-79.308507
8,M6C,York,Humewood-Cedarvale,43.69173,-79.430013
9,M6E,York,Caledonia-Fairbanks,43.689118,-79.45065


### Map of York with all the listed Neighborhoods shown

In [6]:
address1 = 'York, Toronto'

geolocator1 = Nominatim(user_agent="york_explorer")
location1 = geolocator1.geocode(address1)
latitude1 = location1.latitude
longitude1 = location1.longitude
print('The geograpical coordinate of York, Toronto are {}, {}.'.format(latitude1, longitude1))

The geograpical coordinate of York, Toronto are 43.67910515, -79.49118414007154.


In [7]:
york_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(york_data['Latitude'], york_data['Longitude'], york_data['Borough'], york_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='black',
        fill=True,
        fill_color='#8FBC8F',
        fill_opacity=1,
        parse_html=False).add_to(york_map)  
    
york_map

### Foursquare credentials required to pull location info

In [8]:
CLIENT_ID = 'V35HKRZWAEPT1RHUM1O1P3VORCFKLQ4ER4HFYIHILEQ0KMIY' # your Foursquare ID
CLIENT_SECRET = '5BMUD2Z4FZTS3YVC2FWRKU4UQZQ42ZFWCKAPDIMT3ZXIM5YZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Foursquare credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Foursquare credentials:
CLIENT_ID: V35HKRZWAEPT1RHUM1O1P3VORCFKLQ4ER4HFYIHILEQ0KMIY
CLIENT_SECRET:5BMUD2Z4FZTS3YVC2FWRKU4UQZQ42ZFWCKAPDIMT3ZXIM5YZ


### Now, let's get the top 70 venues that are in York within a radius of 1000 meters.

In [9]:
# Create the GET request URL. Name your URL url.
LIMIT = 70
radius = 1000

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude1, longitude1, VERSION, radius, LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?client_id=V35HKRZWAEPT1RHUM1O1P3VORCFKLQ4ER4HFYIHILEQ0KMIY&client_secret=5BMUD2Z4FZTS3YVC2FWRKU4UQZQ42ZFWCKAPDIMT3ZXIM5YZ&ll=43.67910515,-79.49118414007154&v=20180605&radius=1000&limit=70'

In [10]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5eaa04db02a1725a9d82bc6e'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Mount Dennis',
  'headerFullLocation': 'Mount Dennis, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 15,
  'suggestedBounds': {'ne': {'lat': 43.68810515900001,
    'lng': -79.47876300026026},
   'sw': {'lat': 43.670105140999986, 'lng': -79.50360527988282}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bfd86544cf820a1bd98ecf4',
       'name': 'Smythe Park',
       'location': {'address': '61 Black Creek Blvd',
        'lat': 43.67621543710062,
        'lng': -79.49573972502156,
        'labeledLatLngs': [{'label': 'display',
          'lat'

In [11]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [12]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Smythe Park,Park,43.676215,-79.49574
1,Super Coffee,Coffee Shop,43.686861,-79.4896
2,Tim Hortons,Coffee Shop,43.677096,-79.495507
3,Pizza Pizza,Pizza Place,43.676602,-79.494358
4,Just 4 Fun Sporting Club,Hockey Arena,43.680908,-79.502735


In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

15 venues were returned by Foursquare.


In [15]:
york_venues = getNearbyVenues(names=york_data['Neighborhood'],
                                   latitudes=york_data['Latitude'],
                                   longitudes=york_data['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor , Lawrence Heights
Don Mills
Parkview Hill , Woodbine Gardens
Glencairn
Don Mills
Woodbine Heights
Humewood-Cedarvale
Caledonia-Fairbanks
Leaside
Hillcrest Village
Bathurst Manor , Wilson Heights , Downsview North
Thorncliffe Park
Fairview , Henry Farm , Oriole
Northwood Park , York University
East Toronto
Bayview Village
Downsview
York Mills , Silver Hills
Downsview
North Park , Maple Leaf Park , Upwood Park
Humber Summit
Willowdale , Newtonbrook
Downsview
Bedford Park , Lawrence Manor East
Del Ray , Mount Dennis , Keelsdale and Silverthorn
Humberlea , Emery
Willowdale
Downsview
Runnymede , The Junction North
Weston
York Mills West
Willowdale


### Grouping venues by neighborhood

In [16]:
york_venues_grouped = york_venues.groupby('Neighborhood').count()
york_venues_grouped

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor , Wilson Heights , Downsview North",21,21,21,21,21,21
Bayview Village,2,2,2,2,2,2
"Bedford Park , Lawrence Manor East",21,21,21,21,21,21
Caledonia-Fairbanks,8,8,8,8,8,8
"Del Ray , Mount Dennis , Keelsdale and Silverthorn",2,2,2,2,2,2
Don Mills,9,9,9,9,9,9
Downsview,28,28,28,28,28,28
East Toronto,1,1,1,1,1,1
"Fairview , Henry Farm , Oriole",50,50,50,50,50,50
Glencairn,9,9,9,9,9,9


### Let's find out how many unique categories can be curated from all the returned venues  

In [17]:
print('There are {} uniques categories.'.format(len(york_venues['Venue Category'].unique())))

There are 132 uniques categories.


### Analyzing each Neighborhood by venue category

In [18]:
# one hot encoding
york_category = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_category['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_category.columns[-1]] + list(york_category.columns[:-1])
york_category = york_category[fixed_columns]

york_category.head()

Unnamed: 0,Neighborhood,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Baby Store,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Analysis of how frequently each category occurs

In [19]:
york_cat_freq = york_category.groupby('Neighborhood').mean().reset_index()
york_cat_freq

Unnamed: 0,Neighborhood,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Baby Store,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store,Yoga Studio
0,"Bathurst Manor , Wilson Heights , Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park , Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
4,"Del Ray , Mount Dennis , Keelsdale and Silvert...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Don Mills,0.0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0
6,Downsview,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035714,0.0,0.0
7,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Fairview , Henry Farm , Oriole",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.04,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.04,0.0
9,Glencairn,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's print each neighborhood along with the top 5 most common venues

In [20]:

num_top_venues = 5

for neighborhood in york_cat_freq['Neighborhood']:
    print("----"+neighborhood+"----")
    temp = york_cat_freq[york_cat_freq['Neighborhood'] == neighborhood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor , Wilson Heights , Downsview North----
            venue  freq
0     Coffee Shop  0.10
1            Bank  0.10
2  Ice Cream Shop  0.05
3     Bridal Shop  0.05
4           Diner  0.05


----Bayview Village----
                        venue  freq
0  Construction & Landscaping   0.5
1                       Trail   0.5
2                        Park   0.0
3          Mexican Restaurant   0.0
4   Middle Eastern Restaurant   0.0


----Bedford Park , Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.10
1         Coffee Shop  0.10
2      Sandwich Place  0.10
3     Thai Restaurant  0.05
4         Sports Club  0.05


----Caledonia-Fairbanks----
                venue  freq
0                Park  0.25
1                 Gym  0.12
2              Bakery  0.12
3  Mexican Restaurant  0.12
4          Beer Store  0.12


----Del Ray , Mount Dennis , Keelsdale and Silverthorn----
                        venue  freq
0  Construction & Landscaping   0.5
1           

### Function to filter the results by the most common venue

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Filtering by the Top 3 most common venues

In [22]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
york_venues_sorted = pd.DataFrame(columns=columns)
york_venues_sorted['Neighborhood'] = york_cat_freq['Neighborhood']

for ind in np.arange(york_cat_freq.shape[0]):
    york_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_cat_freq.iloc[ind, :], num_top_venues)

york_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Bathurst Manor , Wilson Heights , Downsview North",Coffee Shop,Bank,Supermarket
1,Bayview Village,Construction & Landscaping,Trail,Discount Store
2,"Bedford Park , Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place
3,Caledonia-Fairbanks,Park,Women's Store,Sporting Goods Shop
4,"Del Ray , Mount Dennis , Keelsdale and Silvert...",Coffee Shop,Construction & Landscaping,Convenience Store


### CLustering the neighborhoods

In [23]:
# set number of clusters
kclusters = 5

york_grouped_clustering = york_cat_freq.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 4, 1, 1, 4, 1, 1, 3, 1, 1], dtype=int32)

### Assigning cluster labels

In [24]:
# add clustering labels
york_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = york_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(york_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M3A,North York,Parkwoods,43.752935,-79.335641,1,Construction & Landscaping,Food & Drink Shop,Park
1,M4A,North York,Victoria Village,43.728102,-79.31189,1,Coffee Shop,Portuguese Restaurant,Pizza Place
2,M6A,North York,"Lawrence Manor , Lawrence Heights",43.723265,-79.451211,1,Clothing Store,Furniture / Home Store,Cosmetics Shop
3,M3B,North York,Don Mills,43.7489,-79.35722,1,Athletics & Sports,Spa,Restaurant
4,M4B,East York,"Parkview Hill , Woodbine Gardens",43.707193,-79.311529,1,Fast Food Restaurant,Pizza Place,Gym / Fitness Center
5,M6B,North York,Glencairn,43.707279,-79.4475,1,Pizza Place,Gas Station,Pub
6,M3C,North York,Don Mills,43.722143,-79.352023,1,Athletics & Sports,Spa,Restaurant
7,M4C,East York,Woodbine Heights,43.68974,-79.308507,1,Pharmacy,Gas Station,Breakfast Spot
8,M6C,York,Humewood-Cedarvale,43.69173,-79.430013,1,Park,Trail,Grocery Store
9,M6E,York,Caledonia-Fairbanks,43.689118,-79.45065,1,Park,Women's Store,Sporting Goods Shop


### Create map

In [25]:
york_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(york_map_clusters)
       
york_map_clusters