## Segmenting and Clustering Neighborhoods in Toronto
#### Scrape the Wikipedia page to explore, segment, and cluster the neighborhoods

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from IPython.display import Image 
from IPython.core.display import HTML 
from bs4 import BeautifulSoup
from urllib.request import urlopen

### 1. Scraping data from a website

In [2]:
#using panda to read the HTML data from Wikipedia
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
#creating a table
#selecting the first table from the HTML file 
data= pd.DataFrame(table[0])
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 2. Data cleaning and structuring

In [4]:
#filtering the Borough feature and dropping each row that contains 'Not assigned'
indexname=data[data['Borough']== 'Not assigned'].index
data.drop(indexname, inplace=True)
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
type(data)

pandas.core.frame.DataFrame

In [6]:
#grouping the Neighbourhood by postal code feature
data.groupby(['Postal Code'],as_index=False)
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
#checks if nan value exist in the dataframe
data.isnull().values.any()

False

In [8]:
data.shape

(103, 3)

### 3. Data Exploration

In [9]:
# transform list-likes cells in Neighbourhood column to individual rows
data1=data.assign(Neighbourhood=data.Neighbourhood.str.split(',')).explode('Neighbourhood')
data1['Neighbourhood'] #check number of Neighbourhoods

0                      Parkwoods
1               Victoria Village
2                    Regent Park
2                   Harbourfront
3                 Lawrence Manor
                 ...            
102                    Mimico NW
102           The Queensway West
102               South of Bloor
102     Kingsway Park South West
102        Royal York South West
Name: Neighbourhood, Length: 217, dtype: object

In [10]:
# check how many neighborhoods each Borough has 
data1.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,18,18
Downtown Toronto,39,39
East Toronto,8,8
East York,7,7
Etobicoke,47,47
Mississauga,1,1
North York,38,38
Scarborough,38,38
West Toronto,13,13
York,8,8


### 4. Adding Features

In [11]:
from geopy.geocoders import Nominatim 
from sklearn.cluster import KMeans
import folium

In [12]:
data1['address']=data1['Neighbourhood']+','+ 'Toronto, Canada'

In [13]:
from geopy.extra.rate_limiter import RateLimiter
# create locator that holds the Geocoding service, Nominatim
locator = Nominatim(user_agent='Toronto_explorer')
# conveneint function to delay between geocoding calls: RateLimiter
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# create location column
data1['location']=data1['address'].apply(geocode)
# create longitude, latitude and altitude from location column (returns tuple)
data1['point']=data1['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# split point column into latitude, longitude and altitude columns
data1[['Latitude', 'Longitude', 'altitude']]= pd.DataFrame(data1['point'].tolist(), index=data1.index)
data1=data1.drop(['location', 'point', 'altitude', 'address'], axis=1)

In [14]:
data1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457
2,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
3,M6A,North York,Lawrence Manor,43.722079,-79.437507


In [15]:
data1.isnull().values.any()

True

In [16]:
data1=data1.dropna(subset=['Latitude', 'Longitude'], axis=0)
data1.isnull().values.any()

False

In [17]:
data1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457
2,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
3,M6A,North York,Lawrence Manor,43.722079,-79.437507


In [18]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude,longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(data1['Latitude'], data1['Longitude'], data1['Borough'], data1['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### 5. Extracting data from Foursquare using its API
#### Define Foursquare Credentials and Version

In [20]:
CLIENT_ID = 'FM24JFR1QZWPVUMV5SSJM1VQEVWUNCWHMWSKRFATRNGXEPVA' # your Foursquare ID
CLIENT_SECRET = 'CAGCDDF02NDTJI4VFJJQ0O3P0G3L4W3LG10DCTK4J3BR2LSO' # your Foursquare Secret
VERSION = '20200805' # Foursquare API version

In [21]:
# Get the neighborhood's latitude and longitude values
address = "Queen's Park, Downtown Toronto"
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The geograpical coordinate of Queen's Park are {}, {}.".format(latitude, longitude))

The geograpical coordinate of Queen's Park are 43.663217, -79.38629.


In [22]:
# Creating request URL to get the venues near Queen's Park
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=FM24JFR1QZWPVUMV5SSJM1VQEVWUNCWHMWSKRFATRNGXEPVA&client_secret=CAGCDDF02NDTJI4VFJJQ0O3P0G3L4W3LG10DCTK4J3BR2LSO&v=20200805&ll=43.663217,-79.38629&radius=500&limit=100'

In [23]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f2ad477af4413115c618957'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 37,
  'suggestedBounds': {'ne': {'lat': 43.667717004500005,
    'lng': -79.38008107398377},
   'sw': {'lat': 43.6587169955, 'lng': -79.39249892601623}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '594ff53c2be42528bcc1bdb7',
       'name': 'T-Swirl Crepe',
       'location': {'address': '510 Yonge Street',
        'lat': 43.663452,
        'lng': -79.384125,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.66

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,T-Swirl Crepe,Creperie,43.663452,-79.384125
1,Curry's Art Store Ltd.,Arts & Crafts Store,43.662838,-79.383732
2,Nando's,Portuguese Restaurant,43.661728,-79.386391
3,Burrito Bandidos,Burrito Place,43.662962,-79.383956
4,Bar Volo,Beer Bar,43.665462,-79.385692


In [26]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

37 venues were returned by Foursquare.


### 7. Explore Neighborhoods in Toronto

In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
toronto_venues = getNearbyVenues(names=data1['Neighbourhood'],
                                   latitudes=data1['Latitude'],
                                   longitudes=data1['Longitude']
                                  )


Parkwoods
Victoria Village
Regent Park
 Harbourfront
Lawrence Manor
 Lawrence Heights
Queen's Park
Islington Avenue
 Humber Valley Village
Malvern
 Rouge
Don Mills
 Woodbine Gardens
Garden District
 Ryerson
Glencairn
West Deane Park
 Princess Gardens
 Islington
 Cloverdale
Rouge Hill
 Port Union
 Highland Creek
Don Mills
Woodbine Heights
St. James Town
Eringate
 Bloordale Gardens
 Old Burnhamthorpe
 Markland Wood
Guildwood
 Morningside
 West Hill
The Beaches
Berczy Park
Woburn
Leaside
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
 Wilson Heights
 Downsview North
Thorncliffe Park
Richmond
 Adelaide
 King
Dufferin
 Dovercourt Village
Scarborough Village
Fairview
 Henry Farm
 Oriole
Northwood Park
 York University
East Toronto
 Broadview North (Old East York)
Harbourfront East
 Union Station
 Toronto Islands
Little Portugal
 Trinity
Kennedy Park
 Ionview
 East Birchmount Park
Bayview Village
Downsview
The Danforth West
 Riverdale
Toronto Dominion Centre
 Design Exchange
Brockton
 Pa

In [29]:
print(toronto_venues.shape)
toronto_venues.head()

(5856, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7588,-79.320197,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.7588,-79.320197,LCBO,43.757774,-79.314257,Liquor Store
2,Parkwoods,43.7588,-79.320197,Shoppers Drug Mart,43.760857,-79.324961,Pharmacy
3,Parkwoods,43.7588,-79.320197,Petro-Canada,43.75795,-79.315187,Gas Station
4,Parkwoods,43.7588,-79.320197,TD Canada Trust,43.757569,-79.314976,Bank


In [30]:
#check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Agincourt North,25,25,25,25,25,25
Bathurst Quay,24,24,24,24,24,24
Bloordale Gardens,8,8,8,8,8,8
Broadview North (Old East York),8,8,8,8,8,8
...,...,...,...,...,...,...
Willowdale,147,147,147,147,147,147
Woburn,22,22,22,22,22,22
Woodbine Heights,8,8,8,8,8,8
York Mills,16,16,16,16,16,16


In [31]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,Antique Shop,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
toronto_onehot.shape

(5856, 329)

In [33]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped


Unnamed: 0,Neighborhood,Yoga Studio,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,Adelaide,0.000000,0.000,0.0,0.0,0.0,0.000000,0.000000,0.03,0.0,...,0.0,0.01,0.0,0.0,0.000000,0.0,0.0,0.01,0.0,0.0
1,Agincourt North,0.000000,0.000,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.040000,0.0,0.0,0.00,0.0,0.0
2,Bathurst Quay,0.000000,0.000,0.0,0.0,0.0,0.041667,0.041667,0.00,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
3,Bloordale Gardens,0.000000,0.000,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
4,Broadview North (Old East York),0.000000,0.000,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,Willowdale,0.020408,0.000,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
183,Woburn,0.000000,0.000,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.045455,0.0,0.0,0.00,0.0,0.0
184,Woodbine Heights,0.000000,0.125,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.0
185,York Mills,0.000000,0.000,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,...,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.00,0.0,0.0


In [34]:
toronto_grouped.shape

(187, 329)

In [35]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Adelaide----
            venue  freq
0     Coffee Shop  0.07
1            Café  0.06
2             Gym  0.05
3      Restaurant  0.05
4  Cosmetics Shop  0.04


---- Agincourt North----
                 venue  freq
0                 Bank  0.08
1   Chinese Restaurant  0.08
2             Pharmacy  0.04
3           Beer Store  0.04
4  Sporting Goods Shop  0.04


---- Bathurst Quay----
                  venue  freq
0           Coffee Shop  0.17
1                  Café  0.12
2                  Park  0.08
3                Tunnel  0.04
4  Caribbean Restaurant  0.04


---- Bloordale Gardens----
               venue  freq
0  Convenience Store  0.25
1               Bank  0.12
2     Sandwich Place  0.12
3       Intersection  0.12
4         Print Shop  0.12


---- Broadview North (Old East York)----
                  venue  freq
0        Ice Cream Shop  0.12
1          Intersection  0.12
2         Grocery Store  0.12
3        Discount Store  0.12
4  Other Great Outdoors  0.12


---- Cabbagetown

                   venue  freq
0  Vietnamese Restaurant  0.19
1                 Bakery  0.09
2     Chinese Restaurant  0.09
3            Coffee Shop  0.06
4     Light Rail Station  0.06


---- Roncesvalles----
            venue  freq
0            Café  0.08
1       Gift Shop  0.05
2       Bookstore  0.05
3    Gourmet Shop  0.05
4  Breakfast Spot  0.05


---- Rouge----
                     venue  freq
0                     Park   0.5
1     Fast Food Restaurant   0.5
2              Yoga Studio   0.0
3             Night Market   0.0
4  New American Restaurant   0.0


---- Royal York South East----
                venue  freq
0         Coffee Shop  0.09
1      Breakfast Spot  0.06
2        Dessert Shop  0.06
3                 Pub  0.06
4  Italian Restaurant  0.06


---- Royal York South West----
                venue  freq
0         Coffee Shop  0.09
1      Breakfast Spot  0.06
2        Dessert Shop  0.06
3                 Pub  0.06
4  Italian Restaurant  0.06


---- Ryerson----
          

                venue  freq
0  Italian Restaurant  0.09
1         Coffee Shop  0.07
2                Café  0.04
3  Mexican Restaurant  0.04
4          Restaurant  0.04


----Humber Summit----
                        venue  freq
0  Construction & Landscaping  0.25
1                        Park  0.25
2                      Bakery  0.25
3                         Gym  0.25
4                    Pie Shop  0.00


----Humberlea----
                venue  freq
0         Gas Station  0.25
1   Convenience Store  0.25
2      Baseball Field  0.25
3  Italian Restaurant  0.25
4       Moving Target  0.00


----India Bazaar----
               venue  freq
0  Indian Restaurant  0.23
1      Grocery Store  0.09
2               Café  0.09
3            Brewery  0.03
4                Gym  0.03


----Islington Avenue----
                venue  freq
0        Food Service  0.25
1       Smoothie Shop  0.25
2         Coffee Shop  0.25
3      Sandwich Place  0.25
4  Miscellaneous Shop  0.00


----Kennedy Park----
 

In [36]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [37]:
# create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] =toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Gym,Restaurant,Cosmetics Shop,Clothing Store,Seafood Restaurant,Hotel,Gastropub,Japanese Restaurant
1,Agincourt North,Bank,Chinese Restaurant,Liquor Store,Fast Food Restaurant,Juice Bar,Beer Store,Frozen Yogurt Shop,Fried Chicken Joint,Sporting Goods Shop,Spa
2,Bathurst Quay,Coffee Shop,Café,Park,Sculpture Garden,Caribbean Restaurant,Ramen Restaurant,Bank,Garden,Diner,Grocery Store
3,Bloordale Gardens,Convenience Store,Intersection,Sandwich Place,Donut Shop,Coffee Shop,Bank,Print Shop,Ethiopian Restaurant,Doner Restaurant,Dumpling Restaurant
4,Broadview North (Old East York),Theater,Park,Discount Store,Grocery Store,Other Great Outdoors,Bus Line,Intersection,Ice Cream Shop,Farmers Market,Eastern European Restaurant


## Cluster Neighborhoods


In [38]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [39]:
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 3, 3, 0,
       0, 4, 4, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 0, 0, 0, 0, 0, 2, 3, 0, 3, 0, 0, 0, 4, 3, 0, 0, 0, 0, 2,
       3, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 3, 3, 0, 0, 0, 0, 3, 0, 0], dtype=int32)

In [40]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = data1

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7588,-79.320197,0.0,Caribbean Restaurant,Liquor Store,Gas Station,Discount Store,Chinese Restaurant,Laundry Service,Coffee Shop,Shopping Mall,Bank,Pharmacy
1,M4A,North York,Victoria Village,43.732658,-79.311189,3.0,Thai Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Park,Bus Line,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,0.0,Coffee Shop,Thai Restaurant,Performing Arts Venue,Pool,Electronics Store,Food Truck,Beer Store,Sushi Restaurant,Restaurant,Pub
2,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015,0.0,Coffee Shop,Café,Hotel,Restaurant,Pizza Place,Italian Restaurant,Bank,Plaza,Gym,Steakhouse
3,M6A,North York,Lawrence Manor,43.722079,-79.437507,3.0,Kids Store,Bank,Electronics Store,Park,Doctor's Office,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant


In [41]:
neighborhoods_venues_sorted

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Adelaide,Coffee Shop,Café,Gym,Restaurant,Cosmetics Shop,Clothing Store,Seafood Restaurant,Hotel,Gastropub,Japanese Restaurant
1,0,Agincourt North,Bank,Chinese Restaurant,Liquor Store,Fast Food Restaurant,Juice Bar,Beer Store,Frozen Yogurt Shop,Fried Chicken Joint,Sporting Goods Shop,Spa
2,0,Bathurst Quay,Coffee Shop,Café,Park,Sculpture Garden,Caribbean Restaurant,Ramen Restaurant,Bank,Garden,Diner,Grocery Store
3,0,Bloordale Gardens,Convenience Store,Intersection,Sandwich Place,Donut Shop,Coffee Shop,Bank,Print Shop,Ethiopian Restaurant,Doner Restaurant,Dumpling Restaurant
4,0,Broadview North (Old East York),Theater,Park,Discount Store,Grocery Store,Other Great Outdoors,Bus Line,Intersection,Ice Cream Shop,Farmers Market,Eastern European Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
182,0,Willowdale,Coffee Shop,Japanese Restaurant,Grocery Store,Gym,Fried Chicken Joint,Pharmacy,Pizza Place,Korean Restaurant,Restaurant,Sandwich Place
183,0,Woburn,Fast Food Restaurant,Discount Store,Coffee Shop,Bank,Department Store,Gym,Big Box Store,Toy / Game Store,Sandwich Place,Hardware Store
184,3,Woodbine Heights,Skating Rink,Park,Athletics & Sports,Bus Stop,Dance Studio,Pharmacy,ATM,Filipino Restaurant,Flea Market,Donut Shop
185,0,York Mills,Coffee Shop,Gym,Restaurant,French Restaurant,Thai Restaurant,Sandwich Place,Pub,Gym / Fitness Center,Business Service,Optical Shop


In [42]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int, errors='ignore')
toronto_merged

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.758800,-79.320197,0.0,Caribbean Restaurant,Liquor Store,Gas Station,Discount Store,Chinese Restaurant,Laundry Service,Coffee Shop,Shopping Mall,Bank,Pharmacy
1,M4A,North York,Victoria Village,43.732658,-79.311189,3.0,Thai Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Park,Bus Line,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,0.0,Coffee Shop,Thai Restaurant,Performing Arts Venue,Pool,Electronics Store,Food Truck,Beer Store,Sushi Restaurant,Restaurant,Pub
2,M5A,Downtown Toronto,Harbourfront,43.640080,-79.380150,0.0,Coffee Shop,Café,Hotel,Restaurant,Pizza Place,Italian Restaurant,Bank,Plaza,Gym,Steakhouse
3,M6A,North York,Lawrence Manor,43.722079,-79.437507,3.0,Kids Store,Bank,Electronics Store,Park,Doctor's Office,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,M8Y,Etobicoke,Royal York South East,43.648183,-79.511296,0.0,Coffee Shop,Sushi Restaurant,Dessert Shop,Bank,Pub,Italian Restaurant,Breakfast Spot,Mobile Phone Shop,French Restaurant,Restaurant
102,M8Z,Etobicoke,Mimico NW,43.616677,-79.496805,4.0,Bakery,Bar,Performing Arts Venue,Skating Rink,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant
102,M8Z,Etobicoke,The Queensway West,43.623618,-79.514764,0.0,Restaurant,Coffee Shop,Italian Restaurant,Sporting Goods Shop,Yoga Studio,Bar,Chinese Restaurant,Movie Theater,BBQ Joint,Gourmet Shop
102,M8Z,Etobicoke,South of Bloor,43.658360,-79.443350,0.0,Bar,Café,Sandwich Place,Bakery,Dive Bar,Caribbean Restaurant,Cocktail Bar,Diner,Italian Restaurant,Coffee Shop


In [43]:
toronto_merged['Cluster Labels'].isin(['nan']).any().any()

True

In [44]:
toronto_merged=toronto_merged.dropna(subset=['Cluster Labels'], axis=0)
toronto_merged['Cluster Labels'].isin(['nan']).any().any()

False

In [45]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    cluster=int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

In [46]:
# cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,0.0,Caribbean Restaurant,Liquor Store,Gas Station,Discount Store,Chinese Restaurant,Laundry Service,Coffee Shop,Shopping Mall,Bank,Pharmacy
2,Downtown Toronto,0.0,Coffee Shop,Thai Restaurant,Performing Arts Venue,Pool,Electronics Store,Food Truck,Beer Store,Sushi Restaurant,Restaurant,Pub
2,Downtown Toronto,0.0,Coffee Shop,Café,Hotel,Restaurant,Pizza Place,Italian Restaurant,Bank,Plaza,Gym,Steakhouse
3,North York,0.0,Clothing Store,Restaurant,Coffee Shop,Women's Store,Jewelry Store,Toy / Game Store,American Restaurant,Sushi Restaurant,Sandwich Place,Bakery
4,Downtown Toronto,0.0,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Chinese Restaurant,Japanese Restaurant,Bubble Tea Shop,Thai Restaurant,French Restaurant,Portuguese Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
101,Etobicoke,0.0,Garden Center,Gym,Eastern European Restaurant,Sushi Restaurant,Coffee Shop,Women's Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Egyptian Restaurant
101,Etobicoke,0.0,Coffee Shop,Sushi Restaurant,Dessert Shop,Bank,Pub,Italian Restaurant,Breakfast Spot,Mobile Phone Shop,French Restaurant,Restaurant
102,Etobicoke,0.0,Restaurant,Coffee Shop,Italian Restaurant,Sporting Goods Shop,Yoga Studio,Bar,Chinese Restaurant,Movie Theater,BBQ Joint,Gourmet Shop
102,Etobicoke,0.0,Bar,Café,Sandwich Place,Bakery,Dive Bar,Caribbean Restaurant,Cocktail Bar,Diner,Italian Restaurant,Coffee Shop


In [47]:
# cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1,toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
85,Scarborough,1.0,Playground,Women's Store,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
89,Etobicoke,1.0,Playground,Women's Store,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
90,Scarborough,1.0,Playground,Women's Store,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store


In [48]:
# cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Scarborough,2.0,Park,Fast Food Restaurant,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
12,Scarborough,2.0,Park,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant
17,Etobicoke,2.0,Park,Electronics Store,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Ethiopian Restaurant
34,North York,2.0,Park,Baseball Field,Women's Store,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
45,North York,2.0,Park,Middle Eastern Restaurant,Women's Store,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
77,Etobicoke,2.0,Park,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant
82,Scarborough,2.0,Park,Convenience Store,Gas Station,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
95,Scarborough,2.0,Park,Fast Food Restaurant,Event Space,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store


In [49]:
# cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,3.0,Thai Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Park,Bus Line,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
3,North York,3.0,Kids Store,Bank,Electronics Store,Park,Doctor's Office,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
5,Etobicoke,3.0,Skating Rink,Park,Bus Stop,Bakery,Convenience Store,Fish Market,Flea Market,Doner Restaurant,Donut Shop,Dumpling Restaurant
8,East York,3.0,Home Service,Park,Bakery,Coffee Shop,Women's Store,Ethiopian Restaurant,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
14,East York,3.0,Skating Rink,Park,Athletics & Sports,Bus Stop,Dance Studio,Pharmacy,ATM,Filipino Restaurant,Flea Market,Donut Shop
17,Etobicoke,3.0,Dog Run,Flower Shop,Gas Station,Park,Fish & Chips Shop,Electronics Store,Flea Market,Doner Restaurant,Donut Shop,Dumpling Restaurant
17,Etobicoke,3.0,Piano Bar,Baseball Field,Park,Golf Course,Women's Store,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
23,East York,3.0,Sandwich Place,Convenience Store,Japanese Restaurant,Park,Ethiopian Restaurant,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
30,Downtown Toronto,3.0,Beer Store,Ice Cream Shop,Park,Falafel Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
33,North York,3.0,Tennis Court,Lawyer,Park,Women's Store,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant


In [50]:
# cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
34,North York,4.0,Gas Station,Bakery,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,Event Space
88,Etobicoke,4.0,Bakery,Bar,Performing Arts Venue,Skating Rink,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant
101,Etobicoke,4.0,Bakery,Bar,Performing Arts Venue,Skating Rink,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant
102,Etobicoke,4.0,Bakery,Bar,Performing Arts Venue,Skating Rink,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant
