## Load libraries and set other needed environmental components 

In [1]:
# Gather resources
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.12.5          |   py36h5fab9bb_1         143 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.1.0                |     pyhd3deb0d_0          64 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         240 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.1.0-pyhd3deb0d_0

The following packages will be UPDATED:

  certifi                          2020.12.5-py36h5fab9bb_0 --> 202

In [2]:
!pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/bd/78/56a7c88a57d0d14945472535d0df9fb4bbad7d34ede658ec7961635c790e/lxml-4.6.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 6.2MB/s eta 0:00:01     |████                            | 686kB 6.2MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.6.2


## Obtain the following from the table of Toronto postal codes
## Postal Code, Borough and Neighbourhood

In [3]:
Canada_temp1 = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
# , flavor='html5lib'
# Canada_temp1.shape
Canada_temp1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## To effectively use the geographical coordinates of each postal code, we need to remove the “Not Assigned” Boroughs

In [4]:
Canada_temp2 = Canada_temp1[Canada_temp1.Borough!="Not assigned"]
# Canada_temp2.shape
Canada_temp2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
contain_values_post = Canada_temp2.loc[Canada_temp2['Neighbourhood'] == 'Not assigned']
print (contain_values_post)
Canada_postal = Canada_temp2
Canada_postal.shape

Empty DataFrame
Columns: [Postal Code, Borough, Neighbourhood]
Index: []


(103, 3)

In [6]:
# END OF SECTION 1

## Section 2:  merge the latitude and the longitude coordinates of each neighborhood into a new dataframe

## Obtain the geographical coordinates of each postal code

In [7]:
import pandas as pd
G_Canada_postal = pd.read_csv('http://cocl.us/Geospatial_data')
G_Canada_postal.shape

(103, 3)

In [8]:
G_Canada_postal.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Join two collections of information to make the completed rows in a dataframe

In [9]:
Canada_merged = pd.merge(Canada_postal, G_Canada_postal, on='Postal Code' , how='left')
Canada_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
Canada_merged.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999


In [11]:
# END OF SECTION 2:

## Section 3:  Explore and cluster the neighborhoods in Toronto

## Drop the postal codes from the pandas dataframe, since we will be working primarily with Boroughs and Neighbourhoods

In [12]:
neighborhoods = Canada_merged.drop('Postal Code', 1)
neighborhoods.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,North York,Parkwoods,43.753259,-79.329656
1,North York,Victoria Village,43.725882,-79.315572
2,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Extract only those rows that contain the word “Toronto” in the Borough

In [13]:
Toronto_neighborhoods = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')]
Toronto_neighborhoods.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
2,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,Downtown Toronto,St. James Town,43.651494,-79.375418
19,East Toronto,The Beaches,43.676357,-79.293031


In [14]:
print('The Canada dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)
print('The Toronto dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_neighborhoods['Borough'].unique()),
        Toronto_neighborhoods.shape[0]
    )
)

The Canada dataframe has 10 boroughs and 103 neighborhoods.
The Toronto dataframe has 4 boroughs and 39 neighborhoods.


## Use geopy library to get the latitude and longitude values of Canada and Toronto.

In [15]:
address = 'Canada'

geolocator = Nominatim(user_agent="Canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Canada are {}, {}.'.format(latitude, longitude))

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="Canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Canada are 61.0666922, -107.9917071.
The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Create a map of Toronto with neighborhoods superimposed on top.

In [16]:
map_Toronto  = folium.Map(location=[latitude, longitude], zoom_start=10)

In [17]:
for lat, lng, borough, neighborhood in zip(Toronto_neighborhoods['Latitude'], Toronto_neighborhoods['Longitude'], Toronto_neighborhoods['Borough'], Toronto_neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  


After mapping, we will store a screenshot, for later use.

In [18]:
map_Toronto 

In [19]:
# Toronto_data = neighborhoods[neighborhoods['Borough'] == 'Toronto'].reset_index(drop=True)
Toronto_data = Toronto_neighborhoods.reset_index(drop=True)
Toronto_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,East Toronto,The Beaches,43.676357,-79.293031


## Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them

In [20]:
# The code was removed by Watson Studio for sharing.

In [21]:
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

Let's explore the first neighborhood in our dataframe.

In [22]:
# Toronto_data.loc[0,'Neighbourhood']
neighborhood_latitude = Toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Toronto_data.loc[0, 'Neighbourhood'] # neighborhood name
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


Now, let's get the top 100 venues that are churches in the first neighborhood within a radius of 500 meters.

In [23]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius=500
category='Church, Prayer'
# Church, Kingdom Hall, Prayer Room, Synagogue, Temple
category='4bf58dd8d48988d132941735,5744ccdfe4b0c0459246b4ac,52e81612bcbc57f1066b7a41,4bf58dd8d48988d139941735,4bf58dd8d48988d13a941735'
# url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query=\'{}\'&radius={}&limit={}'.format(
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude,
    category,
    radius, 
    LIMIT)

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60132a6b1ba8f361b8769067'},
 'response': {'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'query': 'church',
  'totalResults': 6,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b6f550bf964a520d7eb2ce3',
       'name': 'Little Trinity Anglican Church',
       'location': {'address': '425 King St. E.',
        'crossStreet': 'at Trinity St.',
        'lat': 43.65309196622969,
        'lng': -79.36193660953931,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65309196622969,
          'lng': -79.3619366095

Borrowing the get_category_type function from the Foursquare lab.

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
nearby_venues.head()

6 venues were returned by Foursquare.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Little Trinity Anglican Church,Church,43.653092,-79.361937
1,St. Paul's Basilica,Church,43.655857,-79.36306
2,The Good Shepperd,Church,43.656312,-79.362624
3,Berkeley Church,Event Space,43.655123,-79.365873
4,St. George's Church,Church,43.657828,-79.363312


Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [27]:
def getNearbyVenues(names, latitudes, longitudes, category, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            category, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
Toronto_venues = getNearbyVenues(names=Toronto_data['Neighbourhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude'],
                                   category='4bf58dd8d48988d132941735,5744ccdfe4b0c0459246b4ac,52e81612bcbc57f1066b7a41,4bf58dd8d48988d139941735,4bf58dd8d48988d13a941735'
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

In [29]:
print(Toronto_venues.shape)
Toronto_venues.head()

(75, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Little Trinity Anglican Church,43.653092,-79.361937,Church
1,"Regent Park, Harbourfront",43.65426,-79.360636,St. Paul's Basilica,43.655857,-79.36306,Church
2,"Regent Park, Harbourfront",43.65426,-79.360636,The Good Shepperd,43.656312,-79.362624,Church
3,"Regent Park, Harbourfront",43.65426,-79.360636,Berkeley Church,43.655123,-79.365873,Event Space
4,"Regent Park, Harbourfront",43.65426,-79.360636,St. George's Church,43.657828,-79.363312,Church


Let's check how many venues were returned for each neighborhood

In [30]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Bay Street,2,2,2,2,2,2
Church and Wellesley,11,11,11,11,11,11
"Commerce Court, Victoria Hotel",3,3,3,3,3,3
Davisville,2,2,2,2,2,2
"Dufferin, Dovercourt Village",1,1,1,1,1,1
"First Canadian Place, Underground city",1,1,1,1,1,1
"Garden District, Ryerson",8,8,8,8,8,8
"Harbourfront East, Union Station, Toronto Islands",1,1,1,1,1,1
"High Park, The Junction South",2,2,2,2,2,2
"India Bazaar, The Beaches West",3,3,3,3,3,3


In [31]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 12 uniques categories.


## Apply the k-means clustering algorithm and associated tailoring tools

In [32]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighborhood,Bookstore,Breakfast Spot,Church,Event Space,Gay Bar,Intersection,Monument / Landmark,Music Venue,Office,Park,Residential Building (Apartment / Condo),Spiritual Center
0,"Regent Park, Harbourfront",0,0,1,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,1,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,1,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,1,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,1,0,0,0,0,0,0,0,0,0


In [33]:
Toronto_onehot.shape

(75, 13)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [34]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighborhood,Bookstore,Breakfast Spot,Church,Event Space,Gay Bar,Intersection,Monument / Landmark,Music Venue,Office,Park,Residential Building (Apartment / Condo),Spiritual Center
0,Central Bay Street,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Church and Wellesley,0.090909,0.090909,0.272727,0.0,0.454545,0.0,0.0,0.0,0.0,0.0,0.090909,0.0
2,"Commerce Court, Victoria Hotel",0.0,0.0,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
3,Davisville,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Dufferin, Dovercourt Village",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"First Canadian Place, Underground city",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Garden District, Ryerson",0.0,0.0,0.875,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0
7,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,"High Park, The Junction South",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"India Bazaar, The Beaches West",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
Toronto_grouped.shape

(28, 13)

Let's print each neighborhood along with the top 5 most common venues

In [36]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Bay Street----
            venue  freq
0          Church   1.0
1       Bookstore   0.0
2  Breakfast Spot   0.0
3     Event Space   0.0
4         Gay Bar   0.0


----Church and Wellesley----
                                      venue  freq
0                                   Gay Bar  0.45
1                                    Church  0.27
2                                 Bookstore  0.09
3                            Breakfast Spot  0.09
4  Residential Building (Apartment / Condo)  0.09


----Commerce Court, Victoria Hotel----
            venue  freq
0          Church  0.67
1    Intersection  0.33
2       Bookstore  0.00
3  Breakfast Spot  0.00
4     Event Space  0.00


----Davisville----
            venue  freq
0          Church   1.0
1       Bookstore   0.0
2  Breakfast Spot   0.0
3     Event Space   0.0
4         Gay Bar   0.0


----Dufferin, Dovercourt Village----
            venue  freq
0          Church   1.0
1       Bookstore   0.0
2  Breakfast Spot   0.0
3     Event S

In [37]:
# Let's put that into a pandas dataframe
# First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [38]:
# Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Bay Street,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
1,Church and Wellesley,Gay Bar,Church,Residential Building (Apartment / Condo),Breakfast Spot,Bookstore,Spiritual Center,Park,Office,Music Venue,Monument / Landmark
2,"Commerce Court, Victoria Hotel",Church,Intersection,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Gay Bar,Event Space
3,Davisville,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
4,"Dufferin, Dovercourt Village",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space


In [40]:
# More wrangling to match the terminology across dataframes
neighbourhoods_venues_sorted=neighborhoods_venues_sorted
# gapminder.rename(columns={'pop':'population'}, inplace=True)
neighbourhoods_venues_sorted.rename(columns={'Neighborhood':'Neighbourhood'},inplace=True)
neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Bay Street,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
1,Church and Wellesley,Gay Bar,Church,Residential Building (Apartment / Condo),Breakfast Spot,Bookstore,Spiritual Center,Park,Office,Music Venue,Monument / Landmark
2,"Commerce Court, Victoria Hotel",Church,Intersection,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Gay Bar,Event Space
3,Davisville,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
4,"Dufferin, Dovercourt Village",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space


## Cluster Neighborhoods

In [41]:
# Run k-means to cluster the neighborhood into 5 clusters.
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 4, 1, 1, 1, 1, 3, 1, 1], dtype=int32)

In [42]:
# Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_data

# merge Toronto_grouped with Toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,Church,Park,Event Space,Spiritual Center,Residential Building (Apartment / Condo),Office,Music Venue,Monument / Landmark,Intersection,Gay Bar
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1.0,Church,Office,Spiritual Center,Residential Building (Apartment / Condo),Park,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
3,Downtown Toronto,St. James Town,43.651494,-79.375418,1.0,Church,Residential Building (Apartment / Condo),Spiritual Center,Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
4,East Toronto,The Beaches,43.676357,-79.293031,1.0,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space


In [43]:
Toronto_merged1 = Toronto_merged[Toronto_merged['Cluster Labels'].isna()]
Toronto_merged1

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downtown Toronto,Berczy Park,43.644771,-79.373306,,,,,,,,,,,
7,Downtown Toronto,Christie,43.669542,-79.422564,,,,,,,,,,,
14,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191,,,,,,,,,,,
18,Central Toronto,Lawrence Park,43.72802,-79.38879,,,,,,,,,,,
19,Central Toronto,Roselawn,43.711695,-79.416936,,,,,,,,,,,
20,Central Toronto,Davisville North,43.712751,-79.390197,,,,,,,,,,,
21,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307,,,,,,,,,,,
25,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325,,,,,,,,,,,
32,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,,,,,,,,,,,
33,Downtown Toronto,Rosedale,43.679563,-79.377529,,,,,,,,,,,


In [44]:
# pandas delete NaN rows from the candidate dataframe
Toronto_merged = Toronto_merged[Toronto_merged['Cluster Labels'].notna()]
Toronto_merged.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,Church,Park,Event Space,Spiritual Center,Residential Building (Apartment / Condo),Office,Music Venue,Monument / Landmark,Intersection,Gay Bar
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1.0,Church,Office,Spiritual Center,Residential Building (Apartment / Condo),Park,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
3,Downtown Toronto,St. James Town,43.651494,-79.375418,1.0,Church,Residential Building (Apartment / Condo),Spiritual Center,Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
4,East Toronto,The Beaches,43.676357,-79.293031,1.0,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space


## Make sure the 'Cluster Labels' column is type = int
If not, reset it

In [45]:
Toronto_merged.dtypes

Borough                    object
Neighbourhood              object
Latitude                  float64
Longitude                 float64
Cluster Labels            float64
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

In [46]:
Toronto_merged['Cluster Labels'] = Toronto_merged['Cluster Labels'].astype(int)
Toronto_merged.dtypes

Borough                    object
Neighbourhood              object
Latitude                  float64
Longitude                 float64
Cluster Labels              int64
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

## Use the Folium library to visualize the candidate neighborhoods, in their respective clusters

In [51]:
# set color scheme for the clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)


Now, we will render the map and save a screenshot

In [52]:
map_clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster.  

In [53]:
# Cluster 1
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
31,"Summerhill West, Rathnelly, South Hill, Forest...",Spiritual Center,Church,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
37,Church and Wellesley,Gay Bar,Church,Residential Building (Apartment / Condo),Breakfast Spot,Bookstore,Spiritual Center,Park,Office,Music Venue,Monument / Landmark


In [54]:
# Cluster 2
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Queen's Park, Ontario Provincial Government",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
2,"Garden District, Ryerson",Church,Office,Spiritual Center,Residential Building (Apartment / Condo),Park,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
3,St. James Town,Church,Residential Building (Apartment / Condo),Spiritual Center,Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
4,The Beaches,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
6,Central Bay Street,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
9,"Dufferin, Dovercourt Village",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
13,"Toronto Dominion Centre, Design Exchange",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
15,"India Bazaar, The Beaches West",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
17,Studio District,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
22,"High Park, The Junction South",Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space


In [55]:
# Cluster 3
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,"Little Portugal, Trinity",Event Space,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Intersection,Gay Bar,Church


In [56]:
# Cluster 4
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,"Harbourfront East, Union Station, Toronto Islands",Monument / Landmark,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Intersection,Gay Bar,Event Space,Church


In [57]:
# Cluster 5
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",Church,Park,Event Space,Spiritual Center,Residential Building (Apartment / Condo),Office,Music Venue,Monument / Landmark,Intersection,Gay Bar
8,"Richmond, Adelaide, King",Church,Office,Spiritual Center,Residential Building (Apartment / Condo),Park,Music Venue,Monument / Landmark,Intersection,Gay Bar,Event Space
12,"The Danforth West, Riverdale",Church,Music Venue,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Monument / Landmark,Intersection,Gay Bar,Event Space
16,"Commerce Court, Victoria Hotel",Church,Intersection,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Gay Bar,Event Space
34,Stn A PO Boxes,Intersection,Church,Spiritual Center,Residential Building (Apartment / Condo),Park,Office,Music Venue,Monument / Landmark,Gay Bar,Event Space
