In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors # Matplotlib and associated plotting modules

from sklearn.cluster import KMeans # import k-means from clustering stage

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim  # convert an address into latitude and longitude values

#!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup # website scraping libraries and packages in Python from BeautifulSoup 

import requests

print("Libraries imported.")


Libraries imported.


In [2]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [3]:
# find all the rows of the table
soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [4]:
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))
        
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
toronto_df.shape

(287, 3)

### Remove Not assigned

In [6]:
toronto_df = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [7]:
toronto_df.shape

(210, 3)

### Group Neighborhoods

In [8]:
toronto_df = toronto_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Replace Not Assigned Neighborhood

In [9]:
toronto_df.loc[toronto_df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Not assigned


In [10]:
toronto_df['Neighborhood'].replace(['Not assigned'], "Queen's Park")

0                                         Rouge, Malvern
1                 Highland Creek, Rouge Hill, Port Union
2                      Guildwood, Morningside, West Hill
3                                                 Woburn
4                                              Cedarbrae
5                                    Scarborough Village
6            East Birchmount Park, Ionview, Kennedy Park
7                        Clairlea, Golden Mile, Oakridge
8        Cliffcrest, Cliffside, Scarborough Village West
9                            Birch Cliff, Cliffside West
10     Dorset Park, Scarborough Town Centre, Wexford ...
11                                     Maryvale, Wexford
12                                             Agincourt
13               Clarks Corners, Sullivan, Tam O'Shanter
14     Agincourt North, L'Amoreaux East, Milliken, St...
15                                       L'Amoreaux West
16                                           Upper Rouge
17                             

In [11]:
toronto_df.loc[toronto_df['Borough'] == "Queen's Park"]

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Not assigned


In [12]:
toronto_df.shape

(103, 3)

In [13]:
value_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
toronto_df2=toronto_df[toronto_df.PostalCode.isin(value_list)]
toronto_df3=toronto_df2.reset_index()
toronto_df3 = toronto_df3[['PostalCode','Borough','Neighborhood']]
toronto_df3

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1J,Scarborough,Scarborough Village
2,M1R,Scarborough,"Maryvale, Wexford"
3,M2H,North York,Hillcrest Village
4,M4B,East York,"Woodbine Gardens, Parkview Hill"
5,M4G,East York,Leaside
6,M4M,East Toronto,Studio District
7,M5A,Downtown Toronto,Harbourfront
8,M5G,Downtown Toronto,Central Bay Street
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."


### Read the coordinates

In [14]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the Toronto Df with the Coords

In [15]:
coordinates.columns = ['PostalCode', 'Latitude','Longitude']
toronto_df_with_Coords = toronto_df3.merge(coordinates, on="PostalCode", how="left")
toronto_df_with_Coords

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
2,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849
3,M2H,North York,Hillcrest Village,43.803762,-79.363452
4,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
5,M4G,East York,Leaside,43.70906,-79.363452
6,M4M,East Toronto,Studio District,43.659526,-79.340923
7,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
8,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


### Neighborhood in Toronto

In [16]:
borough_names = list(toronto_df_with_Coords.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['East Toronto', 'Downtown Toronto']


### create a new DataFrame with only boroughs that contain the word Toronto

In [17]:
toronto = toronto_df_with_Coords[toronto_df_with_Coords['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(toronto.shape)
toronto.head()

(4, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4M,East Toronto,Studio District,43.659526,-79.340923
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
2,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
3,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


In [None]:
#conda install folium --yes

In [23]:
#conda install --override-channels -c main -c conda-forge folium=0.5.0 --yes


In [19]:
!conda install --override-channels -c conda-forge folium=0.5.0 --yes
import folium



Solving environment: ...working... done

## Package Plan ##

  environment location: E:\Anaconda

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    psycopg2-2.8.4             |   py36hb32ad35_1         160 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    libssh2-1.8.2              |       h642c060_2         186 KB  conda-forge
    curl-7.68.0                |       h4496350_0         118 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.1 MB

The following NEW packages will be INSTALLED:

    altair:                 4.0.1-py_0              conda-forge
    branca:                 0.4.0-py_0              conda-forge
    conda-packa


psycopg2-2.8.4       | 160 KB    |            |   0% 
psycopg2-2.8.4       | 160 KB    | 7          |   7% 
psycopg2-2.8.4       | 160 KB    | ########## | 100% 

altair-4.0.1         | 575 KB    |            |   0% 
altair-4.0.1         | 575 KB    | ##7        |  27% 
altair-4.0.1         | 575 KB    | ########1  |  81% 
altair-4.0.1         | 575 KB    | ########7  |  88% 
altair-4.0.1         | 575 KB    | #########6 |  96% 
altair-4.0.1         | 575 KB    | ########## | 100% 

libssh2-1.8.2        | 186 KB    |            |   0% 
libssh2-1.8.2        | 186 KB    | ########## | 100% 

curl-7.68.0          | 118 KB    |            |   0% 
curl-7.68.0          | 118 KB    | ########## | 100% 

folium-0.5.0         | 45 KB     |            |   0% 
folium-0.5.0         | 45 KB     | ########## | 100% 


In [24]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [25]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### FOURSQUARE API

In [28]:
CLIENT_ID = 'xxxxxxxxxx' # your Foursquare ID
CLIENT_SECRET = 'xxxxxxxxxx'  # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: xxxxxxxxxx
CLIENT_SECRET:xxxxxxxxxx


In [31]:
toronto.loc[0, 'Neighborhood']

'Studio District'

In [32]:
neighborhood_latitude = toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Studio District are 43.6595255, -79.340923.


In [33]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=OYKQTSLK4CCFCUNN0XNJXHXIO5GPMGVWQVHOPSDTHW44C1J3&client_secret=XJA1WJKV0QIKERAU3PSHE0AYPYRFO33PPAIJAGPAX02NHBMB&v=20180605&ll=43.6595255,-79.340923&radius=500&limit=100'

In [34]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e515b14c94979001bae09f6'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Leslieville',
  'headerFullLocation': 'Leslieville, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 42,
  'suggestedBounds': {'ne': {'lat': 43.6640255045, 'lng': -79.33471445573701},
   'sw': {'lat': 43.6550254955, 'lng': -79.347131544263}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad7e958f964a520001021e3',
       'name': "Ed's Real Scoop",
       'location': {'address': '920 Queen St. E',
        'crossStreet': 'btwn Logan Ave. & Morse St.',
        'lat': 43.660655832455014,
        'lng': -79.3420187548006,
        'labeledLatLngs': 

In [35]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [36]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Ed's Real Scoop,Ice Cream Shop,43.660656,-79.342019
1,Leslieville Pumps,Sandwich Place,43.660892,-79.340626
2,Queen Books,Bookstore,43.660651,-79.342267
3,Te Aro,Coffee Shop,43.661373,-79.338577
4,Hooked,Fish Market,43.660407,-79.343257


How many venues were returned by Foursquare?

In [37]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

42 venues were returned by Foursquare.


### Explore Neighboors in Toronto

In [38]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [39]:
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )

Studio District
Harbourfront
Central Bay Street
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara


In [41]:
print(toronto_venues.shape)
toronto_venues.head()

(187, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Studio District,43.659526,-79.340923,Ed's Real Scoop,43.660656,-79.342019,Ice Cream Shop
1,Studio District,43.659526,-79.340923,Leslieville Pumps,43.660892,-79.340626,Sandwich Place
2,Studio District,43.659526,-79.340923,Queen Books,43.660651,-79.342267,Bookstore
3,Studio District,43.659526,-79.340923,Te Aro,43.661373,-79.338577,Coffee Shop
4,Studio District,43.659526,-79.340923,Hooked,43.660407,-79.343257,Fish Market


In [42]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",17,17,17,17,17,17
Central Bay Street,82,82,82,82,82,82
Harbourfront,46,46,46,46,46,46
Studio District,42,42,42,42,42,42


In [43]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 93 uniques categories.


### Analyze Each Neighborhood

In [44]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Speakeasy,Stationery Store,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Vegetarian / Vegan Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
toronto_onehot.shape

(187, 93)

In [46]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Speakeasy,Stationery Store,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Vegetarian / Vegan Restaurant,Wine Bar
0,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Bay Street,0.012195,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,0.0,...,0.012195,0.0,0.012195,0.012195,0.012195,0.02439,0.0,0.0,0.012195,0.012195
2,Harbourfront,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0
3,Studio District,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,...,0.0,0.02381,0.0,0.0,0.0,0.02381,0.0,0.02381,0.0,0.02381


In [47]:
toronto_grouped.shape

(4, 93)

In [54]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                 venue  freq
0      Airport Service  0.18
1       Airport Lounge  0.12
2     Airport Terminal  0.12
3  Rental Car Location  0.06
4              Airport  0.06


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.17
1   Italian Restaurant  0.05
2         Burger Joint  0.04
3            Juice Bar  0.04
4  Japanese Restaurant  0.04


----Harbourfront----
            venue  freq
0     Coffee Shop  0.15
1             Pub  0.07
2          Bakery  0.07
3            Park  0.07
4  Breakfast Spot  0.04


----Studio District----
                 venue  freq
0                 Café  0.10
1          Coffee Shop  0.07
2  American Restaurant  0.05
3              Brewery  0.05
4   Italian Restaurant  0.05




In [55]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [57]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Boutique,Rental Car Location,Plane,Sculpture Garden,Harbor / Marina,Boat or Ferry
1,Central Bay Street,Coffee Shop,Italian Restaurant,Juice Bar,Japanese Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Gym / Fitness Center,Salad Place,Department Store
2,Harbourfront,Coffee Shop,Pub,Bakery,Park,Mexican Restaurant,Café,Breakfast Spot,Theater,Restaurant,Hotel
3,Studio District,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Brewery,Gastropub,Wine Bar,Diner,Latin American Restaurant


### Clustering

In [59]:
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 1, 2, 1])

In [60]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Brewery,Gastropub,Wine Bar,Diner,Latin American Restaurant
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,2,Coffee Shop,Pub,Bakery,Park,Mexican Restaurant,Café,Breakfast Spot,Theater,Restaurant,Hotel
2,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1,Coffee Shop,Italian Restaurant,Juice Bar,Japanese Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Gym / Fitness Center,Salad Place,Department Store
3,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,0,Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Boutique,Rental Car Location,Plane,Sculpture Garden,Harbor / Marina,Boat or Ferry


In [61]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

In [63]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Downtown Toronto,0,Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Boutique,Rental Car Location,Plane,Sculpture Garden,Harbor / Marina,Boat or Ferry


In [64]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,1,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Brewery,Gastropub,Wine Bar,Diner,Latin American Restaurant
2,Downtown Toronto,1,Coffee Shop,Italian Restaurant,Juice Bar,Japanese Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Gym / Fitness Center,Salad Place,Department Store


In [65]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,2,Coffee Shop,Pub,Bakery,Park,Mexican Restaurant,Café,Breakfast Spot,Theater,Restaurant,Hotel
