# Toronto Neighborhoods Analytics Project

## First, get the data

In [30]:
#install Beautiful Soup and requests for Web Scaping
!pip install BeautifulSoup4
!pip install requests



In [57]:
#imports
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#get html from wiki page and create soup object
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.text, 'lxml')

#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [58]:
#Remove Boroughs that are 'Not assigned'
canada_df = canada_df[canada_df['Borough'] != 'Not assigned']
canada_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [59]:
canada_df.shape

(103, 3)

In [60]:
canada_df['Neighborhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
canada_df.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don Mills
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [63]:
canada_df.shape

(103, 3)

## Part 2 Merge data

In [68]:
!wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [69]:
Geospatial_data = pd.read_csv('Geospatial_data.csv')
Geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [71]:
Geospatial_data.rename(columns={'Postal Code': 'Postal code'}, inplace=True)

In [73]:
data1 = pd.merge(canada_df, Geospatial_data, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

data1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [75]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postal code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
Latitude        103 non-null float64
Longitude       103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


In [76]:
data1.to_csv('sorted_geoloc.csv')

## Part 3 Analytics data

In [79]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    ------------------------------------------------------------
                       

In [80]:
import pandas as pd

import json

data1.to_json(path_or_buf='geo_toronto.json', orient='table')

In [81]:
with open('geo_toronto.json') as json_data:
    Toronto_data = json.load(json_data)

In [82]:
neighborhoods_data = Toronto_data['data']
neighborhoods_data[0]

{'index': 0,
 'Postal code': 'M1B',
 'Borough': 'Scarborough',
 'Neighborhood': 'Malvern / Rouge',
 'Latitude': 43.8066863,
 'Longitude': -79.1943534}

In [83]:
data1.info()
data1.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postal code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
Latitude        103 non-null float64
Longitude       103 non-null float64
dtypes: float64(2), object(3)
memory usage: 9.8+ KB


(103, 5)

In [84]:
data1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [85]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(data1['Borough'].unique()),
        data1.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [86]:
address = 'Adelaide'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Adelaide are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Adelaide are -34.9281805, 138.5999312.


In [87]:
!conda install -c conda-forge folium=0.5.0 

Solving environment: done

# All requested packages already installed.



## Ready to generate maps, open them on your browser

In [91]:
map_toronto_neighborhoods = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data1['Latitude'], data1['Longitude'], data1['Borough'], data1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_neighbourhoods)  
    
map_toronto_neighbourhoods

map_toronto_neighbourhoods.save("map_toronto_neighbourhoods.html")

#open map_toronto_neighbourhoods.html in browser
#if you cannot generate the maps open PGA_map_*.html from the zip file

In [92]:
address = 'York, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of York, Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinates of York, Toronto are 43.67910515, -79.49118414007154.


In [94]:
york_data = data1[data1['Borough'] == 'York'].reset_index(drop=True)
york_data

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
2,M6M,York,Del Ray / Mount Dennis / Keelsdale and Silvert...,43.691116,-79.476013
3,M6N,York,Runnymede / The Junction North,43.673185,-79.487262
4,M9N,York,Weston,43.706876,-79.518188


In [95]:
york_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
Postal code     5 non-null object
Borough         5 non-null object
Neighborhood    5 non-null object
Latitude        5 non-null float64
Longitude       5 non-null float64
dtypes: float64(2), object(3)
memory usage: 280.0+ bytes


In [97]:
map_york_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(york_data['Latitude'], york_data['Longitude'], york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_york_toronto)  
    
map_york_toronto

map_york_toronto.save("map_york_toronto.html")

In [98]:
neighbourhood_latitude = york_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = york_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = york_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Humewood-Cedarvale are 43.6937813, -79.42819140000002.


In [102]:
LIMIT = 100

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            'LVR251M4E2M5F5SUBZZNISUPIIRXAC2AV1P1POTZUDSBVDP2', 
            '0VJC2SHNRFK5GEYLEGBQ43PEOI2YDLW4T4ZG1QGSI1P5T3AM', 
            '20200409', 
            lat, 
            lng, 
            radius, 
            LIMIT)

In [103]:
york_results = requests.get(url).json()
#york_results

In [106]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [107]:
york_venues = york_results['response']['groups'][0]['items']
    
york_nearby_venues = json_normalize(york_venues) # flatten JSON

# filter columns
york_filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
york_nearby_venues = york_nearby_venues.loc[:, york_filtered_columns]

# filter the category for each row
york_nearby_venues['venue.categories'] = york_nearby_venues.apply(get_category_type, axis=1)

# clean columns
york_nearby_venues.columns = [col.split(".")[-1] for col in york_nearby_venues.columns]

york_nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Grattan Park,Park,43.706222,-79.521705


In [108]:
york_nearby_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 4 columns):
name          1 non-null object
categories    1 non-null object
lat           1 non-null float64
lng           1 non-null float64
dtypes: float64(2), object(2)
memory usage: 112.0+ bytes


In [110]:
print('{} venues were returned by Foursquare.'.format(york_nearby_venues.shape[0]))

1 venues were returned by Foursquare.


In [111]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
              'LVR251M4E2M5F5SUBZZNISUPIIRXAC2AV1P1POTZUDSBVDP2', 
              '0VJC2SHNRFK5GEYLEGBQ43PEOI2YDLW4T4ZG1QGSI1P5T3AM', 
              '20200409', 
               lat, 
               lng, 
               radius, 
               LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [112]:
york_venues = getNearbyVenues(names=york_data['Neighborhood'],
                                   latitudes=york_data['Latitude'],
                                   longitudes=york_data['Longitude']
                                  )

Humewood-Cedarvale
Caledonia-Fairbanks
Del Ray / Mount Dennis / Keelsdale and Silverthorn
Runnymede / The Junction North
Weston


In [113]:
york_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Park,43.692535,-79.428705,Field
1,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Ravine,43.690188,-79.426106,Trail
2,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Dog Park,43.692036,-79.429491,Dog Run
3,Humewood-Cedarvale,43.693781,-79.428191,Phil White Arena,43.691303,-79.431761,Hockey Arena
4,Caledonia-Fairbanks,43.689026,-79.453512,Nairn Park,43.690654,-79.4563,Park


In [114]:
york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Caledonia-Fairbanks,4,4,4,4,4,4
Del Ray / Mount Dennis / Keelsdale and Silverthorn,4,4,4,4,4,4
Humewood-Cedarvale,4,4,4,4,4,4
Runnymede / The Junction North,4,4,4,4,4,4
Weston,1,1,1,1,1,1


In [115]:
print('There are {} uniques categories.'.format(len(york_venues['Venue Category'].unique())))

There are 15 uniques categories.


In [116]:
york_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
york_fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[york_fixed_columns]

york_onehot.head()

Unnamed: 0,Neighborhood,Brewery,Bus Line,Coffee Shop,Discount Store,Dog Run,Field,Grocery Store,Hockey Arena,Market,Park,Pizza Place,Restaurant,Sandwich Place,Trail,Women's Store
0,Humewood-Cedarvale,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Humewood-Cedarvale,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Humewood-Cedarvale,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Caledonia-Fairbanks,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [117]:
york_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 16 columns):
Neighborhood      17 non-null object
Brewery           17 non-null uint8
Bus Line          17 non-null uint8
Coffee Shop       17 non-null uint8
Discount Store    17 non-null uint8
Dog Run           17 non-null uint8
Field             17 non-null uint8
Grocery Store     17 non-null uint8
Hockey Arena      17 non-null uint8
Market            17 non-null uint8
Park              17 non-null uint8
Pizza Place       17 non-null uint8
Restaurant        17 non-null uint8
Sandwich Place    17 non-null uint8
Trail             17 non-null uint8
Women's Store     17 non-null uint8
dtypes: object(1), uint8(15)
memory usage: 471.0+ bytes


In [119]:
york_grouped = york_onehot.groupby('Neighborhood').mean().reset_index()
york_grouped.head()

Unnamed: 0,Neighborhood,Brewery,Bus Line,Coffee Shop,Discount Store,Dog Run,Field,Grocery Store,Hockey Arena,Market,Park,Pizza Place,Restaurant,Sandwich Place,Trail,Women's Store
0,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.0,0.0,0.25
1,Del Ray / Mount Dennis / Keelsdale and Silvert...,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0
2,Humewood-Cedarvale,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0
3,Runnymede / The Junction North,0.25,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
4,Weston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [120]:
york_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 16 columns):
Neighborhood      5 non-null object
Brewery           5 non-null float64
Bus Line          5 non-null float64
Coffee Shop       5 non-null float64
Discount Store    5 non-null float64
Dog Run           5 non-null float64
Field             5 non-null float64
Grocery Store     5 non-null float64
Hockey Arena      5 non-null float64
Market            5 non-null float64
Park              5 non-null float64
Pizza Place       5 non-null float64
Restaurant        5 non-null float64
Sandwich Place    5 non-null float64
Trail             5 non-null float64
Women's Store     5 non-null float64
dtypes: float64(15), object(1)
memory usage: 720.0+ bytes


In [121]:
num_top_venues = 3

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    york_temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    york_temp.columns = ['venue','freq']
    york_temp = york_temp.iloc[1:]
    york_temp['freq'] = york_temp['freq'].astype(float)
    york_temp = york_temp.round({'freq': 2})
    print(york_temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Caledonia-Fairbanks----
           venue  freq
0           Park  0.50
1         Market  0.25
2  Women's Store  0.25


----Del Ray / Mount Dennis / Keelsdale and Silverthorn----
            venue  freq
0     Coffee Shop  0.25
1  Discount Store  0.25
2      Restaurant  0.25


----Humewood-Cedarvale----
          venue  freq
0       Dog Run  0.25
1         Field  0.25
2  Hockey Arena  0.25


----Runnymede / The Junction North----
           venue  freq
0        Brewery  0.25
1       Bus Line  0.25
2  Grocery Store  0.25


----Weston----
      venue  freq
0      Park   1.0
1   Brewery   0.0
2  Bus Line   0.0




In [122]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [123]:
num_top_venues = 17

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
york_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)

york_neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue


In [124]:
york_neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

york_neighborhoods_venues_sorted.head(2)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,Caledonia-Fairbanks,,,,,,,,,,,,,,,,,
1,Del Ray / Mount Dennis / Keelsdale and Silvert...,,,,,,,,,,,,,,,,,


In [133]:
for ind in np.arange(york_grouped.shape[0]):
    york_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, -2:], num_top_venues)

york_neighborhoods_venues_sorted.head(2)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,Caledonia-Fairbanks,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
1,Del Ray / Mount Dennis / Keelsdale and Silvert...,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store


In [135]:
# set number of clusters
kclusters = 2

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
york_kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
york_kmeans.labels_[0:5]

array([0, 1, 1, 1, 0], dtype=int32)

In [137]:
york_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', york_kmeans.labels_)

york_merged = york_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(york_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,1,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,0,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
2,M6M,York,Del Ray / Mount Dennis / Keelsdale and Silvert...,43.691116,-79.476013,1,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
3,M6N,York,Runnymede / The Junction North,43.673185,-79.487262,1,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
4,M9N,York,Weston,43.706876,-79.518188,0,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store


In [138]:
# create map
york_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(york_map_clusters)
       
york_map_clusters

york_map_clusters.save("york_map_clusters.html")

#open york_map_clusters.html in browser
#if you cannot generate the maps open PGA_map_*.html from the zip file

In [139]:
york_merged.loc[york_merged['Cluster Labels'] == 0, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
1,York,0,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
4,York,0,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store


In [140]:
york_merged.loc[york_merged['Cluster Labels'] == 1, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue
0,York,1,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
2,York,1,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
3,York,1,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store,Women's Store
