# Toronto Postal Codes Clustering

Installing needed libraries and packages.

In [408]:
!pip install beautifulsoup4
!pip install requests
!pip install pgeocode
!pip install geopy
!pip install folium

import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import pgeocode
import numpy as np
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize

print('Libraries are good.')

Libraries are good.


Use "requests" library to connect to wiki page and retrieve html, then use "BeautifulSoup" to parse the html code into a "BeautifulSoup" object.

In [748]:
# wp for web page
wp = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# wp1 for web page html
wp1 = req.get(wp)

#OPTIONAL check return from request.get()
#print(wp1.content[:100])

# wp2 BeautifulSoup Object
wp2 = BeautifulSoup(wp1.content, 'html.parser')

Get the table of interest from the "Beautiful Soup" object.

**NOTE:** The page has only one table, if it had more we would have to use **find_all()** to scrape all the tables and then identify our table of interest.

In [749]:
#Retrieve Table
WikiTable = wp2.find('table')

Find the column headers from WikiTable, create a list of the column headers as string objects, and create a "pandas" data frame with list of column headers.

In [750]:
ColumnHeaders = WikiTable.find_all('th')
# ColumnHeaders2 is a list of Column Headers as string objects
ColumnHeaders2 = [ColumnHeaders[0].text.rstrip(),\
                  ColumnHeaders[1].text.rstrip(),\
                  ColumnHeaders[2].text.rstrip()]

# Create DataFrame, df_TNs = data frame of Toronto Neighborhoods
df_TNs = pd.DataFrame(columns=ColumnHeaders2)

Reads a row from WikiTable into the df_TNs dataframe, if the Neighbourhood column is "Not assigned," it is given the Borough value and neighbourhoods are combined into common postal codes.

In [751]:
# Find all the rows from the table, "WikiTable" 
RowTags = WikiTable.find_all('tr')

# l is List of Postal Codes added to Dataframe
l=[]

#Initialize loop at 1 to skip the "headers" row
RowNumberWiki = 1

# Row to add to in Dataframe
RowAddedDF = 1

#288 is the length of the table of interest, "WikiTable"
while RowNumberWiki<288:
    Row = RowTags[RowNumberWiki].find_all('td')
    RowString = [Row[0].text.rstrip(),\
              Row[1].text.rstrip(),\
              Row[2].text.rstrip()]
    #  If neighborhood is "Not assigned",
    #  It is replaced with the "Borough" name.
    if RowString[2]=='Not assigned':
        Row[2]=Row[1]
    #  If Postal Code already exists, the new neighborhood
    #  is added to the existing "Postcode" neighborhood 
    if RowString[0] in l:
        NH_new = RowString[2]
        NH_old = df_TNs.loc[df_TNs['Postcode']== RowString[0],\
                            'Neighbourhood'].values[0]
        NH_updated = NH_new + ', ' + NH_old
        df_TNs.loc[df_TNs['Postcode'] == RowString[0],\
                   'Neighbourhood'] = NH_updated
    #  If "Postcode" is not in dataframe, it's Postcode is
    #  noted into into the list of postcodes in dataframe
    if RowString[0] not in l:
        l.append(RowString[0])
        if RowString[1]!= 'Not assigned':
            df_TNs.loc[RowAddedDF]=list(RowString)
            RowAddedDF = RowAddedDF + 1        
    #  Add one to row count        
    RowNumberWiki = RowNumberWiki+1

### This is the data frame for the first submission for the week 3 project of the Applied Data Science Capstone class.

In [752]:
df_TNs.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,"Lawrence Manor, Lawrence Heights"
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,"Malvern, Rouge"
8,M3B,North York,Don Mills North
9,M4B,East York,"Parkview Hill, Woodbine Gardens"
10,M5B,Downtown Toronto,"Garden District, Ryerson"


Retrieve latitude and longitude coordinates for each postal code, and create a list of postal codes for which either a latitude or longitude is not returned.

**Note:** The geocoder package recommended in the assignment (geocoder.readthedocs.io/index.html) would not work for me, so I used the pgeocode library instead.  It failed for only one of the Toronto post codes, in which case I used the .csv file provided by the assignment.

In [753]:
nomi=pgeocode.Nominatim('CA')

lats=[]
lons=[]
l_errors=[]
a=1
while a<len(df_TNs)+1:
    x=nomi.query_postal_code(df_TNs['Postcode'][a])
    lat = x['latitude']
    lats.append(lat)
    lon = x['longitude']
    lons.append(lon)
    if np.isnan(lat)==True or np.isnan(lon)==True:
        l_errors.append(df_TNs['Postcode'][a])
    a=a+1
df_TNs['latitude']=lats
df_TNs['longitude']=lons

For post codes that pgeocode failed to return a latitude or longitude, their values are found in the .csv file provided by the Week 3 assignment "Segementing and Clustering Neighborhoods in Toronto," and added to the Toronto Neighborhoods dataframe.

In [754]:
for i in l_errors:
    other_path = 'http://cocl.us/Geospatial_data'
    df_PCsAlt = pd.read_csv(other_path, header=0)
    lat_csv=df_PCsAlt[df_PCsAlt['Postal Code'].str\
                         .match(l_errors[0])]['Latitude'].values[0]
    lat_csv=round(lat_csv,4)
    lon_csv=df_PCsAlt[df_PCsAlt['Postal Code'].str\
                         .match(l_errors[0])]['Longitude'].values[0]
    lon_csv=round(lon_csv,4)
    df_TNs.loc[df_TNs['Postcode'] == l_errors[0], 'latitude']=lat_csv
    df_TNs.loc[df_TNs['Postcode']==l_errors[0], 'longitude']=lon_csv

### This is the data frame for the second submission for the week 3 project of the Applied Data Science Capstone class.

In [755]:
df_TNs.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
1,M3A,North York,Parkwoods,43.7545,-79.33
2,M4A,North York,Victoria Village,43.7276,-79.3148
3,M5A,Downtown Toronto,Harbourfront,43.6555,-79.3626
4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
5,M7A,Downtown Toronto,Queen's Park,43.6641,-79.3889
6,M9A,Queen's Park,Not assigned,43.6662,-79.5282
7,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
8,M3B,North York,Don Mills North,43.745,-79.359
9,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
10,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


A centroid is found to be used to generate a map of Toronto.

In [756]:
geolocator = Nominatim(user_agent="WikiTableScrape2")
T_gc= geolocator.geocode('Toronto, Ontario, Canada ')
T_lat=T_gc.latitude
T_lon=T_gc.longitude

Create a map of Toronto with Post Code centroid markers, with labels for the Post Code and Borough.  It did not make sense to include neighborhood markers as centroids are based on Post Codes.

In [757]:
TR_map = folium.Map(location=[T_lat, T_lon], zoom_start=10)

a=1
while a <= len(df_TNs):
    lat = df_TNs['latitude'][a]
    lon = df_TNs['longitude'][a]
    folium.CircleMarker(
       [lat, lon],
       radius=3, # define how big you want the circle markers to be
       color='yellow',
       fill=True,
       fill_color='blue',
       fill_opacity=0.6,
       parse_html=False).add_to(TR_map)
    a=a+1
    
for lat, lng, borough, postCodeArea in zip(df_TNs['latitude'],\
                                           df_TNs['longitude'],\
                                           df_TNs['Borough'],\
                                           df_TNs['Postcode']):
    label = '{}, {}'.format(postCodeArea, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(TR_map) 

TR_map    

Creating variables for FourSquare API credentials.

In [809]:
CLIENT_ID = 'xxxxxxxxxxx'
CLIENT_SECRET = 'xxxxxxxxxxxxxx'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: xxxxxxxxxxx
CLIENT_SECRET:xxxxxxxxxxxxxx


Initiate the Postcode, latitude, and longitude column of the first row from the Toronto Postcodes dataframe, as variables to pass as parameters in the foursquare url to retrieve data from the foursquare api.

In [759]:
PC_lat = df_TNs.loc[1, 'latitude']
PC_lon = df_TNs.loc[1, 'longitude']

PC = df_TNs.loc[1, 'Postcode'] 

print('Latitude and longitude values of {} are {}, {}.'\
      .format(PC,PC_lat,PC_lon)) 


Latitude and longitude values of M3A are 43.7545, -79.33.


Formating the foursquare URL with required parameters defined as variables previously.

In [760]:
LIMIT = 100
radius = 500 

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    PC_lat, 
    PC_lon, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=UO5NFQ5E5U0RKH2NQUBVWKKHNBAQR0YR25TE2R0M14A3L4NJ&client_secret=4PQ5T4N2FWF0JL3UZPULYRNHQYED522KAKA1CXQHAI3BLKEU&v=20180605&ll=43.7545,-79.33&radius=500&limit=100'

Using the "requests" and "json" libraries to return the foursquare.com query as a json file, i.e. a dictionary

In [761]:
results = req.get(url).json()


In [762]:
#results

In [763]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Get a list of resulting venues from foursquare.com, convert to a pandas dataframe, define pandas dataframe with just the columns of interest, go through each row retrieve the category of venue, finally clean the column names.

In [764]:
PC_venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(PC_venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories',\
                    'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues\
                                .apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for\
                         col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,GreenWin pool,Pool,43.756232,-79.333842
2,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [765]:
print('{} venues were returned by Foursquare.'\
      .format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


Defining a function to retrieve a list of the top 100 venues within 500 meters of each postCode centroid.

In [766]:
def getNearbyVenues(postCode, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(postCode, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = req.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Create a dataframe that contains all the venues returned for all the Toronto post codes.

In [767]:
PC_venues=getNearbyVenues(postCode=df_TNs['Postcode'],
                                   latitudes=df_TNs['latitude'],
                                   longitudes=df_TNs['longitude'])

Create a data frame that contains the number of venues returned by the FourSquare API for each Toronto post code.

In [768]:
PC_Venue_Count = PC_venues.groupby('Postcode').count()
PC_Venue_Count.head()

Unnamed: 0_level_0,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,1,1,1,1,1,1
M1C,1,1,1,1,1,1
M1E,32,32,32,32,32,32
M1G,1,1,1,1,1,1
M1H,3,3,3,3,3,3


From the dataframe based on all the venues returned for all the post codes, retrieve the number of unique (or types) venues throughout all of the post codes.

In [769]:
print('There are {} uniques categories.'\
      .format(len(PC_venues['Venue Category'].unique())))

There are 260 uniques categories.


Creating a data frame where the columns are the Venue Categories returned from the list of venues.

In [770]:
TOR_onehot = pd.get_dummies(PC_venues[['Venue Category']],\
                            prefix="", prefix_sep="")

In [771]:
TOR_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [772]:
TOR_onehot.shape

(2283, 260)

In [773]:
# add neighborhood column back to dataframe
TOR_onehot['Postcode'] = PC_venues['Postcode'] 
TOR_onehot.head()


Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Postcode
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M3A
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M3A
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M3A
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M4A
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M4A


In [774]:
# move neighborhood column to the first column
fixed_columns = [TOR_onehot.columns[-1]] + list(TOR_onehot.columns[:-1])
TOR_onehot = TOR_onehot[fixed_columns]
TOR_onehot.head()

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Analyze each postal code

Grouping each neighborhood by the mean.

In [775]:
TOR_grouped = TOR_onehot.groupby('Postcode').mean().reset_index()

In [776]:
for postCode in TOR_grouped['Postcode']:
    #print("----"+postCode+"----")
    temp = TOR_grouped[TOR_grouped['Postcode'] ==\
                       postCode].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    #print(temp.sort_values('freq', ascending=False)\
          #.reset_index(drop=True).head(num_top_venues))
    #print('\n')

In [777]:
TOR_grouped.head()

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [778]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [779]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']


In [780]:
# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1,\
                                                   indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [781]:
columns

['Postcode',
 '1st Most Common Venue',
 '2nd Most Common Venue',
 '3rd Most Common Venue',
 '4th Most Common Venue',
 '5th Most Common Venue',
 '6th Most Common Venue',
 '7th Most Common Venue',
 '8th Most Common Venue',
 '9th Most Common Venue',
 '10th Most Common Venue']

In [782]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] =\
                                    TOR_grouped['Postcode']

In [783]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,,,,,,,,,,
1,M1C,,,,,,,,,,
2,M1E,,,,,,,,,,
3,M1G,,,,,,,,,,
4,M1H,,,,,,,,,,


Need this loop to only return venues with freq greater than 0

In [784]:
for ind in np.arange(TOR_grouped.shape[0]):
    yjh=TOR_grouped.iloc[ind,0]
    hsc=PC_Venue_Count.loc[yjh,'Venue']
    if hsc<num_top_venues:
        num_top_venues = hsc
    else: num_top_venues = 10
    neighborhoods_venues_sorted.iloc[ind, 1:num_top_venues+1] =\
    return_most_common_venues\
    (TOR_grouped.iloc[ind, :], num_top_venues)

In [785]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Home Service,,,,,,,,,
1,M1C,Bar,Yoga Studio,Food Court,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field
2,M1E,Pizza Place,Pharmacy,Fast Food Restaurant,Coffee Shop,Liquor Store,Electronics Store,Supermarket,Fried Chicken Joint,Sports Bar,Breakfast Spot
3,M1G,Korean Restaurant,,,,,,,,,
4,M1H,Construction & Landscaping,Trail,Lounge,Yoga Studio,Field,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant


Identifing Post Codes that FourSquare returned less than three venues.

In [711]:
PC_less3 = neighborhoods_venues_sorted\
           [neighborhoods_venues_sorted['3rd Most Common Venue'].isnull()]

In [786]:
PC_less3.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Home Service,,,,,,,,,
3,M1G,Korean Restaurant,,,,,,,,,
5,M1J,Grocery Store,,,,,,,,,
11,M1R,Convenience Store,Auto Garage,,,,,,,,
16,M2H,Residential Building (Apartment / Condo),Park,,,,,,,,


In [787]:
PC20_list = PC_less3['Postcode'].to_list()
print(PC20_list)

['M1B', 'M1G', 'M1J', 'M1R', 'M2H', 'M2L', 'M3L', 'M3M', 'M4N', 'M5N', 'M6L', 'M6P', 'M8V', 'M9L']


In [788]:
# Get names of indexes for which column Age has value 30
indexNames = PC_less3.index
 

        

In [789]:
indexNames

Int64Index([0, 3, 5, 11, 16, 19, 30, 31, 43, 62, 78, 81, 87, 95], dtype='int64')

Deleting Post Code rows that contain less than three venues for clustering.

In [790]:
# Delete these row indexes from dataFrame
TOR_grouped.drop(indexNames , inplace=True)
#neighborhoods_venues_sorted.drop(indexNames , inplace=True)

In [791]:
neighborhoods_venues_sorted.drop(indexNames , inplace=True)

In [792]:
TOR_grouped.head()

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M1K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M1L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [793]:
from sklearn.cluster import KMeans

In [794]:
# set number of clusters
kclusters = 5

TOR_grouped_clustering = TOR_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(TOR_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [795]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


In [796]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,3,M1C,Bar,Yoga Studio,Food Court,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field
2,0,M1E,Pizza Place,Pharmacy,Fast Food Restaurant,Coffee Shop,Liquor Store,Electronics Store,Supermarket,Fried Chicken Joint,Sports Bar,Breakfast Spot
4,0,M1H,Construction & Landscaping,Trail,Lounge,Yoga Studio,Field,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant
6,0,M1K,Coffee Shop,Hobby Shop,Intersection,Bus Line,Metro Station,Pharmacy,Chinese Restaurant,Light Rail Station,Sandwich Place,Bank
7,0,M1L,Bus Line,Bakery,Intersection,Bus Station,Soccer Field,Park,Metro Station,Coffee Shop,Field,Falafel Restaurant


In [797]:
TOR_merged = df_TNs

In [798]:
TOR_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
1,M3A,North York,Parkwoods,43.7545,-79.33
2,M4A,North York,Victoria Village,43.7276,-79.3148
3,M5A,Downtown Toronto,Harbourfront,43.6555,-79.3626
4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
5,M7A,Downtown Toronto,Queen's Park,43.6641,-79.3889


In [799]:
TOR_merged = TOR_merged\
.join(neighborhoods_venues_sorted\
      .set_index('Postcode'), on='Postcode')


In [800]:
TOR_merged.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
99,M8X,Etobicoke,"Old Mill North, Montgomery Road, The Kingsway",43.6518,-79.5076,0.0,Breakfast Spot,Sushi Restaurant,Burger Joint,Pub,Business Service,Liquor Store,Coffee Shop,Bank,Bakery,Restaurant
100,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.383,0.0,Coffee Shop,Japanese Restaurant,Gay Bar,Restaurant,Sushi Restaurant,Pizza Place,Mediterranean Restaurant,Café,Fast Food Restaurant,Men's Store
101,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.7804,-79.2505,0.0,Coffee Shop,Yoga Studio,Japanese Restaurant,Martial Arts Dojo,Wine Bar,Sushi Restaurant,Restaurant,Deli / Bodega,Department Store,Breakfast Spot
102,M8Y,Etobicoke,"Sunnylea, Royal York South East, The Queensway...",43.6325,-79.4939,4.0,Home Service,Park,Baseball Field,,,,,,,
103,M8Z,Etobicoke,"South of Bloor, Royal York South West, The Que...",43.6256,-79.5231,0.0,Burrito Place,Italian Restaurant,Social Club,Burger Joint,Fish & Chips Shop,Middle Eastern Restaurant,Liquor Store,Sandwich Place,Sushi Restaurant,Thai Restaurant


Need a loop to go through above dataframe and check for NaN in the top venue column.  If NaN drop row (for now)

In [801]:
modDf = TOR_merged.dropna(how='any', subset=['Cluster Labels'])
modDf.head()      

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M3A,North York,Parkwoods,43.7545,-79.33,2.0,Food & Drink Shop,Pool,Park,,,,,,,
2,M4A,North York,Victoria Village,43.7276,-79.3148,0.0,Hockey Arena,Portuguese Restaurant,Coffee Shop,Intersection,Park,Pizza Place,,,,
3,M5A,Downtown Toronto,Harbourfront,43.6555,-79.3626,0.0,Coffee Shop,Breakfast Spot,Yoga Studio,Bakery,Dance Studio,Pub,Restaurant,Electronics Store,Event Space,Mexican Restaurant
4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,0.0,Clothing Store,Coffee Shop,Shoe Store,Cosmetics Shop,Electronics Store,Sandwich Place,Sushi Restaurant,Men's Store,Toy / Game Store,Bakery
5,M7A,Downtown Toronto,Queen's Park,43.6641,-79.3889,0.0,Gym,Burrito Place,Italian Restaurant,Japanese Restaurant,Beer Bar,Portuguese Restaurant,Sushi Restaurant,Ramen Restaurant,Coffee Shop,College Theater


Need to convert 'Cluster Labels' from float to int.

In [802]:
modDf['Cluster Labels']=modDf['Cluster Labels'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [803]:
modDf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M3A,North York,Parkwoods,43.7545,-79.33,2,Food & Drink Shop,Pool,Park,,,,,,,
2,M4A,North York,Victoria Village,43.7276,-79.3148,0,Hockey Arena,Portuguese Restaurant,Coffee Shop,Intersection,Park,Pizza Place,,,,
3,M5A,Downtown Toronto,Harbourfront,43.6555,-79.3626,0,Coffee Shop,Breakfast Spot,Yoga Studio,Bakery,Dance Studio,Pub,Restaurant,Electronics Store,Event Space,Mexican Restaurant
4,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,0,Clothing Store,Coffee Shop,Shoe Store,Cosmetics Shop,Electronics Store,Sandwich Place,Sushi Restaurant,Men's Store,Toy / Game Store,Bakery
5,M7A,Downtown Toronto,Queen's Park,43.6641,-79.3889,0,Gym,Burrito Place,Italian Restaurant,Japanese Restaurant,Beer Bar,Portuguese Restaurant,Sushi Restaurant,Ramen Restaurant,Coffee Shop,College Theater


In [804]:
# create map
map_clusters = folium.Map(location=[T_lat, T_lon], zoom_start=11)

In [805]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [806]:
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [807]:
markers_colors = []
for lat, lon, poi, cluster in zip(modDf['latitude'],\
                                  modDf['longitude'],\
                                  modDf['Postcode'],\
                                  modDf['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [808]:
map_clusters

In [735]:
modDf.loc[modDf['Cluster Labels'] == 0,\
                     modDf.columns[[0] +\
                     list(range(5, modDf.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
41,M3K,0,Food Court,Airport,Snack Place,Coffee Shop,,,,,,


In [736]:
modDf.loc[modDf['Cluster Labels'] == 1,\
                     modDf.columns[[0] +\
                     list(range(5, modDf.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M4A,1,Hockey Arena,Portuguese Restaurant,Coffee Shop,Intersection,Park,Pizza Place,,,,
3,M5A,1,Coffee Shop,Breakfast Spot,Yoga Studio,Bakery,Dance Studio,Pub,Restaurant,Electronics Store,Event Space,Mexican Restaurant
4,M6A,1,Clothing Store,Coffee Shop,Shoe Store,Cosmetics Shop,Electronics Store,Sandwich Place,Sushi Restaurant,Men's Store,Toy / Game Store,Bakery
5,M7A,1,Gym,Burrito Place,Italian Restaurant,Japanese Restaurant,Beer Bar,Portuguese Restaurant,Sushi Restaurant,Ramen Restaurant,Coffee Shop,College Theater
9,M4B,1,Pizza Place,Fast Food Restaurant,Café,Bus Line,Bank,Intersection,Gym / Fitness Center,Pet Store,Pharmacy,Gastropub
10,M5B,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Japanese Restaurant,Lingerie Store,Pizza Place,Bakery,Tea Room,Thai Restaurant
11,M6B,1,Pizza Place,Grocery Store,Fast Food Restaurant,Sushi Restaurant,Latin American Restaurant,Gas Station,Rental Car Location,Japanese Restaurant,Italian Restaurant,Mediterranean Restaurant
12,M9B,1,Pizza Place,Tea Room,Coffee Shop,Chinese Restaurant,Sandwich Place,Construction & Landscaping,Hawaiian Restaurant,Doner Restaurant,Fish & Chips Shop,Field
15,M4C,1,Spa,Convenience Store,Beer Store,Video Store,Asian Restaurant,,,,,
16,M5C,1,Coffee Shop,Café,Restaurant,Bakery,Seafood Restaurant,Beer Bar,Breakfast Spot,Clothing Store,Cosmetics Shop,Hotel


In [737]:
modDf.loc[modDf['Cluster Labels'] == 2,\
                     modDf.columns[[0] +\
                     list(range(5, modDf.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
86,M1V,2,Pharmacy,Sushi Restaurant,Field,,,,,,,


In [738]:
modDf.loc[modDf['Cluster Labels'] == 3,\
                     modDf.columns[[0] +\
                     list(range(5, modDf.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M3A,3,Food & Drink Shop,Pool,Park,,,,,,,
6,M9A,3,Pharmacy,Skating Rink,Bank,Park,Grocery Store,Farmers Market,,,,
8,M3B,3,Yoga Studio,Pool,Park,Basketball Court,Flower Shop,Flea Market,Fish Market,Food,Eastern European Restaurant,Field
14,M3C,3,River,Gym,Park,,,,,,,
17,M6C,3,Hockey Arena,Trail,Playground,Tennis Court,Park,Field,Deli / Bodega,Grocery Store,,
20,M4E,3,Neighborhood,Trail,Gastropub,Bakery,Health Food Store,Park,Pub,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant
22,M6E,3,Park,Sporting Goods Shop,Women's Store,Beer Store,Market,Mexican Restaurant,Bakery,Gym,Fast Food Restaurant,Event Space
36,M4J,3,Park,Convenience Store,Rental Car Location,Massage Studio,Coffee Shop,Electronics Store,,,,
37,M5J,3,Music Venue,Harbor / Marina,Park,Café,,,,,,
40,M2K,3,Flower Shop,Gas Station,Park,Trail,,,,,,


In [739]:
modDf.loc[modDf['Cluster Labels'] == 4,\
                     modDf.columns[[0] +\
                     list(range(5, modDf.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
53,M2M,4,Playground,Bus Line,Food Court,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant


In [740]:
modDf.loc[modDf['Cluster Labels'] == 5,\
                     modDf.columns[[0] +\
                     list(range(5, modDf.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,M1C,5,Bar,Yoga Studio,Food Court,Food & Drink Shop,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field
