# Torronto Exploration

# 1. Initial information gathering

## 1.1 Installs the required libraries

In [1]:
!pip install beautifulsoup4



In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import json
import xml

## 1.2 Initiates the initial working dictionary

In [3]:
nei = {"PostalCode":[], "Borough":[], "Neighborhood": []}

## 1.3 Opens all the html information

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup = BeautifulSoup(source, "lxml")
# print(soup.prettify())

## 1.4 Identifies the table

In [5]:
my_table = soup.find("table", class_="wikitable sortable") # the entire table of the information
# print (my_table.prettify())
# print (type(my_table))

## 1.5 Creates an initial data frame

In [6]:
# tr-->rows, td-->cells, th-->headings
# headings = my_table.find('th', class_='headerSort')
# print (headings)
for row in my_table.find_all("tr"):
    c = 0
    for cell in row.find_all("td"):
        c = c + 1
        if c == 1:
            nei["PostalCode"].append(cell.text)
        elif c == 2:    
            nei["Borough"].append(cell.text)
        else:    
            nei["Neighborhood"].append(cell.text)

n = pd.DataFrame(nei)
n.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
9,M8A,Not assigned,Not assigned\n


## 1.6 Drops Boroughs that are not assigned

In [7]:
counter = 0
index = []
for row in n["Borough"]:
    # print(row, counter)
    if row == "Not assigned":
        index.append(counter)
    counter = counter + 1

for item in index:
    n.iloc[item, 0] = np.nan
    n.iloc[item, 1] = np.nan
    n.iloc[item, 2] = np.nan

n.dropna(inplace=True)
n.reset_index(inplace=True, drop=True)

n.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n
5,M6A,North York,Lawrence Manor\n
6,M7A,Queen's Park,Not assigned\n
7,M9A,Etobicoke,Islington Avenue\n
8,M1B,Scarborough,Rouge\n
9,M1B,Scarborough,Malvern\n


## 1.7 Check for repeated PostalCodes and format

In [8]:
neig = {"postalCode":[], "borough":[], "neighborhood": []}
counter = 0
for index, row in n.iterrows():
    # print(row['PostalCode'], row['Borough'], row['Neighborhood'])
    if row["PostalCode"] not in neig["postalCode"]:
        neig["postalCode"].append(row["PostalCode"])
        neig["borough"].append(row["Borough"])
        neig["neighborhood"].append(row["Neighborhood"])
    else:
        idx = neig["postalCode"].index(row["PostalCode"])
        new = ', '.join([neig["neighborhood"][idx], row["Neighborhood"]])
        neig["neighborhood"][idx] = new
    counter = counter + 1
    

final = pd.DataFrame(neig)
final = final.replace('\n','', regex=True)
final = final.replace('Not assigned', "Queen's Park", regex=True)
final.head(20)  

final.shape

(103, 3)

## 1.8 Get the PostCodes

In [9]:
df_ps = pd.read_csv("https://cocl.us/Geospatial_data", dtype='str')
print(df_ps.shape == final.shape)
final["Latitude"] = "def1" 
final["Longitude"] = "def2"

True


## 1.9 Prepare the dataframe and transfer the information

In [10]:
final.sort_values(by=['postalCode'], inplace=True)
df_ps.sort_values(by=['Postal Code'], inplace=True)
final["Latitude"] = df_ps["Latitude"].astype(np.float16)
final["Longitude"] = df_ps["Longitude"].astype(np.float16)
final.reset_index(inplace=True, drop=True)
final

Unnamed: 0,postalCode,borough,neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.71875,-79.2500
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78125,-79.2500
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.78125,-79.3750
3,M1G,Scarborough,Woburn,43.78125,-79.4375
4,M1H,Scarborough,Cedarbrae,43.75000,-79.3750
5,M1J,Scarborough,Scarborough Village,43.71875,-79.5000
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.71875,-79.3750
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.71875,-79.3750
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.65625,-79.3750
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.65625,-79.3750


# 2 Exploration of Venues

## 2.1 Install missing libraries

In [11]:
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [12]:
print('There are', len(final['borough'].unique()), 'unique Boroughs')
df_g=final.groupby(['Latitude','Longitude']).size().reset_index().rename(columns={0:'count'})
print ('There are', df_g['count'].sum(), 'unique neighborhoods and', df_g.shape[0], 'uniques postcodes')

There are 11 unique Boroughs
There are 103 unique neighborhoods and 41 uniques postcodes


## 2.2 Get coordinates for Torronto and plot an initial Folium map

In [13]:
address = 'Toronto, TO'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# map_toronto

In [15]:
for lat, lng, post, neighborhood in zip(final['Latitude'], final['Longitude'], final['postalCode'], final['neighborhood']):
    label = '{}, {}'.format(neighborhood, post)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

## 2.3 Connect to the Foresquare API

In [16]:
CLIENT_ID = '4CGUVMWQ1FGZVTQEXEPXU3CL4IXJKV3G3T0ZS532SL0IHXGC'
CLIENT_SECRET = 'GRZY0GHY1GGFYMVXNVO305LSFOR5QFT2REOR5FQ2EL0I5UC0'
VERSION = '20180605'

## 2.3 Explore the first neighborhood

In [17]:
#the neighborhood details
nei_name = final.loc[0, 'neighborhood']
nei_lat = final.loc[0, 'Latitude']
nei_long = final.loc[0, 'Longitude']
print('Latitude and longitude values of {} are {}, {}.'.format(nei_name, nei_lat, nei_long))

#the call parameters
LIMIT = 100 
radius = 5000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    nei_lat, 
    nei_long, 
    radius, 
    LIMIT)
results = requests.get(url).json()

#define function to get venue category
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#cleans the .json to a dataframe     
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

#prints the gathered information
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
nearby_venues.head()

Latitude and longitude values of Rouge, Malvern are 43.71875, -79.25.
100 venues were returned by Foursquare.


Unnamed: 0,name,categories,lat,lng
0,Scarborough Bluffs,Beach,43.70778,-79.237239
1,Bluffers Park,Park,43.705848,-79.234313
2,Rosetta McLain Gardens,Park,43.697019,-79.255739
3,Baran's Turkish Restaurant & Bar,Turkish Restaurant,43.728978,-79.280811
4,PetSmart,Pet Store,43.730081,-79.282082


## 2.4 Explore all neighborhoods in Toronto

In [18]:
#defines function 
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    #compiles the information into a dataset
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [19]:
#calls the above function 
toronto_venues = getNearbyVenues(final['neighborhood'],final['Latitude'],final['Longitude'])

#prints the gathered information
print(toronto_venues.shape)
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
toronto_venues.head()

(1742, 7)
There are 143 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.71875,-79.25,Southern Accent,43.718853,-79.250806,Cajun / Creole Restaurant
1,"Rouge, Malvern",43.71875,-79.25,Working Dog Saloon,43.719311,-79.249792,Sports Bar
2,"Rouge, Malvern",43.71875,-79.25,Pizza Nova,43.718773,-79.251142,Pizza Place
3,"Rouge, Malvern",43.71875,-79.25,KFC,43.723232,-79.249682,Fast Food Restaurant
4,"Rouge, Malvern",43.71875,-79.25,Scarborough GO Station,43.717221,-79.254748,Train Station


## 2.5 Analyze each neighborhood

In [20]:
#performs one-hot encoding on the toronto_venues dataframe
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

#move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#prints the gathered information
print(toronto_onehot.shape)
toronto_onehot.head()

(1742, 144)


Unnamed: 0,Neighborhood,African Restaurant,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Bakery,...,Thai Restaurant,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Women's Store
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [21]:
#aggregates the one-hot encoded venues per neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#prints the gathered information
print(toronto_grouped.shape)
toronto_grouped

(97, 144)


Unnamed: 0,Neighborhood,African Restaurant,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Bakery,...,Thai Restaurant,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.000000,0.200000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
1,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.018182,0.000000,...,0.000000,0.036364,0.0,0.000000,0.000000,0.0,0.0,0.018182,0.000000,0.000000
2,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.111111
3,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.000000,0.166667,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
4,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.000000,0.200000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
5,Bayview Village,0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
6,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.018182,0.000000,...,0.000000,0.036364,0.0,0.000000,0.000000,0.0,0.0,0.018182,0.000000,0.000000
7,Berczy Park,0.000000,0.0,0.000000,0.000000,0.0,0.000,0.5,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
8,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.018182,0.000000,...,0.000000,0.036364,0.0,0.000000,0.000000,0.0,0.0,0.018182,0.000000,0.000000
9,"Bloordale Gardens, Eringate, Markland Wood, Ol...",0.000000,0.0,0.000000,0.000000,0.0,0.000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000


In [22]:
#print each neighborhood along with the top 5 most common venues
num_top_venues = 5
for hood in toronto_grouped['Neighborhood']:
    print("----",hood,"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Adelaide, King, Richmond ----
                        venue  freq
0  Construction & Landscaping   0.2
1                      Bakery   0.2
2        Other Great Outdoors   0.2
3                        Park   0.2
4           Convenience Store   0.2


---- Agincourt North, L'Amoreaux East, Milliken, Steeles East ----
                       venue  freq
0                Coffee Shop  0.11
1             Clothing Store  0.09
2                 Restaurant  0.05
3       Fast Food Restaurant  0.04
4  Middle Eastern Restaurant  0.04


---- Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown ----
                  venue  freq
0  Fast Food Restaurant  0.33
1         Women's Store  0.11
2           Coffee Shop  0.11
3         Shopping Mall  0.11
4        Clothing Store  0.11


---- Alderwood, Long Branch ----
               venue  freq
0              Hotel  0.17
1  Electronics Store  0.17
2       Intersection  0.17
3  Convenience Store  0.1

                       venue  freq
0                Coffee Shop  0.11
1             Clothing Store  0.09
2                 Restaurant  0.05
3       Fast Food Restaurant  0.04
4  Middle Eastern Restaurant  0.04


---- Fairview, Henry Farm, Oriole ----
          venue  freq
0          Park   0.4
1  Soccer Field   0.2
2    Food Truck   0.2
3           Gym   0.2
4        Office   0.0


---- First Canadian Place, Underground city ----
                  venue  freq
0    Salon / Barbershop  0.33
1           Pizza Place  0.33
2  Caribbean Restaurant  0.33
3  Pakistani Restaurant  0.00
4           Music Venue  0.00


---- Flemingdon Park, Don Mills South ----
                venue  freq
0         Coffee Shop  0.14
1            Pharmacy  0.14
2  Italian Restaurant  0.14
3                Park  0.14
4        Noodle House  0.14


---- Forest Hill North, Forest Hill West ----
             venue  freq
0      Music Venue  0.17
1    Boat or Ferry  0.17
2  Harbor / Marina  0.17
3        BBQ Joint  0.17


                  venue  freq
0     Convenience Store   0.2
1   American Restaurant   0.2
2      Asian Restaurant   0.2
3  Fast Food Restaurant   0.2
4           Golf Course   0.2


---- Scarborough Village ----
                   venue  freq
0  Vietnamese Restaurant  0.17
1            Pizza Place  0.17
2               Pharmacy  0.17
3       Business Service  0.17
4                 Bakery  0.17


---- Silver Hills, York Mills ----
         venue  freq
0   Food Court   0.2
1         Pool   0.2
2    Juice Bar   0.2
3         Park   0.2
4  Coffee Shop   0.2


---- St. James Town ----
               venue  freq
0  Food & Drink Shop   0.2
1     Baseball Field   0.2
2       Intersection   0.2
3               Park   0.2
4      Tattoo Parlor   0.2


---- Stn A PO Boxes 25 The Esplanade ----
                         venue  freq
0           Italian Restaurant  0.13
1               Ice Cream Shop  0.07
2                   Beer Store  0.07
3            Polish Restaurant  0.07
4  Eastern European R

In [23]:
#function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

#new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Bakery,Construction & Landscaping,Convenience Store,Park,Other Great Outdoors,Discount Store,Farmers Market,Falafel Restaurant,Electronics Store,Eastern European Restaurant
1,"Agincourt North, L'Amoreaux East, Milliken, St...",Coffee Shop,Clothing Store,Restaurant,Café,Fast Food Restaurant,Pizza Place,Theater,Diner,Tea Room,Middle Eastern Restaurant
2,"Albion Gardens, Beaumond Heights, Humbergate, ...",Fast Food Restaurant,Women's Store,Clothing Store,Coffee Shop,Shopping Mall,Food Court,Grocery Store,Dive Bar,Falafel Restaurant,Electronics Store
3,"Alderwood, Long Branch",Hotel,Intersection,Convenience Store,Electronics Store,Bakery,Pharmacy,Grocery Store,Greek Restaurant,Concert Hall,Construction & Landscaping
4,"Bathurst Manor, Downsview North, Wilson Heights",Bakery,Construction & Landscaping,Convenience Store,Park,Other Great Outdoors,Discount Store,Farmers Market,Falafel Restaurant,Electronics Store,Eastern European Restaurant
5,Bayview Village,Juice Bar,Coffee Shop,Pool,Park,Food Court,Diner,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dive Bar
6,"Bedford Park, Lawrence Manor East",Coffee Shop,Clothing Store,Restaurant,Café,Fast Food Restaurant,Pizza Place,Theater,Diner,Tea Room,Middle Eastern Restaurant
7,Berczy Park,Pool,Auto Garage,Women's Store,Discount Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dive Bar
8,"Birch Cliff, Cliffside West",Coffee Shop,Clothing Store,Restaurant,Café,Fast Food Restaurant,Pizza Place,Theater,Diner,Tea Room,Middle Eastern Restaurant
9,"Bloordale Gardens, Eringate, Markland Wood, Ol...",Curling Ice,Campground,Golf Course,Women's Store,Discount Store,Farmers Market,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dive Bar


# 3 Clustering of venues

## 3.1 Dataframe preparation

In [24]:
final['neighborhood'] = final['neighborhood'].astype(str)
toronto_grouped['Neighborhood'] = toronto_grouped['Neighborhood'].astype(str)

check = []
for item in toronto_grouped['Neighborhood']:
    check.append(str(item))

indx = 0
for item in final['neighborhood']:
    item=str(item)
    if item not in check:
        final.iloc[indx, 0] = np.nan
        final.iloc[indx, 1] = np.nan
        final.iloc[indx, 2] = np.nan
        final.iloc[indx, 3] = np.nan
        final.iloc[indx, 4] = np.nan
    indx = indx + 1

final.dropna(inplace=True)
final.reset_index(inplace=True, drop=True)
    
print(len(final['neighborhood'])==len(toronto_grouped['Neighborhood']))

True


## 3.2 Clustering process

In [38]:
#set number of clusters
cls = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

#run k-means clustering
kmeans = KMeans(n_clusters=cls, random_state=0).fit(toronto_grouped_clustering)

#check cluster labels generated for each row in the dataframe
labels = kmeans.labels_
print(len(final["neighborhood"]), len(toronto_grouped['Neighborhood']), len(labels) )

97 97 97


In [39]:
toronto_merged = final

#add clustering labels
toronto_merged['Cluster Labels'] = labels

#merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
#toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
#toronto_merged.head() # check the last columns!

In [41]:
toronto_merged.head()

Unnamed: 0,postalCode,borough,neighborhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.71875,-79.25,1
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78125,-79.25,1
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.78125,-79.375,1
3,M1H,Scarborough,Cedarbrae,43.75,-79.375,1
4,M1J,Scarborough,Scarborough Village,43.71875,-79.5,1


## 3.3 Clustering visualisation

In [45]:
#create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

#set color scheme for the clusters
x = np.arange(cls)
ys = [i+x+(i*x)**2 for i in range(cls)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
         [lat, lon],
         radius=5,
         popup=label,
         color=rainbow[cluster-1],
         fill=True,
         fill_color=rainbow[cluster-1],
         fill_opacity=0.7).add_to(map_clusters)
       
map_clusters