# First phase notebook: Segmenting and Clustering Neighborhoods in Toronto
TOC to be completed later

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize
from dotenv import load_dotenv
from pathlib import Path
import os

## 1st step, importing the dataset
in this step the dataset is read using pandas library. Then its 5 first row printed. The desired table is stored in the first table of url.

In [2]:
# importing dataset
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 2nd step, cleaning and forming the dataset
According to the provided instruction, unique postal codes are analysed. Becasue the number of unique codes are the same of the current recodes, there is no need of merging or combining rows. In the next stage, Borough without assigned values are deleted. Then neigbourhoods without assigned value are investigated. Becasue there are no rows with such a specification, no cell is replaced with its borough. Finally, the shape of the dataset is printed and the last 5 rows are shown.

In [3]:
# cleaning and forming the dataset
print('The dataset includes {} records with {} unique postal codes \n'.format(len(df) , len(df['Postal Code'].unique())))
# igonring cells that Borough is not assigned
df = df[df['Borough'] != 'Not assigned']
df.reset_index(inplace = True, drop = True)
print('Aftering deleting rows without assigned boroughs, the number of records reduced to {} \n'.format(len(df)))
# assigning Borough to Neighbourhood where Neighbourhood is 'Not assigned'
n_na_neighbour = df['Neighbourhood'][df['Neighbourhood'] == 'Not assigned'].count()
print('After correcting NA boroughs, {} neighbourhoods found without assigned value \n'.format(n_na_neighbour))
print('the final shape of the dataset is {} \n'.format(df.shape))
df.tail()

The dataset includes 180 records with 180 unique postal codes 

Aftering deleting rows without assigned boroughs, the number of records reduced to 103 

After correcting NA boroughs, 0 neighbourhoods found without assigned value 

the final shape of the dataset is (103, 3) 



Unnamed: 0,Postal Code,Borough,Neighbourhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


## 3rd step, transforming the database
In the next phase of the project, each neighbourhood's coordinates should be found. So, having their name stored in a single cell is not desirable. The ideal form of dataset is having neighbourhood name in a cell, preferably set as index.

In [4]:
# creating a dataset setting each neighbourhood in one row
dfn = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighbourhood'])
for nn in range(0, len(df) - 1):
    borough = df['Borough'].iloc[nn]
    post_code = df['Postal Code'].iloc[nn]
    neighbourhoods = df['Neighbourhood'].iloc[nn].split(', ')
    for neighbourhood in neighbourhoods: 
        dfn_add = pd.DataFrame({'Borough': [borough], 'Postal Code': [post_code], 'Neighbourhood' : [neighbourhood]})
        dfn = dfn.append(dfn_add, ignore_index=True)
print('the dataset includes {} neighbourhoods \n'.format(len(dfn)))
dfn.tail()

the dataset includes 212 neighbourhoods 



Unnamed: 0,Postal Code,Borough,Neighbourhood
207,M8Y,Etobicoke,Humber Bay
208,M8Y,Etobicoke,Mimico NE
209,M8Y,Etobicoke,The Queensway East
210,M8Y,Etobicoke,Royal York South East
211,M8Y,Etobicoke,Kingsway Park South East


## 4th step, finding coordinates
According to the provided instructions of the assignment, geocoder is used in a while loop to find the corresponding long/lat of each rows in the newly transformed dataset. Unfortunately, it has not ended to any plausible result. So, I used instead geopy which made it possible. Two columns have been added to the new dataset.
there are several differences which made the code into work:
1. using geopy, Nominatim
2. passing GeocoderTimedOut for avoiding errors of timing out
3. setting a search limit for a neighbourhood
4. using sleep of 1 sec for avoiding server runtime limit block
5. passing a random symbolic password
6. random ordering of address 
<br>

Finally, geocoder fails to locate some neibourhoods. These records should be handled manually.

In [12]:
import geopy, random
from time import sleep
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [13]:
def do_geocode(address):
    geopy = Nominatim(user_agent="aron.shirazi@gmail.com")
    try:
        sleep(1)
        return geopy.geocode(address)
    except GeocoderTimedOut:
        return do_geocode(address)

dfn['latitude'] = 'NA'
dfn['longitude'] = 'NA'
max_try = 10
for nn in range(0, len(dfn)):
    neighbourhood = dfn['Neighbourhood'].iloc[nn]
    location = None
    count = 0
    while (location == None) & (count < max_try):
        password = ''.join(random.choice(['#', '$', '%', '@', '*', '-', '&', '~', '!']) for i in range(8))
        address_list = [neighbourhood, 'Toronto', 'Ontario', password]
        order = ''.join(random.sample(['0', '1', '2', '3'], 4))
        n0 = int(order[0]); n1 = int(order[1]); n2 = int(order[2]); n3 = int(order[3])
        address = '{}, {}, {}, {}'.format(address_list[n0], address_list[n1], address_list[n2], address_list[n3])
        location = do_geocode(address)
        count += 1
    if location is not None:
        print('{}, coordinates found for {}'.format(nn, neighbourhood))
        dfn['latitude'].iloc[nn] = location.latitude
        dfn['longitude'].iloc[nn] = location.longitude
    else:
        print('{}, coordinates not found for {}'.format(nn, neighbourhood))

0, coordinates found for Parkwoods
1, coordinates found for Victoria Village
2, coordinates found for Regent Park
3, coordinates found for Harbourfront
4, coordinates found for Lawrence Manor
5, coordinates found for Lawrence Heights
6, coordinates found for Queen's Park
7, coordinates not found for Ontario Provincial Government
8, coordinates found for Islington Avenue
9, coordinates found for Humber Valley Village
10, coordinates found for Malvern
11, coordinates found for Rouge
12, coordinates found for Don Mills
13, coordinates found for Parkview Hill
14, coordinates found for Woodbine Gardens
15, coordinates found for Garden District
16, coordinates found for Ryerson
17, coordinates found for Glencairn
18, coordinates found for West Deane Park
19, coordinates found for Princess Gardens
20, coordinates found for Martin Grove
21, coordinates found for Islington
22, coordinates found for Cloverdale
23, coordinates found for Rouge Hill
24, coordinates found for Port Union
25, coordina

In [14]:
# finding unlocated neighbourhoods to set the m manually
dfn['latitude'][~dfn['latitude'].apply(np.isreal)] = '0'
dfn['longitude'][~dfn['longitude'].apply(np.isreal)] = '0'
dfn['latitude'] = dfn['latitude'].astype('float', errors='ignore')
dfn['longitude'] = dfn['longitude'].astype('float', errors='ignore')
dfo = pd.read_csv('Geospatial_Coordinates.csv')
for nn in dfn[dfn['longitude'] == 0].index:
    dfn['latitude'].iloc[nn] = float(dfo['Latitude'][dfo['Postal Code'] == dfn['Postal Code'].iloc[nn]])
    dfn['longitude'].iloc[nn] = float(dfo['Longitude'][dfo['Postal Code'] == dfn['Postal Code'].iloc[nn]])
dfn.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,Regent Park,43.659279,-79.366135
3,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
4,M6A,North York,Lawrence Manor,43.722079,-79.437507
5,M6A,North York,Lawrence Heights,43.726544,-79.457791
6,M7A,Downtown Toronto,Queen's Park,43.667662,-79.394698
7,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494
8,M9A,Etobicoke,Islington Avenue,43.679484,-79.538909
9,M9A,Etobicoke,Humber Valley Village,43.666472,-79.524314


## 5th step, plotting locations
in this step, found coordinates of neighbourhoods are plotted along with their attached names. The important point is South Niagara is omitted from the plot becasue it is far away from other neighbourhoods and makes our plot unnecessay large. Another point is, initial zoom command in folium is not used, instead a more efficient method of fit_bound has been utilised.

In [15]:
# importing the library
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
# finding the center of map for illustration purpuse
df_loc = dfn[dfn['Neighbourhood'] != 'South Niagara'] # South Niagra is far away and makes our analysis inefficient so it is omitted
center_lat = df_loc['latitude'].mean()
center_lon = df_loc['longitude'].mean()
# to set boundaries of folium
lat_min = df_loc['latitude'].min()
lat_max = df_loc['latitude'].max()
lon_min = df_loc['longitude'].min()
lon_max = df_loc['longitude'].max()

In [19]:
map_toronto = folium.Map(location=[center_lat, center_lon], width=800, height=600)
map_toronto.fit_bounds([[lat_min, lon_min], [lat_max, lon_max]])
# add markers to map
for lat, lng, label in zip(df_loc['latitude'], df_loc['longitude'], df_loc['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

## 6th step, finding venues
After taking out coordinates of neighbourhoods, it is time to extract the specifications of registered venues. To do so, Foursquare API service is used. There are 4 corresonding steps introduced in the following:
1. defining API credentials by using dot env. in this method credentials are savved in a .env file which set to be ignored by Github in the time of publication in .gitignore file.
2. defining two main functions: the first function find venues aroud a specified location by passing lat/lon. The limit is set to 100 and the radius is 500m by default. The second function, extract venues specification stored in the retrieved JSON file.
3. exploring neighbourhoods' venues by runing two functions along all extracted coordinates in the former step. A new dataset is generated here which stores specifications of venues.
4. analysing the venues dataset which starts by finding how many venues found per neighbourhood. Then the number of unique venues is calculated as well as their categories.
5. plotting found venues imposed on neighbourhoods' plot to see the disturbution of them.

In [20]:
# Defining Foursquare Credentials and Version
# importing credentials
load_dotenv()
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
CLIENT_ID = os.getenv("Foursquare_CLIENT_ID")
CLIENT_SECRET = os.getenv("Foursquare_CLIENT_SECRET")
VERSION = '20180605' # Foursquare API version

In [24]:
# defining to main functions
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

def find_venue(lat, lon, limit = 100, radius = 500):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    lon, 
    radius, 
    limit)
    results = requests.get(url).json()
    try:
        venues = results['response']['groups'][0]['items']
    except:
        venues = []
    nearby_venues = None
    if len(venues) > 0:
        nearby_venues = pd.json_normalize(venues) # flatten JSON
        # filter columns
        filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
        nearby_venues =nearby_venues.loc[:, filtered_columns]
        # filter the category for each row
        nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
        # clean columns
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
    return nearby_venues

In [25]:
df_venues = pd.DataFrame(columns = ['name', 'categories', 'lat', 'lng'])
nn = 0
for name, lat, lng in zip(df_loc['Neighbourhood'], df_loc['latitude'], df_loc['longitude']):
    df_tr = find_venue(lat, lng)
    if df_tr is None: 
        len_found = 0
    else: 
        len_found = len(df_tr)
        df_tr['neighbourhood'] = name
    print('{}, venues of {} explored at lat: {} and long: {}, with {} venues'.format(nn, name, lat, lng, len_found))
    df_venues = pd.concat([df_venues, df_tr])
    nn += 1
df_venues.reset_index(inplace = True, drop = True)
print('venues of Toronto are explored, the dataset shape is {} \n'.format(df_venues.shape))
df_venues

0, venues of Parkwoods explored at lat: 43.7587999 and long: -79.3201966, with 14 venues
1, venues of Victoria Village explored at lat: 43.732658 and long: -79.3111892, with 4 venues
2, venues of Regent Park explored at lat: 43.6592794 and long: -79.366135, with 31 venues
3, venues of Harbourfront explored at lat: 43.6400801 and long: -79.3801495, with 100 venues
4, venues of Lawrence Manor explored at lat: 43.7220788 and long: -79.4375067, with 5 venues
5, venues of Lawrence Heights explored at lat: 43.7265441 and long: -79.4577911, with 38 venues
6, venues of Queen's Park explored at lat: 43.66766165 and long: -79.39469797351879, with 100 venues
7, venues of Ontario Provincial Government explored at lat: 43.6623015 and long: -79.3894938, with 35 venues
8, venues of Islington Avenue explored at lat: 43.6794838 and long: -79.5389092, with 4 venues
9, venues of Humber Valley Village explored at lat: 43.6664717 and long: -79.5243136, with 5 venues
10, venues of Malvern explored at lat: 4

Unnamed: 0,name,categories,lat,lng,neighbourhood
0,Allwyn's Bakery,Caribbean Restaurant,43.759840,-79.324719,Parkwoods
1,LCBO,Liquor Store,43.757774,-79.314257,Parkwoods
2,Shoppers Drug Mart,Pharmacy,43.760857,-79.324961,Parkwoods
3,Petro-Canada,Gas Station,43.757950,-79.315187,Parkwoods
4,TD Canada Trust,Bank,43.757569,-79.314976,Parkwoods
...,...,...,...,...,...
6533,Swiss Chalet,Restaurant,43.647888,-79.508356,Kingsway Park South East
6534,Tim Hortons,Coffee Shop,43.646678,-79.513700,Kingsway Park South East
6535,Rogers,Mobile Phone Shop,43.647080,-79.511550,Kingsway Park South East
6536,Enterprise Rent-A-Car,Rental Car Location,43.646860,-79.515110,Kingsway Park South East


In [26]:
print('There are {} uniques categories \n'.format(len(df_venues['categories'].unique())))
print('There are {} uniques venues \n'.format(len(df_venues['name'].unique())))

There are 341 uniques categories 

There are 2505 uniques venues 



In [30]:
# plotting venues along their neighbourhoods
map_venue_toronto = folium.Map(location=[center_lat, center_lon], width=800, height=600)
map_venue_toronto.fit_bounds([[lat_min, lon_min], [lat_max, lon_max]])
# add markers to map for neighbourhoods
for lat, lng, label in zip(df_loc['latitude'], df_loc['longitude'], df_loc['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius= 10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_venue_toronto)
# add markers to map for venues
for lat, lng, label in zip(df_venues['lat'], df_venues['lng'], df_venues['name']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=0.2,
        popup=label,
        color='green',
        fill=False,
        fill_color='#31cc67',
        fill_opacity=0.5,
        parse_html=False).add_to(map_venue_toronto)
map_venue_toronto

## 7th step, analysing each neighbourhood
in this step neighbourhoods are analysed by finding their top venues and the corresponding frequency. Two datasets are developed, the first one stores venues names along with the frequency, and the second one includes only top 10 venues' names for each neighbourhood. this part is consisted of four steps:
1. establishing onehot dataset
2. grouping the dataset by its neighbourhood
3. developing the first dataset, top5 with frequency
4. developing the second dataset, top10 without frequency

In [36]:
# one hot encoding
toronto_onehot = pd.get_dummies(df_venues[['categories']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['neighbourhood'] = df_venues['neighbourhood'] 
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
print('shape of neighbourhood-venues dataset is {}'.format(toronto_onehot.shape))
toronto_onehot.head()

shape of neighbourhood-venues dataset is (6538, 342)


Unnamed: 0,neighbourhood,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# grouping neighbourhood-venues dataset by its neighbourhood to find densities
toronto_grouped = toronto_onehot.groupby('neighbourhood').mean().reset_index()
print('the shape of neighbourhood- all venues dataset is {}'.format(toronto_gropued.shape))
toronto_grouped

the shape of neighbourhood- all venues dataset is (198, 342)


Unnamed: 0,neighbourhood,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Adelaide,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.01,0.000000,0.00,0.0,0.00,0.0,0.0
1,Agincourt,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.076923,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0
2,Agincourt North,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.035714,0.0,0.0,0.00,0.035714,0.00,0.0,0.00,0.0,0.0
3,Albion Gardens,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0
4,Alderwood,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,Woodbine Heights,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0
194,York Mills,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0
195,York Mills West,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0
196,York University,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00,0.000000,0.00,0.0,0.00,0.0,0.0


In [58]:
# creating a dataset of top 5 venues with the frequency
num_top = 5
nei_ven_top5 = pd.DataFrame(np.nan, index=range(0, len(toronto_grouped)), columns = ['neighbourhood', 'top1', 'top2', 'top3', 'top4', 'top5'])
nn = 0
for hood in toronto_grouped['neighbourhood']:
    temp = toronto_grouped[toronto_grouped['neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)  
    temp = temp.sort_values('freq', ascending=False).reset_index(drop=True)
    nei_ven_top5['neighbourhood'].iloc[nn] = hood
    nei_ven_top5['top1'].iloc[nn] = [{'venue': temp['venue'].iloc[0], 'freq': temp['freq'].iloc[0]}]
    nei_ven_top5['top2'].iloc[nn] = [{'venue': temp['venue'].iloc[1], 'freq': temp['freq'].iloc[1]}]
    nei_ven_top5['top3'].iloc[nn] = [{'venue': temp['venue'].iloc[2], 'freq': temp['freq'].iloc[2]}]
    nei_ven_top5['top4'].iloc[nn] = [{'venue': temp['venue'].iloc[3], 'freq': temp['freq'].iloc[3]}]
    nei_ven_top5['top5'].iloc[nn] = [{'venue': temp['venue'].iloc[4], 'freq': temp['freq'].iloc[4]}]
    nn += 1
nei_ven_top5.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,neighbourhood,top1,top2,top3,top4,top5
0,Adelaide,"[{'venue': 'Coffee Shop', 'freq': 0.07}]","[{'venue': 'Café', 'freq': 0.05}]","[{'venue': 'American Restaurant', 'freq': 0.04}]","[{'venue': 'Gym', 'freq': 0.04}]","[{'venue': 'Restaurant', 'freq': 0.04}]"
1,Agincourt,"[{'venue': 'Chinese Restaurant', 'freq': 0.153...","[{'venue': 'Cantonese Restaurant', 'freq': 0.0...","[{'venue': 'Coffee Shop', 'freq': 0.0769230769...","[{'venue': 'Shopping Mall', 'freq': 0.07692307...","[{'venue': 'Food Court', 'freq': 0.07692307692..."
2,Agincourt North,"[{'venue': 'Bank', 'freq': 0.07142857142857142}]","[{'venue': 'Chinese Restaurant', 'freq': 0.071...","[{'venue': 'Bakery', 'freq': 0.071428571428571...","[{'venue': 'Restaurant', 'freq': 0.03571428571...","[{'venue': 'Sporting Goods Shop', 'freq': 0.03..."
3,Albion Gardens,"[{'venue': 'Grocery Store', 'freq': 0.18181818...","[{'venue': 'Fast Food Restaurant', 'freq': 0.0...","[{'venue': 'Caribbean Restaurant', 'freq': 0.0...","[{'venue': 'Hardware Store', 'freq': 0.0909090...","[{'venue': 'Sandwich Place', 'freq': 0.0909090..."
4,Alderwood,"[{'venue': 'Pizza Place', 'freq': 0.25}]","[{'venue': 'Pool', 'freq': 0.125}]","[{'venue': 'Skating Rink', 'freq': 0.125}]","[{'venue': 'Pub', 'freq': 0.125}]","[{'venue': 'Gym', 'freq': 0.125}]"


In [68]:
# creating a dataset of top 10 venues without the frequency
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['neighbourhood'] = toronto_grouped['neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Gym,Restaurant,American Restaurant,Cosmetics Shop,Italian Restaurant,Seafood Restaurant,Gastropub,Clothing Store
1,Agincourt,Chinese Restaurant,Hong Kong Restaurant,Coffee Shop,Food Court,Shopping Mall,Korean Restaurant,Asian Restaurant,Train Station,Restaurant,Rental Car Location
2,Agincourt North,Bakery,Bank,Chinese Restaurant,Sporting Goods Shop,Fast Food Restaurant,Frozen Yogurt Shop,Fried Chicken Joint,Beer Store,Liquor Store,Spa
3,Albion Gardens,Grocery Store,Fast Food Restaurant,Pizza Place,Liquor Store,Caribbean Restaurant,Pharmacy,Hardware Store,Beer Store,Sandwich Place,Fried Chicken Joint
4,Alderwood,Pizza Place,Pool,Skating Rink,Pub,Sandwich Place,Coffee Shop,Gym,Zoo Exhibit,Dumpling Restaurant,Doctor's Office


## 8th step, clustring neighbourhoods
This is the last stage where neighbourhoods are classified and plotted. This process is accomplished through three steps:
1. clustring neighbourhoods by use of kmean which is an unsupervised machine learning method. The clustering is applied on the density based dataset which each venue in a neighbourhood holds a frequency.
2. creating a dataset for passing to plotting section. it includes main columns of postal code, boroguh, neighbourhood, lat, lon, and first 10 common venues.
3. plotting the venues using folium library, in which each cluster is colour coded. <br>

note:
- some neighbourhoods are omitted from kmeans input becasue no venues found around them, so they could not be clustered. As a result, when in the final stage the datasets are merged they hold nan value for their cluster. in plotting section, these neighbourhoods skipped.

In [62]:
# importing libraries
from sklearn.cluster import KMeans
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('neighbourhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
print('The 10 first neighbourhoods clustring is ', kmeans.labels_[0:10])

The 10 first neighbourhoods clustring is  [1 1 1 1 1 1 4 1 1 1]


In [69]:
# developing a dataset includes 10 first venues along with borough, neighbourhood, lat and lon
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = dfn

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7588,-79.320197,1.0,Bus Line,Coffee Shop,Bank,Gas Station,Liquor Store,Caribbean Restaurant,Discount Store,Chinese Restaurant,Electronics Store,Laundry Service
1,M4A,North York,Victoria Village,43.732658,-79.311189,1.0,Middle Eastern Restaurant,Mediterranean Restaurant,Spa,Thai Restaurant,Ethiopian Restaurant,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant
2,M5A,Downtown Toronto,Regent Park,43.659279,-79.366135,1.0,Pharmacy,Restaurant,Coffee Shop,Café,Grocery Store,Bus Stop,Beer Store,Food Truck,Food & Drink Shop,Fast Food Restaurant
3,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015,1.0,Coffee Shop,Café,Hotel,Restaurant,Pizza Place,Italian Restaurant,Fried Chicken Joint,Sports Bar,Music Venue,Steakhouse
4,M6A,North York,Lawrence Manor,43.722079,-79.437507,4.0,Electronics Store,Kids Store,Bank,Park,Doctor's Office,Zoo Exhibit,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant


In [106]:
# importing libraries
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[center_lat, center_lon], width=800, height=600)
map_clusters.fit_bounds([[lat_min, lon_min], [lat_max, lon_max]])

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    if ~np.isnan(cluster):
        cluster = int(cluster)
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)
        
map_clusters

# END of CODE
Please send your inquiries to aron.shirazi (at) gmail.com