# Battle of Neighbourhoods in Toronto
This notebook shows primary analysis on how Toronto neighourhoods differ in their venues.

### Import and clean data sets

In [1]:
# import packages
import requests
import pandas as pd
import numpy as np

In [2]:
# import data
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# set the table as datafrome
df = pd.DataFrame(data[0])

# drop Neighbourhood that is not assigned
df = df[df['Neighbourhood'] != 'Not assigned']
df.reset_index(drop=True,inplace=True)
print(df.shape)
df.head()

(210, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [3]:
# read a csv file contains the latitude and the longitude coordinates of each neighborhood
location = pd.read_csv('http://cocl.us/Geospatial_data')

# merge two dataframes
location.columns = ['Postcode', 'Latitude', 'Longitude']
df = pd.merge(df,location, on="Postcode")
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


Find geological location of Toronto

In [4]:
# import packages
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

address = 'Toronto, ON'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Accessing Foursquare API to obtain venues information

In [5]:
# for accessing Foursquare API
CLIENT_ID = 'your Foursquare ID'
CLIENT_SECRET = 'your Foursquare Secret'
VERSION = '20190405'
LIMIT = 100

In [6]:
# difne a function to make request for each neighbourhood
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
# run the above function on each neighborhood
toronto_venues = getNearbyVenues(names=df['Neighbourhood'], latitudes=df['Latitude'], longitudes=df['Longitude'])
print(toronto_venues.shape)
toronto_venues.head()

(9227, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W Canada,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,43.746086,-79.324978,Grocery Store


In [10]:
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 331 uniques categories.


### Analyze top venues in neighbourhoods

In [11]:
# one hot encoding
tor_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
tor_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
Toronto_grouped = tor_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [13]:
# create the new dataframe and display the top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Café,Coffee Shop,Hotel,Theater,Sushi Restaurant,American Restaurant,Concert Hall,Restaurant,Gastropub,Gym
1,Agincourt,Chinese Restaurant,Shopping Mall,Pizza Place,Sandwich Place,Coffee Shop,Supermarket,Caribbean Restaurant,Bakery,Seafood Restaurant,Motorcycle Shop
2,Agincourt North,Chinese Restaurant,Korean Restaurant,Bakery,Pizza Place,Pharmacy,Park,Gym,Caribbean Restaurant,Shop & Service,BBQ Joint
3,Albion Gardens,Pizza Place,Grocery Store,Construction & Landscaping,Coffee Shop,Discount Store,Sandwich Place,Fried Chicken Joint,Park,Beer Store,Japanese Restaurant
4,Alderwood,Discount Store,Pizza Place,Pharmacy,Grocery Store,Trail,Moroccan Restaurant,Skating Rink,Shopping Mall,Donut Shop,Liquor Store


### Apply DBSCAN method to cluster similar neighbourhoods

In [14]:
# import packages
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)

<mtrand.RandomState at 0x1db3b946ea0>

In [15]:
# create cluster data set
df2 =  Toronto_grouped.merge(df.drop_duplicates(subset='Neighbourhood'), on='Neighbourhood')
fixed_columns = list(df2.columns[-4:]) + list(df2.columns[:-4])
df2 = df2[fixed_columns]
Clus_dataSet = df2.iloc[:, 5:]
#Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

In [16]:
# Compute DBSCAN
db = DBSCAN(eps=25, min_samples=2).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
Clus_df = df2.loc[:,['Neighbourhood','Latitude','Longitude']]
Clus_df["Clus_Db"]=labels
realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 
Clus_org = Clus_df

# exclude cluster -1 (i.e.outliers) 
Clus_df = Clus_df[Clus_df['Clus_Db'] > -1]

# find number of clusters 
kclusters = Clus_df['Clus_Db'].unique().shape[0]
print('Number of clusters = ', kclusters)

Number of clusters =  6


In [17]:
# Plot clusters on map

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Clus_df['Latitude'], Clus_df['Longitude'], Clus_df['Neighbourhood'], 
                                  Clus_df['Clus_Db']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Find top venues in each cluster

In [18]:
# find top venues in each cluster
df3 = pd.merge(Clus_df,neighbourhoods_venues_sorted, on="Neighbourhood")

# create the new dataframe and display the top 10 venues for each cluster
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Clus_Db']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
c_sorted = pd.DataFrame(columns=columns)

# loop over all clusters
for num in np.arange(kclusters):
    c0 = df3[df3['Clus_Db'] == num]
    c0_top = pd.DataFrame(pd.value_counts(c0.iloc[:,4:].values.flatten()))
    c_sorted.loc[num,'Clus_Db']= num
    c_sorted.iloc[num,1:] = c0_top.index.values[:10]
    
c_sorted

Unnamed: 0,Clus_Db,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Coffee Shop,Park,Pizza Place,Café,Sandwich Place,Bakery,Italian Restaurant,Grocery Store,Fast Food Restaurant,Restaurant
1,1,Accessories Store,Juice Bar,Coffee Shop,Bakery,Sports Club,Bank,Italian Restaurant,Sushi Restaurant,Fast Food Restaurant,Boutique
2,2,Hotel,Indian Restaurant,Bakery,Tibetan Restaurant,Coffee Shop,Café,Vegetarian / Vegan Restaurant,Restaurant,Furniture / Home Store,Bar
3,3,Pub,Diner,Breakfast Spot,Park,Coffee Shop,Café,Sushi Restaurant,Restaurant,Italian Restaurant,Theater
4,4,Men's Store,Italian Restaurant,Bakery,Asian Restaurant,Coffee Shop,Café,Vegetarian / Vegan Restaurant,Restaurant,Cocktail Bar,Bar
5,5,Pub,Museum,Bakery,Grocery Store,Coffee Shop,Café,Vegetarian / Vegan Restaurant,Restaurant,Italian Restaurant,Pizza Place
