<h1><b>Cannadian Neighbourhood Clustering</b></h1>

In [1]:
import pandas as pd
import numpy as np
import json
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from geopy.geocoders import Nominatim

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import re

print('Libraries imported.')

Libraries imported.


In [2]:
html_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
canada_df = pd.DataFrame(html_data[0])

<h2>Cleaning the data
   

In [4]:
print(canada_df.columns)

Index(['Postal code', 'Borough', 'Neighborhood'], dtype='object')


<h3> Removing all Not Assigned Boroughs 

In [5]:
canada_df = canada_df[canada_df.Borough != 'Not assigned']

<h3> Grouping Neighborhoods by , instead of /

In [6]:
canada_df['Neighborhood'] = canada_df['Neighborhood'].apply(lambda x : re.sub(' / ',',',x))

<h3> Replacing Not assigned Neighborhoods with boroughs

In [7]:
canada_df['Neighborhood'] = canada_df['Neighborhood'].where((canada_df['Neighborhood'] != 'Not assigned'), canada_df['Borough'])

<h3> Resetting Index

In [8]:
canada_df.reset_index(drop=True,inplace=True)
canada_df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park,Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern,Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill,Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
canada_df.shape

(103, 3)

<h2> Using csv file provided to add latitude and logitude information

In [10]:
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord.set_index(['Postal Code'], inplace=True)

In [11]:
canada_df.set_index(['Postal code'], inplace=True)

In [12]:
canada_lat_long = canada_df.join(coord,how='left')
canada_lat_long.reset_index(inplace=True)
canada_lat_long

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park,Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill,Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


<h2> Clustering

In [13]:
address = 'Toronto, Canada'

locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode(address)

print("The latitude and logitude values for cannada are {} and {}".format(location.latitude,location.longitude))

The latitude and logitude values for cannada are 43.6534817 and -79.3839347


In [14]:
canada_map = folium.Map([location.latitude,location.longitude],zoom_start=11)
for neigh, lat,long in zip(canada_lat_long['Neighborhood'],canada_lat_long['Latitude'],canada_lat_long['Longitude']):
    label = folium.Popup(neigh, parse_html=True)
    folium.CircleMarker([lat,long], popup=label, parse_html=True).add_to(canada_map)

canada_map

In [15]:
CLIENT_ID = 'GMINQGQGTEJRL5PV23OJH45IFASRCIYGCBKOJICSTLV431MH' # your Foursquare ID
CLIENT_SECRET = 'LL0FGH4MLG4BLKLCYVGPRDCJBQVSLE2DVO21HPKZCZ4A5LQL' # your Foursquare Secret
VERSION = '20200415'
LIMIT = 30

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GMINQGQGTEJRL5PV23OJH45IFASRCIYGCBKOJICSTLV431MH
CLIENT_SECRET:LL0FGH4MLG4BLKLCYVGPRDCJBQVSLE2DVO21HPKZCZ4A5LQL


<h3>The goal of my analysis is to find out Neighborhoods most similar to "The Beaches"</h3>
    <p> Using the Foursquare API to get the necessery information

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
cannada_venues = getNearbyVenues(names = canada_lat_long['Neighborhood'], latitudes=
                                 canada_lat_long['Latitude'], longitudes=canada_lat_long['Longitude'])

KeyError: 'groups'

In [None]:
cannada_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(cannada_venues['Venue Category'].unique())))

<b>Using venue catagories, performing transformations

In [None]:
cannada_onehot = pd.get_dummies(cannada_venues['Venue Category'])

In [None]:
cannada_onehot['Neighborhood'] = cannada_venues['Neighborhood']
cannada_onehot = cannada_onehot.groupby('Neighborhood').mean()

In [None]:
x_pred = cannada_onehot.reset_index()
x_pred = x_pred[x_pred['Neighborhood']=='The Beaches']
x_pred

In [None]:
cannada_onehot.drop('The Beaches', inplace=True)

In [None]:
cannada_onehot.reset_index(inplace=True)

In [None]:
cannada_onehot.head()

In [None]:
cannada_onehot.shape

In [None]:
num_top_venues = 5

for hood in cannada_onehot['Neighborhood']:
    print("----"+hood+"----")
    temp = cannada_onehot[cannada_onehot['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

<b> Making it more visually pleasing and putting it into a DataFrame 

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = cannada_onehot['Neighborhood']

for ind in np.arange(cannada_onehot.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cannada_onehot.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

<h3>Starting the process of Clustering

In [None]:
k = 8

main_df = cannada_onehot.iloc[:,1:]

clus = KMeans(k,random_state=5)

clus.fit(main_df)

print(clus.labels_)

In [None]:
neighborhoods_venues_sorted.insert(0,'Cluster Labels',clus.labels_)
canada_main = canada_lat_long

canada_main = canada_main.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on=['Neighborhood'])

print(canada_main.shape)
canada_main.dropna()
canada_main.shape

In [None]:
canada_main.dropna(inplace=True)

In [None]:
canada_main

In [None]:
# create map
map_clusters = folium.Map(location=[lat, long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(canada_main['Latitude'], canada_main['Longitude'], canada_main['Neighborhood'], canada_main['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    cluster=int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h1>Now to predict which Neighborhoods are similar to 'The Beaches'

In [None]:
x_pred 

In [None]:
neigh_name = x_pred.iloc[:,0]

pred_label = clus.predict(x_pred.iloc[:,1:])
print(pred_label)

<p>Now that we know which cluster The Beaches belong to, let us find out Neighborhoods that are similar to it

In [None]:
similar_neigh = canada_main[canada_main['Cluster Labels'] == pred_label[0]]

print(similar_neigh['Neighborhood'])

In [None]:
similar_map = folium.Map([lat,long], zoom_start=12)

for lat,long,neigh in zip(similar_neigh['Latitude'],similar_neigh['Longitude'],similar_neigh['Neighborhood']):
    label = folium.Popup(str('neigh'),parse_html=True)
    folium.CircleMarker(
    [lat,long],
    radius = 3,
    color='blue',
    popup=label,
    fill=True,
    fill_color='blue'
    ).add_to(similar_map)
    

In [None]:
l1= folium.Popup('The Beaches',parse_html=True)
canada_lat_long.set_index('Neighborhood',inplace=True)
latitude=canada_lat_long.loc['The Beaches']['Latitude']
longitude = canada_lat_long.loc['The Beaches']['Longitude']

In [None]:
folium.CircleMarker(
    [latitude,longitude],
    radius=5,
    color='red',
    popup=l1,
    fill=True,
    fill_color='red'
).add_to(similar_map)

similar_map

In [None]:
response[meta]