In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup
import requests

## Trimming Neighborhood table

In [10]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
source = BeautifulSoup(url.text, 'html.parser')

table = source.find('table')
table_rows = table.find_all('tr')

l = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    row = [x[:-1] for x in row]
    l.append(row)
    
df = pd.DataFrame(l, columns=["Postal Code", "Borough", "Neighborhood"])
df = df.drop(df[df.Borough == "Not assigned"].index)
df = df.drop(df.index[0])

df = df.assign(Neighborhood=df['Neighborhood'].str.split(',')).explode('Neighborhood')
df = df.drop_duplicates(subset='Neighborhood')

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Manor
...,...,...,...
179,M8Z,Etobicoke,Mimico NW
179,M8Z,Etobicoke,The Queensway West
179,M8Z,Etobicoke,South of Bloor
179,M8Z,Etobicoke,Kingsway Park South West


## Merging existing table with Lat-Long

In [12]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

df = pd.merge(df, geo_df, on="Postal Code", how='inner')

df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
3,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
...,...,...,...,...,...
204,M8Z,Etobicoke,Mimico NW,43.628841,-79.520999
205,M8Z,Etobicoke,The Queensway West,43.628841,-79.520999
206,M8Z,Etobicoke,South of Bloor,43.628841,-79.520999
207,M8Z,Etobicoke,Kingsway Park South West,43.628841,-79.520999


## Clustering

In [13]:
CLIENT_ID = 'XA5E1330BXZ0UXUSUDRNZJZY5IIOVIX1LYKZIHKLS3FN0ZRE' # Foursquare ID
CLIENT_SECRET = '53YH5U3AWFEUYTNY1PUFEEPYHC425WLF02GYG21ODSAIMT43' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        Limit = 100
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            Limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
Toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

In [16]:
tor_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = Toronto_venues['Neighborhood']


Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Lawrence Park,Clothing Store,Coffee Shop,Yoga Studio
1,Adelaide,Coffee Shop,Café,Gym
2,Agincourt North,Park,Playground,Distribution Center
3,Albion Gardens,Grocery Store,Pizza Place,Pharmacy
4,Bathurst Quay,Airport Service,Airport Lounge,Airport Terminal


In [34]:
val = list(tor_grouped.columns.values)
#val.remove('Italian Restaurant')
#val.remove('Pizza Place')
print(val)

['Neighborhood', 'Accessories Store', 'Afghan Restaurant', 'Airport', 'Airport Food Court', 'Airport Gate', 'Airport Lounge', 'Airport Service', 'Airport Terminal', 'American Restaurant', 'Antique Shop', 'Aquarium', 'Art Gallery', 'Art Museum', 'Arts & Crafts Store', 'Asian Restaurant', 'Athletics & Sports', 'Auto Garage', 'Auto Workshop', 'BBQ Joint', 'Baby Store', 'Bagel Shop', 'Bakery', 'Bank', 'Bar', 'Baseball Field', 'Baseball Stadium', 'Basketball Court', 'Basketball Stadium', 'Beach', 'Bed & Breakfast', 'Beer Bar', 'Beer Store', 'Belgian Restaurant', 'Bike Rental / Bike Share', 'Bike Shop', 'Bistro', 'Board Shop', 'Boat or Ferry', 'Bookstore', 'Boutique', 'Brazilian Restaurant', 'Breakfast Spot', 'Brewery', 'Bridal Shop', 'Bubble Tea Shop', 'Building', 'Burger Joint', 'Burrito Place', 'Bus Line', 'Bus Station', 'Business Service', 'Butcher', 'Cafeteria', 'Café', 'Cajun / Creole Restaurant', 'Camera Store', 'Candy Store', 'Caribbean Restaurant', 'Cheese Shop', 'Chinese Restaurant

In [52]:
print(clust_neigh.shape)
print(neighborhoods_venues_sorted.shape)

(199, 4)
(199, 4)


In [55]:
t2 = tor_grouped
#t2.rename(columns={"Italian Restaurant": "Itar", "Pizza Place": "Pizza"})
#test = t2['Itar']
#print(test)

df_2 = t2[['Neighborhood', 'Italian Restaurant', 'Pizza Place']].copy()
#df_2

In [56]:
kclusters = 3

tor_grouped_clustering = df_2.drop('Neighborhood', 1)
#print(tor_grouped_clustering)


# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=10)
kmeans.fit(tor_grouped_clustering)
#kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)
clustercenters = kmeans.cluster_centers_
#print(clustercenters)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       2, 1, 0, 2, 1, 1, 0, 1, 1, 1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 2, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 1, 0, 1, 2,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1,
       1])

In [57]:
clust_neigh = neighborhoods_venues_sorted
clust_neigh.insert(0, 'Cluster_Lables', kmeans.labels_)

tor_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(clust_neigh.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster_Lables,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Construction & Landscaping,Food & Drink Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,2.0,Coffee Shop,Pizza Place,Intersection
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,1.0,Coffee Shop,Bakery,Pub
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1.0,Coffee Shop,Bakery,Pub
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,1.0,Clothing Store,Accessories Store,Furniture / Home Store


In [59]:
tm2 = tor_merged
tm2 = tm2.dropna(subset = ['Cluster_Lables'])
tor_merged = tm2

In [60]:
map_clusters = folium.Map(location=[43.651070, -79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster_Lables']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(float(cluster)) - 1],
        fill=False,
        fill_color=rainbow[int(float(cluster)) - 1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters