# Explore and Cluster Neighbourhoods. 

Use similar code to the New York exercise to examine and cluster neighbourhoods in Toronto

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



## Stage 1
Prep the Canada data file

In [4]:
pcode = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

print (pcode.head())    

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


Filter on Toronto to reduce dataset we're working with


In [5]:
pcodes_filtered = pcode[(pcode["Borough"].str.contains("East Toronto"))]

Combine into one Boroughs into one row with the neighborhoods separated with a comma


In [6]:
pcodes_final = pcodes_filtered.groupby("Postcode").agg(lambda y:','.join(set(y)))

print(pcodes_final.head())

               Borough                                      Neighbourhood
Postcode                                                                 
M4E       East Toronto                                        The Beaches
M4K       East Toronto                        The Danforth West,Riverdale
M4L       East Toronto                      India Bazaar,The Beaches West
M4M       East Toronto                                    Studio District
M7Y       East Toronto  Business Reply Mail Processing Centre 969 Eastern


## Stage 2
Import CSV File with Geo coordinates


In [7]:
geodata = pd.read_csv('/Users/Paul/Coding/IBM Courses Info/Capstone/Geospatial_Coordinates.csv')
print(geodata.head())

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


Ensure the Postal Code columns are named the same


In [8]:
geodata = geodata.rename(columns={'Postal Code': 'Postcode'})

Then merge the dataframes


In [9]:
canada = pd.merge(pcodes_final,geodata, on='Postcode')
print(canada.head())

  Postcode       Borough                                      Neighbourhood  \
0      M4E  East Toronto                                        The Beaches   
1      M4K  East Toronto                        The Danforth West,Riverdale   
2      M4L  East Toronto                      India Bazaar,The Beaches West   
3      M4M  East Toronto                                    Studio District   
4      M7Y  East Toronto  Business Reply Mail Processing Centre 969 Eastern   

    Latitude  Longitude  
0  43.676357 -79.293031  
1  43.679557 -79.352188  
2  43.668999 -79.315572  
3  43.659526 -79.340923  
4  43.662744 -79.321558  


## Stage 3
Analyse each neighbourhood. Adapted from course logic on New York

Define FourSquare details

In [10]:
CLIENT_ID = 'T1V0AEHTGEZOXW5PJHOLFCTJ5RFVSL5LIHP0TROHQR0ASM2F' # your Foursquare ID
CLIENT_SECRET = 'CRREFB0Z0KQOBME5XGHDXCRKPVGPOMQYFLMV3TH0AJ3RVHVB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [11]:
neighbourhood_latitude = canada.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = canada.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = canada.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


Create the GET request URL

In [12]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)

url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=T1V0AEHTGEZOXW5PJHOLFCTJ5RFVSL5LIHP0TROHQR0ASM2F&client_secret=CRREFB0Z0KQOBME5XGHDXCRKPVGPOMQYFLMV3TH0AJ3RVHVB&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [16]:
canada_venues = getNearbyVenues(names=canada['Neighbourhood'],
                                   latitudes=canada['Latitude'],
                                   longitudes=canada['Longitude']
                                  )


The Beaches
The Danforth West,Riverdale
India Bazaar,The Beaches West
Studio District
Business Reply Mail Processing Centre 969 Eastern


## 3. Analyze Each Neighborhood
using code from course - can't say I understand every single step here!


In [17]:
# one hot encoding
canada_onehot = pd.get_dummies(canada_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
canada_onehot['Neighbourhood'] = canada_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [canada_onehot.columns[-1]] + list(canada_onehot.columns[:-1])
canada_onehot = canada_onehot[fixed_columns]

canada_onehot.head()

Unnamed: 0,Neighbourhood,American Restaurant,Arts & Crafts Store,Auto Workshop,Bakery,Bank,Bar,Bookstore,Brewery,Bubble Tea Shop,Burrito Place,Butcher,Café,Caribbean Restaurant,Cheese Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food & Drink Shop,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Gay Bar,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Ice Cream Shop,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Middle Eastern Restaurant,Movie Theater,Neighborhood,Park,Pet Store,Pizza Place,Pub,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Thai Restaurant,Thrift / Vintage Store,Trail,Wine Bar,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
canada_grouped = canada_onehot.groupby('Neighbourhood').mean().reset_index()

In [19]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = canada_grouped['Neighbourhood']

for ind in np.arange(canada_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(canada_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Brewery,Light Rail Station,Gym / Fitness Center,Park,Butcher,Pizza Place,Burrito Place,Restaurant,Comic Shop
1,"India Bazaar,The Beaches West",Sandwich Place,Park,Burrito Place,Liquor Store,Gym,Food & Drink Shop,Movie Theater,Pet Store,Pizza Place,Pub
2,Studio District,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,Brewery,American Restaurant,Stationery Store,Ice Cream Shop,Gay Bar
3,The Beaches,Neighborhood,Pub,Pizza Place,Health Food Store,Trail,Food & Drink Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
4,"The Danforth West,Riverdale",Greek Restaurant,Italian Restaurant,Coffee Shop,Bookstore,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Frozen Yogurt Shop,Juice Bar,Grocery Store


In [21]:
# set number of clusters
kclusters = 5

canada_grouped_clustering = canada_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(canada_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 2, 4, 1, 0], dtype=int32)

In [22]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

canada_merged = canada

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
canada_merged = canada_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

canada_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Neighborhood,Pub,Pizza Place,Health Food Store,Trail,Food & Drink Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Greek Restaurant,Italian Restaurant,Coffee Shop,Bookstore,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Frozen Yogurt Shop,Juice Bar,Grocery Store
2,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572,2,Sandwich Place,Park,Burrito Place,Liquor Store,Gym,Food & Drink Shop,Movie Theater,Pet Store,Pizza Place,Pub
3,M4M,East Toronto,Studio District,43.659526,-79.340923,4,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,Brewery,American Restaurant,Stationery Store,Ice Cream Shop,Gay Bar
4,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,3,Yoga Studio,Brewery,Light Rail Station,Gym / Fitness Center,Park,Butcher,Pizza Place,Burrito Place,Restaurant,Comic Shop


In [23]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(canada_merged['Latitude'], canada_merged['Longitude'], canada_merged['Neighbourhood'], canada_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters