PART ONE - FORMING THE DATAFRAME

Installing necessary libraries

In [None]:
!pip install geopy
!pip install beautifulsoup4
!pip install lxml
!pip install geocoder

Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import urllib.request
from bs4 import BeautifulSoup
import lxml
import geocoder

The following code scrapes the Wikipedia table and converts it into a Pandas Dataframe. I used Beautiful Soup to scrape data into 3 separate lists and then generated a dataframe that used those lists as its column data.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
post = urllib.request.urlopen(url)
scrape = BeautifulSoup(post, "lxml")
table = scrape.find('table', class_='wikitable sortable')
code = []
boro = []
hood = []
for row in table.findAll('tr'):
    cells=row.findAll('td') 
    if len(cells) == 3:
        if cells[1].find(text=True) != 'Not assigned\n':
            c = (cells[0].text)
            code.append(c.strip()) #strips the \n characters
            b = cells[1].text
            boro.append(b.strip())
            if cells[2].find(text=True)=='Not assigned\n':
                hood.append(b.strip())
            else:
                h = cells[2].text
                hood.append(h.strip())
df = pd.DataFrame()
df['PostalCode']=code
df['Borough']=boro
df['Neighborhood']=hood
df

        

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


As instructed, the last line uses the shape method to print rows.

In [3]:
df.shape

(103, 3)

PART TWO: GETTING LATITUDE AND LONGITUDE COORDINATES

I'm using the arcgis geocoder instead of google, but the results should be the same.

In [4]:
lat = []
long = []
for co in code:
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(co))
        lat_lng_coords = g.latlng

    lat.append(lat_lng_coords[0])
    long.append(lat_lng_coords[1])
df['Latitude']=lat
df['Longitude']=long
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


PART 3: SORTING NEIGHBORHOODS

First, I'm simplifying the dataframe, as suggested, to just include Boroughs that contain 'Toronto'

In [5]:
tor = []
counter = 0
for item in df['Borough']:
    if 'Toronto' in item:
        tor.append(df.loc[counter,:])
    counter += 1
    
toronto = pd.DataFrame(tor).reset_index(drop=True)
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M4E,East Toronto,The Beaches,43.67709,-79.29547
5,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306
6,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493
7,M6G,Downtown Toronto,Christie,43.66869,-79.42071
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891


Making a map of the Toronto neighborhoods being clustered

In [6]:
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

Loading in Foursquare credentials in a way that won't be shown

In [7]:
with open(r'C:\Users\p_ten\OneDrive\Documents\cred.json') as c:
    data = json.load(c)
    credentials = {'CLIENT_ID': data['CLIENT_ID'], 'CLIENT_SECRET': data['CLIENT_SECRET']}

In [8]:
CLIENT_ID = credentials['CLIENT_ID']
CLIENT_SECRET = credentials['CLIENT_SECRET']
VERSION = '20180605' # Foursquare API version
LIMIT = 100
RADIUS = 500

Defining the function to get venues for neighborhoods. I'll be recreating the same searches and tables as the New York lab from here on out.

In [9]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']

        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Geting venues for each neighborhood and putting them into a dataframe

In [10]:
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65512,-79.36264,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65512,-79.36264,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65512,-79.36264,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Regent Park, Harbourfront",43.65512,-79.36264,Berkeley Church,43.655123,-79.365873,Event Space
4,"Regent Park, Harbourfront",43.65512,-79.36264,The Yoga Lounge,43.655515,-79.364955,Yoga Studio


In [11]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,60,60,60,60,60,60
"Brockton, Parkdale Village, Exhibition Place",85,85,85,85,85,85
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",76,76,76,76,76,76
Central Bay Street,76,76,76,76,76,76
Christie,11,11,11,11,11,11
Church and Wellesley,79,79,79,79,79,79
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,26,26,26,26,26,26
Davisville North,8,8,8,8,8,8


Onehot conversion and grouping

In [12]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
n = toronto_onehot['Neighborhood']
toronto_onehot.drop(labels=['Neighborhood'], axis = 1, inplace = True)
toronto_onehot.insert(0, 'Neighborhood', n)
toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(38, 228)

In [14]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0     Coffee Shop  0.08
1    Cocktail Bar  0.03
2  Breakfast Spot  0.03
3     Cheese Shop  0.03
4        Beer Bar  0.03


----Brockton, Parkdale Village, Exhibition Place----
         venue  freq
0  Coffee Shop  0.06
1         Café  0.06
2          Bar  0.06
3   Restaurant  0.05
4    Nightclub  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
              venue  freq
0       Coffee Shop  0.09
1             Hotel  0.05
2        Restaurant  0.04
3              Café  0.04
4  Asian Restaurant  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0  Italian Restaurant  0.08
1                Café  0.07
2         Coffee Shop  0.07
3                 Bar  0.04
4                Park  0.04


----Central Bay Street----
                       venue  freq
0                Coffee Shop  0.12
1             Clothi

Getting the top 10 venue types per neighborhood and storing them in a dataframe

In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [16]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Farmers Market,Beer Bar,Breakfast Spot,Cocktail Bar,Restaurant,Cheese Shop,Bakery,Seafood Restaurant,Liquor Store
1,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Bar,Café,Restaurant,Gift Shop,Nightclub,Sandwich Place,Burrito Place,French Restaurant,Supermarket
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Hotel,Restaurant,Café,Italian Restaurant,Bar,Asian Restaurant,Taco Place,Salon / Barbershop,Pub
3,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Café,Coffee Shop,French Restaurant,Park,Bar,Sandwich Place,Restaurant,Speakeasy,Bakery
4,Central Bay Street,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Plaza,Sushi Restaurant,Hotel,Restaurant,Bookstore


Clustering neighborhoods

In [17]:
toronto_clustering = toronto_grouped.drop('Neighborhood', 1)
toronto_clustering.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,0.0,0.0,0.0,0.016667,0.0,0.016667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.016667,0.0,0.0,0.0,0.0,0.0,0.0,0.016667
1,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.023529,0.011765,0.0,...,0.0,0.0,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.011765
2,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.03,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,...,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.0,0.0,0.013158
4,0.0,0.0,0.0,0.0,0.0,0.013158,0.013158,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.013158,0.013158,0.013158,0.0,0.0,0.0


In [18]:
kcluster = 8
kmeans = KMeans(n_clusters=kcluster, random_state=0).fit(toronto_clustering)
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 7, 1, 1, 2, 1, 4, 6,
       1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [19]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [20]:
toronto_merged = toronto
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [21]:
toronto_merged.dropna(axis=0, inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,1,Coffee Shop,Breakfast Spot,Distribution Center,Spa,Event Space,Food Truck,Electronics Store,Restaurant,Bakery,Italian Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188,1,Coffee Shop,Sandwich Place,Theater,Burrito Place,Café,Falafel Restaurant,Fried Chicken Joint,Bank,Gastropub,Italian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804,1,Coffee Shop,Clothing Store,Café,Japanese Restaurant,Cosmetics Shop,Diner,Bubble Tea Shop,Bookstore,Hotel,Ramen Restaurant
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587,1,Coffee Shop,Cocktail Bar,Café,Restaurant,Cosmetics Shop,Clothing Store,Hotel,Gastropub,Japanese Restaurant,Italian Restaurant
4,M4E,East Toronto,The Beaches,43.67709,-79.29547,1,Health Food Store,Pub,Trail,Dumpling Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
5,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306,1,Coffee Shop,Farmers Market,Beer Bar,Breakfast Spot,Cocktail Bar,Restaurant,Cheese Shop,Bakery,Seafood Restaurant,Liquor Store
6,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Plaza,Sushi Restaurant,Hotel,Restaurant,Bookstore
7,M6G,Downtown Toronto,Christie,43.66869,-79.42071,1,Café,Grocery Store,Playground,Italian Restaurant,Baby Store,Athletics & Sports,Coffee Shop,Candy Store,Farmers Market,Farm
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258,1,Coffee Shop,Hotel,Café,Restaurant,Gym,Japanese Restaurant,American Restaurant,Salad Place,Asian Restaurant,Steakhouse
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891,1,Park,Grocery Store,Bakery,Pharmacy,Middle Eastern Restaurant,Smoke Shop,Café,Brazilian Restaurant,Furniture / Home Store,Bar


Mapping the clusters

In [22]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kcluster)
ys = [i + x + (i*x)**2 for i in range(kcluster)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

I initially clustered into 5, but changed it to 8 to see if it would affect it much. Not really, almost everything is still in one cluster with a few others scattered about.