In [79]:

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

#  options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Webscrape the Toronto Neighborhoods

In [80]:

#  list of postal codes of Toronto area contains all the information we need to assign:
#  
#      PostalCode, Borough, Neighborhood
#
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(url)
soup = BeautifulSoup(html_data.text, 'html.parser')

In [81]:

#  how many tables does the webpage contain?
print('There are {} tables defined in the webpage'.format(len(soup.find_all('table'))))

There are 3 tables defined in the webpage


In [82]:
#  pick the table with first cell entry: M1A
table = [tbl for tbl in soup.find_all('table') if 'M1A' in tbl.find('tbody').find('tr').find('td').text]
table = table[0].find('tbody')

In [83]:
toronto = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
for cell in table.find_all('td'):
    if cell.span.text == 'Not assigned':
        continue
    #  capture postal code, borough and all neighborhoods
    postalcode, borough, neighborhoods = re.search(r'(M\d[A-Z])([^(]+)\((.*)$', cell.text.strip()).groups()
    
    #  remove all extra parenthesized neighborhoods
    #  replace any leftover closed parenthesis with space
    #  replace any / with comma
    neighborhoods = neighborhoods.split('(')[0].strip(')').replace(')', ' ').replace(' /', ',').strip(' ')
    toronto.loc[len(toronto)] = [postalcode, borough, neighborhoods]

In [84]:
print(f'Total number of neighborhoods found = {len(toronto)}')


Total number of neighborhoods found = 103


Identify Boroughs with unusually long names. We might have to manually fix them.

In [85]:
toronto.loc[ toronto.Borough.str.len() > 20 ]

Unnamed: 0,PostalCode,Borough,Neighborhood
35,M4J,East YorkEast Toronto,The Danforth East
76,M7R,MississaugaCanada Post Gateway Processing Centre,Enclave of L4W
92,M5W,Downtown TorontoStn A PO Boxes25 The Esplanade,Enclave of M5E
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L


In [86]:

#  ok, let's manually fix them
tofix = toronto.loc[ toronto.Borough.str.len() > 20, 'PostalCode']
toronto.loc[ toronto.Borough.str.len() > 20, 'Borough' ] = ['East York',
                                                            'Mississauga',
                                                            'Downtown Toronto',
                                                            'East Toronto']
toronto[ toronto.PostalCode.isin(tofix) ]

Unnamed: 0,PostalCode,Borough,Neighborhood
35,M4J,East York,The Danforth East
76,M7R,Mississauga,Enclave of L4W
92,M5W,Downtown Toronto,Enclave of M5E
100,M7Y,East Toronto,Enclave of M4L


In [87]:
#  are there any Boroughs with 'Not assigned' neighborhoods?
toronto.query('Neighborhood.str.contains("[Nn]ot").values')

Unnamed: 0,PostalCode,Borough,Neighborhood


In [88]:
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [89]:
toronto.shape

(103, 3)

## Geocode the Toronto Neighborhoods

In [90]:
!pip install folium



In [91]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [92]:
#  get the Toronto latitude and longitude coordinates
gl = Nominatim(user_agent='toronto_neighborhoods')
toronto_gl = gl.geocode('Toronto, Ontario')
toronto_lat , toronto_lng = toronto_gl.latitude, toronto_gl.longitude
print(f'Toronto latitude = {toronto_lat} and longitude = {toronto_lng}')

Toronto latitude = 43.6534817 and longitude = -79.3839347


We will try to get the Toronto Neighborhood coordinates using the Google Maps API. We define the following function which tries 10 times before stopping.

In [93]:
#  Now let's get the rest of the Toronto Neighborhood coordinates if we can from Google Maps
def assign_latlng(postalcode):
    coords = None
    STOP = 10
    while (coords is None):
        g = geocoder.google(f'{postalcode}, Toronto, Ontario')
        coords = g.latlng
        STOP -= 1
        if (STOP == 0):
            print('It does not work..')
            break
    return coords


Let's try for the first postalcode to see if it works.

In [94]:
assign_latlng(toronto.PostalCode[0])

NameError: name 'geocoder' is not defined

In [None]:
toronto_latlng = pd.read_csv('Geospatial_data', header=0)
toronto_latlng.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

#  there should not be any differences in Postal Codes between our webscrapped set and this one
assert( set(toronto_latlng.PostalCode) == set(toronto.PostalCode) )
toronto_latlng.head()

In [95]:
#  inner join
toronto = toronto.merge(toronto_latlng, on='PostalCode')

NameError: name 'toronto_latlng' is not defined

In [62]:
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [63]:
toronto.shape

(103, 3)

## Explore the Toronto Neighborhoods

Let's create first a map of all the neighborhoods.

In [64]:
import folium

ModuleNotFoundError: No module named 'folium'

In [65]:

#  shift location a bit so all neighborhoods fit at zoom level 11
toronto_map = folium.Map(location=[toronto_lat+0.06, toronto_lng+0.05], zoom_start=11)
for lat, lng, label in zip(toronto.Latitude, toronto.Longitude, toronto.Neighborhood):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)
toronto_map

NameError: name 'folium' is not defined

In [None]:

#  prepare to talk to the Foursquare API
CLIENT_ID = 'W3SH5A1VQH330RLC4U5J4XRRC3FYJYMBYDN4KIXV5CFBZNV5'
CLIENT_SECRET = 'XMHABSF1WWR0TZ2AJNM0POZ0FYSU0DDE4YWPS5DMVEHLRQUX'
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [66]:
# borrow the wrapper function from the exercise lab
#  modify it when a given neighborhood request does not produce any results not to fail but keep going 
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
            venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
            
        except:
            venues_list.append([(
            name, 
            lat, 
            lng, 
            None, 
            None, 
            None,  
            None)])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [67]:

#  query Foursquare for top 100 venues in each of the Toronto Neighborhoods
toronto_venues = getNearbyVenues(toronto.Neighborhood, toronto.Latitude, toronto.Longitude, radius=500)

AttributeError: 'DataFrame' object has no attribute 'Latitude'

In [None]:
toronto_venues.applymap(lambda x: x is None).sum()


In [None]:
pick = toronto_venues.query('`Venue Category`.isnull().values | Venue.isnull().values')
pick

Ok, let's query the API again.

In [None]:
pick_venues = getNearbyVenues(pick['Neighborhood'], pick['Neighborhood Latitude'], pick['Neighborhood Longitude'],
                              radius=500)
pick_venues.head()

Good! Now we have values for all the neighborhoods! Let's drop the empty row(s) and append the new rows.

In [68]:
toronto_venues_fixed = toronto_venues.drop(pick.index).append(pick_venues, ignore_index=True)
assert( toronto_venues.shape[0] - 1 + pick_venues.shape[0] == toronto_venues_fixed.shape[0] )

NameError: name 'toronto_venues' is not defined

Let's order neighborhoods by the number of venues returned (up to the maximum limit we chose).

In [None]:
toronto_venues_fixed.groupby('Neighborhood')[['Venue']].count().sort_values('Venue', ascending=False)

Let's one-hot encode the venue categories in order to start preparing for neighborhood clustering. First let's check what are the different venues.



In [69]:
toronto_venues_fixed['Venue Category'].value_counts()

NameError: name 'toronto_venues_fixed' is not defined

In [None]:
toronto_venues_fixed.query('`Venue Category` == "Neighborhood"')

This is going to mess up the downstream analysis, where we already rely on the Neighborhood column for grouping, so let's replace these values with Neighboorhood Area.

In [70]:

toronto_venues_fixed['Venue Category'].replace({'Neighborhood': 'Neighborhood Area'}, inplace=True)
toronto_venues_fixed[ toronto_venues_fixed['Venue Category'].str.contains('Neigh') ]

NameError: name 'toronto_venues_fixed' is not defined

In [71]:
features = pd.get_dummies(toronto_venues_fixed[['Venue Category']], prefix='', prefix_sep='')
features = pd.concat([toronto_venues_fixed[['Neighborhood']], features], axis=1)
features.head()

NameError: name 'toronto_venues_fixed' is not defined

Great! Let's compute the venue frequencies for each neighborhood and then identify the top 10 venues per neighborhood.

In [72]:
features_freqs = features.groupby('Neighborhood').mean().reset_index()

NameError: name 'features' is not defined

In [73]:

def getTop10(row):
    #  get the neighborhood venue frequencies as a single row
    #  order them in descending order
    #  take the top10
    p = row.iloc[0].sort_values(ascending=False).iloc[:10].reset_index().set_axis(['top venue', 'freq'], axis=1)
    
    #  make sure zero frequency venues do not count in the top10 list
    p.loc[ p.freq.eq(0.0), 'top venue' ] = None
    
    #  rename the index as Top1, Top2, ..., Top10
    p['name'] = ['Top' + str(i) for i in range(1, 11)]
    p.set_index('name', inplace=True)
    
    return p['top venue']

#  ok, let's run this function now neighborhood by neighborhood on our DataFrame
top10 = features_freqs.groupby('Neighborhood').apply(getTop10)
top10.head()

NameError: name 'features_freqs' is not defined

## Cluster the Toronto Neighborhoods

In [74]:
from sklearn.cluster import KMeans

In [75]:
n_clusters = 3
features_sparse = features.groupby('Neighborhood').sum().reset_index()
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(features_sparse.drop('Neighborhood', axis=1))
features_sparse.insert(1, column='cluster', value=kmeans.labels_)

NameError: name 'features' is not defined

In [76]:
features_sparse.head()

NameError: name 'features_sparse' is not defined

In [None]:
features_sparse.cluster.value_counts()

In [None]:
toronto_clustered = toronto.loc[ toronto.Neighborhood.isin(features_sparse.Neighborhood) ]\
.set_index('Neighborhood').join(features_sparse.set_index('Neighborhood'), on='Neighborhood').reset_index()
toronto_clustered.head()

In [77]:

toronto_map = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=11)

cluster_color = {0: 'red', 1: 'steelblue', 2: 'green'}

for lat, lng, neighborhood, cluster in zip(toronto_clustered.Latitude, 
                                           toronto_clustered.Longitude, 
                                           toronto_clustered.Neighborhood,
                                           toronto_clustered.cluster):
    label = folium.Popup(str(neighborhood) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=cluster_color[cluster],
        fill=True,
        fill_color=cluster_color[cluster],
        fill_opacity=1.0,
        parse_html=False).add_to(toronto_map)
toronto_map

NameError: name 'folium' is not defined