# Toronto Neighbourhood Notebook

## Part 1

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

### That's a long way, using a Beautiful Soup.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

In [3]:
class Code:
    def parse_url(self, url):
        return [(self.parse_html_table(table)) for table in soup.find_all('table', class_="wikitable sortable")]

    def parse_html_table(self, table):
        cols = 0
        rows = 0
        col_names = []
        for row in table.find_all('tr'):
            tds = row.find_all('td')
            if len(tds) > 0:
                rows += 1
                if cols == 0 :
                    cols = len(tds)
                
            ths = row.find_all('th')
            if len(ths) > 0 and len(col_names) == 0:
                for th in ths:
                    col_names.append(th.get_text())
                    
        if len(col_names) > 0 and len(col_names) != cols:
            raise Exception('Number of columns and columns names mismatch')
    
        columns = col_names if len(col_names) > 0 else range(0, cols)
        df = pd.DataFrame(columns = columns, index = range(0, rows))
    
        r_mark = 0
        for row in table.find_all('tr'):
            c_mark = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[r_mark, c_mark] = column.get_text()
                c_mark += 1
            if len(columns) > 0:
                r_mark += 1
                
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass
        return df

In [4]:
t = Code()
table = t.parse_url(url)[0]
print(table.shape)
table.head()

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


### That's an easy way, using only pandas (btw it uses bs4+html5lib as well), result is the same.

***
*Cleaning data*
***

In [5]:
df = pd.read_html(url, header = 0)[0]
print('Dataset shape before cleaning', df.shape)
df = df[df.Borough != 'Not assigned']
print('Dataset shape after cleaning', df.shape)
df.head()

Dataset shape before cleaning (288, 3)
Dataset shape after cleaning (211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


***
*Grouping Neighbourhood Postcodes*
***

In [6]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


***
*Changing 'Not assigned' Neighbourhood name to Borough name*
***

In [7]:
df.Neighbourhood[df.Neighbourhood == 'Not assigned'] = df.Borough[df.Neighbourhood == 'Not assigned']

In [8]:
print(df.shape)

(103, 3)


## Part 2

***
*Loading Geo Dataset*
***

In [9]:
url_g="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url_g)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


***
*Joining Geo Dataset to Neghbourhood Dataset using Postcodes as indexes*
***

In [10]:
df = df.set_index('Postcode').join(geo_data.set_index('Postal Code'))
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3

In [11]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [12]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [13]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

***
*Using only 'Toronto'-containing Boroughs to explore*
***

In [14]:
tor = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
tor_c = df[df['Borough'].isin(tor)].reset_index(drop=True)
print(tor_c.shape)
tor_c.head()

(38, 4)


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


In [15]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(tor_c['Latitude'], tor_c['Longitude'], tor_c['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

***
*Using FourSquare API to explore the Boroughs*
***

In [16]:
CLIENT_ID = 'CLIENT_ID' # your Foursquare ID
CLIENT_SECRET = 'CLIENT_SECRET' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [20]:
radius = 500
LIMIT = 100

venues = []

for lat, long, borough, neighbourhood in zip(tor_c['Latitude'], tor_c['Longitude'], tor_c['Borough'], tor_c['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append(( 
            borough,
            neighbourhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [21]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Borough', 'Neighbourhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1685, 8)


Unnamed: 0,Borough,Neighbourhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


***
*Checking numbers of kinds of venues categories*
***

In [22]:
len(venues_df['VenueCategory'].unique())

229

***
*Getting the venue analyze of each area and frequency of occurance*
***

In [23]:
tor_c_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
tor_c_onehot['Borough'] = venues_df['Borough'] 
tor_c_onehot['Neighbourhood'] = venues_df['Neighbourhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(tor_c_onehot.columns[-3:]) + list(tor_c_onehot.columns[:-3])
tor_c_onehot = tor_c_onehot[fixed_columns]

print(tor_c_onehot.shape)
tor_c_onehot.head()

(1685, 231)


Unnamed: 0,Yoga Studio,Borough,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,0,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,East Toronto,"The Danforth West, Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
tor_c_venues_freq = tor_c_onehot.groupby(['Borough', 'Neighbourhood']).mean().reset_index()
print(tor_c_venues_freq.shape)
tor_c_venues_freq.head()

(38, 231)


Unnamed: 0,Borough,Neighbourhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,Central Toronto,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Toronto,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0
3,Central Toronto,"Forest Hill North, Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***
*Getting 10 most occurance venue types*
***

In [25]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['Borough', 'Neighbourhood']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Borough'] = tor_c_venues_freq['Borough']
neighbourhoods_venues_sorted['Neighbourhood'] = tor_c_venues_freq['Neighbourhood']

for ind in np.arange(tor_c_venues_freq.shape[0]):
    row_categories = tor_c_venues_freq.iloc[ind, :].iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighbourhoods_venues_sorted.iloc[ind, 2:] = row_categories_sorted.index.values[0:num_top_venues]

neighbourhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighbourhoods_venues_sorted

Unnamed: 0,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Bar,Plane,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry
33,West Toronto,"Dovercourt Village, Dufferin",Bakery,Supermarket,Pharmacy,Athletics & Sports,Brewery,Gym / Fitness Center,Park,Middle Eastern Restaurant,Bar,Music Venue
35,West Toronto,"Little Portugal, Trinity",Bar,Coffee Shop,Men's Store,Asian Restaurant,Restaurant,Café,Vietnamese Restaurant,New American Restaurant,Pizza Place,Record Shop
34,West Toronto,"High Park, The Junction South",Bar,Mexican Restaurant,Café,Thai Restaurant,Grocery Store,Arts & Crafts Store,Bakery,Discount Store,Diner,Speakeasy
28,East Toronto,Studio District,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Bar,Fish Market,Coworking Space,Seafood Restaurant,Latin American Restaurant
32,West Toronto,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Breakfast Spot,Climbing Gym,Stadium,Burrito Place,Restaurant,Caribbean Restaurant,Pet Store,Bakery
12,Downtown Toronto,"Cabbagetown, St. James Town",Café,Coffee Shop,Italian Restaurant,Bakery,Pizza Place,Market,Restaurant,Pub,Park,Japanese Restaurant
37,West Toronto,"Runnymede, Swansea",Café,Coffee Shop,Sushi Restaurant,Pizza Place,Italian Restaurant,Diner,Food,Fish & Chips Shop,Bar,Indie Movie Theater
15,Downtown Toronto,Christie,Café,Grocery Store,Park,Restaurant,Italian Restaurant,Candy Store,Diner,Nightclub,Coffee Shop,Convenience Store
20,Downtown Toronto,"Harbord, University of Toronto",Café,Sandwich Place,Restaurant,Bookstore,Japanese Restaurant,Italian Restaurant,Bar,Bakery,French Restaurant,Beer Store


***
*Clustering areas*
***

In [27]:
kclusters = 3

tor_c_venues_freq_clustering = tor_c_venues_freq.drop(['Borough', 'Neighbourhood'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_c_venues_freq_clustering)

tor_c_clustered = tor_c
tor_c_clustered['Cluster'] = kmeans.labels_

tor_c_clustered = tor_c_clustered.join(neighbourhoods_venues_sorted.drop(['Borough'], 1).set_index('Neighbourhood'), on='Neighbourhood')
tor_c_clustered.sort_values(['Cluster'] + freqColumns, inplace=True)
tor_c_clustered

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,0,Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Bar,Plane,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry
31,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,0,Bakery,Supermarket,Pharmacy,Athletics & Sports,Brewery,Gym / Fitness Center,Park,Middle Eastern Restaurant,Bar,Music Venue
32,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,0,Bar,Coffee Shop,Men's Store,Asian Restaurant,Restaurant,Café,Vietnamese Restaurant,New American Restaurant,Pizza Place,Record Shop
34,West Toronto,"High Park, The Junction South",43.661608,-79.464763,0,Bar,Mexican Restaurant,Café,Thai Restaurant,Grocery Store,Arts & Crafts Store,Bakery,Discount Store,Diner,Speakeasy
3,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Bar,Fish Market,Coworking Space,Seafood Restaurant,Latin American Restaurant
33,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,0,Café,Coffee Shop,Breakfast Spot,Climbing Gym,Stadium,Burrito Place,Restaurant,Caribbean Restaurant,Pet Store,Bakery
11,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,0,Café,Coffee Shop,Italian Restaurant,Bakery,Pizza Place,Market,Restaurant,Pub,Park,Japanese Restaurant
36,West Toronto,"Runnymede, Swansea",43.651571,-79.48445,0,Café,Coffee Shop,Sushi Restaurant,Pizza Place,Italian Restaurant,Diner,Food,Fish & Chips Shop,Bar,Indie Movie Theater
30,Downtown Toronto,Christie,43.669542,-79.422564,0,Café,Grocery Store,Park,Restaurant,Italian Restaurant,Candy Store,Diner,Nightclub,Coffee Shop,Convenience Store
25,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049,0,Café,Sandwich Place,Restaurant,Bookstore,Japanese Restaurant,Italian Restaurant,Bar,Bakery,French Restaurant,Beer Store


***
*Showing clusters on the map*
***

In [28]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bor, poi, cluster in zip(tor_c_clustered['Latitude'], tor_c_clustered['Longitude'], tor_c_clustered['Borough'], tor_c_clustered['Neighbourhood'], tor_c_clustered['Cluster']):
    label = folium.Popup('{} : {} - Cluster {}'.format(bor, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

***
*
Thanks for watching.*
***