In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
import matplotlib.pyplot as plt
import geocoder 
import folium
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# From here I start to obtain the table from Wikipedia

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
# The Session object allows you to persist certain parameters across requests. it also persists cookies across all requests made from the Session instance, and will use urllib3’s connection pooling. So if you’re making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase (see HTTP persistent connection).
# By default, Request will keep waiting for a response indefinitely. Therefore, it is advised to set the timeout parameter.
# If the request was successful, you should see the reponse output as '200'.
s = requests.Session()
response = s.get(url, timeout = 10)
response

<Response [200]>

In [108]:
soup = BeautifulSoup(response.text, 'html.parser')
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [141]:
table=soup.find('table', {"class":'wikitable sortable'})

In [142]:
# Number of rows and columns in the table

rows = table.findAll('tr')
columns = rows[1].findAll('td')
header = [th.text.rstrip() for th in rows[0].select('th')]
print('\t\t\tTable\n\nThe header is:', header)
print('There are {} rows and {} columns.'.format(len(rows),len(columns)))


			Table

The header is: ['Postal Code', 'Borough', 'Neighbourhood']
There are 181 rows and 3 columns.


In [144]:
lst_data = []
for row in rows[1:]:
    data = [d.text.rstrip() for d in row.select('td')]
    lst_data.append(data)
# sample records            
lst_data[0:3]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

## Table obtained!

In [145]:
df = pd.DataFrame(lst_data, columns = header)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### I delete repeated Neighbourhoods and not assigned Boroughs

In [160]:
df = df[ df['Borough'] != 'Not assigned' ]
df3.drop_duplicates(subset = 'Neighbourhood',keep='first', inplace=True)
df = df.reset_index(drop=True)

df.shape

(99, 3)

### I create a list in which I join Postal Code with Neighbourhood

In [161]:
postal_code_neighbourhood = []
for postal_code, Neighbourhood in zip(df['Postal Code'], df['Neighbourhood']):
    postal_code_neighbourhood.append([postal_code, Neighbourhood])

    

In [162]:
df.head(2)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village


In [163]:
#csv with neighbours coordinates
coordinates = pd.read_csv("C:/Users/chopi/Downloads/Geospatial_Coordinates.csv")

In [164]:
coordinates

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### I add lat/long to Neighbourhoods

In [190]:
canada_data = pd.merge(df,coordinates, on = 'Postal Code')
print(canada_data.shape, df.shape)
len(canada_data['Neighbourhood'].unique())
canada_data
canada_useful = canada_data[['Neighbourhood', 'Latitude', 'Longitude']]
canada_useful

(99, 5) (99, 3)


Unnamed: 0,Neighbourhood,Latitude,Longitude
0,Parkwoods,43.753259,-79.329656
1,Victoria Village,43.725882,-79.315572
2,"Regent Park, Harbourfront",43.654260,-79.360636
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...
94,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
95,Church and Wellesley,43.665860,-79.383160
96,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
97,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [167]:
# I initializate latitude and longitude in a random neighbourhood to create the map
latitude = canada_data['Latitude'][15]
longitude = canada_data['Longitude'][15]

## Map of canada with the neighbours

In [168]:
map_canada = folium.Map(width=500,height=500,location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(canada_data['Latitude'], canada_data['Longitude'], canada_data['Borough'], canada_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(map_canada)

map_canada

In [169]:
CLIENT_ID = 'J0TCWUO1VO2ZBJKJ2MKOKWXD2JUWDT4GTYPOHV55PJTIBEHK' # your Foursquare ID
CLIENT_SECRET = 'NV5KOX2T2MCDBK2Q2ACS5ELZ0O3YWFZDY5XGISBIRLCK11KR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: J0TCWUO1VO2ZBJKJ2MKOKWXD2JUWDT4GTYPOHV55PJTIBEHK
CLIENT_SECRET:NV5KOX2T2MCDBK2Q2ACS5ELZ0O3YWFZDY5XGISBIRLCK11KR


In [170]:
neighborhood_latitude = canada_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = canada_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = canada_data.loc[0, 'Neighbourhood'] # neighborhood name
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


#### Now, let's get the top 100 venues within a radius of 200 meters from Parkwoods.

In [205]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 200 # define radius in meters
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url 

'https://api.foursquare.com/v2/venues/explore?&client_id=J0TCWUO1VO2ZBJKJ2MKOKWXD2JUWDT4GTYPOHV55PJTIBEHK&client_secret=NV5KOX2T2MCDBK2Q2ACS5ELZ0O3YWFZDY5XGISBIRLCK11KR&v=20180605&ll=43.7532586,-79.3296565&radius=200&limit=100'

In [None]:
# Send the GET request to obtain results
results = requests.get(url).json()
results

In [174]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
#Now we are ready to clean the json and structure it into a _pandas_dataframe.
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON
display(nearby_venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
display(nearby_venues.head())
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

#### Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [206]:
def getNearbyVenues(names, latitudes, longitudes, radius=200):
    i = 1
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name, i)
        i=i+1
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighborhood and create a new dataframe called canada_venues.

In [207]:
canada_venues = getNearbyVenues(names=canada_data['Neighbourhood'], latitudes=canada_data['Latitude'], longitudes=canada_data['Longitude'])

Parkwoods 1
Victoria Village 2
Regent Park, Harbourfront 3
Lawrence Manor, Lawrence Heights 4
Queen's Park, Ontario Provincial Government 5
Islington Avenue, Humber Valley Village 6
Malvern, Rouge 7
Don Mills 8
Parkview Hill, Woodbine Gardens 9
Garden District, Ryerson 10
Glencairn 11
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale 12
Rouge Hill, Port Union, Highland Creek 13
Woodbine Heights 14
St. James Town 15
Humewood-Cedarvale 16
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood 17
Guildwood, Morningside, West Hill 18
The Beaches 19
Berczy Park 20
Caledonia-Fairbanks 21
Woburn 22
Leaside 23
Central Bay Street 24
Christie 25
Cedarbrae 26
Hillcrest Village 27
Bathurst Manor, Wilson Heights, Downsview North 28
Thorncliffe Park 29
Richmond, Adelaide, King 30
Dufferin, Dovercourt Village 31
Scarborough Village 32
Fairview, Henry Farm, Oriole 33
Northwood Park, York University 34
East Toronto, Broadview North (Old East York) 35
Harbourfront East, Union

In [77]:
canada_venues['Neighbourhood']

0                                               Parkwoods
1                                               Parkwoods
2                                               Parkwoods
3                                        Victoria Village
4                                        Victoria Village
                              ...                        
1125    Mimico NW, The Queensway West, South of Bloor,...
1126    Mimico NW, The Queensway West, South of Bloor,...
1127    Mimico NW, The Queensway West, South of Bloor,...
1128    Mimico NW, The Queensway West, South of Bloor,...
1129    Mimico NW, The Queensway West, South of Bloor,...
Name: Neighbourhood, Length: 1130, dtype: object

In [80]:
canada_venues

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Careful & Reliable Painting,43.752622,-79.331957,Construction & Landscaping
2,Parkwoods,43.753259,-79.329656,TTC stop #8380,43.752672,-79.326351,Bus Stop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
...,...,...,...,...,...,...,...
1125,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Dollarama,43.629883,-79.518627,Discount Store
1126,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Healthy Planet,43.630214,-79.518495,Supplement Shop
1127,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,RONA,43.629393,-79.518320,Hardware Store
1128,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,7-Eleven,43.629107,-79.517431,Convenience Store


In [83]:
print('There are {} uniques categories.'.format(len(canada_venues['Venue Category'].unique())))

There are 208 uniques categories.


## 3. Analyze Each Neighborhood

In [208]:
# one hot encoding
canada_onehot = pd.get_dummies(canada_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
canada_onehot['Neighbourhood'] = canada_venues['Neighbourhood'] 
# move neighborhood column to the first column
fixed_columns = [canada_onehot.columns[-1]] + list(canada_onehot.columns[:-1])
canada_onehot = canada_onehot[fixed_columns]
canada_onehot.shape

(519, 148)

In [209]:
canada_grouped = canada_onehot.groupby('Neighbourhood').mean().reset_index()
canada_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Auto Workshop,BBQ Joint,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Victoria Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,Westmount,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


First, let's write a function to sort the venues in descending order.

In [210]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [211]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = canada_grouped['Neighbourhood']

for ind in np.arange(canada_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(canada_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head(1)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Alderwood, Long Branch",Pharmacy,Coffee Shop,Dance Studio,Pizza Place,Park,Other Great Outdoors,Performing Arts Venue,Organic Grocery,Pet Store,Martial Arts School


## 4. Cluster Neighborhoods

In [228]:
# set number of clusters
kclusters = 3
canada_grouped_clustering = canada_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(canada_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 0, 2, 1, 0, 2, 0, 2, 2, 2, 1, 1, 1, 1, 0, 2, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2,
       0, 0, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0])

### I add the labels clusters to the neighbourhoods

In [230]:
canada_grouped = canada_onehot.groupby('Neighbourhood').mean().reset_index()
canada_grouped['Labels'] = kmeans.labels_
canada_grouped = pd.merge(canada_grouped, canada_useful, on = 'Neighbourhood')
canada_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Auto Workshop,BBQ Joint,...,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio,Labels,Latitude,Longitude
0,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,43.602414,-79.543484
1,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,43.754328,-79.442259
2,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,43.786947,-79.385975
3,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,43.733283,-79.419750
4,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,43.692657,-79.264848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Victoria Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,43.725882,-79.315572
57,Westmount,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,43.696319,-79.532242
58,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,43.770992,-79.216917
59,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,43.695344,-79.318389


In [234]:
# create map
map_clusters = folium.Map(width=500, height=500, location = [latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(canada_grouped['Latitude'], canada_grouped['Longitude'], canada_grouped['Neighbourhood'], canada_grouped['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon],radius=5,popup=label,color=rainbow[cluster-1],fill=True,fill_color=rainbow[cluster-1],fill_opacity=0.9).add_to(map_clusters)
       
map_clusters