# 1. Web Scraping from Wikipedia

#### URL: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
import requests
import pandas as pd
#!conda install -c anaconda beautifulsoup4 --yes
from bs4 import BeautifulSoup
import numpy as np
import folium

In [343]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

In [344]:
html_soup=BeautifulSoup(response.text,'html.parser')

In [345]:
table = html_soup.find('table', class_="wikitable sortable")
table_body = table.find('tbody')
rows = table_body.find_all('tr')
data=[]
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)

In [346]:
df = pd.DataFrame(data[1:],columns=['Postal Code','Borough','Neighborhood'])

In [347]:
#Remove rows with Borughs not assigned and resetting index
df = df[df["Borough"]!='Not assigned']
df = df.reset_index().drop('index',axis=1)

#Concatenate neighbourhoods with the same Postal Code. No duplicate postal codes on wiki.
if len(df["Postal Code"].unique()) == len(df):
    print("There are no duplicate Postal Codes!")

There are no duplicate Postal Codes!


In [348]:
#Check if there are neighbourhoods that are not assigned.
if ((df["Neighborhood"]=='Not assigned') | (df["Neighborhood"]=='')).sum() == 0:
    print("All neighborhoods are assigned!")

All neighborhoods are assigned!


In [349]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [350]:
print(df.shape)

(103, 3)


# 2. Obtaining Coordinates through GeoPy

In [351]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

In [352]:
#Obtain the latitude and longitude for different neighborhoods.
latitude=[]
longitude=[]
neighborhood_name_change=[]
name = ''
for neigh, post in zip(df["Neighborhood"],df["Postal Code"]):
    address = "{}, Toronto, Canada".format(neigh)
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    name = neigh
    if location is None:
        address = "{}, Toronto, Canada".format(post)
        geolocator = Nominatim(user_agent="foursquare_agent")
        location = geolocator.geocode(address)
        name = neigh
        if location is None:
            address = "{}, Toronto, Canada".format(neigh.split(",")[0])
            geolocator = Nominatim(user_agent="foursquare_agent")
            location = geolocator.geocode(address)
            name = neigh.split(",")[0]
            if location is None:
                address = "{}, Toronto, Canada".format(neigh.split("-")[0])
                geolocator = Nominatim(user_agent="foursquare_agent")
                location = geolocator.geocode(address)
                name = neigh.split("-")[0]
                if location is None:
                    latitude.append(np.nan)
                    longitude.append(np.nan)
                    neighborhood_name_change.append('')
                    continue
    latitude.append(location.latitude)
    longitude.append(location.longitude)
    neighborhood_name_change.append(neigh)

In [353]:
df['Latitude'] = latitude
df['Longitude'] = longitude
df['Neighborhood'] = neighborhood_name_change

In [354]:
#Drop all neighborhoods which did not receive coordinates! --> Assumption.
df = df.dropna(how='any')

In [355]:
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.640769,-79.379892
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.715283,-79.443914
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.653482,-79.383935
5,M9A,Etobicoke,Islington Avenue,43.622575,-79.514215
6,M1B,Scarborough,"Malvern, Rouge",43.809196,-79.221701
7,M3B,North York,Don Mills,43.775347,-79.345944
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.653482,-79.383935
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6565,-79.377114


In [356]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto is 43.6534817, -79.3839347.


# 3. Clustering Section and use of Foursquare and Folium

### We will be using all the neighborhoods irrespective of borough.
### Edit: When calling foursquare, some neighborhoods had very few or no venues. Therefore, I subsetted all neighborhood that have more than 15 venues, which came to 56

In [357]:
CLIENT_ID = 'E3D0QOUTQHHB5IQV0U5UK5U1RSYLGKURHHL4ER5YAOD0J0ND' # your Foursquare ID
CLIENT_SECRET = 'QY2SK34DH5S3EWEWV1XCLXPHWW5ZTIY3LAH2FXWWPJQ0D0H1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: E3D0QOUTQHHB5IQV0U5UK5U1RSYLGKURHHL4ER5YAOD0J0ND
CLIENT_SECRET:QY2SK34DH5S3EWEWV1XCLXPHWW5ZTIY3LAH2FXWWPJQ0D0H1


In [358]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [359]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [360]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                 latitudes=df['Latitude'],
                                 longitudes=df['Longitude']
                                )

In [361]:
(toronto_venues["Neighborhood"].value_counts()>15).sum()
# There are many places in which there are very few venues and some with none. Therefore I'm 
# subsetting it to only neighbourhoods which have more than 15 venues (i.e., 56)

56

In [362]:
#Removed all neighbourhoods with 15 or less venues!
index = toronto_venues["Neighborhood"].value_counts()[toronto_venues["Neighborhood"].value_counts()>15].index
toronto_venues = toronto_venues[toronto_venues["Neighborhood"].isin(index)].reset_index().drop('index',axis=1)

In [363]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 279 uniques categories.


In [364]:
toronto_venues.shape

(2958, 7)

In [365]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot = toronto_onehot.drop("Neighborhood",axis=1)

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Aquarium,Arcade,Art Gallery,Art Museum,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [366]:
toronto_onehot.shape

(2958, 279)

In [367]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [368]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 2, 1, 2, 0, 2, 4, 2, 2, 2, 2, 2, 3, 1, 2, 0, 2, 1, 2, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 1, 0, 1,
       1, 2, 0, 2, 2, 2, 0, 1, 2, 4, 0, 2], dtype=int32)

In [369]:
toronto_grouped["Clusters"]=kmeans.labels_
toronto_venues_edited = toronto_venues.drop_duplicates(subset=["Neighborhood"]).sort_values(by='Neighborhood').reset_index().drop('index',axis=1)
toronto_grouped["Latitude"]=toronto_venues_edited["Neighborhood Latitude"]
toronto_grouped["Longitude"]=toronto_venues_edited["Neighborhood Longitude"]

In [370]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [371]:
# create map
map_clusters = folium.Map(location=[latitude+0.05, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Latitude'], toronto_grouped['Longitude'], toronto_grouped['Neighborhood'], toronto_grouped["Clusters"]):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters