Let's import necessary packages first

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import KMeans
import matplotlib
!pip install folium
import folium
import json



You are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Next, I use the requests package to fetch the webpage listing the Canadian Postal Code table, and store it in a variable called r_html

Then, using the Beautiful Soup package, I search for all row elements on the page and store results in a list (called rows)
I eliminate the first element of the list (since it contains the table headers, and not the data).

Finally, I keep only the first 289 elements, which identify the main table on the page.


In [6]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
r_html = r.text

soup = BeautifulSoup(r_html,'html.parser')
rows=soup.find_all("tr")

rows.pop(0)
rows=rows[0:289]

Out of each row, I split its elements (Postcode,Borough,Neighborhood) into separate columns, which I store as columns 1,2,3

In [7]:
column1=[rows[i].text.replace("\n",",").split(",")[1:-1][0] for i in range(len(rows))]
column2=[rows[i].text.replace("\n",",").split(",")[1:-1][1] for i in range(len(rows))]
column3=[rows[i].text.replace("\n",",").split(",")[1:-1][2] for i in range(len(rows))]

I create a dictionary with table headers and columns, and then i convert it as a pandas dataframe.

Then i call head and shape to verify results

In [8]:
df_dict={"Postcode":column1,
         "Borough":column2,
         "Neighborhood":column3
         }

df=pd.DataFrame(df_dict)

df.shape
df.head()



Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


I filter the dataframe to exclude not assigned Boroughs and reset index; calling shape to verify.

In [9]:
df=df[df["Borough"]!="Not assigned"]
df=df.reset_index(drop=True)

df.shape
df.head()


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Consolidating Neighborhoods belonging to the same code, and dropping duplicated columns to consolidate results

In [10]:
for i in range(len(df)-1):
    if df.loc[i].Postcode==df.loc[i+1].Postcode:
        df.loc[i].Neighborhood=df.loc[i].Neighborhood+", "+df.loc[i+1].Neighborhood
        df.drop([i+1])
        
df=df.drop_duplicates(subset=["Postcode"])
df=df.reset_index(drop=True)
        
df.shape
df.head()  


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Filling in the Borough field for Neighborhood with "Not Assigned" as value; shape and head to verify results

In [11]:
for i in range(len(df)-1):
    if df.loc[i].Neighborhood=="Not assigned":
        df.loc[i].Neighborhood=df.loc[i].Borough

df.shape
df.head()        

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


Using csv file provided, upload it into pandas and displaying it

In [12]:
geocord_df=pd.read_csv(r"C:\Users\romani edoardo\Desktop\Geospatial_Coordinates.csv")
geocord_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging lat lon coordinates with original dataset, dropping right key column and displaying complete dataset

In [13]:
total=df.merge(geocord_df,how="left",left_on="Postcode",right_on="Postal Code")
total=total.drop(["Postal Code"],axis=1)

total.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


Filter boroughs for only the ones in Toronto

In [14]:
Toronto=total[total["Borough"].str.contains("Toronto")]
Toronto=Toronto.reset_index(drop=True)
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


Then I plot the location of each Neighborhood onto the toronto map

In [245]:
latitude=43.761539
longitude=-79.411079

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, label in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Setup of Foursquare API credentials, venue limit and metre radius

In [247]:
Client_ID="XX"
Client_Secret="XX"
Version="20190315"
limit=10
radius=500

For each neighborhood, fetch up to 10 venues in a 500 meter radius using the Foursquare api; 
the rest of the code organizes observations into a pandas Datframe

In [195]:
boroughs=[]
neighborhoods=[]
neigh_lat=[]
neigh_lon=[]
venues=[]
venues_categories=[]
venues_lats=[]
venues_longs=[]

for i in range(len(Toronto)):


    url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        Client_ID, 
        Client_Secret, 
        Version, 
        Toronto.loc[i].Latitude, 
        Toronto.loc[i].Longitude, 
        radius,
        limit
        )
    
    results=requests.get(url).json()
    
    for j in range(len(results["response"]["groups"][0]["items"])):
        
        venue_name=results["response"]["groups"][0]["items"][j]["venue"]["name"]
        venue_category=results["response"]["groups"][0]["items"][j]["venue"]["categories"][0]["name"]
        venue_lat=results["response"]["groups"][0]["items"][j]["venue"]["location"]["lat"]
        venue_lon=results["response"]["groups"][0]["items"][j]["venue"]["location"]["lng"]
        
        boroughs.append(Toronto.loc[i].Borough)
        neighborhoods.append(Toronto.loc[i].Neighborhood)
        neigh_lat.append(Toronto.loc[i].Latitude)
        neigh_lon.append(Toronto.loc[i].Longitude)
    
        venues.append(venue_name)
        venues_categories.append(venue_category)
        venues_lats.append(venue_lat)
        venues_longs.append(venue_lon)
    


In [248]:
venue_dict_={"Borough":boroughs,
            "Neighborhood":neighborhoods,
            "Latitude":neigh_lat,
            "Longitude":neigh_lon,
            "Venue":venues,
            "Category":venues_categories,
            "Venue_Latitude":venues_lats,
            "Venue_Longitude":venues_longs}

In [251]:
venue_data=pd.DataFrame(venue_dict_)
venue_data.head(20)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue_Latitude,Venue_Longitude
0,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,Bakery,43.653447,-79.362017
1,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,Gym / Fitness Center,43.653191,-79.357947
3,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
5,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,Restaurant,43.656369,-79.35698
6,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Dominion Pub and Kitchen,Pub,43.656919,-79.358967
7,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
8,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Corktown Common,Park,43.655618,-79.356211
9,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,The Distillery Historic District,Historic Site,43.650244,-79.359323


Counting and displaying counts of categories to understand data; I notice that most observations pertain to either Cafes,Restaurants and Parks

In [200]:
venue_data["Category"].value_counts()

Café                     23
Coffee Shop              21
Italian Restaurant       13
Park                     13
Restaurant               12
Bakery                    9
Bar                       8
Gym                       7
Pub                       7
Gastropub                 6
Greek Restaurant          6
Sushi Restaurant          6
Steakhouse                6
Pizza Place               6
Dessert Shop              5
Brewery                   5
Breakfast Spot            5
Japanese Restaurant       5
Gym / Fitness Center      5
Diner                     4
Cocktail Bar              4
Tea Room                  4
Hotel                     4
Ice Cream Shop            4
Bookstore                 4
Supermarket               3
Burger Joint              3
Neighborhood              3
Ramen Restaurant          3
Beer Bar                  3
                         ..
Lake                      1
Cheese Shop               1
Organic Grocery           1
Gift Shop                 1
Bus Line            

I therefore decide to filter Venue data only for venues pertaining to either of these categories

In [204]:
filtered=venue_data[venue_data["Category"].str.contains("Café") | venue_data["Category"].str.contains("Coffee") | venue_data["Category"].str.contains("Restaurant") | venue_data["Category"].str.contains("Park")]
filtered=filtered.reset_index(drop=True)
filtered.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue_Latitude,Venue_Longitude
0,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,Coffee Shop,43.653559,-79.361809
1,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,Restaurant,43.656369,-79.35698
2,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Corktown Common,Park,43.655618,-79.356211
3,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Hokkaido Ramen Santouka らーめん山頭火,Ramen Restaurant,43.656435,-79.377586
4,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Page One Cafe,Café,43.657772,-79.376073


I group labels to only these 3 key words ("Cafe","Restaurant","Park")

In [252]:
filtered=filtered.replace("Coffee Shop","Café")

for i in range(len(filtered)):

    if "Restaurant" in filtered.loc[i,"Category"] and len(filtered.loc[i,"Category"])>10:
        filtered.loc[i,"Category"]="Restaurant"

filtered.head()
filtered["Category"].value_counts()
filtered=filtered[filtered["Category"]!="Skate Park"]
filtered.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue_Latitude,Venue_Longitude
0,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,Café,43.653559,-79.361809
1,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,Restaurant,43.656369,-79.35698
2,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Corktown Common,Park,43.655618,-79.356211
3,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Hokkaido Ramen Santouka らーめん山頭火,Restaurant,43.656435,-79.377586
4,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Page One Cafe,Café,43.657772,-79.376073


I get dummy data for the Boroughs (either Downtown, Central, East and West Toronto) and the categories, 
as i try to group data into 12 different clusters, according to the 12 different Borough-Category combinations

In [253]:
dummy=pd.get_dummies(filtered[["Borough","Category"]])
dummy.head()

Unnamed: 0,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Category_Café,Category_Park,Category_Restaurant
0,0,1,0,0,1,0,0
1,0,1,0,0,0,0,1
2,0,1,0,0,0,1,0
3,0,1,0,0,0,0,1
4,0,1,0,0,1,0,0


Run k-means algorithm and insert cluster labels back into filtered dataset

In [254]:
kclusters = 12


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dummy)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

dummy.insert(0, 'Cluster', kmeans.labels_)

dummy.head()

final=pd.concat([filtered,dummy],axis=1)

final.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Venue,Category,Venue_Latitude,Venue_Longitude,Cluster,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Category_Café,Category_Park,Category_Restaurant
0,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,Café,43.653559,-79.361809,2,0,1,0,0,1,0,0
1,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,Restaurant,43.656369,-79.35698,0,0,1,0,0,0,0,1
2,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,Corktown Common,Park,43.655618,-79.356211,4,0,1,0,0,0,1,0
3,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Hokkaido Ramen Santouka らーめん山頭火,Restaurant,43.656435,-79.377586,0,0,1,0,0,0,0,1
4,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Page One Cafe,Café,43.657772,-79.376073,2,0,1,0,0,1,0,0


Plot Labelled Borough-Category combinations for the relevant venues

In [256]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []

for lat, lon, poi, cluster in zip(final['Latitude'], final['Longitude'], final['Venue'], final['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto)
       
map_toronto