# Welcome to the first part of Clustering Toronto neighborhoods

In [1]:
#step 1 : importing libraries

import requests 
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup

In [2]:
#step 2 : Getting a dataframe

web_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(web_text,'xml')

toronto = soup.find('table',{'class':'wikitable sortable'})
toronto_rows = toronto.find_all('tr')

data = []
for row in toronto_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

toronto_df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Now, we will clean our data through several steps.

In [3]:
#Step 1 : deleting not assigned data

toronto_cleaned1 = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_cleaned1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [4]:
#Step 2 : merging neighborhoods with same postal code, using a coma

toronto_cleaned2 = toronto_cleaned1.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_cleaned2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
#Step 3 : not assigned neighborhood : then the neighborhood = borough

absentneigh = toronto_cleaned2['Neighbourhood'] == "Not assigned"
toronto_cleaned2.loc[absentneigh, 'Neighbourhood'] = toronto_cleaned2.loc[absentneigh, 'Borough']
toronto_cleaned2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Let us now look at the shape of our data 

In [6]:
toronto_cleaned2.shape[0]

103

### This is the end of the first part of the task. 

# Welcome to the second part of Clustering Toronto

Now we want to include longitude and latitudes to our boroughs.

In [7]:
#first let's import the postalcode and long / lat

from pandas import read_csv
pc="http://cocl.us/Geospatial_data"
postalcode=read_csv(pc)
postalcode.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
#now let's merge !

postalcode.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)
neightoronto = pd.merge(toronto_cleaned2, postalcode, on='PostalCode', how='inner')

In [10]:
# the data must look like this 

neightoronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### This is the end of the second task.

# Welcome to the third part of Clustering Toronto 

### Now, we will explore the different neighborhoods of Toronto and cluster them.

In [11]:
#First, let's visualize the different locations of each neighborhoods. For that, we will need to import some more libraries.

from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

! pip install folium==0.5.0
import folium

print('Libraries imported.')

Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 6.9 MB/s  eta 0:00:01
[?25hCollecting branca
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=5ebc603efaed37f4347aa6bb374570d8d90ca8db94fcbb5f8c58489500b3e18b
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.5.0
Libraries imported.


In [12]:
# Obtain coordinates of Toronto, Canada

index = neightoronto['Borough'].str.contains('Toronto')

df_subset = neightoronto[index]

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))


The geographical coordinates of Toronto are 43.6534817, -79.3839347.


In [13]:
# Create map of Toronto, ON 
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 12)

# Add neighbourhood markers to map
for lat, lng, borough, neighb in zip(df_subset['Latitude'], df_subset['Longitude'], df_subset['Borough'], df_subset['Neighbourhood']):
    label = '{}, {}'.format(neighb, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = 'green',
        fill = True, 
        fill_color = 'blue',
        fill_opacity = 0.8,
        parse_html = False).add_to(map_toronto)

map_toronto

In [14]:
#we need foursquare credentials

CLIENT_ID = 'XV03ZS242TKYNPF22COZXISGUNOEMN15G55NQBYXUYQYJQCY' 
CLIENT_SECRET = 'QGYTQ5GYJ4SLNYTMOJETVPXKEOZQ0ZPYXONZ4KW5BOM4DI1M' 
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XV03ZS242TKYNPF22COZXISGUNOEMN15G55NQBYXUYQYJQCY
CLIENT_SECRET:QGYTQ5GYJ4SLNYTMOJETVPXKEOZQ0ZPYXONZ4KW5BOM4DI1M


In [15]:
#now let's look at the venues close by

def getNearbyVenues(names, latitudes, longitudes, radius=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
toronto_venues = getNearbyVenues(names = df_subset['Neighbourhood'], 
                                 latitudes = df_subset['Latitude'],
                                 longitudes = df_subset['Longitude']
                                )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High

In [17]:
#let's get a vue of the venues and neigh's coordinates
print(toronto_venues.shape)
toronto_venues.head()

(100, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,Davisville,43.704324,-79.38879,Jules Cafe Patisserie,43.704138,-79.388413,Dessert Shop
2,Davisville,43.704324,-79.38879,Thobors Boulangerie Patisserie Café,43.704514,-79.388616,Café
3,Davisville,43.704324,-79.38879,XO Gelato,43.705177,-79.388793,Dessert Shop
4,Davisville,43.704324,-79.38879,Positano,43.704558,-79.388639,Italian Restaurant


In [18]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",1,1,1,1,1,1
Central Bay Street,4,4,4,4,4,4
Christie,1,1,1,1,1,1
"Commerce Court, Victoria Hotel",17,17,17,17,17,17
Davisville,9,9,9,9,9,9
"First Canadian Place, Underground city",12,12,12,12,12,12
"Garden District, Ryerson",2,2,2,2,2,2
"Harbourfront East, Union Station, Toronto Islands",2,2,2,2,2,2
"Kensington Market, Chinatown, Grange Park",4,4,4,4,4,4
"Little Portugal, Trinity",7,7,7,7,7,7


In [19]:
# One-hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix = "", prefix_sep = "")

# Add neighborhood column back to dataframe and move to first column
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']

fixed_columns = [['Neighbourhood'] + list(toronto_onehot.columns[toronto_onehot.columns != 'Neighbourhood'])]
toronto_onehot = toronto_onehot[fixed_columns[0]]

print(toronto_onehot.shape)
toronto_onehot.head()

(100, 61)


Unnamed: 0,Neighbourhood,American Restaurant,Art Gallery,Asian Restaurant,Bakery,Bank,Bar,Beer Store,Bookstore,Breakfast Spot,...,Supermarket,Sushi Restaurant,Taco Place,Tea Room,Thai Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped.head()

(20, 61)


Unnamed: 0,Neighbourhood,American Restaurant,Art Gallery,Asian Restaurant,Bakery,Bank,Bar,Beer Store,Bookstore,Breakfast Spot,...,Supermarket,Sushi Restaurant,Taco Place,Tea Room,Thai Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Commerce Court, Victoria Hotel",0.058824,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.058824,0.0,...,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0


In [21]:
#now we can classify the top 5 venues for each neighborhoods

num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['VENUE','FREQ']
    temp = temp.iloc[1:]
    temp['FREQ'] = temp['FREQ'].astype(float)
    temp = temp.round({'FREQ': 2})
    print(temp.sort_values('FREQ', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                   VENUE  FREQ
0  Performing Arts Venue   1.0
1    American Restaurant   0.0
2            Art Gallery   0.0
3   Gym / Fitness Center   0.0
4                 Hostel   0.0


----Central Bay Street----
            VENUE  FREQ
0     Coffee Shop  0.50
1        Pharmacy  0.25
2  Sandwich Place  0.25
3     Salad Place  0.00
4          Hostel  0.00


----Christie----
                  VENUE  FREQ
0             Nightclub   1.0
1   American Restaurant   0.0
2           Art Gallery   0.0
3  Gym / Fitness Center   0.0
4                Hostel   0.0


----Commerce Court, Victoria Hotel----
                 VENUE  FREQ
0  American Restaurant  0.06
1                 Café  0.06
2          Salad Place  0.06
3       Sandwich Place  0.06
4   Seafood Restaurant  0.06


----Davisville----
                VENUE  FREQ
0        Dessert Shop  0.22
1                Café  0.22
2      

In [22]:
## Function to sort the venues 
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Set up the dataframe parameters 
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"CN Tower, King and Spadina, Railway Lands, Har...",Performing Arts Venue,Yoga Studio,Gift Shop,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market,Diner,Dessert Shop
1,Central Bay Street,Coffee Shop,Sandwich Place,Pharmacy,Yoga Studio,Cocktail Bar,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market
2,Christie,Nightclub,Yoga Studio,Gift Shop,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market,Diner,Dessert Shop
3,"Commerce Court, Victoria Hotel",American Restaurant,Gym,Café,Burrito Place,Pub,Deli / Bodega,Salad Place,Sandwich Place,Seafood Restaurant,Soup Place
4,Davisville,Dessert Shop,Café,Toy / Game Store,Sushi Restaurant,Italian Restaurant,Seafood Restaurant,Coffee Shop,Garden,Food Court,Fast Food Restaurant


In [30]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [31]:
# add clustering labels
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,"CN Tower, King and Spadina, Railway Lands, Har...",Performing Arts Venue,Yoga Studio,Gift Shop,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market,Diner,Dessert Shop
1,0,Central Bay Street,Coffee Shop,Sandwich Place,Pharmacy,Yoga Studio,Cocktail Bar,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market
2,1,Christie,Nightclub,Yoga Studio,Gift Shop,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market,Diner,Dessert Shop
3,0,"Commerce Court, Victoria Hotel",American Restaurant,Gym,Café,Burrito Place,Pub,Deli / Bodega,Salad Place,Sandwich Place,Seafood Restaurant,Soup Place
4,0,Davisville,Dessert Shop,Café,Toy / Game Store,Sushi Restaurant,Italian Restaurant,Seafood Restaurant,Coffee Shop,Garden,Food Court,Fast Food Restaurant


In [42]:
toronto_merged= df_subset

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')


toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,2.0,Trail,Yoga Studio,Cocktail Bar,Gastropub,Garden,Food Court,Fast Food Restaurant,Farmers Market,Diner,Dessert Shop
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,,,,,,,,,,,
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,,,,,,,,,,,
43,M4M,East Toronto,Studio District,43.659526,-79.340923,,,,,,,,,,,
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,,,,,,,,,,,


In [43]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon, poi],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

TypeError: must be real number, not str

# I was unable to plot the final graph, I'm sorry. i tried my best at solving the issue at hand but I didn't find any answers.