# Cluster of neighborhoods in Toronto
This work clusters the neighborhoods in Toronto

## 1. First part of assignment
Import libraries.

In [275]:
#!pip3 install wikipedia
#!pip3 install lxml
#!pip3 install folium
import pandas as pd
import numpy as np
import folium 
import requests # library to handle requests
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# import k-means from clustering stage
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Read the table from the  wekipedia link.

In [276]:
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
neighborhoods = pd.read_html(link,header=0)[0]
neighborhoods.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


 ###  Data cleaning
 
 Drop the rows with a borough that is **Not assigned**

In [277]:
nrow = neighborhoods.shape[0] # initial number of rows
neighborhoods=neighborhoods[neighborhoods.Borough != 'Not assigned']
print('{} rows are dropped with Borough = \'Not assigned\''.format(nrow-neighborhoods.shape[0]))

77 rows are dropped with Borough = 'Not assigned'


Regroup Postal code duplicates into a single cell

In [278]:
freq_PC = neighborhoods['Postal code'].value_counts() # frequency counts
freq_PC = freq_PC[freq_PC>1].to_frame()
freq_PC.shape

(0, 1)

**No duplicates are found in the postal codes. Checked on the wikipedia page that the neighborhoods with the same postal code has already grouped together with a '/' seperated. Instead, I changed the seperator into a comma as instructed in the assignement.**

In [279]:
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace('/', ', ')
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace(' ,', ',')
neighborhoods.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


If a cell has a borough but a Not assigned neighborhood, change the corresponding neighborhood to be the same as the borough.

In [280]:
print('{} number of NaN '.format(neighborhoods['Neighborhood'].isna().sum()))
print('{} number of \'Not assigned\''.format((neighborhoods['Neighborhood']=='Not assigned').sum()))

0 number of NaN 
0 number of 'Not assigned'


**No 'Not assigned' neighborhood.**

Shape of cleaned data:

In [281]:
neighborhoods.shape

(103, 3)

## 2. Second part of the assignment

Read latitude and longitude data. Checked locally that the geocoder package is not stable. 

In [282]:
location =  pd.read_csv('http://cocl.us/Geospatial_data')
location.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge data to include latitudes and longitudes in neighborhood data.

In [283]:
location.rename(columns={'Postal Code':'Postal code'}, inplace=True)
neighborhoods = pd.merge(neighborhoods,location,how='left', on='Postal code')

neighborhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Third part of the assignment: clustering

Define a function to get latitude and longitude from given address

In [284]:
def getLocation(address):
    # function to get location from address
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude,longitude

Select only boroughs contain 'Toronto'

In [285]:
Toronto = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')].reset_index(drop=True)
print(Toronto.shape)
Toronto.head()
# Toronto = neighborhoods

(39, 5)


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Create a map of Toronto with marked Boroughs.

In [286]:
(latitude, longitude) = getLocation('Toronto')
map_t = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_t)  
map_t 

Give Foursquare crenditials. 

In [287]:
CLIENT_ID = 'MVTOPPH0TGRY3NECS3K53CTS0V1IAFJRG4DPF1JGAIEBCO4W' # Foursquare ID
CLIENT_SECRET = 'MN2AKKTPXWIKX13IUC1F0XHDXOVB1XHXMV5Q2CVTUW5RE1AB' # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
radius = 500

Get the top 100 venues in Toronto within a radius of 500 meters

In [288]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
Toronto_venues = getNearbyVenues(names=Toronto['Neighborhood'],
                                   latitudes=Toronto['Latitude'],
                                   longitudes=Toronto['Longitude']
                                  )

print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 231 uniques categories.


One hot encoding and grouped by neighborhoods

In [289]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronton_onehot = Toronto_onehot[fixed_columns]

Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.shape

(39, 231)

In [290]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Café,Bakery
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Nightclub,Burrito Place
2,Business reply mail Processing CentrE,Yoga Studio,Auto Workshop,Light Rail Station,Skate Park,Smoke Shop
3,"CN Tower, King and Spadina, Railway Lands, ...",Airport Service,Airport Lounge,Airport Terminal,Airport,Harbor / Marina
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place


### K-means clustering

In [291]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, init='k-means++', random_state=0).fit(Toronto_grouped_clustering)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
Toronto_merged

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Breakfast Spot
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Diner,Mexican Restaurant,Burger Joint
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Café,Restaurant,Bubble Tea Shop
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Gastropub,American Restaurant,Cocktail Bar
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Yoga Studio,Deli / Bodega
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Seafood Restaurant,Café,Bakery
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0,Grocery Store,Café,Park,Candy Store,Restaurant
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,0,Coffee Shop,Café,Restaurant,Deli / Bodega,Clothing Store
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,0,Bakery,Pharmacy,Grocery Store,Pool,Liquor Store


### Display clusters

In [292]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

In [293]:
First cluster:

SyntaxError: invalid syntax (<ipython-input-293-1531b67781eb>, line 1)

In [None]:
cLabel=0
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == cLabel, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Second cluster: 

In [None]:
cLabel=1
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == cLabel, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Third cluster:

In [None]:
cLabel=2
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == cLabel, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Fourth cluster:

In [None]:
cLabel=3
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == cLabel, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]