# Segmenting and Clustering Neighborhoods in Toronto

In [175]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#conda install -c conda-forge folium=0.5.0 --yes 
import folium 
from pandas.io.json import json_normalize 

In [176]:
obj  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page = obj.text

In [177]:
soup = BeautifulSoup(page, "lxml")
#rawData = pd.DataFrame(columns = ["PostalCode", "Borough", "Neighbourhood"])
Table = soup.find('table', class_='wikitable sortable')
ls = pd.read_html(str(Table))
dt = {"PostalCode":ls[0][0],"Borough":ls[0][1],"Neighbourhood":ls[0][2]}
rawData = pd.DataFrame.from_dict(dt)

indexNames = rawData[ rawData['Borough'] == 'Not assigned' ].index
# Delete these row indexes from dataFrame
rawData.drop(indexNames , inplace=True)
rawData.drop(0 , inplace=True)
PData = rawData.sort_values(['Borough'],ascending=[1,])
PData.to_csv("TorontoData.csv",index=False)
PData.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
213,M4V,Central Toronto,Forest Hill SE
198,M4T,Central Toronto,Summerhill East
197,M4T,Central Toronto,Moore Park
212,M4V,Central Toronto,Deer Park
184,M4S,Central Toronto,Davisville


In [178]:
Todata = pd.read_csv("TorontoData.csv").set_index("PostalCode")
Todata.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M4V,Central Toronto,Forest Hill SE
M4T,Central Toronto,Summerhill East
M4T,Central Toronto,Moore Park
M4V,Central Toronto,Deer Park
M4S,Central Toronto,Davisville


In [179]:
LLData = pd.read_csv("http://cocl.us/Geospatial_data").set_index("Postal Code")
LLData.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [180]:
Todata = Todata.join(LLData)
Todata.to_csv("TorontLocation.csv",index=False)
Todata.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
M1B,Scarborough,Malvern,43.806686,-79.194353
M1B,Scarborough,Rouge,43.806686,-79.194353
M1C,Scarborough,Rouge Hill,43.784535,-79.160497
M1C,Scarborough,Port Union,43.784535,-79.160497
M1C,Scarborough,Highland Creek,43.784535,-79.160497


In [181]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [182]:
def CreateMap(location, latitude, longitude):
    mapOfToronto = folium.Map(location=[latitude, longitude], zoom_start=11)
    # add markers to map
    for lat, lng, borough, Neighhbourhood in zip(Todata['Latitude'], Todata['Longitude'], Todata['Borough'], Todata['Neighbourhood']):
        label = '{}, {}'.format(Neighhbourhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(mapOfToronto)
    return mapOfToronto

In [183]:

def getTorontoCoords():
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return location, latitude, longitude

In [184]:
CLIENT_ID = 'xxxxxxxxxxxxxxx' 
CLIENT_SECRET = 'yyyyyyyyyyyyyyyyyyyyyyyyy'
VERSION = '20180604'
LIMIT = 30
radius = 500

address = 'Toronto'
location, longitude, latitude = getTorontoCoords()
mapOfToronto = CreateMap(location, latitude, longitude)
mapOfToronto

  This is separate from the ipykernel package so we can avoid doing imports until


<img src="fig1.jpg" width=1227 /> 

In [None]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, latitude, longitude, radius, LIMIT)
torontoVenues = getNearbyVenues(names=Todata['Neighbourhood'], latitudes=Todata['Latitude'], longitudes=Todata['Longitude'])

In [253]:
Todata = Todata.reset_index()
torontoVenuesOnehot = pd.get_dummies(torontoVenues[['Venue Category']], prefix="", prefix_sep="")
# add Neighhbourhood column back to dataframe
torontoVenuesOnehot['Neighbourhood'] = torontoVenues['Neighborhood'] 
# move Neighhbourhood column to the first column
fixed_columns = [torontoVenuesOnehot.columns[-1]] + list(torontoVenuesOnehot.columns[:-1])
torontoVenuesOnehot =torontoVenuesOnehot[fixed_columns]
torontoGrouped = torontoVenuesOnehot.groupby('Neighbourhood').mean().reset_index()


In [254]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [273]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Neighhbourhoods_venues_sorted = pd.DataFrame(columns=columns)
Neighhbourhoods_venues_sorted['Neighbourhood'] = torontoGrouped['Neighbourhood']
for ind in np.arange(torontoGrouped.shape[0]):
    Neighhbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(torontoGrouped.iloc[ind, :], num_top_venues)


In [None]:
Todata = pd.read_csv("TorontLocation.csv")
#Todata.columns=["PostalCode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
#Todata = Todata.reset_index()
# CLUSTERING 
# set number of clusters
kclusters = 5
torontoClustering = torontoGrouped.drop('Neighbourhood', 1)
torontoClustering.fillna(0)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0,max_iter=1000).fit(torontoClustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
Neighhbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
Neighhbourhoods_venues_sorted.fillna({'Cluster Labels':0})                                
torontoAllData = Todata
# merge toronto_grouped with toronto_data to add latitude/longitude for each Neighhbourhood
torontoAllData = torontoAllData.join(Neighhbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
torontoAllData.dropna()
#torontoAllData = torontoAllData.dropna(axis = 0).reset_index(drop=True)
#torontoAllData.to_csv("z.csv")
#torontoAllData['Cluster Labels'] = kmeans.labels_


In [276]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
torontoAllData.dropna(inplace = True)
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(torontoAllData['Latitude'], torontoAllData['Longitude'], torontoAllData['Neighbourhood'], torontoAllData['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

<img src="fig2.jpg" width=1227 /> 