# Segmenting and Clustering Neighborhoods in Toronto





### Step 1 - Pull out the list of postal codes from wikipedia's page and transform into a dataframe using Pandas.

In [1]:
import pandas as pd

df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


#### Remove cells with the column "Borough" that is Not assigned.

In [3]:
df = df[df.Borough != "Not assigned"]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


####  Rows with same postal code will be combined into one row with the neighborhoods separated by bar.

In [5]:
df2 = df.groupby(['Postal code', 'Borough'])['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
df2.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### If a cell has a valid value in the column "Borough" but a Not assigned value in the column "Neighborhood", then the "Neighborhood" will be the same as the "Borough".

In [6]:
df2.loc[df2['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = df2['Borough']

#### Print the number of rows present in the final dataframe using .shape function.

In [7]:
print ("Number of rows in this dataframe:", df2.shape[0])

Number of rows in this dataframe: 103


### Step 2 - Add latitude and longitude coordinates  (Geospatial data) 

#### Read the Geospatial data CSV file with pandas read_csv function.

In [8]:
da = pd.read_csv('http://cocl.us/Geospatial_data')
da.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge both dataframes on common Postal Code.

In [10]:
df_da = pd.merge(df2, da, left_on = 'Postal code', right_on = 'Postal Code')
df_da.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,M1B,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,M1C,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


 ### Step 3 - Explore and cluster neighborhoods in Toronto.

#### Check out the Entertainment District of Toronto.

In [11]:
df_northyork = df_da[df_da ['Borough'] == 'North York'].reset_index(drop=True)
df_northyork

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M2H,North York,Hillcrest Village,M2H,43.803762,-79.363452
1,M2J,North York,Fairview / Henry Farm / Oriole,M2J,43.778517,-79.346556
2,M2K,North York,Bayview Village,M2K,43.786947,-79.385975
3,M2L,North York,York Mills / Silver Hills,M2L,43.75749,-79.374714
4,M2M,North York,Willowdale / Newtonbrook,M2M,43.789053,-79.408493
5,M2N,North York,Willowdale,M2N,43.77012,-79.408493
6,M2P,North York,York Mills West,M2P,43.752758,-79.400049
7,M2R,North York,Willowdale,M2R,43.782736,-79.442259
8,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
9,M3B,North York,Don Mills,M3B,43.745906,-79.352188


#### Import libraries.

In [14]:
import numpy as np
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium
import folium
print("Done")

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 13.5MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1
Done


In [15]:
#from latlong.net
ny_lat = 43.761539
ny_long = -79.411079

#create map of North York using latitude and longitude values

map_northyork = folium.Map(location=[ny_lat, ny_long], zoom_start=11)

#add markers to map

for lat, lng, label in zip(df_northyork['Latitude'], df_northyork['Longitude'], df_northyork['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_northyork)
    
map_northyork

In [16]:
CLIENT_ID = 'FWDBXEXZRFCGPHD5DJN0YBEQMXW45NS01UTNHJBVQ5XJOGL2'
CLIENT_SECRET = 'LQAJIA5FA212D4YCBGASMJTSZHHAV40AI5OGLP3TGXSZCMJS'
VERSION = '20200312'

#### Function to make pulling nearby venues easier:

In [17]:
def getNearbyVenues (names, latitudes, longitudes, radius=600):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        #API request URL:
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            800,
            300)
        
        #GET request:
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #Return only relevant informations for each nearby venue:
        venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

#### Call Foursquare to get the nearby venues for each Neighbourhood:

In [19]:
northyork_venues = getNearbyVenues(
    names=df_northyork['Neighborhood'],
    latitudes=df_northyork['Latitude'],
    longitudes=df_northyork['Longitude'])

Hillcrest Village
Fairview / Henry Farm / Oriole
Bayview Village
York Mills / Silver Hills
Willowdale / Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor / Wilson Heights / Downsview North
Northwood Park / York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Bedford Park / Lawrence Manor East
Lawrence Manor / Lawrence Heights
Glencairn
North Park / Maple Leaf Park / Upwood Park
Humber Summit
Humberlea / Emery


#### Convert the Venue data to one hot encoding then built a data frame with it and their Neighbourhoods.

In [21]:
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix="", prefix_sep="")
northyork_onehot['Neighborhood'] = northyork_venues['Neighborhood']
fixed_columns = [northyork_onehot.columns[-1] + (northyork_onehot.columns[:-1])]
northyork_venuelist = northyork_onehot.groupby('Neighborhood').mean().reset_index()

#### Find the 5 most commn venues for each Neighbourhood.

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

#create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try: columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
#create a new dataframe
ny_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
ny_neighborhoods_venues_sorted['Neighborhood'] = northyork_venuelist['Neighborhood']

for ind in np.arange(northyork_venuelist.shape[0]):
    ny_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_venuelist.iloc[ind, :], num_top_venues)
    
ny_neighborhoods_venues_sorted.head()
            

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Bathurst Manor / Wilson Heights / Downsview North,Pizza Place,Coffee Shop,Bank,Gas Station,Park
1,Bayview Village,Japanese Restaurant,Bank,Shopping Mall,Chinese Restaurant,Skating Rink
2,Bedford Park / Lawrence Manor East,Coffee Shop,Italian Restaurant,Restaurant,Sandwich Place,Wings Joint
3,Don Mills,Japanese Restaurant,Coffee Shop,Gym,Bus Line,Restaurant
4,Downsview,Vietnamese Restaurant,Coffee Shop,Pizza Place,Gas Station,Grocery Store


#### Now, let's run K-Means to cluster the neighbourhoods in 5 clusters.

In [23]:
kclusters = 5

northyork_venuelist_clustering = northyork_venuelist.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(northyork_venuelist_clustering)

kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int32)

In [25]:
ny_neighborhoods_venues_sorted.insert(0,'Clusters', kmeans.labels_)

#### Merge the informatio back into the dataframe for easy analysis and future use:

In [26]:
df_northyork_merged = df_northyork

df_northyork_merged = df_northyork_merged.join(ny_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_northyork_merged.dropna(axis=0, inplace = True)
df_northyork_merged['Clusters'] = df_northyork_merged['Clusters'].astype('int')

df_northyork_merged

Unnamed: 0,Postal code,Borough,Neighborhood,Postal Code,Latitude,Longitude,Clusters,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M2H,North York,Hillcrest Village,M2H,43.803762,-79.363452,0,Park,Pharmacy,Pizza Place,Shopping Mall,Chinese Restaurant
1,M2J,North York,Fairview / Henry Farm / Oriole,M2J,43.778517,-79.346556,0,Clothing Store,Coffee Shop,Restaurant,Bank,Japanese Restaurant
2,M2K,North York,Bayview Village,M2K,43.786947,-79.385975,0,Japanese Restaurant,Bank,Shopping Mall,Chinese Restaurant,Skating Rink
3,M2L,North York,York Mills / Silver Hills,M2L,43.75749,-79.374714,4,Pool,Deli / Bodega,Tennis Court,Cafeteria,Falafel Restaurant
4,M2M,North York,Willowdale / Newtonbrook,M2M,43.789053,-79.408493,0,Korean Restaurant,Park,Coffee Shop,Bank,Indian Restaurant
5,M2N,North York,Willowdale,M2N,43.77012,-79.408493,0,Pizza Place,Coffee Shop,Korean Restaurant,Ramen Restaurant,Sushi Restaurant
6,M2P,North York,York Mills West,M2P,43.752758,-79.400049,3,Park,Restaurant,Convenience Store,Gym,Pet Store
7,M2R,North York,Willowdale,M2R,43.782736,-79.442259,0,Pizza Place,Coffee Shop,Korean Restaurant,Ramen Restaurant,Sushi Restaurant
8,M3A,North York,Parkwoods,M3A,43.753259,-79.329656,2,Bus Stop,Park,Road,Food & Drink Shop,Wings Joint
9,M3B,North York,Don Mills,M3B,43.745906,-79.352188,0,Japanese Restaurant,Coffee Shop,Gym,Bus Line,Restaurant


#### Let's visualize the results by creating a Map:

In [28]:
map_clusters = folium.Map(location=[ny_lat, ny_long], zoom_start=12)

#color scheme for Clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_northyork_merged['Latitude'], df_northyork_merged['Longitude'], df_northyork_merged['Neighborhood'], df_northyork_merged['Clusters']):
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=0.7).add_to(map_clusters)
    
map_clusters