<a href="https://colab.research.google.com/github/Bob-Gohardani/nlp-ml/blob/main/neighborhood_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import requests
import json
# from pandas.io.json import normalize
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Getting the Postal code
We use pandas's read_html() function which will download the table from the webpage then we choose the element that contains the dataframe and name it 'postal'

In [None]:
down = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
postal = down[0]

In [None]:
postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


here we will only choose rows that have a Borough assigned to them

In [None]:
postal = postal[postal['Borough'] != 'Not assigned']
postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


since we want to get all unique post codes and add up their Neighbourhood first we use groupby function and sum and save it in a variable and then drop all rows that contain a duplicate for Postcode

In [None]:
l = postal.groupby('Postal Code')['Neighbourhood'].sum().reset_index()
l

Unnamed: 0,Postal Code,Neighbourhood
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae
...,...,...
98,M9N,Weston
99,M9P,Westmount
100,M9R,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,"South Steeles, Silverstone, Humbergate, Jamest..."


we sort the postal dataframe to have same order as the groupbyed series

In [None]:
postal = postal.drop_duplicates(subset='Postal Code', keep='first')
postal.sort_values('Postal Code', inplace=True)
postal.reset_index(inplace=True)
postal.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,9,M1B,Scarborough,"Malvern, Rouge"
1,18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,27,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,36,M1G,Scarborough,Woburn
4,45,M1H,Scarborough,Cedarbrae


In [None]:
# Next we equal all values of postal['Neighbourhood'] to be equal to the aggregated series
postal['Neighbourhood'] = l['Neighbourhood']

In [None]:
# and at last we select all neighbourhoods that have not assigned and set them to same value as their Borough
postal.loc[postal['Neighbourhood'] == "Not assigned", "Neighbourhood"] = postal.loc[postal['Neighbourhood'] == "Not assigned", "Borough"]

In [None]:
postal.drop('index', axis=1, inplace=True)
postal.shape

(103, 3)

## Getting longitude and Latitude
here first we will try to download lat,long from geocoder and add it as new column to the database

In [None]:
pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 1.1 MB/s 
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [None]:
import geocoder

def findLatLong(code, mode):
    lat_lng_coords= None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(code))
        lat_lng_coords = g.latlng
        
    lat = lat_lng_coords[0]
    long = lat_lng_coords[1]
    
    if mode == "lat":
        return lat
    if mode == "long":
        return long
    
# print(findLatLong("M9V", "lat"))

In [None]:
postal['Latitude'] = ""
postal['Longitude'] = ""

# postal['Latitude'] = postal.apply(lambda x: findLatLng(x, "lat"), axis=1)
# postal['Longitude'] = postal.apply(lambda x: findLatLng(x, "long"), axis=1)

but since I didn't get any response from the geocoder Server I will use the CSV file that I downloaded

In [None]:
ll_df = pd.read_csv("../input/geospatial-coordinates/Geospatial_Coordinates.csv")
ll_df.head(3)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [None]:
print(ll_df.loc[ll_df['Postal Code'] == 'M1G', "Longitude"].values[0])

-79.2169174


In [None]:
def get_latlong(row, mode):
    code = row['Postal Code']
    
    if mode == "lat":
        return ll_df.loc[ll_df['Postal Code'] == code,'Latitude'].values[0]
    elif mode == 'long':
         return ll_df.loc[ll_df['Postal Code'] == code,'Longitude'].values[0]

In [None]:
postal['Latitude'] = postal.apply(lambda x:get_latlong(x, "lat"), axis=1)
postal['Longitude'] = postal.apply(lambda x:get_latlong(x, "long"), axis=1)
postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Exploring the data and clustering
we want to only keep the neighbourhoods which have the word Toronto inside them

In [None]:
postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [None]:
postal = postal[postal['Borough'].str.contains("Toronto") == True]
postal.reset_index(inplace=True)
postal.drop(["Postal Code","index"], inplace=True, axis=1)

postal.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


![](http://)there are 4 Boroughs and 39 hoods in our dataset

In [None]:
print(len(postal['Borough'].unique()))
print(len(postal['Neighbourhood'].unique()))

4
39


In [None]:
# Here we want to get the loction of city center for city of Toronto, we use geocoder for that

address = "Toronto, Ontario"
geolocator = Nominatim(user_agent='t_explorer')
location = geolocator.geocode(address)
lat = location.latitude
long = location.longitude

print("Coordinates of Toronto", lat, long)

Coordinates of Toronto 43.6534817 -79.3839347


We use folium library to map all the neighbourhoods with blue dots

In [None]:
map_toronto = folium.Map(location=[lat, long], zoom_start=11)

for lat, long, borough, hood in zip(postal['Latitude'], postal['Longitude'], postal['Borough'], postal['Neighbourhood']):
    label = '{}{}'.format(hood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity= 0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [None]:
CLIENT_ID = 'M5MA40ZTJH3SL23GHMCQ43PKQ33CXQ5RGIH3D4PXIYQZT3A3' 
CLIENT_SECRET = 'FC0CAORHA4PDTTPB0WCXRJQLZ3YVBOLTCHKIF0YHH3ESPIXI' 
VERSION = '20180604' 
LIMIT = 30

to this function we will give list of names, lats, longs in a Borough and it will return dataframe consisting of all their venues added together and the category for each venue

In [None]:
def get_venues(hoods, lats, longs, radius=500):
    venues_list = []
    for hood, lat, long in zip(hoods, lats, longs):
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, radius, LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(hood, lat, long, v['venue']['name'], 
                                              v['venue']['location']['lat'],
                                              v['venue']['location']['lng'],
                                              v['venue']['categories'][0]['name'])
                                              for v in results])
        
    # for venue_list in venues_list => for item in venue_list => item
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Latitude', 'Longitude', 'Venue', 'v_Latitude', 'v_Longitude', 'v_category']
    
    return nearby_venues

In [None]:
toronto_venues = get_venues(hoods=postal["Neighbourhood"], 
                            lats=postal["Latitude"], 
                            longs=postal["Longitude"])


# in total we have 860 venues
print(toronto_venues.shape)
toronto_venues.head()

(851, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,Venue,v_Latitude,v_Longitude,v_category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Beaches,43.676357,-79.293031,Seaspray Restaurant,43.678888,-79.298167,Asian Restaurant


In [None]:
toronto_venues.groupby("Neighborhood").count().head()

Unnamed: 0_level_0,Latitude,Longitude,Venue,v_Latitude,v_Longitude,v_category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,30,30,30,30,30,30
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",14,14,14,14,14,14
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,30,30,30,30,30,30


In [None]:
# There are 189 different types of venues
len(toronto_venues.v_category.unique())

191

We need to use each of these venue types as category in our clustering, one way to do this is **creating a one hot encoding system that assigns 0 to all False venues and True (1) to the venue that exists for this neighborhood**

In [None]:
toronto_oneHot = pd.get_dummies(toronto_venues[['v_category']], prefix="", prefix_sep="")
toronto_oneHot.head(3)

Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Art Museum,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# drop Neighborhood column from toronto_oneHot
toronto_oneHot.drop("Neighborhood", inplace=True, axis=1)
# add Neighborhood column to end of toronto_oneHot dataframe
toronto_oneHot['Neighborhood'] = toronto_venues['Neighborhood']
toronto_oneHot.head(3)

Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Basketball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Auditorium,College Cafeteria,College Gym,College Rec Center,Colombian Restaurant,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Coworking Space,Creperie,Cuban Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Eastern European Restaurant,Electronics Store,Escape Room,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Food & Drink Shop,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Harbor / Marina,Health Food Store,Historic Site,History Museum,Home Service,Hotel,IT Services,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Market,Martial Arts School,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Movie Theater,Museum,Music Venue,New American Restaurant,Nightclub,Noodle House,Office,Organic Grocery,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Plane,Playground,Plaza,Poke Place,Pool,Poutine Place,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Restaurant,Roof Deck,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shoe Store,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Spa,Speakeasy,Sporting Goods Shop,Stadium,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Swim School,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,The Beaches
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches


In [None]:
#  get name of the last column (Neighborhood) as a list + add up to the list of name of all columns except last one
fixed_columns = [toronto_oneHot.columns[-1]] + list(toronto_oneHot.columns[:-1])
toronto_oneHot = toronto_oneHot[fixed_columns]
toronto_oneHot.head(3)

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Basketball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Auditorium,College Cafeteria,College Gym,College Rec Center,Colombian Restaurant,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Coworking Space,Creperie,Cuban Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Eastern European Restaurant,Electronics Store,Escape Room,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Food & Drink Shop,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Harbor / Marina,Health Food Store,Historic Site,History Museum,Home Service,Hotel,IT Services,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Market,Martial Arts School,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Movie Theater,Museum,Music Venue,New American Restaurant,Nightclub,Noodle House,Office,Organic Grocery,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Plane,Playground,Plaza,Poke Place,Pool,Poutine Place,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Restaurant,Roof Deck,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shoe Store,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Spa,Speakeasy,Sporting Goods Shop,Stadium,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Swim School,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
print(toronto_oneHot.shape)
toronto_grouped = toronto_oneHot.groupby("Neighborhood").mean().reset_index()
print(toronto_grouped.shape)

(851, 191)
(39, 191)


In [None]:
def returns_most_common_venues(row, num_top_venues):
    # get all columns from row except first one
    row_categories = row.iloc[1:]
    # sort the cols based on their numerical value
    row_categories_sorted = row_categories.sort_values(ascending=False)
    # give back top 'num_top_venues' columns from that row
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# add top ten venue names
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhood_venues_sorted = pd.DataFrame(columns=columns)
neighborhood_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    # return each row of dataframe except first column of that row (which is Neighborhood)
    neighborhood_venues_sorted.iloc[ind, 1:] = returns_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
    
neighborhood_venues_sorted.head(2)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Cocktail Bar,Coffee Shop,Seafood Restaurant,Beer Bar,Farmers Market,French Restaurant,Bistro,Liquor Store,Breakfast Spot,Basketball Stadium
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Furniture / Home Store,Restaurant,Nightclub,Bar,Climbing Gym,Bakery,Burrito Place


## Clustering Data

In [None]:
kclusters = 4
toronto_grouped_clustered = toronto_grouped.drop("Neighborhood", 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustered)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

we want to visualize our clusters on the map so we add together the postal dataset and neighborhood_venues_sorted to have a row with Borough, Neighborhood,cluster label, lat, long and top 10 common venues all together



In [None]:
neighborhood_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = postal
toronto_merged.columns = ["Borough", "Neighborhood", "Latitude", "Longitude"]
toronto_merged = toronto_merged.join(neighborhood_venues_sorted.set_index("Neighborhood"), on="Neighborhood")

In [None]:
toronto_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Pub,Trail,Asian Restaurant,Airport,New American Restaurant,Martial Arts School,Men's Store,Mexican Restaurant,Middle Eastern Restaurant
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Ice Cream Shop,Coffee Shop,Italian Restaurant,Spa,Restaurant,Pub,Pizza Place,Juice Bar,Bookstore
2,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,Fast Food Restaurant,Liquor Store,Sushi Restaurant,Pet Store,Gym,Park,Pub,Restaurant,Movie Theater,Sandwich Place
3,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Bakery,American Restaurant,Café,Yoga Studio,Comfort Food Restaurant,Pet Store,Park,Seafood Restaurant,Cheese Shop
4,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Park,Swim School,Bus Line,Airport,Museum,Martial Arts School,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop


In [None]:
map_clusters = folium.Map(location=[lat, long], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], 
                                  toronto_merged['Longitude'], 
                                  toronto_merged['Neighborhood'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters