# Project on Segmenting and Clustering Neighborhoods in Toronto
## We scrape the Wikipedia page using Beautifulsoup and convert the data to a dataframe. We then include latitude and longitude for each of the boroughs.

In [103]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html_response = requests.get(url).text
soup = BeautifulSoup(html_response)
html_table = soup.find('table', {'class': 'wikitable'})

table_rows = html_table.findAll('tr')

postcodes = []
boroughs = []
neighbourhoods = []

for table_row in table_rows:
    cells = table_row.findAll('td')

    # Ignore rows that don't have td - The first row
    if len(cells) == 0:
        continue

    # Strip removes trailing \n character
    postcode = cells[0].text.strip()
    borough = cells[1].text.strip()
    neighbourhood = cells[2].text.strip()

    # Append into list
    postcodes.append(postcode)
    boroughs.append(borough)
    neighbourhoods.append(neighbourhood)

# Create dataframe
df = pd.DataFrame({
    'borough': boroughs,
    'postcode': postcodes,
    'neighbourhood': neighbourhoods
})


# Keep only boroughs whose value is not 'Not assigned'
df = df[df['borough'] != 'Not assigned']

# Wherever neighbourhood is 'Not assigned', use the borough instead.
# Otherwise, use the neighbourhood as it is
# If the condition matches, borough is used, otherwise neighbourhood is used
df['neighbourhood'] = np.where(df['neighbourhood'] == 'Not assigned', df['borough'], df['neighbourhood'])

# Group by borough and postcode
# Join the neighbourhood by comma
# Name the combined column as neighbourhood (it would have a different name after group-by)
neighbourhoods_joined = df.groupby(['borough', 'postcode']).apply(lambda x: ', '.join(x['neighbourhood'])).reset_index(name='neighbourhood')

# Read CSV File of geospatial coordinates
geospatial_coords_file = 'Geospatial_Coordinates.csv'
coords = pd.read_csv(geospatial_coords_file)

# Rename column 'Postcal Code' with 'postcode'
# Renaming allows us to merge the two dataframes
coords.rename(columns={ 'Postal Code': 'postcode' }, inplace=True)

# Merge dataframes
neighbourhoods_with_coords = pd.merge(neighbourhoods_joined, coords, how='inner', left_on='postcode', right_on = 'postcode')

neighbourhoods_with_coords

Unnamed: 0,borough,postcode,neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316
5,Central Toronto,M4V,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
6,Central Toronto,M5N,Roselawn,43.711695,-79.416936
7,Central Toronto,M5P,"Forest Hill North, Forest Hill West",43.696948,-79.411307
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,Downtown Toronto,M4W,Rosedale,43.679563,-79.377529


In [104]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [105]:
neighbourhoods_with_coords.head(20)

Unnamed: 0,borough,postcode,neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316
5,Central Toronto,M4V,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
6,Central Toronto,M5N,Roselawn,43.711695,-79.416936
7,Central Toronto,M5P,"Forest Hill North, Forest Hill West",43.696948,-79.411307
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,Downtown Toronto,M4W,Rosedale,43.679563,-79.377529


In [106]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighbourhoods_with_coords['borough'].unique()),
        neighbourhoods_with_coords.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


## We now create a map of the Toronto boroughs with neighborhoods superimposed on top

In [107]:
# create map of New York using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighbourhoods_with_coords['Latitude'], neighbourhoods_with_coords['Longitude'], neighbourhoods_with_coords['borough'], neighbourhoods_with_coords['neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### We consider one of the boroughs to analyse the data, here Central Toronto

In [108]:
Central_Toronto = neighbourhoods_with_coords[neighbourhoods_with_coords['borough'] == 'Central Toronto'].reset_index(drop=True)
Central_Toronto.head()

Unnamed: 0,borough,postcode,neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


In [109]:
#Let's get the geographical coordinates of Central Toronto.
address = 'Central Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Central Toronto are 43.653963, -79.387207.


In [110]:
map_centraltoronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(Central_Toronto['Latitude'], Central_Toronto['Longitude'], Central_Toronto['neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_centraltoronto)  
    
map_centraltoronto

In [111]:
CLIENT_ID = '51GCULZPIZUMT0KXZDXTOVLBMZGIOQTPPL1IHPDJFDPTTDS4' # your Foursquare ID
CLIENT_SECRET = 'N2CZRRICTNAWZ1UTT4EAA3UJPQD3QQNTCSOOISUCCREQ2GSF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: 51GCULZPIZUMT0KXZDXTOVLBMZGIOQTPPL1IHPDJFDPTTDS4
CLIENT_SECRET:N2CZRRICTNAWZ1UTT4EAA3UJPQD3QQNTCSOOISUCCREQ2GSF


In [112]:
Central_Toronto.loc[0, 'neighbourhood']

'Lawrence Park'

In [113]:
neighborhood_latitude = Central_Toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Central_Toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Central_Toronto.loc[0, 'neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


In [114]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=51GCULZPIZUMT0KXZDXTOVLBMZGIOQTPPL1IHPDJFDPTTDS4&client_secret=N2CZRRICTNAWZ1UTT4EAA3UJPQD3QQNTCSOOISUCCREQ2GSF&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=100'

In [115]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d1c54650719020025f8b23a'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

In [116]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### We use Foursquare to get the relevant data for Central Toronto

In [117]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


In [118]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


In [119]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### We now create  dataframe with all venues in the neighbourhoods related to Central Toronto

In [120]:
central_toronto_venues = getNearbyVenues(names=Central_Toronto['neighbourhood'],
                                   latitudes=Central_Toronto['Latitude'],
                                   longitudes=Central_Toronto['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville


### We now find the size of the resulting dataframe using the shape method

In [121]:
print(central_toronto_venues.shape)
central_toronto_venues.head()

(111, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop
4,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


### We find how many venues were returned for each neighborhood

In [122]:
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,38,38,38,38,38,38
Davisville North,9,9,9,9,9,9
"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",14,14,14,14,14,14
"Forest Hill North, Forest Hill West",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",2,2,2,2,2,2
North Toronto West,16,16,16,16,16,16
Roselawn,2,2,2,2,2,2
"The Annex, North Midtown, Yorkville",23,23,23,23,23,23


In [123]:
print('There are {} uniques categories.'.format(len(central_toronto_venues['Venue Category'].unique())))

There are 61 uniques categories.


### One Hot Encoding: One hot encoding is the technique to convert categorical values into a 1-dimensional numerical vector. The resulting vector will have only one element equal to 1 and the rest will be 0. The 1 is called Hot and the 0’s are Cold. This is where its name of one hot encoding comes from.

In [124]:
# one hot encoding
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
central_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

central_toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Costume Shop,Deli / Bodega,Dessert Shop,Diner,Discount Store,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Garden,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,History Museum,Hotel,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Jewelry Store,Jewish Restaurant,Light Rail Station,Liquor Store,Mexican Restaurant,Park,Pharmacy,Pizza Place,Playground,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [125]:
central_toronto_onehot.shape

(111, 62)

### We now group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [126]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighborhood').mean().reset_index()
central_toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Costume Shop,Deli / Bodega,Dessert Shop,Diner,Discount Store,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Garden,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,History Museum,Hotel,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Jewelry Store,Jewish Restaurant,Light Rail Station,Liquor Store,Mexican Restaurant,Park,Pharmacy,Pizza Place,Playground,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,0.026316,0.026316,0.078947,0.026316,0.026316,0.026316,0.0,0.026316,0.0,0.026316,0.0,0.026316,0.026316,0.0,0.026316,0.0,0.0,0.0,0.0,0.026316,0.052632,0.0,0.0,0.0,0.0,0.0,0.026316,0.026316,0.105263,0.0,0.0,0.0,0.052632,0.0,0.078947,0.026316,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,0.026316,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.071429,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
3,"Forest Hill North, Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
6,North Toronto West,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.125,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
7,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"The Annex, North Midtown, Yorkville",0.043478,0.043478,0.0,0.0,0.0,0.043478,0.0,0.130435,0.0,0.0,0.130435,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.043478,0.0,0.0,0.043478,0.0,0.043478,0.0,0.043478,0.043478,0.086957,0.0,0.043478,0.0,0.0,0.0,0.130435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0


In [127]:
central_toronto_grouped.shape

(9, 62)

In [128]:
num_top_venues = 5

for hood in central_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = central_toronto_grouped[central_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
             venue  freq
0      Pizza Place  0.11
1   Sandwich Place  0.08
2     Dessert Shop  0.08
3  Thai Restaurant  0.05
4       Restaurant  0.05


----Davisville North----
               venue  freq
0     Clothing Store  0.11
1               Park  0.11
2  Food & Drink Shop  0.11
3         Playground  0.11
4      Grocery Store  0.11


----Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West----
                 venue  freq
0                  Pub  0.14
1          Coffee Shop  0.14
2  American Restaurant  0.07
3           Sports Bar  0.07
4           Bagel Shop  0.07


----Forest Hill North, Forest Hill West----
                 venue  freq
0        Jewelry Store  0.25
1                Trail  0.25
2                 Park  0.25
3     Sushi Restaurant  0.25
4  American Restaurant  0.00


----Lawrence Park----
                 venue  freq
0             Bus Line  0.33
1                 Park  0.33
2          Swim School  0.33
3  American Restaurant  0.00
4  

### We have listed each neighborhood along with the top 5 most common venues and now put that into a pandas dataframe

In [129]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### We now create the new dataframe and display the top 10 venues for each neighborhood.

In [130]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_toronto_grouped['Neighborhood']

for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Pizza Place,Sandwich Place,Dessert Shop,Restaurant,Thai Restaurant,Sushi Restaurant,Café,Italian Restaurant,Coffee Shop,Discount Store
1,Davisville North,Hotel,Food & Drink Shop,Sandwich Place,Park,Clothing Store,Playground,Breakfast Spot,Grocery Store,Gym,Gourmet Shop
2,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Liquor Store,Light Rail Station,Pizza Place,Supermarket,Sports Bar,Fried Chicken Joint
3,"Forest Hill North, Forest Hill West",Sushi Restaurant,Trail,Jewelry Store,Park,Food & Drink Shop,Discount Store,Farmers Market,Fast Food Restaurant,Flower Shop,Yoga Studio
4,Lawrence Park,Swim School,Bus Line,Park,Yoga Studio,Diner,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Garden


### We now run k-means to cluster the neighborhood into 3 clusters.

In [131]:
# set number of clusters
kclusters = 3

central_toronto_grouped_clustering = central_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(central_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 0, 2, 1, 2], dtype=int32)

In [132]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
central_toronto_merged = Central_Toronto
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
central_toronto_merged = central_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='neighbourhood')
central_toronto_merged.head(10)



Unnamed: 0,borough,postcode,neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879,2,Swim School,Bus Line,Park,Yoga Studio,Diner,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Garden
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197,2,Hotel,Food & Drink Shop,Sandwich Place,Park,Clothing Store,Playground,Breakfast Spot,Grocery Store,Gym,Gourmet Shop
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678,2,Coffee Shop,Yoga Studio,Sporting Goods Shop,Fast Food Restaurant,Mexican Restaurant,Park,Diner,Dessert Shop,Salon / Barbershop,Spa
3,Central Toronto,M4S,Davisville,43.704324,-79.38879,2,Pizza Place,Sandwich Place,Dessert Shop,Restaurant,Thai Restaurant,Sushi Restaurant,Café,Italian Restaurant,Coffee Shop,Discount Store
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316,0,Tennis Court,Playground,Yoga Studio,Dessert Shop,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Garden,Fried Chicken Joint
5,Central Toronto,M4V,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,2,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Liquor Store,Light Rail Station,Pizza Place,Supermarket,Sports Bar,Fried Chicken Joint
6,Central Toronto,M5N,Roselawn,43.711695,-79.416936,1,Ice Cream Shop,Garden,Hotel,Gym / Fitness Center,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Fried Chicken Joint,Food & Drink Shop
7,Central Toronto,M5P,"Forest Hill North, Forest Hill West",43.696948,-79.411307,2,Sushi Restaurant,Trail,Jewelry Store,Park,Food & Drink Shop,Discount Store,Farmers Market,Fast Food Restaurant,Flower Shop,Yoga Studio
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,2,Coffee Shop,Sandwich Place,Café,Pizza Place,American Restaurant,Pharmacy,Liquor Store,Pub,Jewish Restaurant,Cosmetics Shop


### Visualizing the clusters

In [133]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(central_toronto_merged['Latitude'], central_toronto_merged['Longitude'], central_toronto_merged['neighbourhood'], central_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### The different  clusters

In [134]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 0, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4T,0,Tennis Court,Playground,Yoga Studio,Dessert Shop,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Garden,Fried Chicken Joint


In [135]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 1, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M5N,1,Ice Cream Shop,Garden,Hotel,Gym / Fitness Center,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Fried Chicken Joint,Food & Drink Shop


In [136]:
central_toronto_merged.loc[central_toronto_merged['Cluster Labels'] == 2, central_toronto_merged.columns[[1] + list(range(5, central_toronto_merged.shape[1]))]]

Unnamed: 0,postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,2,Swim School,Bus Line,Park,Yoga Studio,Diner,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Garden
1,M4P,2,Hotel,Food & Drink Shop,Sandwich Place,Park,Clothing Store,Playground,Breakfast Spot,Grocery Store,Gym,Gourmet Shop
2,M4R,2,Coffee Shop,Yoga Studio,Sporting Goods Shop,Fast Food Restaurant,Mexican Restaurant,Park,Diner,Dessert Shop,Salon / Barbershop,Spa
3,M4S,2,Pizza Place,Sandwich Place,Dessert Shop,Restaurant,Thai Restaurant,Sushi Restaurant,Café,Italian Restaurant,Coffee Shop,Discount Store
5,M4V,2,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Liquor Store,Light Rail Station,Pizza Place,Supermarket,Sports Bar,Fried Chicken Joint
7,M5P,2,Sushi Restaurant,Trail,Jewelry Store,Park,Food & Drink Shop,Discount Store,Farmers Market,Fast Food Restaurant,Flower Shop,Yoga Studio
8,M5R,2,Coffee Shop,Sandwich Place,Café,Pizza Place,American Restaurant,Pharmacy,Liquor Store,Pub,Jewish Restaurant,Cosmetics Shop
