# Segmenting and Clustering Neighborhoods in the city of Toronto

## Problem 1
##### create a specific dataframe from a Wikipedia page

In [1]:
# loading data from wiki
import pandas as pd
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [2]:
# checking the data
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
# drop the rows that boroughs are not assigned
criteria= df['Borough'] != 'Not assigned'
df = df[criteria]
df.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [5]:
# checking if any Postal Codes are listed more than once
import numpy as np
np.sum(df['Postal Code'].value_counts() > 1)

0

These neighborhoods with the same postal codes are already combined into one row with the separated with a commain the table.\
Therefore, no need to process again.

In [6]:
# checking if there are any neighborhoods are not assigned
np.sum(df['Neighborhood']=='Not assigned')

0

Obviously, after deleting cells with a borough that is not assigned, there are no cells with a neighborhood is not assigned.

In [7]:
# looking the shape of data
df.shape

(103, 3)

## Problem 2
#####  get the latitude and the longitude coordinates of each neighborhood

**Because there are something wrong about geocoder in my computer, I use the csv file instead.**

In [15]:
# loading csv file
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [51]:
# merge the geo with data
df_loc = pd.merge(left=df, right=geo, how='left' , on='Postal Code')
df_loc.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [19]:
df_loc.shape

(103, 5)

## Problem 3
#####  explore and cluster the neighborhoods in Toronto

### 1. fristly visualize the data

In [52]:
# selecting boroughs that contains the word Toronto
df_loc = df_loc[df_loc['Borough'].str.contains('Toronto')]
df_loc.reset_index(drop=True,inplace=True)
df_loc.shape

(39, 5)

After filtering the data, we have 39 rows with 5 columns.

In [35]:
# attaining the location of Toronto
from geopy.geocoders import Nominatim

address = 'Toronto'
geolocator = Nominatim(user_agent="yto_explorer")
location = geolocator.geocode(address)
latitude = location[-1][0]
longitude = location[-1][1]
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [36]:
#visualize the neighborhoods
import folium

map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_loc['Latitude'], df_loc['Longitude'], df_loc['Borough'], df_loc['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

### 2. utilizing the Foursquare API to explore the neighborhoods

In [37]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'EZG40Z10UTCLPZPEYR1QPUMXCW2TXHFFU1OIEBAPUDQE23ZP' 
CLIENT_SECRET = 'GKGDMVZT4PMZXHQOPHQX2RUDLWZI02T0WVQB0DLT5CHUMA5D' 
VERSION = '20180605' # Foursquare API version

print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

CLIENT_ID: EZG40Z10UTCLPZPEYR1QPUMXCW2TXHFFU1OIEBAPUDQE23ZP
CLIENT_SECRET:GKGDMVZT4PMZXHQOPHQX2RUDLWZI02T0WVQB0DLT5CHUMA5D


In [38]:
# define categories function
def get_category_type(df):
    try:
        categories_list = df['categories']
    except:
        categories_list = df['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [69]:
# define function to get nearby venues of each neighborhood
def getNearbyVenues(names, latitudes, longitudes):
    radius=500 # limit 500 meters
    LIMIT=100 # limit 100 nearby venues
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        results = pd.json_normalize(results)
        filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
        results =results.loc[:, filtered_columns]
        results['venue.categories'] = results.apply(get_category_type, axis=1)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            results.loc[i,'venue.name'], 
            results.loc[i,'venue.location.lat'], 
            results.loc[i,'venue.location.lng'],
            results.loc[i,'venue.categories'])for i in range(results.shape[0])])
   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [70]:
# get the nearby venues
Toronto_venues = getNearbyVenues(names=df_loc['Postal Code'],
                                   latitudes=df_loc['Latitude'],
                                   longitudes=df_loc['Longitude']
                                  )
Toronto_venues.head()

M5A
M7A
M5B
M5C
M4E
M5E
M5G
M6G
M5H
M6H
M5J
M6J
M4K
M5K
M6K
M4L
M5L
M4M
M4N
M5N
M4P
M5P
M6P
M4R
M5R
M6R
M4S
M5S
M6S
M4T
M5T
M4V
M5V
M4W
M5W
M4X
M5X
M4Y
M7Y


Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


In [71]:
# checking the size of the resulting dataframe
print(Toronto_venues.shape)

(1615, 7)


In [72]:
# checking how many venues we have collected
Toronto_venues.groupby('Postal Code').count()['Venue'].sort_values(ascending = False)

Postal Code
M5J    100
M5B    100
M5K    100
M5X    100
M5L    100
M5W     97
M5H     94
M5C     78
M4Y     74
M5G     68
M5T     59
M5E     59
M4X     46
M5A     45
M6J     44
M4K     43
M4M     40
M5S     35
M6S     34
M7A     33
M4S     32
M6P     24
M6K     23
M4L     21
M5R     21
M4R     19
M6H     17
M4V     16
M7Y     16
M5V     16
M6G     16
M6R     15
M4P     10
M4W      4
M5P      4
M4N      4
M4E      4
M5N      3
M4T      1
Name: Venue, dtype: int64

In [73]:
# checking how many categories we have collected
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 231 uniques categories.


### 3. analyze each neighborhood

In [74]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postal code column back to dataframe
Toronto_onehot['Postal Code'] = Toronto_venues['Postal Code'] 

# move postal code column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Postal Code,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# getting the shape of onehot dataset
Toronto_onehot.shape

(1615, 232)

In [77]:
# creating a grouped dataset 
Toronto_grouped = Toronto_onehot.groupby('Postal Code').mean().reset_index()
print(Toronto_grouped.shape)
Toronto_grouped.head()

(39, 232)


Unnamed: 0,Postal Code,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,...,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.025
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
# creating a toplist dataset 
num_top_venues = 5
list = ['M4E','M4K','M4L'] # just looking some examples

for code in list:
    print("----"+code+"----")
    temp = Toronto_grouped[Toronto_grouped['Postal Code'] == code].T.reset_index().iloc[1:]
    temp.columns = ['venue','freq']
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M4E----
               venue  freq
0              Trail  0.25
1  Health Food Store  0.25
2       Neighborhood  0.25
3                Pub  0.25
4  Afghan Restaurant  0.00


----M4K----
                    venue  freq
0        Greek Restaurant  0.19
1             Coffee Shop  0.07
2      Italian Restaurant  0.07
3  Furniture / Home Store  0.05
4               Bookstore  0.05


----M4L----
                  venue  freq
0  Fast Food Restaurant  0.10
1                  Park  0.10
2        Sandwich Place  0.10
3           Pizza Place  0.05
4    Italian Restaurant  0.05




In [93]:
# defining a function to find the most common venues
def return_most_common_venues(df, num_top_venues):
    df_categories = df.iloc[1:]
    df_categories_sorted = df_categories.sort_values(ascending=False)  
    return df_categories_sorted.index.values[0:num_top_venues]

In [137]:
# creating the sorted dataset
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# creating columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# creating a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Postal Code'] = Toronto_grouped['Postal Code']

for ind in np.arange(Toronto_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Trail,Health Food Store,Pub,Neighborhood,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Yoga Studio
1,M4K,Greek Restaurant,Italian Restaurant,Coffee Shop,Ice Cream Shop,Restaurant,Bookstore,Furniture / Home Store,Yoga Studio,Indian Restaurant,Caribbean Restaurant
2,M4L,Sandwich Place,Fast Food Restaurant,Park,Food & Drink Shop,Liquor Store,Burrito Place,Restaurant,Italian Restaurant,Fish & Chips Shop,Steakhouse
3,M4M,Café,Coffee Shop,American Restaurant,Bakery,Brewery,Gastropub,Yoga Studio,Fish Market,Pet Store,Park
4,M4N,Park,Bus Line,Swim School,Dim Sum Restaurant,Falafel Restaurant,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant


### 4. clustering the neighborhood

In [106]:
# import kmeans
from sklearn.cluster import KMeans

In [136]:
# building the model

# set number of clusters
kclusters = 3

Toronto_grouped_clustering = Toronto_grouped.drop('Postal Code', axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, init="k-means++", n_init=2, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [138]:
# add clustering labels
venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df_loc

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(venues_sorted.set_index('Postal Code'), on='Postal Code')

df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Neighborhood_first,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Regent Park,1,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Yoga Studio,Event Space,Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,Queen's Park,1,Coffee Shop,Sushi Restaurant,Diner,Yoga Studio,College Auditorium,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,Garden District,1,Clothing Store,Coffee Shop,Cosmetics Shop,Bubble Tea Shop,Café,Middle Eastern Restaurant,Japanese Restaurant,Lingerie Store,Pizza Place,Movie Theater
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,St. James Town,1,Coffee Shop,Café,Gastropub,Restaurant,American Restaurant,Cocktail Bar,Gym,Hotel,Italian Restaurant,Japanese Restaurant
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Beaches,1,Trail,Health Food Store,Pub,Neighborhood,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Yoga Studio


### 5. visualize the clustering

In [116]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [139]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### conclusion
* Unfortunately, the clustering effect is not significant as we suppose.
* That is to say, with very few exceptions, almost neighborhoods have the similar venue categories nearby. If only considering this point, as long as it's not the isolated neighborhoods, there are lots of choice to move to similar neighborhoods in Toronto.
* However, in order to further cluster neighborhoods, we should add more features that a tenant may consider about, like population, transportation.

## THE END