**Import libraries that will be used.**

In [1]:
import numpy as np
import pandas as pd
import pickle
from geopy.geocoders import Nominatim
import folium
import FourSquareKeys
import json
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

**Load the pickle file from the previous section into a dataframe.**

In [2]:
pkl_df = pd.read_pickle('Toronto Neighborhood Merged Dataframe.pkl')
pkl_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


**Keep rows containing "Toronto."**

In [3]:
places_to_keep = set([name for name in pkl_df['Borough'] if 'Toronto' in name])
df = pkl_df[pkl_df['Borough'].isin(places_to_keep)]
df = df.drop('Postal Code', axis = 1).reset_index(drop = True)
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,East Toronto,The Beaches,43.676357,-79.293031


**Get the coordinates for Toronto.**

In [4]:
address = 'Toronto'

geolocator = Nominatim(user_agent = 'toronto-explorer')
location = geolocator.geocode(address)
latitude,longitude = location.latitude, location.longitude

print('The geographical coordinate of Toronto is {}, {}'.format(latitude, longitude))

The geographical coordinate of Toronto is 43.6534817, -79.3839347


**Create map of Toronto using the latitude and lonitude values.**


_**Note: Please refer to [nbviewer](https://nbviewer.jupyter.org/github/Ericjung008/Coursera_Capstone/blob/master/Toronto%20Neighborhood%20Clustering.ipynb) to see the visualization.**_

In [5]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

for lat, lng, borough, neighborhood in zip(df.Latitude, df.Longitude, df.Borough, df.Neighborhood):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)
    
map_toronto

**Assign Foursquare client ID, client secret, and version to variables.**
* _**Note: The client ID and client secret keys were imported from another notebook.**_

In [6]:
CLIENT_ID = FourSquareKeys.CLIENT_ID
CLIENT_SECRET = FourSquareKeys.CLIENT_SECRET
VERSION = '20180605'

**Create a function to return a dataframe that outputs the neighborhood, neighborhood's coordinates, venue names, venue's coordinates, and their category. The function will return venues within 500 meters of each unique neighborhood's coordinates with a limit of 100 venues.**

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius, LIMIT):
    venues_list = []
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(name,
                          lat,
                          lng,
                          v['venue']['name'],
                          v['venue']['location']['lat'],
                          v['venue']['location']['lng'],
                          v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhoods',
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    
    return nearby_venues
            

In [8]:
venues = getNearbyVenues(df.Neighborhood, df.Latitude, df.Longitude, 500, 100)

**The dataframe has 1621 rows and 7 columns.**

In [9]:
venues.shape

(1621, 7)

**The dataframe is grouped by neighborhoods, and the number of venues returned for each neighborhood is shown below.**

In [10]:
venues.groupby('Neighborhoods').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhoods,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton, Parkdale Village, Exhibition Place",22,22,22,22,22,22
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",17,17,17,17,17,17
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,64,64,64,64,64,64
Christie,16,16,16,16,16,16
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,32,32,32,32,32,32
Davisville North,8,8,8,8,8,8


**Counting the number of unique categories.**

In [11]:
print('There are {} unique categories.'.format(venues['Venue Category'].nunique()))

There are 232 unique categories.


**The "Venue Category" column is separated into many columns, one for each unique value. The values are also encoded to 0's and 1's.**  
**The "Neighborhoods" column is added to the encoded dataframe.**  
**The "Neighborhoods" column is moved as the first column of the dataframe.**  
**Finally, the first five rows are outputted, ensuring the desired result.**

In [12]:
onehot_venues = pd.get_dummies(venues['Venue Category'], prefix = '', prefix_sep = '')

onehot_venues['Neighborhoods'] = venues['Neighborhoods']

fixed_cols = [onehot_venues.columns[-1]] + list(onehot_venues.columns[:-1])
onehot_venues = onehot_venues[fixed_cols]

onehot_venues.head()

Unnamed: 0,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Confirming the number of rows and columns for the new dataframe.**

In [13]:
onehot_venues.shape

(1621, 233)

**The encoded dataframe is then grouped by neighborhoods and the mean for the remaining columns is returned.**

In [14]:
venues_group = onehot_venues.groupby('Neighborhoods').mean().reset_index()
venues_group.head()

Unnamed: 0,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.015625


**Confirming the number of rows and columns for the new dataframe.**

In [15]:
venues_group.shape

(39, 233)

**Printing the top five categories for each neighborhood and their mean.**

In [16]:
for neigh in venues_group['Neighborhoods']:
    print('----'+neigh+'----')
    transposed_df = venues_group[venues_group['Neighborhoods'] == neigh].T.reset_index()
    transposed_df.columns = ['venue','freq']
    transposed_df = transposed_df.loc[1:]
    transposed_df['freq'] = transposed_df['freq'].astype('float')
    transposed_df['freq'] = transposed_df['freq'].round(decimals = 2)
    
    print(transposed_df.sort_values(ascending = False, by = 'freq').reset_index(drop = True).head())
    print('\n')

----Berczy Park----
            venue  freq
0     Coffee Shop  0.10
1  Farmers Market  0.03
2        Beer Bar  0.03
3      Restaurant  0.03
4            Café  0.03


----Brockton, Parkdale Village, Exhibition Place----
                venue  freq
0                Café  0.14
1      Breakfast Spot  0.09
2         Coffee Shop  0.09
3          Restaurant  0.05
4  Italian Restaurant  0.05


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0    Light Rail Station  0.12
1           Yoga Studio  0.06
2               Brewery  0.06
3  Gym / Fitness Center  0.06
4         Garden Center  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3             Plane  0.06
4  Sculpture Garden  0.06


----Central Bay Street----
                 venue  freq
0      

**Creating a dataframe outputting the top 10 categories for each neighborhood.**

In [17]:
def get_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories = row_categories.sort_values(ascending = False)
    
    return row_categories.index[0:num_top_venues]

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']
columns = ['Neighborhoods']

for num in range(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(num+1,indicators[num]))
    except:
        columns.append('{}th Most Common Venue'.format(num+1))
    
venues_sorted = pd.DataFrame(columns = columns)
venues_sorted['Neighborhoods'] = venues_group['Neighborhoods']

for ind in range(len(venues_group)):
    venues_sorted.iloc[ind,1:] = get_most_common_venues(venues_group.iloc[ind,:],num_top_venues)

venues_sorted

Unnamed: 0,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Seafood Restaurant,Cheese Shop,Café,Bakery,Cocktail Bar,Beer Bar,Restaurant,Farmers Market,Clothing Store
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Performing Arts Venue,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store,Bakery
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Auto Workshop,Park,Comic Shop,Pizza Place,Recording Studio,Restaurant,Burrito Place,Brewery,Skate Park
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Bar,Coffee Shop,Plane,Rental Car Location,Sculpture Garden,Boat or Ferry
4,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Japanese Restaurant,Department Store,Salad Place,Bubble Tea Shop,Burger Joint,Indian Restaurant
5,Christie,Grocery Store,Café,Park,Candy Store,Coffee Shop,Nightclub,Diner,Baby Store,Restaurant,Italian Restaurant
6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Café,Pub,Men's Store,Mediterranean Restaurant,Hotel
7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Asian Restaurant
8,Davisville,Sandwich Place,Dessert Shop,Pizza Place,Coffee Shop,Italian Restaurant,Café,Sushi Restaurant,Gym,Park,Gourmet Shop
9,Davisville North,Gym / Fitness Center,Hotel,Department Store,Dog Run,Sandwich Place,Breakfast Spot,Food & Drink Shop,Park,General Travel,General Entertainment


**Using KMeans to create 5 clusters and assign each neighborhood to a cluster.**

In [19]:
kclusters = 5

venues_clustering = venues_group.drop('Neighborhoods', axis = 1)

model = KMeans(n_clusters = kclusters, random_state = 0)
model.fit(venues_clustering)

venues_sorted.insert(0, 'Cluster Labels', model.labels_)

**Merging two dataframes to have the following columns:**
1. _**Neighborhood**_
2. _**Latitude**_
3. _**Longitude**_
4. _**Cluster Labels**_
5. _**Top 10 Most Common Venues**_

In [20]:
venues_merged = df.merge(venues_sorted, left_on = 'Neighborhood', right_on = 'Neighborhoods')
venues_merged.drop('Neighborhoods', axis = 1, inplace = True)

venues_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Park,Pub,Bakery,Breakfast Spot,Restaurant,Café,Theater,Dessert Shop,Shoe Store
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,Coffee Shop,Diner,Yoga Studio,Fried Chicken Joint,Burrito Place,Café,Smoothie Shop,Japanese Restaurant,Distribution Center,Beer Bar
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,Clothing Store,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Cosmetics Shop,Electronics Store,Pizza Place,Plaza
3,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Café,Clothing Store,Cosmetics Shop,Restaurant,American Restaurant,Cocktail Bar,Gym,Beer Bar,Seafood Restaurant
4,East Toronto,The Beaches,43.676357,-79.293031,0,Neighborhood,Health Food Store,Pub,Trail,Yoga Studio,Doner Restaurant,Discount Store,Distribution Center,Dog Run,Dumpling Restaurant


**Visualize the clusters.**

_**Note: Please refer to [nbviewer](https://nbviewer.jupyter.org/github/Ericjung008/Coursera_Capstone/blob/master/Toronto%20Neighborhood%20Clustering.ipynb) to see the visualization.**_

In [21]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(venues_merged['Latitude'], venues_merged['Longitude'], venues_merged['Neighborhood'], venues_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

**Return the rows that are in cluster 0.**

In [22]:
venues_merged.loc[venues_merged['Cluster Labels'] == 0, venues_merged.columns[[1] + list(range(5,venues_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,The Beaches,Neighborhood,Health Food Store,Pub,Trail,Yoga Studio,Doner Restaurant,Discount Store,Distribution Center,Dog Run,Dumpling Restaurant
18,Lawrence Park,Park,Bus Line,Swim School,Fast Food Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
21,"Forest Hill North & West, Forest Hill Road Park",Trail,Jewelry Store,Bus Line,Sushi Restaurant,Yoga Studio,Diner,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


**Return the rows that are in cluster 1.**

In [23]:
venues_merged.loc[venues_merged['Cluster Labels'] == 1, venues_merged.columns[[1] + list(range(5,venues_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Roselawn,Garden,Health & Beauty Service,Home Service,Yoga Studio,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


**Return the rows that are in cluster 2.**

In [24]:
venues_merged.loc[venues_merged['Cluster Labels'] == 2, venues_merged.columns[[1] + list(range(5,venues_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",Coffee Shop,Park,Pub,Bakery,Breakfast Spot,Restaurant,Café,Theater,Dessert Shop,Shoe Store
1,"Queen's Park, Ontario Provincial Government",Coffee Shop,Diner,Yoga Studio,Fried Chicken Joint,Burrito Place,Café,Smoothie Shop,Japanese Restaurant,Distribution Center,Beer Bar
2,"Garden District, Ryerson",Clothing Store,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Cosmetics Shop,Electronics Store,Pizza Place,Plaza
3,St. James Town,Coffee Shop,Café,Clothing Store,Cosmetics Shop,Restaurant,American Restaurant,Cocktail Bar,Gym,Beer Bar,Seafood Restaurant
5,Berczy Park,Coffee Shop,Seafood Restaurant,Cheese Shop,Café,Bakery,Cocktail Bar,Beer Bar,Restaurant,Farmers Market,Clothing Store
6,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Japanese Restaurant,Department Store,Salad Place,Bubble Tea Shop,Burger Joint,Indian Restaurant
7,Christie,Grocery Store,Café,Park,Candy Store,Coffee Shop,Nightclub,Diner,Baby Store,Restaurant,Italian Restaurant
8,"Richmond, Adelaide, King",Coffee Shop,Café,Bar,Restaurant,Gym,Clothing Store,Hotel,Steakhouse,Thai Restaurant,Pizza Place
9,"Dufferin, Dovercourt Village",Pharmacy,Bakery,Grocery Store,Music Venue,Café,Middle Eastern Restaurant,Bar,Supermarket,Bank,Brewery
10,"Harbourfront East, Union Station, Toronto Islands",Coffee Shop,Aquarium,Hotel,Café,Italian Restaurant,Fried Chicken Joint,Brewery,Scenic Lookout,Restaurant,History Museum


**Return the rows that are in cluster 3.**

In [25]:
venues_merged.loc[venues_merged['Cluster Labels'] == 3, venues_merged.columns[[1] + list(range(5,venues_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,Rosedale,Park,Playground,Trail,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


**Return the rows that are in cluster 4.**

In [26]:
venues_merged.loc[venues_merged['Cluster Labels'] == 4, venues_merged.columns[[1] + list(range(5,venues_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,"Moore Park, Summerhill East",Gym,Tennis Court,Restaurant,Comic Shop,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
