# Explore and cluster the neighborhoods in Toronto
## Scrape the Wiki page to get the list of PostalCode, Borough, and Neighborhood
### Only processing the cells that have an assigned borough. Ignoring the cells with a borough that is Not assigned
### cancating the neighborhoods based on postal code, seperated by ','
### print heads of dataframes
### Analyze neighborhoods details like we did in lab (Newyork, NY)

Let's start with importing the packages. we will use wiki pakage to scrape details from wiki and will use geopy to get latitude and longitude.. 


In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge wikipedia
import wikipedia as wp

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
wikipedia                 1.4.0                    py35_0    conda-forge
Libraries imported.


# Scraping Wiki to prepare Toronto dataframe

#### Tranform the data into a *pandas* dataframe

#### Borough and Neighbourhoods inforamtion extracted from Wiki, but there is no latitude and longitude information.  To get the latitude and longitude values of Borough and Neighbourhood of Toronto use geopy library and update each row of dataset

In [2]:
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]

#df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
df=df.rename(columns={0: 'PostalCode', 1:"Borough", 2: "Neighborhood"})
df.drop(df[df.Borough =="Not assigned"].index, inplace=True)

df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge


In [3]:
grouped= df.groupby(['PostalCode', 'Borough'], as_index=False, sort=True).apply(lambda group: ','.join(group['Neighborhood'])).reset_index()
grouped.rename(columns={0 : 'Neighborhood'}, inplace=True)
for key, data in grouped.iterrows():
   # print(data['Neighbourhoods'])
   
    if data['Neighborhood'] =='Not assigned':
        grouped.loc[key,'Neighborhood'] = data['Borough']
        
grouped.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [4]:
grouped.shape


(104, 3)

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park

Update the Latitude and Longitude columns 

In [5]:


grouped.add('Latitude', fill_value=None)
grouped.add('Longitude', fill_value=None)
# Exclude the first row..
neighborhoods=grouped
#neighborhoods.dropna(thresh=2, inplace=True)

for key, data in neighborhoods.iterrows():
   # print(data['Neighbourhoods'])
   try: 
    
    address = data['Borough'] + ',' + data['PostalCode']
       #print(address)
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    neighborhoods.loc[key,'Latitude'] = location.latitude
    neighborhoods.loc[key, 'Longitude'] = location.longitude
       #print( location.latitude, location.longitude)
   except:
        #print(address)
        pass
neighborhoods.head(100)



Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",54.28476,-0.409034
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",54.28476,-0.409034
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",,
3,M1G,Scarborough,Woburn,43.762669,-79.230861
4,M1H,Scarborough,Cedarbrae,,
5,M1J,Scarborough,Scarborough Village,,
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",,
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",,
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",,
9,M1N,Scarborough,"Birch Cliff,Cliffside West",,


In [38]:
#Remove all rows where no latitude and longitude... there may be a reason,geopy doesn't retrive latitude and longitude..
neighborhoods.dropna(inplace=True)
neighborhoods.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",54.28476,-0.409034
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",54.28476,-0.409034
3,M1G,Scarborough,Woburn,43.762669,-79.230861
15,M1W,Scarborough,"L'Amoreaux West,Steeles West",43.773077,-79.257774
18,M2J,North York,"Fairview,Henry Farm,Oriole",43.754326,-79.449117
21,M2M,North York,"Newtonbrook,Willowdale",43.763531,-79.411147
22,M2N,North York,Willowdale South,43.754326,-79.449117
27,M3C,North York,"Flemingdon Park,Don Mills South",43.732822,-79.346961
28,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.756199,-79.439802
29,M3J,North York,"Northwood Park,York University",43.754326,-79.449117


In [39]:



print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 7 boroughs and 18 neighborhoods.


In [40]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto City are 43.653963, -79.387207.


#### Create a map of New York with neighborhoods superimposed on top.

In [41]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Folium is a great visualization library. Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in North York. So let's slice the original dataframe and create a new dataframe of the North York data.

In [42]:
NorthYork_data = neighborhoods[neighborhoods['Borough'] == 'North York'].reset_index(drop=True)
NorthYork_data.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M2J,North York,"Fairview,Henry Farm,Oriole",43.754326,-79.449117
1,M2M,North York,"Newtonbrook,Willowdale",43.763531,-79.411147
2,M2N,North York,Willowdale South,43.754326,-79.449117
3,M3C,North York,"Flemingdon Park,Don Mills South",43.732822,-79.346961
4,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.756199,-79.439802
5,M3J,North York,"Northwood Park,York University",43.754326,-79.449117


In [43]:
address = 'North York, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of North York are 43.7709163, -79.4124102.


As we did with all of Toronto City, let's visualizat North York the neighborhoods in it.

In [44]:
# create map of Manhattan using latitude and longitude values
map_NorthYork = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(NorthYork_data['Latitude'], NorthYork_data['Longitude'], NorthYork_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork)  
    
map_NorthYork

 Foursquare API to explore the neighborhoods and segment them

In [45]:
#
CLIENT_ID = '3QMDP42DYXT0M5LCCXYJNY5B2WNPFZIKQEL1NAQA1FUFNYKC' # your Foursquare ID
CLIENT_SECRET = 'AD3MSB2U4MLVBBTK5YMNYBF2ULA5J4JSN3LO5NVWMQY4BEBD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3QMDP42DYXT0M5LCCXYJNY5B2WNPFZIKQEL1NAQA1FUFNYKC
CLIENT_SECRET:AD3MSB2U4MLVBBTK5YMNYBF2ULA5J4JSN3LO5NVWMQY4BEBD


In [46]:
NorthYork_data.loc[0, 'Neighborhood']
neighborhood_latitude = NorthYork_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = NorthYork_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = NorthYork_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Fairview,Henry Farm,Oriole are 43.7543263, -79.4491169663959.


Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.
First, let's create the GET request URL. Name your URL url.

In [47]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c377df34c1f67402fd3c3fc'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-5011603fe4b07c3cf1967fba-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/coffeeshop_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1e0931735',
         'name': 'Coffee Shop',
         'pluralName': 'Coffee Shops',
         'primary': True,
         'shortName': 'Coffee Shop'}],
       'id': '5011603fe4b07c3cf1967fba',
       'location': {'address': '680 Sheppard Ave West',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Bryant',
        'distance': 474,
        'formattedAddress': ['680 Sheppard Ave West (Bryant)',
         'Toronto ON M3H 2S5',
         'Canada'],
        'labeledLatLngs

In [48]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [49]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Tim Hortons,Coffee Shop,43.754767,-79.44325
1,Domino's Pizza,Pizza Place,43.753049,-79.450965
2,Orly Restaurant & Grill,Middle Eastern Restaurant,43.754493,-79.443507
3,lori@itsyourtreasure.com,Women's Store,43.750265,-79.447868


In [50]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


2. Explore Neighborhoods in North York

#### Let's create a function to repeat the same process to all the neighborhoods in North York


In [51]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [52]:
northyork_venues = getNearbyVenues(names=NorthYork_data['Neighborhood'],
                                   latitudes=NorthYork_data['Latitude'],
                                   longitudes=NorthYork_data['Longitude']
                                  )


Fairview,Henry Farm,Oriole
Newtonbrook,Willowdale
Willowdale South
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University


In [53]:
print(northyork_venues.shape)
northyork_venues.head()

(118, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Fairview,Henry Farm,Oriole",43.754326,-79.449117,Tim Hortons,43.754767,-79.44325,Coffee Shop
1,"Fairview,Henry Farm,Oriole",43.754326,-79.449117,Domino's Pizza,43.753049,-79.450965,Pizza Place
2,"Fairview,Henry Farm,Oriole",43.754326,-79.449117,Orly Restaurant & Grill,43.754493,-79.443507,Middle Eastern Restaurant
3,"Fairview,Henry Farm,Oriole",43.754326,-79.449117,lori@itsyourtreasure.com,43.750265,-79.447868,Women's Store
4,"Newtonbrook,Willowdale",43.763531,-79.411147,Sushi Moto Sake & Wine Bar,43.763902,-79.411559,Sushi Restaurant


In [54]:
northyork_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor,Downsview North,Wilson Heights",18,18,18,18,18,18
"Fairview,Henry Farm,Oriole",4,4,4,4,4,4
"Flemingdon Park,Don Mills South",36,36,36,36,36,36
"Newtonbrook,Willowdale",52,52,52,52,52,52
"Northwood Park,York University",4,4,4,4,4,4
Willowdale South,4,4,4,4,4,4


In [55]:
print('There are {} uniques categories.'.format(len(northyork_venues['Venue Category'].unique())))

There are 61 uniques categories.


## 3. Analyze Each Neighborhood

In [56]:
# one hot encoding
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
northyork_onehot['Neighborhood'] = northyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [northyork_onehot.columns[-1]] + list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]

northyork_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Bakery,Bank,Bar,Bridal Shop,Bubble Tea Shop,Burger Joint,Café,Cantonese Restaurant,Chinese Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Community Center,Cosmetics Shop,Deli / Bodega,Dessert Shop,Diner,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,Gourmet Shop,Grocery Store,Gym,Ice Cream Shop,Indonesian Restaurant,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Liquor Store,Lounge,Mexican Restaurant,Middle Eastern Restaurant,Movie Theater,Park,Pet Store,Pharmacy,Pizza Place,Plaza,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Shoe Store,Shopping Mall,Spa,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Video Store,Women's Store
0,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,"Newtonbrook,Willowdale",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [57]:
northyork_onehot.shape

(118, 62)

In [58]:
northyork_grouped = northyork_onehot.groupby('Neighborhood').mean().reset_index()
northyork_grouped

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Bakery,Bank,Bar,Bridal Shop,Bubble Tea Shop,Burger Joint,Café,Cantonese Restaurant,Chinese Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Community Center,Cosmetics Shop,Deli / Bodega,Dessert Shop,Diner,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,Gourmet Shop,Grocery Store,Gym,Ice Cream Shop,Indonesian Restaurant,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Liquor Store,Lounge,Mexican Restaurant,Middle Eastern Restaurant,Movie Theater,Park,Pet Store,Pharmacy,Pizza Place,Plaza,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Shoe Store,Shopping Mall,Spa,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Video Store,Women's Store
0,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.055556,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.055556,0.0,0.055556,0.0,0.055556,0.0,0.055556,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.055556,0.055556,0.0,0.0,0.0,0.055556,0.0,0.055556,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.055556,0.0
1,"Fairview,Henry Farm,Oriole",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25
2,"Flemingdon Park,Don Mills South",0.055556,0.0,0.027778,0.027778,0.027778,0.0,0.027778,0.027778,0.027778,0.027778,0.0,0.027778,0.0,0.027778,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.027778,0.0,0.027778,0.0,0.027778,0.0,0.027778,0.0,0.027778,0.0,0.027778,0.027778,0.027778,0.0,0.027778,0.0,0.027778,0.0,0.0,0.0,0.055556,0.027778,0.027778,0.027778,0.027778,0.027778,0.027778,0.0,0.027778,0.027778,0.0,0.0,0.0,0.055556
3,"Newtonbrook,Willowdale",0.019231,0.019231,0.019231,0.019231,0.019231,0.0,0.0,0.0,0.057692,0.0,0.019231,0.0,0.019231,0.057692,0.0,0.0,0.0,0.019231,0.0,0.019231,0.057692,0.038462,0.019231,0.019231,0.0,0.019231,0.019231,0.0,0.019231,0.0,0.096154,0.0,0.019231,0.0,0.038462,0.0,0.019231,0.0,0.019231,0.0,0.019231,0.0,0.038462,0.038462,0.019231,0.019231,0.038462,0.038462,0.0,0.038462,0.0,0.0,0.0,0.0,0.019231,0.0,0.019231,0.019231,0.019231,0.0,0.0
4,"Northwood Park,York University",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25
5,Willowdale South,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25


In [59]:
northyork_grouped.shape

(6, 62)

#### Let's print each neighborhood along with the top 5 most common venues

In [60]:
num_top_venues = 5

for hood in northyork_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = northyork_grouped[northyork_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor,Downsview North,Wilson Heights----
                 venue  freq
0          Coffee Shop  0.11
1       Sandwich Place  0.06
2             Pharmacy  0.06
3   Frozen Yogurt Shop  0.06
4  Fried Chicken Joint  0.06


----Fairview,Henry Farm,Oriole----
                       venue  freq
0              Women's Store  0.25
1  Middle Eastern Restaurant  0.25
2                Pizza Place  0.25
3                Coffee Shop  0.25
4           Ramen Restaurant  0.00


----Flemingdon Park,Don Mills South----
                 venue  freq
0  American Restaurant  0.06
1       Cosmetics Shop  0.06
2           Restaurant  0.06
3        Women's Store  0.06
4                  Bar  0.03


----Newtonbrook,Willowdale----
                  venue  freq
0   Japanese Restaurant  0.10
1                  Café  0.06
2  Fast Food Restaurant  0.06
3           Coffee Shop  0.06
4   Fried Chicken Joint  0.04


----Northwood Park,York University----
                       venue  freq
0              Women

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [61]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [62]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = northyork_grouped['Neighborhood']

for ind in np.arange(northyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Pizza Place,Shopping Mall,Community Center,Pharmacy,Deli / Bodega,Diner,Restaurant,Sandwich Place,Fast Food Restaurant
1,"Fairview,Henry Farm,Oriole",Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop
2,"Flemingdon Park,Don Mills South",Women's Store,Restaurant,Cosmetics Shop,American Restaurant,Sushi Restaurant,Burger Joint,Gourmet Shop,Coffee Shop,Ice Cream Shop,Supermarket
3,"Newtonbrook,Willowdale",Japanese Restaurant,Café,Coffee Shop,Fast Food Restaurant,Fried Chicken Joint,Pharmacy,Pizza Place,Sandwich Place,Korean Restaurant,Restaurant
4,"Northwood Park,York University",Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop
5,Willowdale South,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [63]:
# set number of clusters
kclusters = 5

northyork_grouped_clustering = northyork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(northyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([2, 0, 3, 1, 0, 0], dtype=int32)

In [64]:
#NorthYork_data.apply(lambda col: col.drop_duplicates().reset_index(drop=True))

northyork_merged = NorthYork_data
northyork_merged.shape
# add clustering labels
northyork_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
northyork_merged = northyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

northyork_merged.head(50) # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2J,North York,"Fairview,Henry Farm,Oriole",43.754326,-79.449117,2,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop
1,M2M,North York,"Newtonbrook,Willowdale",43.763531,-79.411147,0,Japanese Restaurant,Café,Coffee Shop,Fast Food Restaurant,Fried Chicken Joint,Pharmacy,Pizza Place,Sandwich Place,Korean Restaurant,Restaurant
2,M2N,North York,Willowdale South,43.754326,-79.449117,3,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop
3,M3C,North York,"Flemingdon Park,Don Mills South",43.732822,-79.346961,1,Women's Store,Restaurant,Cosmetics Shop,American Restaurant,Sushi Restaurant,Burger Joint,Gourmet Shop,Coffee Shop,Ice Cream Shop,Supermarket
4,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.756199,-79.439802,0,Coffee Shop,Pizza Place,Shopping Mall,Community Center,Pharmacy,Deli / Bodega,Diner,Restaurant,Sandwich Place,Fast Food Restaurant
5,M3J,North York,"Northwood Park,York University",43.754326,-79.449117,0,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop


In [65]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(northyork_merged['Latitude'], northyork_merged['Longitude'], northyork_merged['Neighborhood'], northyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters


Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

In [66]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 0, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,0,Japanese Restaurant,Café,Coffee Shop,Fast Food Restaurant,Fried Chicken Joint,Pharmacy,Pizza Place,Sandwich Place,Korean Restaurant,Restaurant
4,North York,0,Coffee Shop,Pizza Place,Shopping Mall,Community Center,Pharmacy,Deli / Bodega,Diner,Restaurant,Sandwich Place,Fast Food Restaurant
5,North York,0,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop


In [67]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 1, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,North York,1,Women's Store,Restaurant,Cosmetics Shop,American Restaurant,Sushi Restaurant,Burger Joint,Gourmet Shop,Coffee Shop,Ice Cream Shop,Supermarket


In [68]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 2, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,2,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop


In [69]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 3, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North York,3,Women's Store,Middle Eastern Restaurant,Coffee Shop,Pizza Place,Cosmetics Shop,Gym,Grocery Store,Gourmet Shop,Gas Station,Frozen Yogurt Shop


In [70]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 4, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
