### Week 3 Applied Data Science Capstone Assignment
 #### Task 1 Transform the data on Wiki page into pandas dataframe

Import required libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
#Grep the wiki page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

In [3]:
data = []
for tr in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tr.find_all('td')])

In [4]:
#create dataframe
df = pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [5]:
df.shape

(290, 3)

Remove 'Not assigned' and 'None' rows

In [6]:
not_assigned = 'Not assigned'
not_assigned_row = df[ (df.Borough == not_assigned) & (df.Neighborhood == not_assigned) ]
not_assigned_row.head(), not_assigned_row.shape
df.drop(not_assigned_row.index, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [7]:
df.shape

(212, 3)

In [8]:
#Add neighborhood column
def neighborhood_list(grouped):    
    if( len(grouped) == 1 ):
        # only one line under the postal code assign the Borough as Neighborhood
        borough = grouped['Borough'].tolist()[0] 
        neighborhood = grouped['Neighborhood'].tolist()[0] 
        if( neighborhood) == not_assigned:
            return borough
        else:
            return neighborhood
    else:
        # transform grouped Neighborhood as single value separated with commas
        return ', '.join(sorted(grouped['Neighborhood'].tolist())) 
                    
grp = df.groupby(['PostalCode', 'Borough'])
df1 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [9]:
df1

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
df1.shape

(103, 3)

#### TAsk2 Add latitude and longitudes coordinates

In [11]:
!pip install geocoder
import geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 5.4MB/s ta 0:00:01
[?25hRequirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Requirement 

In [12]:
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords
    
get_latlng('M2H')

[43.80284500000005, -79.35623615099996]

In [13]:
postal_codes = df1['PostalCode']    
coords = [ get_latlng(postal_code) for postal_code in postal_codes.tolist() ]

Add columns for coordinates

In [14]:
df1_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df1['Latitude'] = df1_coords['Latitude']
df1['Longitude'] = df1_coords['Longitude']

In [15]:
df1

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.811650,-79.195561
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.785605,-79.158701
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765690,-79.175299
3,M1G,Scarborough,Woburn,43.768216,-79.217610
4,M1H,Scarborough,Cedarbrae,43.769608,-79.239440
5,M1J,Scarborough,Scarborough Village,43.743085,-79.232172
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726260,-79.263670
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713213,-79.284910
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.696690,-79.260069


### Task3 Explore and cluster Toronto neighborhoods

In [16]:
#import libraries
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00   4.87 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.02 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  30.93 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  46.27 MB/s


In [17]:
toronto_coords = get_latlng('')
toronto_coords

[43.648690000000045, -79.38543999999996]

worked with boroughs that contain the word Toronto

In [18]:
df2 = df1[ df1.Borough.str.contains('Toronto') ]

In [19]:
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676531,-79.29541
41,M4K,East Toronto,"Riverdale, The Danforth West",43.683262,-79.35512
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.667985,-79.314642
43,M4M,East Toronto,Studio District,43.662766,-79.33483
44,M4N,Central Toronto,Lawrence Park,43.728135,-79.38709
45,M4P,Central Toronto,Davisville North,43.712755,-79.388514
46,M4R,Central Toronto,North Toronto West,43.714523,-79.40696
47,M4S,Central Toronto,Davisville,43.702765,-79.385769
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.690505,-79.382973
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686003,-79.402335


In [20]:
df2.shape

(38, 5)

Plot data on map

In [21]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[toronto_coords[0], toronto_coords[1]], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{} - {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [22]:
CLIENT_ID = 'GETKZZNQH00HHAKHSHJV1RCJZWPXHAKO11NH4SEYPVCT4HTK'
 # your Foursquare ID
CLIENT_SECRET = 'PN0FMYA3WOKFQZ12KEE1OFJENU5XJZYIQBAD0SPOTXQMUFVQ' # your Foursquare Secret
VERSION = '20190307' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius

In [23]:
toronto_data = pd.DataFrame(df2)

In [24]:
toronto_data = toronto_data.reset_index().drop('index', axis=1)

In [25]:
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676531,-79.29541
1,M4K,East Toronto,"Riverdale, The Danforth West",43.683262,-79.35512
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.667985,-79.314642
3,M4M,East Toronto,Studio District,43.662766,-79.33483
4,M4N,Central Toronto,Lawrence Park,43.728135,-79.38709


In [26]:
toronto_data.loc[0, 'Neighborhood']

'The Beaches'

In [27]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67653131600008, -79.29540999999995.


Top 100 venues that are in East Toronto within a radius of 500 meters

In [29]:
VERSION = '20190307'
CLIENT_ID = 'GETKZZNQH00HHAKHSHJV1RCJZWPXHAKO11NH4SEYPVCT4HTK'
CLIENT_SECRET = 'PN0FMYA3WOKFQZ12KEE1OFJENU5XJZYIQBAD0SPOTXQMUFVQ'
latitude = neighborhood_latitude
longitude = neighborhood_longitude
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION,radius, LIMIT)

In [30]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c81d72d9fb6b740fef3bea6'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4ad4c062f964a52011f820e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_',
          'suffix': '.png'},
         'id': '50aa9e744b90af0d42d5de0e',
         'name': 'Health Food Store',
         'pluralName': 'Health Food Stores',
         'primary': True,
         'shortName': 'Health Food Store'}],
       'id': '4ad4c062f964a52011f820e3',
       'location': {'address': '125 Southwood Dr',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'distance': 321,
        'formattedAddress': ['125 Southwood Dr',
         'Toronto ON M4E 0B8',
         'Canada'],
        'labeledLatLngs': [{'label': 'display',
      

In [31]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [32]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
1,Grover Pub and Grub,Pub,43.679181,-79.297215
2,Starbucks,Coffee Shop,43.678798,-79.298045
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [33]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


All Neighborhoods in Toronto

In [34]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now write the code to run the above function on each neighborhood and create a new dataframe called toronto_venues.

In [35]:
toronto_venues = getNearbyVenues(toronto_data['Neighborhood'], toronto_data['Latitude'], toronto_data['Longitude'])

The Beaches
Riverdale, The Danforth West
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
North Midtown, The Annex, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
Bathurst Quay, CN Tower, Harbourfront West, Island airport, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [36]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676531,-79.29541,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676531,-79.29541,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676531,-79.29541,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.676531,-79.29541,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"Riverdale, The Danforth West",43.683262,-79.35512,Dairy Queen,43.684223,-79.357062,Fast Food Restaurant


In [37]:
#size of resulting dataframe
print(toronto_venues.shape)
toronto_venues.head()

(1727, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676531,-79.29541,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676531,-79.29541,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676531,-79.29541,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.676531,-79.29541,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"Riverdale, The Danforth West",43.683262,-79.35512,Dairy Queen,43.684223,-79.357062,Fast Food Restaurant


In [39]:
#Venues per neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
"Bathurst Quay, CN Tower, Harbourfront West, Island airport, King and Spadina, Railway Lands, South Niagara",69,69,69,69,69,69
Berczy Park,61,61,61,61,61,61
"Brockton, Exhibition Place, Parkdale Village",52,52,52,52,52,52
Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100
"Cabbagetown, St. James Town",43,43,43,43,43,43
Central Bay Street,98,98,98,98,98,98
"Chinatown, Grange Park, Kensington Market",92,92,92,92,92,92
Christie,9,9,9,9,9,9
Church and Wellesley,82,82,82,82,82,82


Let's find out how many unique categories can be curated from all the returned venues

In [40]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 214 uniques categories.


#### Analyze Each Neighborhood

In [41]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
#new dataframe size
toronto_onehot.shape

(1727, 214)

group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [43]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.03,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.01
1,"Bathurst Quay, CN Tower, Harbourfront West, Is...",0.014493,0.0,0.0,0.014493,0.0,0.0,0.0,0.0,0.014493,...,0.0,0.014493,0.014493,0.014493,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019231,0.0,...,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.02,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.010204,0.0,0.010204,0.010204,0.0,0.0,...,0.010204,0.0,0.010204,0.0,0.010204,0.010204,0.010204,0.0,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.021739,0.0,0.01087,0.0,...,0.01087,0.0,0.043478,0.0,0.0,0.032609,0.01087,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.012195,0.012195,0.012195,0.0,0.0,0.0,0.012195,0.0,...,0.0,0.0,0.0,0.0,0.012195,0.012195,0.0,0.012195,0.012195,0.0


In [44]:
#confirm new size
toronto_grouped.shape

(38, 214)

 print each neighborhood along with the top 5 most common venues

In [45]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0                Hotel  0.06
1                 Café  0.06
2          Coffee Shop  0.05
3           Steakhouse  0.03
4  Japanese Restaurant  0.03


----Bathurst Quay, CN Tower, Harbourfront West, Island airport, King and Spadina, Railway Lands, South Niagara----
                  venue  freq
0           Coffee Shop  0.09
1    Italian Restaurant  0.07
2                  Café  0.06
3            Restaurant  0.04
4  Gym / Fitness Center  0.04


----Berczy Park----
          venue  freq
0   Coffee Shop  0.08
1  Cocktail Bar  0.05
2    Restaurant  0.05
3        Bakery  0.03
4    Steakhouse  0.03


----Brockton, Exhibition Place, Parkdale Village----
                           venue  freq
0                    Coffee Shop  0.12
1         Furniture / Home Store  0.06
2                           Café  0.06
3  Vegetarian / Vegan Restaurant  0.04
4                      Gift Shop  0.04


----Business Reply Mail Processing Centre 969 Easte

#### put the data in dataframes

In [46]:
#function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [47]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Hotel,Café,Coffee Shop,Bar,Steakhouse,Asian Restaurant,Japanese Restaurant,Gastropub,Restaurant,American Restaurant
1,"Bathurst Quay, CN Tower, Harbourfront West, Is...",Coffee Shop,Italian Restaurant,Café,Gym / Fitness Center,Restaurant,Speakeasy,Pub,Sandwich Place,Diner,Park
2,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Pub,Steakhouse,Café,Seafood Restaurant,Cheese Shop,Bakery,Hotel
3,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Furniture / Home Store,Café,Gym,Gift Shop,Hotel,Italian Restaurant,Vegetarian / Vegan Restaurant,Sandwich Place,Bar
4,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Bar,Café,Hotel,Steakhouse,Japanese Restaurant,Pizza Place,Sushi Restaurant,American Restaurant,Pub
5,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Park,Italian Restaurant,Pizza Place,Café,Bakery,Sandwich Place,Flower Shop,Deli / Bodega
6,Central Bay Street,Coffee Shop,Clothing Store,Tea Room,Cosmetics Shop,Plaza,Sushi Restaurant,Burger Joint,Spa,Café,Bar
7,"Chinatown, Grange Park, Kensington Market",Café,Chinese Restaurant,Bar,Vegetarian / Vegan Restaurant,Ice Cream Shop,Mexican Restaurant,Caribbean Restaurant,Bakery,Vietnamese Restaurant,Ramen Restaurant
8,Christie,Café,Grocery Store,Playground,Italian Restaurant,Baby Store,Coffee Shop,Women's Store,Ethiopian Restaurant,Flea Market,Fish Market
9,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Fast Food Restaurant,Dance Studio,Men's Store,Pub,Bubble Tea Shop


#### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [48]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

#### create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [49]:
toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676531,-79.29541,1,Pub,Health Food Store,Coffee Shop,Women's Store,Electronics Store,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
1,M4K,East Toronto,"Riverdale, The Danforth West",43.683262,-79.35512,1,Bus Line,Discount Store,Park,Fast Food Restaurant,Grocery Store,Women's Store,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.667985,-79.314642,1,Park,Sandwich Place,Pizza Place,Sushi Restaurant,Pub,Movie Theater,Fast Food Restaurant,Fish & Chips Shop,Burrito Place,Burger Joint
3,M4M,East Toronto,Studio District,43.662766,-79.33483,1,Diner,Bakery,Italian Restaurant,Coffee Shop,Café,Brewery,Sushi Restaurant,Gastropub,Bar,Pizza Place
4,M4N,Central Toronto,Lawrence Park,43.728135,-79.38709,1,Bus Line,Swim School,Women's Store,Event Space,Food & Drink Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant


In [50]:
#visualize the resulting cluster
# create map
map_clusters = folium.Map(location=[toronto_coords[0], toronto_coords[1]], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Clusters

#### Cluster 1

In [53]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,West Toronto,0,Bar,Restaurant,Asian Restaurant,Coffee Shop,Cocktail Bar,Pizza Place,Wine Bar,Vietnamese Restaurant,American Restaurant,Bakery


#### Cluster 2

In [54]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,1,Pub,Health Food Store,Coffee Shop,Women's Store,Electronics Store,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
1,East Toronto,1,Bus Line,Discount Store,Park,Fast Food Restaurant,Grocery Store,Women's Store,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market
2,East Toronto,1,Park,Sandwich Place,Pizza Place,Sushi Restaurant,Pub,Movie Theater,Fast Food Restaurant,Fish & Chips Shop,Burrito Place,Burger Joint
3,East Toronto,1,Diner,Bakery,Italian Restaurant,Coffee Shop,Café,Brewery,Sushi Restaurant,Gastropub,Bar,Pizza Place
4,Central Toronto,1,Bus Line,Swim School,Women's Store,Event Space,Food & Drink Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
5,Central Toronto,1,Burger Joint,Breakfast Spot,Dog Run,Gym,Park,Gym / Fitness Center,Food & Drink Shop,Bus Line,Hotel,Clothing Store
6,Central Toronto,1,Playground,Gym Pool,Park,Garden,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
7,Central Toronto,1,Coffee Shop,Italian Restaurant,Café,Pizza Place,Dessert Shop,Sandwich Place,Park,Flower Shop,Farmers Market,Seafood Restaurant
8,Central Toronto,1,Playground,Convenience Store,Summer Camp,Gym,Electronics Store,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
9,Central Toronto,1,Light Rail Station,Coffee Shop,Spa,Convenience Store,Supermarket,Women's Store,Event Space,Flower Shop,Flea Market,Fish Market


#### Cluster 3

In [55]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,Central Toronto,2,Sandwich Place,Coffee Shop,Pizza Place,Café,Indian Restaurant,Mexican Restaurant,Burger Joint,Liquor Store,French Restaurant,Restaurant


#### Cluster 4

In [56]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,3,Coffee Shop,Clothing Store,Tea Room,Cosmetics Shop,Plaza,Sushi Restaurant,Burger Joint,Spa,Café,Bar
20,Downtown Toronto,3,Coffee Shop,Café,Hotel,American Restaurant,Steakhouse,Deli / Bodega,Japanese Restaurant,Bar,Sushi Restaurant,Gym
22,Central Toronto,3,Home Service,Food Truck,Food & Drink Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
28,Downtown Toronto,3,Coffee Shop,Bar,Café,Hotel,Steakhouse,Japanese Restaurant,Pizza Place,Sushi Restaurant,American Restaurant,Pub
30,Downtown Toronto,3,Café,Grocery Store,Playground,Italian Restaurant,Baby Store,Coffee Shop,Women's Store,Ethiopian Restaurant,Flea Market,Fish Market
31,West Toronto,3,Park,Discount Store,Bus Line,Furniture / Home Store,Supermarket,Liquor Store,Middle Eastern Restaurant,Pool,Bar,Bank


#### Cluster 5

In [57]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
26,Downtown Toronto,4,Café,Chinese Restaurant,Bar,Vegetarian / Vegan Restaurant,Ice Cream Shop,Mexican Restaurant,Caribbean Restaurant,Bakery,Vietnamese Restaurant,Ramen Restaurant
