# Coursera 
## Applied DataScience Capstone

In [1]:
import pandas as pd

#### Read wikipedia page to get the Toronto postal details

In [2]:
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df_postal = tables[0]

In [3]:
df_postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df_postal1 = df_postal[df_postal['Borough'] != 'Not assigned']
df_postal1.shape

(103, 3)

In [5]:
df_postal1[df_postal1['Neighbourhood'] == 'Not assigned']
# There is no Neighbourhood with 'Not assigned'

Unnamed: 0,Postal Code,Borough,Neighbourhood


#### Gathering latitude and longitude of Toronto

In [6]:
#!pip3 install geocoder

In [7]:
#import geocoder # import geocoder
#
## initialize your variable to None
#lat_lng_coords = None
#
## loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
#  lat_lng_coords = g.latlng
#
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

# Not able to find geographical coordinate using the google geoorder package
# Using the csv file instead

#### Reading Geo data

In [8]:
geodata = pd.read_csv('https://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Joining both postal and geodata

In [9]:
postal_toronto = pd.merge(left=df_postal1, right=geodata, left_on='Postal Code', right_on='Postal Code')

In [10]:
postal_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [11]:
postal_toronto.shape

(103, 5)

# Create a map of Toronto Neighbourhood 

In [12]:
# Installing necessary packages
#!conda install -c conda-forge folium=0.5.0 --yes

In [13]:
#!conda install -c conda-forge geopy --yes 

In [14]:
# importing necessary packages
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [15]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City, Ontario are 43.6534817, -79.3839347.


In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(postal_toronto['Latitude'], postal_toronto['Longitude'], postal_toronto['Borough'], postal_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

In [17]:
map_toronto

In [18]:
CLIENT_ID = 'TRMPAH34WTNP00GP10DR05TFNGPHI0XO3UASH0KFFP5OOHV3' # your Foursquare ID
CLIENT_SECRET = 'ILIZGGKKESCADSSIIUZTFR5J4I3PXGM2GRH21IPM5R1KYQF3' # your Foursquare Secret
VERSION = '20201111'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TRMPAH34WTNP00GP10DR05TFNGPHI0XO3UASH0KFFP5OOHV3
CLIENT_SECRET:ILIZGGKKESCADSSIIUZTFR5J4I3PXGM2GRH21IPM5R1KYQF3


In [19]:
radius=500
lat=latitude
lng=longitude
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
out=requests.get(url).json()
pd.json_normalize(out)

Unnamed: 0,meta.code,meta.requestId,response.suggestedFilters.header,response.suggestedFilters.filters,response.headerLocation,response.headerFullLocation,response.headerLocationGranularity,response.totalResults,response.suggestedBounds.ne.lat,response.suggestedBounds.ne.lng,response.suggestedBounds.sw.lat,response.suggestedBounds.sw.lng,response.groups
0,200,5fbabfe33a1e82333e337824,Tap to show:,"[{'name': 'Open now', 'key': 'openNow'}]",Bay Street Corridor,"Bay Street Corridor, Toronto",neighborhood,90,43.657982,-79.377727,43.648982,-79.390143,"[{'type': 'Recommended Places', 'name': 'recom..."


In [20]:
## Converting the json data into pandas dataframe for the item
toronto_venue=pd.json_normalize(out['response']['groups'][0]['items'])

In [21]:
# Check the fields available in the dataframe
toronto_venue.columns

Index(['referralId', 'reasons.count', 'reasons.items', 'venue.id',
       'venue.name', 'venue.location.lat', 'venue.location.lng',
       'venue.location.labeledLatLngs', 'venue.location.distance',
       'venue.location.cc', 'venue.location.city', 'venue.location.state',
       'venue.location.country', 'venue.location.formattedAddress',
       'venue.categories', 'venue.photos.count', 'venue.photos.groups',
       'venue.location.address', 'venue.location.crossStreet',
       'venue.location.postalCode', 'venue.location.neighborhood',
       'venue.venuePage.id'],
      dtype='object')

In [22]:
type(toronto_venue['venue.categories'][0])

list

In [23]:
toronto_venue['venue.categories'][0]

[{'id': '4f2a25ac4b909258e854f55f',
  'name': 'Neighborhood',
  'pluralName': 'Neighborhoods',
  'shortName': 'Neighborhood',
  'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/neighborhood_',
   'suffix': '.png'},
  'primary': True}]

## Cleaning up the dataframe
cleaning up the dataframe and keeping only the required fields

In [24]:
# function that extracts the category name of the venue
def get_category_type(row):
    categories_list = row['categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
## selecting only venue details
toronto_venue = toronto_venue[['venue.name','venue.categories','venue.location.lat','venue.location.lng']]
## renaming column by removing prefix
toronto_venue.columns = [col.split(".")[-1] for col in toronto_venue.columns]

In [26]:
## extracting the category name from the dict and assigning in the dataframe column
toronto_venue['categories'] = toronto_venue.apply(get_category_type, axis=1)

In [27]:
toronto_venue.shape

(30, 4)

### Below is the function for this repeatative cleaning process
This function can be called for all the neighbourhoods to get the venue details for each neighbourhood

In [47]:
def getNearbyVenues(zipcode, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for zipcode, name, lat, lng in zip(zipcode, names, latitudes, longitudes):
        print("Zip : " + zipcode + "  |  Neighbouhood : " + name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        try :
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except :
            requests.get(url).json()
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            zipcode,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['zipcode',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
# There are same neighbourhood in multiple zip code
postal_toronto[postal_toronto['Neighbourhood'].duplicated()]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
13,M3C,North York,Don Mills,43.7259,-79.340923
46,M3L,North York,Downsview,43.739015,-79.506944
53,M3M,North York,Downsview,43.728496,-79.495697
60,M3N,North York,Downsview,43.761631,-79.520999


In [39]:
# There are same neighbourhood in multiple zip code, so including zip code also in the result
toronto_venues = getNearbyVenues(zipcode=postal_toronto['Postal Code'],
                                   names=postal_toronto['Neighbourhood'],
                                   latitudes=postal_toronto['Latitude'],
                                   longitudes=postal_toronto['Longitude']
                                  )

Zip : M3A  |  Neighbouhood : Parkwoods
Zip : M4A  |  Neighbouhood : Victoria Village
Zip : M5A  |  Neighbouhood : Regent Park, Harbourfront
Zip : M6A  |  Neighbouhood : Lawrence Manor, Lawrence Heights
Zip : M7A  |  Neighbouhood : Queen's Park, Ontario Provincial Government
Zip : M9A  |  Neighbouhood : Islington Avenue, Humber Valley Village
Zip : M1B  |  Neighbouhood : Malvern, Rouge
Zip : M3B  |  Neighbouhood : Don Mills
Zip : M4B  |  Neighbouhood : Parkview Hill, Woodbine Gardens
Zip : M5B  |  Neighbouhood : Garden District, Ryerson
Zip : M6B  |  Neighbouhood : Glencairn
Zip : M9B  |  Neighbouhood : West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Zip : M1C  |  Neighbouhood : Rouge Hill, Port Union, Highland Creek
Zip : M3C  |  Neighbouhood : Don Mills
Zip : M4C  |  Neighbouhood : Woodbine Heights
Zip : M5C  |  Neighbouhood : St. James Town
Zip : M6C  |  Neighbouhood : Humewood-Cedarvale
Zip : M9C  |  Neighbouhood : Eringate, Bloordale Gardens, Old Burnhamthorp

In [40]:
print('There are total {} Neibourhoods in Toronto, and {} venue returned by Foursqaure.'.format(postal_toronto.shape[0],toronto_venues.shape[0]))

There are total 103 Neibourhoods in Toronto, and 1338 venue returned by Foursqaure.


Lets check total venue in each neighbourhood

In [52]:
print(toronto_venues.columns)
print(toronto_venues.shape)
toronto_venues[['zipcode','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue']].head()

Index(['zipcode', 'Neighborhood', 'Neighborhood Latitude',
       'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude',
       'Venue Category'],
      dtype='object')
(1338, 8)


Unnamed: 0,zipcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
0,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park
1,M3A,Parkwoods,43.753259,-79.329656,Variety Store
2,M4A,Victoria Village,43.725882,-79.315572,Victoria Village Arena
3,M4A,Victoria Village,43.725882,-79.315572,Tim Hortons
4,M4A,Victoria Village,43.725882,-79.315572,Portugril


ok. Now lests see how many venues we have nearby in each neighbourhood as per foursqaure. 
Lets group by each neighbourhood and get the count of the venue.

In [50]:
toronto_tot_venue=toronto_venues.groupby(['zipcode','Neighborhood','Neighborhood Latitude','Neighborhood Longitude']).count()
toronto_tot_venue.reset_index(inplace=True)
toronto_tot_venue.head()

Unnamed: 0,zipcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,"Malvern, Rouge",43.806686,-79.194353,1,1,1,1
1,M1C,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2,2,2,2
2,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,8,8,8,8
3,M1G,Woburn,43.770992,-79.216917,4,4,4,4
4,M1H,Cedarbrae,43.773136,-79.239476,8,8,8,8


Lets visualize it in map by circle. More venue in neighbourhood the bigger the circle is.

In [51]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood, tot_venue in zip(toronto_tot_venue['Neighborhood Latitude'], toronto_tot_venue['Neighborhood Longitude'], toronto_tot_venue['Neighborhood'], toronto_tot_venue['Venue']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=tot_venue,
        popup=label,
        color='green',
        fill=True,
        fill_color='#318622',
        fill_opacity=0.4,
        parse_html=False).add_to(map_toronto)  

map_toronto

Now lets try to categorize (more higher level then forsquare data) the venue like school/cafe/park/store etc.

In [None]:
toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*Food.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*caf[eé].*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*Restaurant.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*coffee.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*pizza.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*bakery.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*pub.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*breakfast.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*sandwich.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*burger.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*Burrito.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*bar.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*diner.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*chicken.*)', case=False) == True), 'category'] = 'food & drink' 

toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*park.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*play.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*hockey.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*baseball.*)', case=False) == True), 'category'] = 'play' 

toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*distribution.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*store.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*shop.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*market.*)', case=False) == True), 'category'] = 'store' 

toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*spa.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*gym.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*fitness.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*yoga.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*pool.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*swim.*)', case=False) == True), 'category'] = 'fitness & welness' 

toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*salon.*)', case=False) == True), 'category'] = 'salon'

toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*bank.*)', case=False) == True), 'category'] = 'finance'

toronto_venues.loc[(toronto_venues['Venue Category'].str.match(r'(.*art.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*theater.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*club.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*site.*)', case=False) == True) | 
                   (toronto_venues['Venue Category'].str.match(r'(.*auditorium.*)', case=False) == True), 'category'] = 'entertainment'

In [None]:
toronto_venues[toronto_venues['Venue Category'].str.match(r'(.*school.*)', case=False) == True]
#toronto_venues[toronto_venues['category'].isna() == True]

In [55]:
#one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['zipcode'] = toronto_venues['zipcode'] 

# move neighborhood column to the first column
toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])]

###### Lets get average of each venue category

In [56]:
toronto_grouped = toronto_onehot.groupby('zipcode').mean().reset_index()

#Lets check the shape
print(toronto_grouped.shape)

toronto_grouped.head()

(100, 241)


Unnamed: 0,zipcode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now creating dataframe to hold 10 most common venue category for each zipcode

In [57]:
import numpy as np

tot_top_ven = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['zipcode']
for ind in np.arange(tot_top_ven):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['zipcode'] = toronto_grouped['zipcode']

neighborhoods_venues_sorted

Unnamed: 0,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,,,,,,,,,,
1,M1C,,,,,,,,,,
2,M1E,,,,,,,,,,
3,M1G,,,,,,,,,,
4,M1H,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
95,M9N,,,,,,,,,,
96,M9P,,,,,,,,,,
97,M9R,,,,,,,,,,
98,M9V,,,,,,,,,,


###### Below code block to find most common venues in each zip code
Note : Not all zipcode has 10 or more venue category. So we will put whatever is available in right order

In [59]:
# Select top venues for each zipcode
for cnt in np.arange(neighborhoods_venues_sorted.shape[0]):
    zipcode = toronto_grouped.iloc[cnt][0]
    print("selecting top venues for " + zipcode)
    
    # getting the top venues
    t = toronto_grouped.iloc[cnt][1:].astype(float).where(lambda x: x > 0).dropna().sort_values(ascending=False).head(tot_top_ven)

    if zipcode == neighborhoods_venues_sorted.iloc[cnt][0] :
        neighborhoods_venues_sorted.iloc[cnt, 1:t.shape[0] + 1] = t.index.values
    else :
        print("    Error: Index not matching")
        raise

neighborhoods_venues_sorted.head()

selecting top venues for M1B
selecting top venues for M1C
selecting top venues for M1E
selecting top venues for M1G
selecting top venues for M1H
selecting top venues for M1J
selecting top venues for M1K
selecting top venues for M1L
selecting top venues for M1M
selecting top venues for M1N
selecting top venues for M1P
selecting top venues for M1R
selecting top venues for M1S
selecting top venues for M1T
selecting top venues for M1V
selecting top venues for M1W
selecting top venues for M2H
selecting top venues for M2J
selecting top venues for M2K
selecting top venues for M2L
selecting top venues for M2N
selecting top venues for M2P
selecting top venues for M2R
selecting top venues for M3A
selecting top venues for M3B
selecting top venues for M3C
selecting top venues for M3H
selecting top venues for M3J
selecting top venues for M3K
selecting top venues for M3L
selecting top venues for M3M
selecting top venues for M3N
selecting top venues for M4A
selecting top venues for M4B
selecting top 

Unnamed: 0,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,,,,,,,,,
1,M1C,Construction & Landscaping,Bar,,,,,,,,
2,M1E,Restaurant,Rental Car Location,Mexican Restaurant,Medical Center,Intersection,Electronics Store,Breakfast Spot,Bank,,
3,M1G,Coffee Shop,Mexican Restaurant,Korean BBQ Restaurant,,,,,,,
4,M1H,Thai Restaurant,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,,


## Clustering Neighbourhood

In [60]:
from sklearn.cluster import KMeans 

num_clusters = 5

toronto_grouped_cluster = toronto_grouped.drop('zipcode', 1)

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(toronto_grouped_cluster)
labels = k_means.labels_

print(labels)

[1 2 2 2 2 4 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 0 2 0 2 2 2 2 0 2 3 2 2 2 2 2 2
 2 0 2 2 2 0 2 2 2 4 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 0 2 2 2 2]


In [61]:
neighborhoods_venues_sorted.insert(0, 'Cluster Number', labels)
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,M1B,Fast Food Restaurant,,,,,,,,,
1,2,M1C,Construction & Landscaping,Bar,,,,,,,,
2,2,M1E,Restaurant,Rental Car Location,Mexican Restaurant,Medical Center,Intersection,Electronics Store,Breakfast Spot,Bank,,
3,2,M1G,Coffee Shop,Mexican Restaurant,Korean BBQ Restaurant,,,,,,,
4,2,M1H,Thai Restaurant,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,,


In [66]:
toronto_tot_venue.head()

Unnamed: 0,zipcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,"Malvern, Rouge",43.806686,-79.194353,1,1,1,1
1,M1C,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2,2,2,2
2,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,8,8,8,8
3,M1G,Woburn,43.770992,-79.216917,4,4,4,4
4,M1H,Cedarbrae,43.773136,-79.239476,8,8,8,8


In [68]:
toronto_venues_merged = pd.merge(left=neighborhoods_venues_sorted, right=toronto_tot_venue[['zipcode','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue']], left_on='zipcode', right_on='zipcode')
toronto_venues_merged.head()

Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
0,1,M1B,Fast Food Restaurant,,,,,,,,,,"Malvern, Rouge",43.806686,-79.194353,1
1,2,M1C,Construction & Landscaping,Bar,,,,,,,,,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2
2,2,M1E,Restaurant,Rental Car Location,Mexican Restaurant,Medical Center,Intersection,Electronics Store,Breakfast Spot,Bank,,,"Guildwood, Morningside, West Hill",43.763573,-79.188711,8
3,2,M1G,Coffee Shop,Mexican Restaurant,Korean BBQ Restaurant,,,,,,,,Woburn,43.770992,-79.216917,4
4,2,M1H,Thai Restaurant,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,,,Cedarbrae,43.773136,-79.239476,8


In [80]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


In [90]:
# create map of Toronto using latitude and longitude values
map_toronto_cluster = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood, cluster, tot_venue in zip(
                                                    toronto_venues_merged['Neighborhood Latitude'], 
                                                    toronto_venues_merged['Neighborhood Longitude'], 
                                                    toronto_venues_merged['Neighborhood'], 
                                                    toronto_venues_merged['Cluster Number'], 
                                                    toronto_venues_merged['Venue']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=round(tot_venue/5)+2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.4,
        parse_html=False).add_to(map_toronto_cluster)  

map_toronto_cluster

### Analyzing each cluster

In [97]:
# Cluster 1
cluster_no = 1
print("There are total " + str(toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no].shape[0]) + " zipcode under this cluster")
toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no]

There are total 1 zipcode under this cluster


Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
0,1,M1B,Fast Food Restaurant,,,,,,,,,,"Malvern, Rouge",43.806686,-79.194353,1


In [98]:
# Cluster 2
cluster_no = 2
print("There are total " + str(toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no].shape[0]) + " zipcode under this cluster")
toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no]

There are total 84 zipcode under this cluster


Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
1,2,M1C,Construction & Landscaping,Bar,,,,,,,,,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2
2,2,M1E,Restaurant,Rental Car Location,Mexican Restaurant,Medical Center,Intersection,Electronics Store,Breakfast Spot,Bank,,,"Guildwood, Morningside, West Hill",43.763573,-79.188711,8
3,2,M1G,Coffee Shop,Mexican Restaurant,Korean BBQ Restaurant,,,,,,,,Woburn,43.770992,-79.216917,4
4,2,M1H,Thai Restaurant,Hakka Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Bank,Bakery,Athletics & Sports,,,Cedarbrae,43.773136,-79.239476,8
6,2,M1K,Train Station,Hobby Shop,Department Store,Coffee Shop,,,,,,,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,2,M9L,Pizza Place,Furniture / Home Store,,,,,,,,,Humber Summit,43.756303,-79.565963,2
96,2,M9P,Sandwich Place,Pizza Place,Intersection,Discount Store,Coffee Shop,Chinese Restaurant,,,,,Westmount,43.696319,-79.532242,6
97,2,M9R,Sandwich Place,Pizza Place,Park,Bus Line,,,,,,,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724,4
98,2,M9V,Pizza Place,Grocery Store,Sandwich Place,Pharmacy,Liquor Store,Fried Chicken Joint,Fast Food Restaurant,Beer Store,,,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437,10


In [99]:
# Cluster 3
cluster_no = 3
print("There are total " + str(toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no].shape[0]) + " zipcode under this cluster")
toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no]

There are total 3 zipcode under this cluster


Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
30,3,M3M,Speakeasy,Home Service,Business Service,Baseball Field,,,,,,,Downsview,43.728496,-79.495697,4
89,3,M8Y,Construction & Landscaping,Baseball Field,,,,,,,,,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,2
94,3,M9M,Baseball Field,,,,,,,,,,"Humberlea, Emery",43.724766,-79.532242,1


In [100]:
# Cluster 4
cluster_no = 4
print("There are total " + str(toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no].shape[0]) + " zipcode under this cluster")
toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no]

There are total 2 zipcode under this cluster


Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
5,4,M1J,Smoke Shop,Playground,Jewelry Store,,,,,,,,Scarborough Village,43.744734,-79.239476,3
46,4,M4T,Trail,Playground,,,,,,,,,"Moore Park, Summerhill East",43.689574,-79.38316,2


In [101]:
# Cluster 5
cluster_no = 5
print("There are total " + str(toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no].shape[0]) + " zipcode under this cluster")
toronto_venues_merged[toronto_venues_merged['Cluster Number'] == cluster_no]

There are total 0 zipcode under this cluster


Unnamed: 0,Cluster Number,zipcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue
