# <span style='color:DarkBlue'>Week 4 Submission for IBM Data Science Course 9</span>

## <span style='color:Teal'>Segmenting and Clustering Neighborhoods in Toronto</span>

#### Installing the requisite libraries

In [3]:
! pip install beautifulsoup4



In [4]:
! pip install lxml



In [5]:
! pip install html5lib



In [6]:
! pip install requests



#### Scraping the wiki page using Beautiful Soup

In [38]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import numpy as np

# Getting the data on the Wikipedia page as text
source= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Getting the soup of the data by parsing as XML
soup=BeautifulSoup(source,'lxml')

# Getting the tabular data
table=soup.find('table', class_='wikitable sortable')

print(table.prettify())



<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postcode
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighbourhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Parkwoods" title="Parkwoods">
     Parkwoods
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Victoria_Village" title="Victoria Village">
     Victoria Village
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    <a href="/wiki/Downtown_Toronto" title="Downtown Toronto">
     Downtown Toronto
    </a>
   </td>
   <td>
    <a href="

In [4]:
# Getting rows in the table
table_rows = table.find_all('tr')

#Initializing the result list
result = []

# Looping through each row in the table row
for tr in table_rows:
    #print("Row is",tr)
    td = tr.find_all('td')# get the table data elements in each row
    #print("td is:",td)
    
    # for each table data element, strip the space and add to row list
    row = [data.text.strip() for data in td if data.text.strip()]
    if row:
        result.append(row)
    #print("")

df = pd.DataFrame(result, columns=["PostalCode", "Borough", "Neighborhood"])
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Cleaning the table data 

In [5]:
# Dropping rows where Borough is not assigned
df_1=df
df_1.shape

(288, 3)

In [6]:
# Counting the rows that have 'Not Assigned' as Borough 
df_1['Borough'].value_counts() #77

# Deleting the rows that have 'Not Assigned' as Borough
df_3=df_1[df_1.Borough!='Not assigned']
df_3.shape

(211, 3)

In [7]:
#Resetting the index and sorting by PostalCodes
df_3.reset_index(drop=True)
df_3.sort_values(by=['PostalCode'])


# checking the number of 'Not Assigned Counts'
df_3['Neighborhood'].value_counts()

#df_3[df_3.PostalCode=="M7A"]
df_3.reset_index(drop=True)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [8]:
# Counting the unique values for Postal Codes
len(df_3['PostalCode']. unique().tolist()) #103

103

In [9]:
# Creating a new empty dataframe with the same set of columns
df_4 = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])
df_4

Unnamed: 0,PostalCode,Borough,Neighborhood


#### Grouping the rows for each Postal Code

In [10]:
# Iterating through the dataframe to identify the Boroughs with multiple neighborhoods

for postalcode in df_3['PostalCode'].unique().tolist():
    
    neighborhoodlist=[]  # To build string with combined neighborhoods
    rows_list=[] # To build each row for unique postalcode, borough and combined neighborhoods

    #print("PostalCode is",postalcode)
    
    # Looping through the dataframe for each unique postal code
    for index in df_3.index:
    
        if df_3['PostalCode'][index]==postalcode:
            borough=df_3['Borough'][index]
            neighborhoodlist.append(df_3['Neighborhood'][index])
           # print(neighborhoodlist)
    
    #print("Borough is:",borough)
    if 'Not assigned' in neighborhoodlist:
        print(borough)
        neighborhoodlist=[]
        neighborhoodlist.append(borough)
        
    rows_list=[postalcode,borough,str(neighborhoodlist).strip('[]')]
    #print(rows_list)
    
    # adding a new row to the dataframe
    df_4.loc[len(df_4)]=rows_list
df_4

Queen's Park


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,'Parkwoods'
1,M4A,North York,'Victoria Village'
2,M5A,Downtown Toronto,"'Harbourfront', 'Regent Park'"
3,M6A,North York,"'Lawrence Heights', 'Lawrence Manor'"
4,M7A,Queen's Park,"""Queen's Park"""
5,M9A,Etobicoke,'Islington Avenue'
6,M1B,Scarborough,"'Rouge', 'Malvern'"
7,M3B,North York,'Don Mills North'
8,M4B,East York,"'Woodbine Gardens', 'Parkview Hill'"
9,M5B,Downtown Toronto,"'Ryerson', 'Garden District'"


In [164]:
! pip install geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


#### Getting the Geo coordinates using GeoCoder
#### Tried multiple times, but the coordinates are not received consistently
#### __*So marking this code as markdown*__

import geocoder # import geocoder

for index in df_4.index:
    postal_code=df_4['PostalCode'][index]
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    df_4['Latitude'][index] = lat_lng_coords[0]
    df_4['Longitude'][index] = lat_lng_coords[1]
df_4.head()

In [11]:
path='https://cocl.us/Geospatial_data'
df_latlong=pd.read_csv(path)
df_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging the two dataframes

In [12]:
finalresult=pd.merge(df_4,df_latlong, left_on='PostalCode', right_on='Postal Code')
finalresult

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,'Parkwoods',M3A,43.753259,-79.329656
1,M4A,North York,'Victoria Village',M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"'Harbourfront', 'Regent Park'",M5A,43.654260,-79.360636
3,M6A,North York,"'Lawrence Heights', 'Lawrence Manor'",M6A,43.718518,-79.464763
4,M7A,Queen's Park,"""Queen's Park""",M7A,43.662301,-79.389494
5,M9A,Etobicoke,'Islington Avenue',M9A,43.667856,-79.532242
6,M1B,Scarborough,"'Rouge', 'Malvern'",M1B,43.806686,-79.194353
7,M3B,North York,'Don Mills North',M3B,43.745906,-79.352188
8,M4B,East York,"'Woodbine Gardens', 'Parkview Hill'",M4B,43.706397,-79.309937
9,M5B,Downtown Toronto,"'Ryerson', 'Garden District'",M5B,43.657162,-79.378937


In [15]:
df_dtwntoronto=finalresult[finalresult.Borough=='Downtown Toronto']
df_dtwntoronto.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M5A,Downtown Toronto,"'Harbourfront', 'Regent Park'",M5A,43.65426,-79.360636
1,M5B,Downtown Toronto,"'Ryerson', 'Garden District'",M5B,43.657162,-79.378937
2,M5C,Downtown Toronto,'St. James Town',M5C,43.651494,-79.375418
3,M5E,Downtown Toronto,'Berczy Park',M5E,43.644771,-79.373306
4,M5G,Downtown Toronto,'Central Bay Street',M5G,43.657952,-79.387383
5,M6G,Downtown Toronto,'Christie',M6G,43.669542,-79.422564
6,M5H,Downtown Toronto,"'Adelaide', 'King', 'Richmond'",M5H,43.650571,-79.384568
7,M5J,Downtown Toronto,"'Harbourfront East', 'Toronto Islands', 'Union...",M5J,43.640816,-79.381752
8,M5K,Downtown Toronto,"'Design Exchange', 'Toronto Dominion Centre'",M5K,43.647177,-79.381576
9,M5L,Downtown Toronto,"'Commerce Court', 'Victoria Hotel'",M5L,43.648198,-79.379817


#### Define Foursquare Credentials and Version

In [27]:
CLIENT_ID = 'P205TY3TP3W42QQDLYUFQ2ZUAVEKFHAI0O3VFNCFOV1EN1CR' # your Foursquare ID
CLIENT_SECRET = 'HNVKLNZOBKGHDDQJSEY11POTS0KXODDIMTHKIDFBL3ROVGI5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT=100

#### Getting the venues near Downtown Toronto

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
toronto_venues = getNearbyVenues(names=df_dtwntoronto['Neighborhood'],
                                   latitudes=df_dtwntoronto['Latitude'],
                                   longitudes=df_dtwntoronto['Longitude']
                                  )

'Harbourfront', 'Regent Park'
'Ryerson', 'Garden District'
'St. James Town'
'Berczy Park'
'Central Bay Street'
'Christie'
'Adelaide', 'King', 'Richmond'
'Harbourfront East', 'Toronto Islands', 'Union Station'
'Design Exchange', 'Toronto Dominion Centre'
'Commerce Court', 'Victoria Hotel'
'Harbord', 'University of Toronto'
'Chinatown', 'Grange Park', 'Kensington Market'
'CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara'
'Rosedale'
'Stn A PO Boxes 25 The Esplanade'
'Cabbagetown', 'St. James Town'
'First Canadian Place', 'Underground city'
'Church and Wellesley'


In [30]:
print(toronto_venues.shape)

(1288, 7)


In [31]:
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 209 uniques categories.


#### Preprocessing the dataframe

In [32]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,"'Adelaide', 'King', 'Richmond'",0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.01
1,'Berczy Park',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
2,"'CN Tower', 'Bathurst Quay', 'Island airport',...",0.0,0.0,0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"'Cabbagetown', 'St. James Town'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,'Central Bay Street',0.011364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.011364,0.0,0.011364,0.0,0.011364,0.0,0.0
5,"'Chinatown', 'Grange Park', 'Kensington Market'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.06,0.0,0.0,0.03,0.01,0.0,0.0
6,'Christie',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,'Church and Wellesley',0.011494,0.0,0.011494,0.011494,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011494,0.0,0.011494,0.0,0.011494,0.0
8,"'Commerce Court', 'Victoria Hotel'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0
9,"'Design Exchange', 'Toronto Dominion Centre'",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [34]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----'Adelaide', 'King', 'Richmond'----
             venue  freq
0      Coffee Shop  0.06
1             Café  0.05
2              Bar  0.04
3  Thai Restaurant  0.04
4       Steakhouse  0.04


----'Berczy Park'----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.05
2  Italian Restaurant  0.04
3                Café  0.04
4            Beer Bar  0.04


----'CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara'----
              venue  freq
0    Airport Lounge  0.12
1  Airport Terminal  0.12
2   Airport Service  0.12
3             Plane  0.06
4       Coffee Shop  0.06


----'Cabbagetown', 'St. James Town'----
                venue  freq
0         Coffee Shop  0.09
1          Restaurant  0.07
2              Bakery  0.04
3  Italian Restaurant  0.04
4         Pizza Place  0.04


----'Central Bay Street'----
                venue  freq
0         Coffee Shop  0.16
1                Café  0.05
2  Ital

In [35]:
#Let's put that into a pandas dataframe

#First, let's write a function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [68]:
# Display top 10
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
toronto_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
toronto_neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    toronto_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"'Adelaide', 'King', 'Richmond'",Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Gym,Restaurant,Burger Joint,Hotel
1,'Berczy Park',Coffee Shop,Cocktail Bar,Seafood Restaurant,Cheese Shop,Steakhouse,Beer Bar,Italian Restaurant,Farmers Market,Café,Bakery
2,"'CN Tower', 'Bathurst Quay', 'Island airport',...",Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Boutique,Sculpture Garden,Bar,Plane,Boat or Ferry,Airport Gate
3,"'Cabbagetown', 'St. James Town'",Coffee Shop,Restaurant,Park,Italian Restaurant,Café,Pizza Place,Bakery,Pub,Bar,Bank
4,'Central Bay Street',Coffee Shop,Café,Italian Restaurant,Middle Eastern Restaurant,Sandwich Place,Burger Joint,Japanese Restaurant,Bubble Tea Shop,Bar,Bakery


####  Cluster Neighborhoods

In [69]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 2, 0, 0, 0, 3, 0, 0, 0])

In [70]:
#Let's create a new dataframe that includes the cluster as well as the top 10 venues
#for each neighborhood

# add clustering labels
toronto_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_dtwntoronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"'Harbourfront', 'Regent Park'",M5A,43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Mexican Restaurant,Breakfast Spot,Restaurant,Café,Theater,Cosmetics Shop
9,M5B,Downtown Toronto,"'Ryerson', 'Garden District'",M5B,43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Italian Restaurant,Lingerie Store,Bubble Tea Shop,Pizza Place
15,M5C,Downtown Toronto,'St. James Town',M5C,43.651494,-79.375418,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Cosmetics Shop,Breakfast Spot,Gastropub,Italian Restaurant,Cocktail Bar
20,M5E,Downtown Toronto,'Berczy Park',M5E,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Seafood Restaurant,Cheese Shop,Steakhouse,Beer Bar,Italian Restaurant,Farmers Market,Café,Bakery
24,M5G,Downtown Toronto,'Central Bay Street',M5G,43.657952,-79.387383,0,Coffee Shop,Café,Italian Restaurant,Middle Eastern Restaurant,Sandwich Place,Burger Joint,Japanese Restaurant,Bubble Tea Shop,Bar,Bakery
25,M6G,Downtown Toronto,'Christie',M6G,43.669542,-79.422564,3,Grocery Store,Café,Park,Baby Store,Italian Restaurant,Diner,Nightclub,Convenience Store,Restaurant,Coffee Shop
30,M5H,Downtown Toronto,"'Adelaide', 'King', 'Richmond'",M5H,43.650571,-79.384568,0,Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Gym,Restaurant,Burger Joint,Hotel
36,M5J,Downtown Toronto,"'Harbourfront East', 'Toronto Islands', 'Union...",M5J,43.640816,-79.381752,0,Coffee Shop,Aquarium,Hotel,Italian Restaurant,Café,Brewery,Scenic Lookout,Fried Chicken Joint,Bakery,Pizza Place
42,M5K,Downtown Toronto,"'Design Exchange', 'Toronto Dominion Centre'",M5K,43.647177,-79.381576,0,Coffee Shop,Café,Hotel,Restaurant,Bakery,Italian Restaurant,Bar,Deli / Bodega,Gastropub,Steakhouse
48,M5L,Downtown Toronto,"'Commerce Court', 'Victoria Hotel'",M5L,43.648198,-79.379817,0,Coffee Shop,Café,Hotel,American Restaurant,Restaurant,Bakery,Deli / Bodega,Gastropub,Steakhouse,Seafood Restaurant


In [71]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# LatLong for Toronto

address = 'Toronto, ON'

# geolocator = Nominatim(user_agent="toronto_explorer")

# location = geolocator.geocode(address)

# Service timed out after multiple tries, so inserting lat-long values directly
latitude =  43.6532 #location.latitude
longitude =  -79.384 #location.longitude


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster Analysis

I tried clustering using number of clusters as 4,5,6 upto 10

I see number of clusters as 4 as the optimum number since it distinctly splits the postal codes
into 4 areas as

1) __Cluster 0__
   This is the area with high concentration of eateries , cafes, pubs and is predominantly in the heart of the downtown

2) __Cluster 1__
   This is the area near to amenities centers like parks, playgrounds, trail and farther from the downtown

3) __Cluster 2__
   This is the area near the airport terminal and harbor. It is far from the downtown area
   
4) __Cluster 3__
   This is the area slightly away from downtown with the shopping centres - groceries, convenience and baby items.
