## Python for Prata Science Part I
### Creating the Singapore Neighborhood Map

In [1]:
import pandas as pd

I rely on the airbnb dataset (available online) to create the different neighborhood of Singapore. It is not perfect but still enable an extensive view and is very easy to handle.

In [2]:
df=pd.read_csv('http://data.insideairbnb.com/singapore/sg/singapore/2020-03-21/visualisations/listings.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,North Region,Woodlands,1.44255,103.7958,Private room,87,180,1,2013-10-21,0.01,2,365
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,Central Region,Bukit Timah,1.33235,103.78521,Private room,80,90,18,2014-12-26,0.25,1,365
2,56334,COZICOMFORT,266763,Francesca,North Region,Woodlands,1.44246,103.79667,Private room,72,6,20,2015-10-01,0.19,2,365
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,East Region,Tampines,1.34541,103.95712,Private room,214,1,20,2020-01-17,0.2,8,365
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,East Region,Tampines,1.34567,103.95963,Private room,99,1,24,2019-10-13,0.23,8,365


In [3]:
#I'm not posh enough to use the English spelling
df.rename(columns={'neighbourhood':'Neighborhood'}, 
                 inplace=True)

In [4]:
#let's have a view by neighborhood
df=df.groupby('Neighborhood',axis=0).mean()
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Neighborhood,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Ang Mo Kio,22627680.0,83115740.0,1.375497,103.84411,81.56,19.4,3.64,0.7616,2.12,149.3
1,Bedok,19869030.0,82924470.0,1.319705,103.920172,150.459375,22.009375,17.2375,0.846484,19.375,187.31875
2,Bishan,23064390.0,73043660.0,1.354061,103.841018,169.344262,27.52459,12.409836,0.826176,48.114754,163.540984
3,Bukit Batok,20121020.0,74791430.0,1.355327,103.75576,110.724138,41.413793,5.103448,0.379412,1.62069,172.603448
4,Bukit Merah,28064860.0,99475210.0,1.279965,103.829755,146.401709,14.683761,11.589744,0.943288,28.339031,205.068376


In [5]:
#let's select the features that interest us
df=df[['Neighborhood','latitude','longitude']]
df.head(2)

Unnamed: 0,Neighborhood,latitude,longitude
0,Ang Mo Kio,1.375497,103.84411
1,Bedok,1.319705,103.920172


In [6]:
df.tail(2)

Unnamed: 0,Neighborhood,latitude,longitude
41,Woodlands,1.437421,103.790081
42,Yishun,1.418865,103.838932


In [7]:
#a quick google search gives us the location of Singapore
latitude = 1.3521
longitude = 103.8198

In [8]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.2
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base conda



# All requested packages already installed.



In [9]:
#let's visualize the different neighborhood of Singapore
sg_map = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng, label in zip(df.latitude, df.longitude, df.Neighborhood):
    folium.features.CircleMarker(
        [lat, lng],
        radius=7,
        color='blue',
        fill=True,
        popup=label,
        fill_color='blue',
        fill_opacity=0.4
    ).add_to(sg_map)

sg_map

## Python for Prata Science Part II
### Calling the Foursquare API and defining the clusters

In [10]:
CLIENT_ID = 'VIBRIWQCQR2JUZZBQT5V31TGFTRGWTS132QAZ0JF00B02BGW' # your Foursquare ID
CLIENT_SECRET = '44UWUQLBYFXZ2XP11UHNWKKHDMRDKZ2UYTGCP0YQSW00KX3Z' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [11]:
#let's explore the first neighborhood
df.loc[0, 'Neighborhood']

'Ang Mo Kio'

In [12]:
neighborhood_latitude = df.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Ang Mo Kio are 1.3754973999999998, 103.84410999999999.


In [13]:
#let's call the API to provide info about the neighborhood venues
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 400 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=VIBRIWQCQR2JUZZBQT5V31TGFTRGWTS132QAZ0JF00B02BGW&client_secret=44UWUQLBYFXZ2XP11UHNWKKHDMRDKZ2UYTGCP0YQSW00KX3Z&v=20180605&ll=1.3754973999999998,103.84410999999999&radius=400&limit=100'

In [14]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [15]:
results = requests.get(url).json()

In [16]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [17]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Ang Mo Kio Town Garden West,Park,1.376039,103.844528
1,Shanghai Ren Jia 上海人家,Shanghai Restaurant,1.377068,103.841329
2,McDonald's / McCafé,Fast Food Restaurant,1.372462,103.84498
3,Courts,Electronics Store,1.373486,103.84607
4,Broadway Coffeeshop,Food Court,1.373072,103.846393
5,Amk Ave 5 Junction Of Amk Ave 6,Intersection,1.37723,103.844685
6,7-Eleven,Convenience Store,1.377028,103.841034
7,School of Art NYP,Arts & Crafts Store,1.376568,103.847536


In [18]:
#quick peak at the neighborhood venues categories
nearby_venues['categories'].unique().tolist()

['Park',
 'Shanghai Restaurant',
 'Fast Food Restaurant',
 'Electronics Store',
 'Food Court',
 'Intersection',
 'Convenience Store',
 'Arts & Crafts Store']

In [19]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

8 venues were returned by Foursquare.


In [20]:
#### Let's create a function to repeat the same process to all the neighborhoods in Singapore

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
sg_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['latitude'],
                                   longitudes=df['longitude']
                                  )

Ang Mo Kio
Bedok
Bishan
Bukit Batok
Bukit Merah
Bukit Panjang
Bukit Timah
Central Water Catchment
Choa Chu Kang
Clementi
Downtown Core
Geylang
Hougang
Jurong East
Jurong West
Kallang
Mandai
Marina South
Marine Parade
Museum
Newton
Novena
Orchard
Outram
Pasir Ris
Pioneer
Punggol
Queenstown
River Valley
Rochor
Sembawang
Sengkang
Serangoon
Singapore River
Southern Islands
Sungei Kadut
Tampines
Tanglin
Toa Payoh
Tuas
Western Water Catchment
Woodlands
Yishun


In [23]:
#let's see what it looks like
print(sg_venues.shape)
sg_venues.head()

(1277, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ang Mo Kio,1.375497,103.84411,Ang Mo Kio Town Garden West,1.376039,103.844528,Park
1,Ang Mo Kio,1.375497,103.84411,Xi Xiang Feng Yong Tau Foo 喜相逢酿豆腐,1.371975,103.846408,Chinese Restaurant
2,Ang Mo Kio,1.375497,103.84411,Shanghai Ren Jia 上海人家,1.377068,103.841329,Shanghai Restaurant
3,Ang Mo Kio,1.375497,103.84411,Mr Teh Tarik,1.372168,103.845602,Halal Restaurant
4,Ang Mo Kio,1.375497,103.84411,True Fitness,1.372891,103.847661,Gym


In [24]:
#We are interest in food only. Let's select the feature of interest only
sg_venues['Venue Category'].unique().tolist()

['Park',
 'Chinese Restaurant',
 'Shanghai Restaurant',
 'Halal Restaurant',
 'Gym',
 'Thai Restaurant',
 'Food Court',
 'Fast Food Restaurant',
 'Electronics Store',
 'Malay Restaurant',
 'Intersection',
 'Breakfast Spot',
 'Dim Sum Restaurant',
 'Convenience Store',
 'Arts & Crafts Store',
 'Sporting Goods Shop',
 'Bakery',
 'Pizza Place',
 'Furniture / Home Store',
 'Indian Restaurant',
 'Asian Restaurant',
 'Café',
 'Supermarket',
 'Gastropub',
 'Coffee Shop',
 'Shopping Mall',
 'Health & Beauty Service',
 'Campground',
 'Gym / Fitness Center',
 'French Restaurant',
 'Vegetarian / Vegan Restaurant',
 'Soup Place',
 'Grocery Store',
 'Bus Station',
 'Boutique',
 'Lake',
 'Stadium',
 'Trail',
 'Bookstore',
 'Cupcake Shop',
 'Yoga Studio',
 'Ice Cream Shop',
 'Japanese Restaurant',
 'Restaurant',
 'Betting Shop',
 'Hostel',
 'Flea Market',
 'Casino',
 'Bus Stop',
 'Playground',
 'Pool',
 'Flower Shop',
 'Clothing Store',
 'Home Service',
 'Boat or Ferry',
 'Train Station',
 'Rest Area

### Let's keep only the categories that contains food related string:  
 'Restaurant',  
 'Food',  
 'Pizza',  
 'Gastropub',  
 'Noodle',  
 'Steakhouse',  
 'Diner',  
 'BBQ'  

In [25]:
searchfor = [  'Restaurant', 'Food', 'Pizza', 'Gastropub', 'Noodle', 'Steakhouse', 'Diner', 'BBQ' ]
sg_venues = sg_venues[sg_venues['Venue Category'].str.contains('|'.join(searchfor))]
sg_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
1,Ang Mo Kio,1.375497,103.84411,Xi Xiang Feng Yong Tau Foo 喜相逢酿豆腐,1.371975,103.846408,Chinese Restaurant
2,Ang Mo Kio,1.375497,103.84411,Shanghai Ren Jia 上海人家,1.377068,103.841329,Shanghai Restaurant
3,Ang Mo Kio,1.375497,103.84411,Mr Teh Tarik,1.372168,103.845602,Halal Restaurant
5,Ang Mo Kio,1.375497,103.84411,Soi 19 十九街 Thai Wanton Mee,1.377004,103.840706,Thai Restaurant
6,Ang Mo Kio,1.375497,103.84411,Ang Mo Kio Central Market & Food Centre,1.372037,103.846366,Food Court


In [26]:
sg_venues.shape

(472, 7)

In [27]:
sg_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ang Mo Kio,10,10,10,10,10,10
Bedok,6,6,6,6,6,6
Bishan,8,8,8,8,8,8
Bukit Merah,9,9,9,9,9,9
Bukit Panjang,1,1,1,1,1,1
Clementi,15,15,15,15,15,15
Downtown Core,43,43,43,43,43,43
Geylang,19,19,19,19,19,19
Hougang,19,19,19,19,19,19
Jurong East,20,20,20,20,20,20


In [28]:
print('There are {} uniques categories.'.format(len(sg_venues['Venue Category'].unique())))

There are 48 uniques categories.


In [29]:
# one hot encoding in order to trigger the most popular place per neighborhood
sg_onehot = pd.get_dummies(sg_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sg_onehot['Neighborhood'] = sg_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sg_onehot.columns[-1]] + list(sg_onehot.columns[:-1])
sg_onehot = sg_onehot[fixed_columns]

sg_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Australian Restaurant,BBQ Joint,Cantonese Restaurant,Chinese Restaurant,Dim Sum Restaurant,Diner,Dumpling Restaurant,...,Southern / Soul Food Restaurant,Spanish Restaurant,Steakhouse,Street Food Gathering,Sushi Restaurant,Swiss Restaurant,Tapas Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
1,Ang Mo Kio,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,Ang Mo Kio,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
sg_onehot.shape

(472, 49)

In [31]:
sg_grouped = sg_onehot.groupby('Neighborhood').mean().reset_index()
sg_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Australian Restaurant,BBQ Joint,Cantonese Restaurant,Chinese Restaurant,Dim Sum Restaurant,Diner,Dumpling Restaurant,...,Southern / Soul Food Restaurant,Spanish Restaurant,Steakhouse,Street Food Gathering,Sushi Restaurant,Swiss Restaurant,Tapas Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Ang Mo Kio,0.0,0.0,0.0,0.0,0.0,0.2,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
1,Bedok,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bishan,0.0,0.125,0.0,0.0,0.0,0.375,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.125,0.0
3,Bukit Merah,0.0,0.555556,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bukit Panjang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
#let's take the 10 most popular venues per neighborhood
num_top_venues = 5

for hood in sg_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = sg_grouped[sg_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Ang Mo Kio----
                venue  freq
0  Chinese Restaurant   0.2
1          Food Court   0.2
2    Malay Restaurant   0.1
3     Thai Restaurant   0.1
4  Dim Sum Restaurant   0.1


----Bedok----
                  venue  freq
0             Gastropub  0.17
1      Asian Restaurant  0.17
2     Indian Restaurant  0.17
3           Pizza Place  0.17
4  Fast Food Restaurant  0.17


----Bishan----
                           venue  freq
0             Chinese Restaurant  0.38
1               Asian Restaurant  0.12
2  Vegetarian / Vegan Restaurant  0.12
3                Thai Restaurant  0.12
4              Indian Restaurant  0.12


----Bukit Merah----
                 venue  freq
0     Asian Restaurant  0.56
1  Japanese Restaurant  0.22
2   Chinese Restaurant  0.11
3           Restaurant  0.11
4   Seafood Restaurant  0.00


----Bukit Panjang----
                        venue  freq
0         Japanese Restaurant   1.0
1         American Restaurant   0.0
2    Mediterranean Restaurant   0.0
3 

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [34]:
import numpy as np

In [35]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = sg_grouped['Neighborhood']

for ind in np.arange(sg_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sg_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(15)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Ang Mo Kio,Chinese Restaurant,Food Court,Fast Food Restaurant,Thai Restaurant,Malay Restaurant
1,Bedok,French Restaurant,Asian Restaurant,Indian Restaurant,Fast Food Restaurant,Gastropub
2,Bishan,Chinese Restaurant,Vegetarian / Vegan Restaurant,Pizza Place,Thai Restaurant,Asian Restaurant
3,Bukit Merah,Asian Restaurant,Japanese Restaurant,Restaurant,Chinese Restaurant,French Restaurant
4,Bukit Panjang,Japanese Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,Indonesian Restaurant,Indian Restaurant
5,Clementi,Indian Restaurant,Halal Restaurant,Food Court,Japanese Restaurant,Thai Restaurant
6,Downtown Core,Korean Restaurant,Food Court,Japanese Restaurant,Restaurant,Italian Restaurant
7,Geylang,Vegetarian / Vegan Restaurant,Chinese Restaurant,Seafood Restaurant,Noodle House,Korean Restaurant
8,Hougang,Chinese Restaurant,Food Court,Asian Restaurant,Noodle House,Korean Restaurant
9,Jurong East,Food Court,Sushi Restaurant,Steakhouse,Italian Restaurant,Korean Restaurant


In [36]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [82]:
# let's set the number of clusters we want to create
kclusters = 8

sg_grouped_clustering = sg_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters,init='k-means++',n_init=10, random_state=None).fit(sg_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:42] 

array([6, 5, 6, 1, 2, 0, 0, 6, 6, 0, 5, 6, 6, 0, 0, 0, 6, 0, 6, 7, 1, 5,
       4, 6, 6, 6, 5, 5, 0, 5, 7, 5, 0, 3, 0, 6], dtype=int32)

In [83]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Unnamed: 0,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Ang Mo Kio,1.375497,103.84411,6.0,Chinese Restaurant,Food Court,Fast Food Restaurant,Thai Restaurant,Malay Restaurant
1,Bedok,1.319705,103.920172,5.0,French Restaurant,Asian Restaurant,Indian Restaurant,Fast Food Restaurant,Gastropub
2,Bishan,1.354061,103.841018,6.0,Chinese Restaurant,Vegetarian / Vegan Restaurant,Pizza Place,Thai Restaurant,Asian Restaurant
3,Bukit Batok,1.355327,103.75576,,,,,,
4,Bukit Merah,1.279965,103.829755,1.0,Asian Restaurant,Japanese Restaurant,Restaurant,Chinese Restaurant,French Restaurant


In [102]:
sg_merged = df

# merge sg_grouped with df to add latitude/longitude for each neighborhood
sg_merged = sg_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

sg_merged.head()

Unnamed: 0,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Ang Mo Kio,1.375497,103.84411,6.0,Chinese Restaurant,Food Court,Fast Food Restaurant,Thai Restaurant,Malay Restaurant
1,Bedok,1.319705,103.920172,5.0,French Restaurant,Asian Restaurant,Indian Restaurant,Fast Food Restaurant,Gastropub
2,Bishan,1.354061,103.841018,6.0,Chinese Restaurant,Vegetarian / Vegan Restaurant,Pizza Place,Thai Restaurant,Asian Restaurant
3,Bukit Batok,1.355327,103.75576,,,,,,
4,Bukit Merah,1.279965,103.829755,1.0,Asian Restaurant,Japanese Restaurant,Restaurant,Chinese Restaurant,French Restaurant


In [103]:
#let's drop the neighborhoods that dont have any food related location identified
sg_merged=sg_merged.dropna()
sg_merged.head()

Unnamed: 0,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Ang Mo Kio,1.375497,103.84411,6.0,Chinese Restaurant,Food Court,Fast Food Restaurant,Thai Restaurant,Malay Restaurant
1,Bedok,1.319705,103.920172,5.0,French Restaurant,Asian Restaurant,Indian Restaurant,Fast Food Restaurant,Gastropub
2,Bishan,1.354061,103.841018,6.0,Chinese Restaurant,Vegetarian / Vegan Restaurant,Pizza Place,Thai Restaurant,Asian Restaurant
4,Bukit Merah,1.279965,103.829755,1.0,Asian Restaurant,Japanese Restaurant,Restaurant,Chinese Restaurant,French Restaurant
5,Bukit Panjang,1.368675,103.769223,2.0,Japanese Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,Indonesian Restaurant,Indian Restaurant


In [105]:
sg_merged['Cluster Labels Int']=sg_merged['Cluster Labels'].astype(int)
sg_merged.head()

Unnamed: 0,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels Int
0,Ang Mo Kio,1.375497,103.84411,6.0,Chinese Restaurant,Food Court,Fast Food Restaurant,Thai Restaurant,Malay Restaurant,6
1,Bedok,1.319705,103.920172,5.0,French Restaurant,Asian Restaurant,Indian Restaurant,Fast Food Restaurant,Gastropub,5
2,Bishan,1.354061,103.841018,6.0,Chinese Restaurant,Vegetarian / Vegan Restaurant,Pizza Place,Thai Restaurant,Asian Restaurant,6
4,Bukit Merah,1.279965,103.829755,1.0,Asian Restaurant,Japanese Restaurant,Restaurant,Chinese Restaurant,French Restaurant,1
5,Bukit Panjang,1.368675,103.769223,2.0,Japanese Restaurant,Vegetarian / Vegan Restaurant,Italian Restaurant,Indonesian Restaurant,Indian Restaurant,2


In [86]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [117]:
# Now, let's create the map with the clusters
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sg_merged['latitude'], sg_merged['longitude'], sg_merged['Neighborhood'], sg_merged['Cluster Labels Int']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters