## IBM Applied Data Science Capstone "The Battle of Neighborhoods"

### The Battle of Neighborhoods Report

#### Opening a New Shopping Mall in Kuala Lumpur, Malaysia
* Build a dataframe of neighborhoods in Kuala Lumpur, Malaysia by web scraping the data from Wikipedia page
* Get the geographical coordinates of the neighborhoods
* Obtain the venue data for the neighborhoods from Foursquare API
* Explore and cluster the neighborhoods
* Select the best cluster to open a new shopping mall

### 1.Import Libraries

In [33]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!conda install -c conda-forge geocoder --yes
import geocoder # to get coordinates

import requests # library to handle requests

!conda install -c conda-forge bs4 --yes
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print("Libraries imported.")

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


###  2.Scrap data from Wikipedia page into a DataFrame

In [34]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Suburbs_in_Kuala_Lumpur").text

In [36]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [37]:
# create a list to store neighborhood data
neighborhoodList = []

In [38]:
# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [39]:
# create a new DataFrame from the list
kl_df = pd.DataFrame({"Neighborhood": neighborhoodList})

kl_df.head()

Unnamed: 0,Neighborhood
0,Alam Damai
1,"Ampang, Kuala Lumpur"
2,Bandar Menjalara
3,Bandar Sri Permaisuri
4,Bandar Tasik Selatan


In [40]:

# print the number of rows of the dataframe
kl_df.shape

(71, 1)

### 3.Get the geographical coordinates

In [44]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Kuala Lumpur, Malaysia'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [45]:

# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in kl_df["Neighborhood"].tolist() ]

In [47]:
print(coords)

[[3.0576900000000364, 101.74388000000005], [3.148494115588384, 101.69672876508707], [3.1903500000000236, 101.62545000000006], [3.1039100000000417, 101.71226000000007], [3.072750000000042, 101.71461000000005], [3.08280000000002, 101.72281000000004], [3.1292000000000257, 101.67844000000008], [3.1292000000000257, 101.67844000000008], [3.111020000000053, 101.66283000000004], [3.061870000000056, 101.74675000000008], [3.147890000000075, 101.69405000000006], [3.12916000000007, 101.68406000000004], [3.147770000000037, 101.70855000000006], [3.0578100000000745, 101.68965000000009], [3.1434800000000678, 101.64433000000008], [3.1511418740671178, 101.69937468268016], [3.129290000000026, 101.69896000000006], [3.17381000000006, 101.68276000000009], [3.061870000000056, 101.74675000000008], [3.163590000000056, 101.69811000000004], [3.1479700000000435, 101.66795000000008], [3.136442497863743, 101.69029641203808], [3.1419057131529953, 101.67967833034047], [3.1566851613075664, 101.69807685476978], [3.0833

In [48]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [49]:
# merge the coordinates into the original dataframe
kl_df['Latitude'] = df_coords['Latitude']
kl_df['Longitude'] = df_coords['Longitude']

In [50]:
# check the neighborhoods and the coordinates
print(kl_df.shape)
kl_df

(71, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Alam Damai,3.05769,101.74388
1,"Ampang, Kuala Lumpur",3.148494,101.696729
2,Bandar Menjalara,3.19035,101.62545
3,Bandar Sri Permaisuri,3.10391,101.71226
4,Bandar Tasik Selatan,3.07275,101.71461
5,Bandar Tun Razak,3.0828,101.72281
6,Bangsar,3.1292,101.67844
7,Bangsar Park,3.1292,101.67844
8,Bangsar South,3.11102,101.66283
9,Batu 11 Cheras,3.06187,101.74675


In [51]:
# save the DataFrame as CSV file
kl_df.to_csv("kl_df.csv", index=False)

### 4.Create a map of Kuala Lumpur with neighborhoods superimposed on top

In [52]:
# get the coordinates of Kuala Lumpur
address = 'Kuala Lumpur, Malaysia'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Kuala Lumpur, Malaysiae {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Kuala Lumpur, Malaysiae 3.1516964, 101.6942371.


In [53]:
# create map of Toronto using latitude and longitude values
map_kl = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_kl)  
    
map_kl

In [54]:
# save the map as HTML file
map_kl.save('map_kl.html')

### 5.Use the Foursquare API to explore the neighborhoods

In [86]:
# define Foursquare Credentials and Version
CLIENT_ID = 'your Foursquare ID' # your Foursquare ID
CLIENT_SECRET = 'your Foursquare Secret' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: your Foursquare ID
CLIENT_SECRET:your Foursquare Secret


#### Now, let's get the top 100 venues that are within a radius of 2000 meters.

In [57]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id=MSJPRISQSWZYGK3GCMT53RYSCNKZB3IEPBMAC1NBAXJSX4XL&client_secret=5UQTFM4LIXSDXGKBQJXQOMEZZRKESNI4QQFMNBOCAOVLI0AD&v=20180605&ll=3.1516964,101.6942371&radius=2000&limit=100".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [58]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(7100, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Alam Damai,3.05769,101.74388,Adya Hotel Kuala Lumpur,3.151703,101.695623,Hotel
1,Alam Damai,3.05769,101.74388,BackHome Kuala Lumpur,3.148732,101.697887,Hostel
2,Alam Damai,3.05769,101.74388,Restoran Santa,3.149083,101.698156,South Indian Restaurant
3,Alam Damai,3.05769,101.74388,4Fingers Crispy Chicken,3.15613,101.69516,Fried Chicken Joint
4,Alam Damai,3.05769,101.74388,Urbanscapes House,3.146803,101.696028,Exhibit


#### Let's check how many venues were returned for each neighorhood

In [59]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alam Damai,100,100,100,100,100,100
"Ampang, Kuala Lumpur",100,100,100,100,100,100
Bandar Menjalara,100,100,100,100,100,100
Bandar Sri Permaisuri,100,100,100,100,100,100
Bandar Tasik Selatan,100,100,100,100,100,100
Bandar Tun Razak,100,100,100,100,100,100
Bangsar,100,100,100,100,100,100
Bangsar Park,100,100,100,100,100,100
Bangsar South,100,100,100,100,100,100
Batu 11 Cheras,100,100,100,100,100,100


#### Let's find out how many unique categories can be curated from all the returned venues

In [60]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 59 uniques categories.


In [61]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(['Hotel', 'Hostel', 'South Indian Restaurant',
       'Fried Chicken Joint', 'Exhibit', 'Shoe Store', 'Boutique',
       "Men's Store", 'Chettinad Restaurant', 'IT Services', 'Food Truck',
       'Gym', 'Indian Restaurant', 'Kebab Restaurant', 'Asian Restaurant',
       'Park', 'Convenience Store', 'Noodle House', 'Coffee Shop',
       'Speakeasy', 'Juice Bar', 'Malay Restaurant', 'Fabric Shop',
       'Museum', 'Latin American Restaurant', 'Dessert Shop',
       'Monument / Landmark', 'Hotel Pool', 'Dance Studio', 'Café',
       'Department Store', 'Soup Place', 'Thai Restaurant', 'Bar',
       'Grocery Store', 'Multiplex', 'Pool', 'Lounge', 'Tapas Restaurant',
       'Halal Restaurant', 'Club House', 'Spa',
       'Middle Eastern Restaurant', 'Dim Sum Restaurant',
       'Italian Restaurant', 'Gift Shop', 'Restaurant', 'Beer Bar',
       'Sushi Restaurant', 'Breakfast Spot'], dtype=object)

In [89]:
# check if the results contain "Hotel"
"Hotel" in venues_df['VenueCategory'].unique()

True

### 6.Analyze Each Neighborhood

In [63]:

# one hot encoding
kl_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
kl_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [kl_onehot.columns[-1]] + list(kl_onehot.columns[:-1])
kl_onehot = kl_onehot[fixed_columns]

print(kl_onehot.shape)
kl_onehot.head()

(7100, 60)


Unnamed: 0,Neighborhoods,Asian Restaurant,BBQ Joint,Bar,Beer Bar,Boutique,Breakfast Spot,Café,Chettinad Restaurant,Club House,Cocktail Bar,Coffee Shop,Convenience Store,Dance Studio,Department Store,Dessert Shop,Dim Sum Restaurant,Exhibit,Fabric Shop,Food Truck,Fried Chicken Joint,Gift Shop,Grocery Store,Gym,Halal Restaurant,Hostel,Hotel,Hotel Bar,Hotel Pool,IT Services,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Juice Bar,Kebab Restaurant,Latin American Restaurant,Lounge,Malay Restaurant,Men's Store,Middle Eastern Restaurant,Monument / Landmark,Multiplex,Museum,Noodle House,Park,Pool,Pub,Resort,Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Soup Place,South Indian Restaurant,Spa,Speakeasy,Sushi Restaurant,Tapas Restaurant,Tennis Court,Thai Restaurant
0,Alam Damai,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Alam Damai,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Alam Damai,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,Alam Damai,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Alam Damai,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [64]:
kl_grouped = kl_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(kl_grouped.shape)
kl_grouped

(71, 60)


Unnamed: 0,Neighborhoods,Asian Restaurant,BBQ Joint,Bar,Beer Bar,Boutique,Breakfast Spot,Café,Chettinad Restaurant,Club House,Cocktail Bar,Coffee Shop,Convenience Store,Dance Studio,Department Store,Dessert Shop,Dim Sum Restaurant,Exhibit,Fabric Shop,Food Truck,Fried Chicken Joint,Gift Shop,Grocery Store,Gym,Halal Restaurant,Hostel,Hotel,Hotel Bar,Hotel Pool,IT Services,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Juice Bar,Kebab Restaurant,Latin American Restaurant,Lounge,Malay Restaurant,Men's Store,Middle Eastern Restaurant,Monument / Landmark,Multiplex,Museum,Noodle House,Park,Pool,Pub,Resort,Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Soup Place,South Indian Restaurant,Spa,Speakeasy,Sushi Restaurant,Tapas Restaurant,Tennis Court,Thai Restaurant
0,Alam Damai,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
1,"Ampang, Kuala Lumpur",0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
2,Bandar Menjalara,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
3,Bandar Sri Permaisuri,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
4,Bandar Tasik Selatan,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
5,Bandar Tun Razak,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
6,Bangsar,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
7,Bangsar Park,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
8,Bangsar South,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01
9,Batu 11 Cheras,0.01,0.01,0.02,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.02,0.16,0.01,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.02,0.01,0.04,0.01,0.02,0.02,0.01,0.02,0.03,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.01


In [67]:
len(kl_grouped[kl_grouped["Hotel"] > 0])

71

### Create a new DataFrame for Hotel data only

In [90]:
kl_hotel = kl_grouped[["Neighborhoods","Hotel"]]

In [91]:
kl_hotel.head()

Unnamed: 0,Neighborhoods,Hotel
0,Alam Damai,0.16
1,"Ampang, Kuala Lumpur",0.16
2,Bandar Menjalara,0.16
3,Bandar Sri Permaisuri,0.16
4,Bandar Tasik Selatan,0.16


### 7. Cluster Neighborhoods

Run k-means to cluster the neighborhoods in Kuala Lumpur into 3 clusters.

In [74]:
# set number of clusters
kclusters = 3

kl_clustering = kl_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [76]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
kl_merged = kl_mall.copy()

# add clustering labels
kl_merged["Cluster Labels"] = kmeans.labels_

In [77]:
kl_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
kl_merged.head()

Unnamed: 0,Neighborhood,Hotel,Cluster Labels
0,Alam Damai,0.16,0
1,"Ampang, Kuala Lumpur",0.16,0
2,Bandar Menjalara,0.16,0
3,Bandar Sri Permaisuri,0.16,0
4,Bandar Tasik Selatan,0.16,0


In [78]:

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
kl_merged = kl_merged.join(kl_df.set_index("Neighborhood"), on="Neighborhood")

print(kl_merged.shape)
kl_merged.head() # check the last columns!

(71, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Alam Damai,0.16,0,3.05769,101.74388
1,"Ampang, Kuala Lumpur",0.16,0,3.148494,101.696729
2,Bandar Menjalara,0.16,0,3.19035,101.62545
3,Bandar Sri Permaisuri,0.16,0,3.10391,101.71226
4,Bandar Tasik Selatan,0.16,0,3.07275,101.71461


In [79]:
# sort the results by Cluster Labels
print(kl_merged.shape)
kl_merged.sort_values(["Cluster Labels"], inplace=True)
kl_merged

(71, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Alam Damai,0.16,0,3.05769,101.74388
50,Shamelin,0.16,0,3.12457,101.73597
49,Setiawangsa,0.16,0,3.191803,101.74007
48,Setapak,0.16,0,3.18816,101.70415
47,"Sentul, Kuala Lumpur",0.16,0,3.17508,101.69305
46,Semarak,0.16,0,3.179916,101.721437
45,Segambut,0.16,0,3.18639,101.6681
51,Sri Hartamas,0.16,0,3.1622,101.65036
44,Salak South,0.16,0,3.08102,101.69724
42,"Pudu, Kuala Lumpur",0.16,0,3.13354,101.71307


### Finally, let's visualize the resulting clusters

In [80]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(kl_merged['Latitude'], kl_merged['Longitude'], kl_merged['Neighborhood'], kl_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [81]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### 8. Examine Clusters

### Cluster 0

In [82]:
kl_merged.loc[kl_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Alam Damai,0.16,0,3.05769,101.74388
50,Shamelin,0.16,0,3.12457,101.73597
49,Setiawangsa,0.16,0,3.191803,101.74007
48,Setapak,0.16,0,3.18816,101.70415
47,"Sentul, Kuala Lumpur",0.16,0,3.17508,101.69305
46,Semarak,0.16,0,3.179916,101.721437
45,Segambut,0.16,0,3.18639,101.6681
51,Sri Hartamas,0.16,0,3.1622,101.65036
44,Salak South,0.16,0,3.08102,101.69724
42,"Pudu, Kuala Lumpur",0.16,0,3.13354,101.71307


#### Cluster 1

In [84]:
kl_merged.loc[kl_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude


#### Cluster 2

In [85]:
kl_merged.loc[kl_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
