# **Import libraries and packages**

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


# **Scrap data of Hanoi neighbourhoods page into a DataFrame**

In [0]:
# Wikipedia url
wiki_url = 'https://vi.wikipedia.org/wiki/B%E1%BA%A3n_m%E1%BA%ABu:%C4%90%C6%A1n_v%E1%BB%8B_h%C3%A0nh_ch%C3%ADnh_thu%E1%BB%99c_th%C3%A0nh_ph%E1%BB%91_H%C3%A0_N%E1%BB%99i'

# Send GET request
data = requests.get(wiki_url).text

# Parse data from the html using a BeautifulSoup object
soup = BeautifulSoup(data, 'html.parser')

In [0]:
# Create two lists for storing the data
district = []
ward = []
ward_per_district = []

In [4]:
# Append district names into the list
for row in soup.find_all("th", class_="navbox-group"):
    if ("Quận" in row.text): 
      district.append(row.text.replace("Quận", "").strip()) # remove Quận(Vietnamese word for district) from the district name
    elif ("Huyện" in row.text):
      district.append(row.text.replace("Huyện", "").strip()) # remove Huyện(another Vietnamese word for district) from the district name

district

['Ba Đình',
 'Bắc Từ Liêm',
 'Cầu Giấy',
 'Đống Đa',
 'Hà Đông',
 'Hai Bà Trưng',
 'Hoàn Kiếm',
 'Hoàng Mai',
 'Long Biên',
 'Nam Từ Liêm',
 'Tây Hồ',
 'Thanh Xuân',
 'Ba Vì',
 'Chương Mỹ',
 'Đan Phượng',
 'Đông Anh',
 'Gia Lâm',
 'Hoài Đức',
 'Mê Linh',
 'Mỹ Đức',
 'Phú Xuyên',
 'Phúc Thọ',
 'Quốc Oai',
 'Sóc Sơn',
 'Thạch Thất',
 'Thanh Oai',
 'Thanh Trì',
 'Thường Tín',
 'Ứng Hòa']

In [5]:
'''
rows = soup.select('table tr')[2:] # skip first row as it shows other information

# For each row of the table, find all the table data then append to the corresponding list
for row in rows:
    cells = row.find_all('td')
    if(len(cells) > 0):
      tmp = []
      counter = 0
      for str in cells[0].text.replace("\n", "").replace("\xa0", "").replace(" ", "").split(":"):
        while counter < len(str):
          char = str[counter]
          if (char.isdigit() and str[str.index(char)+1].isdigit()):
            tmp.append(int(char+str[str.index(char)+1]))
            counter += 2
          elif (char.isdigit() and not str[str.index(char)+1].isdigit()):
            tmp.append(int(char))
            counter += 1
          else:
            counter += 1

      ward_per_district.append(sum(tmp))
      #ward_per_district.append([int(s+b[b.index(s)+1]) for b in cells[0].text.replace("\n", "").replace("\xa0", "").replace(" ", "").split(":") for s in b if (s.isdigit() and b[b.index(s)+1].isdigit())])
      ward.append(cells[0].text.replace("\n", "").replace("\xa0", ""))


ward_per_district
'''

'\nrows = soup.select(\'table tr\')[2:] # skip first row as it shows other information\n\n# For each row of the table, find all the table data then append to the corresponding list\nfor row in rows:\n    cells = row.find_all(\'td\')\n    if(len(cells) > 0):\n      tmp = []\n      counter = 0\n      for str in cells[0].text.replace("\n", "").replace("\xa0", "").replace(" ", "").split(":"):\n        while counter < len(str):\n          char = str[counter]\n          if (char.isdigit() and str[str.index(char)+1].isdigit()):\n            tmp.append(int(char+str[str.index(char)+1]))\n            counter += 2\n          elif (char.isdigit() and not str[str.index(char)+1].isdigit()):\n            tmp.append(int(char))\n            counter += 1\n          else:\n            counter += 1\n\n      ward_per_district.append(sum(tmp))\n      #ward_per_district.append([int(s+b[b.index(s)+1]) for b in cells[0].text.replace("\n", "").replace("\xa0", "").replace(" ", "").split(":") for s in b if (s.isd

In [6]:
# create a new DataFrame from the list
hn_df = pd.DataFrame({"District": district})

hn_df.head()

Unnamed: 0,District
0,Ba Đình
1,Bắc Từ Liêm
2,Cầu Giấy
3,Đống Đa
4,Hà Đông


# **Get geographical coordinates**

In [0]:
geolocator = Nominatim(user_agent="my-application")

In [8]:
# Get latitudes
district_lat = [geolocator.geocode(district).latitude for district in district]

district_lat

[21.0363054,
 21.0698605,
 21.0363077,
 21.0128913,
 20.9524428,
 21.0059701,
 21.0289343,
 19.26758545,
 21.0393411,
 21.0128458,
 30.2616958,
 20.9938862,
 21.157241650000003,
 20.8784745,
 33.7042605,
 21.1361069,
 21.0237482,
 21.0115012,
 11.9605987,
 20.697382599999997,
 20.72904585,
 21.1372972,
 20.978094300000002,
 21.2808747,
 21.0235566,
 20.8602693,
 20.941461599999997,
 20.8319978,
 20.7110772]

In [10]:
# Get longitudes
district_lon = [geolocator.geocode(district).longitude for district in district]

district_lon

[105.8289861,
 105.7573392,
 105.7860752,
 105.8277098,
 105.7609551,
 105.8574845,
 105.8522605,
 105.70557053396413,
 105.8922453,
 105.7608745,
 120.1256628,
 105.8146705,
 105.37676448554157,
 105.64924976305664,
 110.4272323,
 105.8426874,
 105.97041496102682,
 105.7076862,
 108.4698883,
 105.71577535499549,
 105.91023981311886,
 105.5519049,
 105.62945748084047,
 105.82924029963563,
 105.55383629633194,
 105.7800155750608,
 105.83603265035416,
 105.87006434087073,
 105.81433035379561]

In [0]:
# Merge the coordinates into the original dataframe
hn_df['Latitude'] = district_lat
hn_df['Longitude'] = district_lon

In [12]:
hn_df.head()

Unnamed: 0,District,Latitude,Longitude
0,Ba Đình,21.036305,105.828986
1,Bắc Từ Liêm,21.069861,105.757339
2,Cầu Giấy,21.036308,105.786075
3,Đống Đa,21.012891,105.82771
4,Hà Đông,20.952443,105.760955


# **Create a map of Hanoi with districts on top**

In [0]:
# Get the coordinates of Hanoi
address = 'Hanoi, Vietnam'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Hanoi, Vietnam {}, {}.'.format(latitude, longitude))

In [14]:
# create map of Hanoi using latitude and longitude values
map_hanoi = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(hn_df['Latitude'], hn_df['Longitude'], hn_df['District']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_hanoi)  
    
map_hanoi

# **Use the Foursquare API to explore the districts**

In [15]:
# define Foursquare Credentials and Version
CLIENT_ID = 'IZ2BYX5E4VKW5PQG4IU5YUPIFIDYSWHUYAPVD1DX01U5C50S' # your Foursquare ID
CLIENT_SECRET = '5T43EVKG5RTVJZPPBVKNRHYGSG3GB31WGLYYLRQOYOWRJYNV' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: IZ2BYX5E4VKW5PQG4IU5YUPIFIDYSWHUYAPVD1DX01U5C50S
CLIENT_SECRET:5T43EVKG5RTVJZPPBVKNRHYGSG3GB31WGLYYLRQOYOWRJYNV



**Now, let's get the top 100 venues that are within a radius of 2000 meters**

In [0]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(hn_df['Latitude'], hn_df['Longitude'], hn_df['District']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [17]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['District', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(650, 7)


Unnamed: 0,District,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Ba Đình,21.036305,105.828986,Bình Minh,21.031457,105.832087,Fried Chicken Joint
1,Ba Đình,21.036305,105.828986,Lý Văn Phúc (Chicken Street),21.031233,105.832014,Wings Joint
2,Ba Đình,21.036305,105.828986,Bia Hơi 19C Ngọc Hà,21.03728,105.831327,Beer Garden
3,Ba Đình,21.036305,105.828986,Văn Miếu Quốc Tử Giám (Temple of Literature) (...,21.028707,105.836005,Confucian Temple
4,Ba Đình,21.036305,105.828986,Zennova Massage,21.029979,105.825851,Massage Studio


**Let's check how many venues were returned for each neighorhood**

In [18]:
venues_df.groupby(["District"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ba Vì,1,1,1,1,1,1
Ba Đình,100,100,100,100,100,100
Bắc Từ Liêm,7,7,7,7,7,7
Cầu Giấy,61,61,61,61,61,61
Gia Lâm,3,3,3,3,3,3
Hai Bà Trưng,100,100,100,100,100,100
Hoài Đức,2,2,2,2,2,2
Hoàn Kiếm,100,100,100,100,100,100
Hoàng Mai,5,5,5,5,5,5
Hà Đông,5,5,5,5,5,5


**Let's find out how many unique categories can be curated from all the returned venues**

In [19]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 133 uniques categories.


In [20]:
# print out the list of categories
venues_df['VenueCategory'].unique()

array(['Fried Chicken Joint', 'Wings Joint', 'Beer Garden',
       'Confucian Temple', 'Massage Studio', 'Temple', 'BBQ Joint',
       'Coffee Shop', 'Monument / Landmark', 'Hotel',
       'Vietnamese Restaurant', 'Rock Club', 'Beer Bar',
       'Hotpot Restaurant', 'Pizza Place', 'Steakhouse', 'Café',
       'Bookstore', 'Noodle House', 'Spa', 'Buddhist Temple',
       'Vegetarian / Vegan Restaurant', 'Bar', 'Cocktail Bar',
       'Scenic Lookout', 'Asian Restaurant', 'Lounge', 'Church', 'Lake',
       'Sushi Restaurant', 'Wine Bar', 'Ice Cream Shop', 'Dessert Shop',
       'Supermarket', 'Wedding Hall', 'Hotel Bar', 'Art Museum',
       'Sandwich Place', 'Food Court', 'Restaurant',
       'Japanese Restaurant', 'Tea Room', 'Chinese Restaurant',
       'Russian Restaurant', 'Shopping Mall', 'Himalayan Restaurant',
       'Roof Deck', 'Korean Restaurant', 'Paintball Field',
       'Historic Site', 'Flower Shop', 'Park', 'Multiplex', 'Museum',
       'Fish & Chips Shop', 'Food Truck', '

# **Analyze Each Neighborhood**

In [21]:
# one hot encoding
hn_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
hn_onehot['District'] = venues_df['District'] 

# move neighborhood column to the first column
fixed_columns = [hn_onehot.columns[-1]] + list(hn_onehot.columns[:-1])
hn_onehot = hn_onehot[fixed_columns]

print(hn_onehot.shape)
hn_onehot.head()

(650, 134)


Unnamed: 0,District,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,Auto Garage,BBQ Joint,Bakery,Bar,Beer Bar,Beer Garden,Big Box Store,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Bowling Alley,Brewery,Bubble Tea Shop,Buddhist Temple,Buffet,Bulgarian Restaurant,Burger Joint,Bus Station,Cafeteria,Café,Campground,Chinese Restaurant,Chocolate Shop,Church,Cocktail Bar,Coffee Shop,College Cafeteria,Comic Shop,Confucian Temple,Cultural Center,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run,Electronics Store,English Restaurant,Farm,Fast Food Restaurant,Fish & Chips Shop,Flower Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Garden,Gift Shop,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hainan Restaurant,Himalayan Restaurant,Historic Site,History Museum,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Korean Restaurant,Lake,Light Rail Station,Lounge,Market,Massage Studio,Men's Store,Mobile Phone Shop,Modern European Restaurant,Monument / Landmark,Motel,Mountain,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Noodle House,Outdoors & Recreation,Outlet Store,Paintball Field,Park,Pedestrian Plaza,Performing Arts Venue,Pizza Place,Pub,Ramen Restaurant,Restaurant,River,Rock Club,Roof Deck,Russian Restaurant,Salad Place,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shopping Mall,Snack Place,Soccer Field,Spa,Spiritual Center,Stadium,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Temple,Thai Restaurant,Theater,Tiki Bar,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wedding Hall,Wine Bar,Wings Joint,Zhejiang Restaurant
0,Ba Đình,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Ba Đình,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Ba Đình,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Ba Đình,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Ba Đình,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## **Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [22]:
hn_grouped = hn_onehot.groupby(["District"]).mean().reset_index()

print(hn_grouped.shape)
hn_grouped

(25, 134)


Unnamed: 0,District,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,Auto Garage,BBQ Joint,Bakery,Bar,Beer Bar,Beer Garden,Big Box Store,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Bowling Alley,Brewery,Bubble Tea Shop,Buddhist Temple,Buffet,Bulgarian Restaurant,Burger Joint,Bus Station,Cafeteria,Café,Campground,Chinese Restaurant,Chocolate Shop,Church,Cocktail Bar,Coffee Shop,College Cafeteria,Comic Shop,Confucian Temple,Cultural Center,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run,Electronics Store,English Restaurant,Farm,Fast Food Restaurant,Fish & Chips Shop,Flower Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Garden,Gift Shop,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hainan Restaurant,Himalayan Restaurant,Historic Site,History Museum,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Korean Restaurant,Lake,Light Rail Station,Lounge,Market,Massage Studio,Men's Store,Mobile Phone Shop,Modern European Restaurant,Monument / Landmark,Motel,Mountain,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Noodle House,Outdoors & Recreation,Outlet Store,Paintball Field,Park,Pedestrian Plaza,Performing Arts Venue,Pizza Place,Pub,Ramen Restaurant,Restaurant,River,Rock Club,Roof Deck,Russian Restaurant,Salad Place,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shopping Mall,Snack Place,Soccer Field,Spa,Spiritual Center,Stadium,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Temple,Thai Restaurant,Theater,Tiki Bar,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wedding Hall,Wine Bar,Wings Joint,Zhejiang Restaurant
0,Ba Vì,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ba Đình,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.02,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.01,0.0,0.01,0.01,0.06,0.0,0.0,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.11,0.01,0.02,0.02,0.0,0.03,0.0,0.0,0.02,0.01,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.02,0.01,0.01,0.0,0.01,0.01,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.13,0.01,0.01,0.01,0.0
2,Bắc Từ Liêm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0
3,Cầu Giấy,0.016393,0.0,0.016393,0.0,0.0,0.0,0.016393,0.0,0.016393,0.016393,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131148,0.0,0.0,0.0,0.0,0.016393,0.098361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.032787,0.016393,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.04918,0.0,0.0,0.0,0.0,0.065574,0.0,0.0,0.032787,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.032787,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.032787,0.0,0.0,0.04918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.016393,0.032787,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.016393,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.098361,0.0,0.0,0.0,0.0
4,Gia Lâm,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Hai Bà Trưng,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.03,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.01,0.0,0.0,0.12,0.0,0.01,0.0,0.01,0.03,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.06,0.01,0.02,0.0,0.01,0.06,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.04,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.02,0.01,0.0,0.0,0.0,0.01,0.19,0.0,0.0,0.0,0.0
6,Hoài Đức,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hoàn Kiếm,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.07,0.0,0.0,0.01,0.0,0.02,0.14,0.0,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.17,0.01,0.0,0.02,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.17,0.0,0.0,0.0,0.0
8,Hoàng Mai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
9,Hà Đông,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Create a new DataFrame for Hotel data only**

In [0]:
hn_hotel = hn_grouped[["District","Hotel", "Hostel", "Motel"]]

In [24]:
hn_hotel.head()

Unnamed: 0,District,Hotel,Hostel,Motel
0,Ba Vì,0.0,0.0,0.0
1,Ba Đình,0.11,0.0,0.0
2,Bắc Từ Liêm,0.0,0.0,0.0
3,Cầu Giấy,0.04918,0.0,0.0
4,Gia Lâm,0.0,0.0,0.0


In [0]:
# Drop all rural districts
hn_hotel = hn_hotel.drop(hn_hotel[(hn_hotel['Hotel'] == 0.0) & (hn_hotel['Hostel'] == 0.0) & (hn_hotel['Motel'] == 0.0)].index)

In [30]:
hn_hotel

Unnamed: 0,District,Hotel,Hostel,Motel
1,Ba Đình,0.11,0.0,0.0
3,Cầu Giấy,0.04918,0.0,0.0
5,Hai Bà Trưng,0.06,0.0,0.0
7,Hoàn Kiếm,0.17,0.02,0.0
8,Hoàng Mai,0.2,0.0,0.2
11,Mê Linh,0.142857,0.0,0.0
13,Nam Từ Liêm,0.045455,0.0,0.0
21,Tây Hồ,0.079365,0.0,0.0
23,Đống Đa,0.013158,0.0,0.0


## **Cluster Neighborhoods**

Run k-means to cluster the districts in Hanoi into 3 clusters.

In [26]:
# set number of clusters
kclusters = 3

hn_clustering = hn_hotel.drop(["District"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hn_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 1, 1, 0, 2, 0, 1, 1, 1], dtype=int32)

In [0]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each district
hn_merged = hn_hotel.copy()

# add clustering labels
hn_merged["Cluster Labels"] = kmeans.labels_

In [28]:
hn_merged.rename(columns={"District": "District"}, inplace=True)
hn_merged.head()

Unnamed: 0,District,Hotel,Hostel,Motel,Cluster Labels
1,Ba Đình,0.11,0.0,0.0,0
3,Cầu Giấy,0.04918,0.0,0.0,1
5,Hai Bà Trưng,0.06,0.0,0.0,1
7,Hoàn Kiếm,0.17,0.02,0.0,0
8,Hoàng Mai,0.2,0.0,0.2,2


In [29]:
# merge hn_merged with hn_df to add latitude/longitude for each neighborhood
hn_merged = hn_merged.join(hn_df.set_index("District"), on="District")

print(hn_merged.shape)
hn_merged.head() # check the last columns!

(9, 7)


Unnamed: 0,District,Hotel,Hostel,Motel,Cluster Labels,Latitude,Longitude
1,Ba Đình,0.11,0.0,0.0,0,21.036305,105.828986
3,Cầu Giấy,0.04918,0.0,0.0,1,21.036308,105.786075
5,Hai Bà Trưng,0.06,0.0,0.0,1,21.00597,105.857484
7,Hoàn Kiếm,0.17,0.02,0.0,0,21.028934,105.85226
8,Hoàng Mai,0.2,0.0,0.2,2,19.267585,105.705571


In [31]:
# sort the results by Cluster Labels
print(hn_merged.shape)
hn_merged.sort_values(["Cluster Labels"], inplace=True)
hn_merged

(9, 7)


Unnamed: 0,District,Hotel,Hostel,Motel,Cluster Labels,Latitude,Longitude
1,Ba Đình,0.11,0.0,0.0,0,21.036305,105.828986
7,Hoàn Kiếm,0.17,0.02,0.0,0,21.028934,105.85226
11,Mê Linh,0.142857,0.0,0.0,0,11.960599,108.469888
3,Cầu Giấy,0.04918,0.0,0.0,1,21.036308,105.786075
5,Hai Bà Trưng,0.06,0.0,0.0,1,21.00597,105.857484
13,Nam Từ Liêm,0.045455,0.0,0.0,1,21.012846,105.760874
21,Tây Hồ,0.079365,0.0,0.0,1,30.261696,120.125663
23,Đống Đa,0.013158,0.0,0.0,1,21.012891,105.82771
8,Hoàng Mai,0.2,0.0,0.2,2,19.267585,105.705571


## **Finally, let's visualize the resulting clusters**

In [44]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
rainbow = ["red", "blue", "orange"]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hn_merged['Latitude'], hn_merged['Longitude'], hn_merged['District'], hn_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# **Examine Clusters**

**Cluster 0**

In [45]:
hn_merged.loc[hn_merged['Cluster Labels'] == 0]

Unnamed: 0,District,Hotel,Hostel,Motel,Cluster Labels,Latitude,Longitude
1,Ba Đình,0.11,0.0,0.0,0,21.036305,105.828986
7,Hoàn Kiếm,0.17,0.02,0.0,0,21.028934,105.85226
11,Mê Linh,0.142857,0.0,0.0,0,11.960599,108.469888


**Cluster 1**

In [46]:
hn_merged.loc[hn_merged['Cluster Labels'] == 1]

Unnamed: 0,District,Hotel,Hostel,Motel,Cluster Labels,Latitude,Longitude
3,Cầu Giấy,0.04918,0.0,0.0,1,21.036308,105.786075
5,Hai Bà Trưng,0.06,0.0,0.0,1,21.00597,105.857484
13,Nam Từ Liêm,0.045455,0.0,0.0,1,21.012846,105.760874
21,Tây Hồ,0.079365,0.0,0.0,1,30.261696,120.125663
23,Đống Đa,0.013158,0.0,0.0,1,21.012891,105.82771


**Cluster 2**

In [47]:
hn_merged.loc[hn_merged['Cluster Labels'] == 2]

Unnamed: 0,District,Hotel,Hostel,Motel,Cluster Labels,Latitude,Longitude
8,Hoàng Mai,0.2,0.0,0.2,2,19.267585,105.705571


**Observations:**  

Most of the hotels, hostels and motels are concentrated in the central area of Hanoi, with the highest number in cluster 1 and moderate number in cluster 0. On the other hand, cluster 2 has very low number to totally no hotel in the neighborhoods. This represents a great opportunity and high potential areas to open new hotels as there is very little to no competition from existing ones. Meanwhile, hotels in cluster 1 are likely to suffer from intense competition due to oversupply and high concentration. From another perspective, this also shows that the oversupply of hotels mostly happened in the central area of the city, with the suburb area still have very few hotels. Therefore, this project recommends property developers to capitalize on these findings to open new hotels in the district in cluster 2 with little to no competition. Property developers with unique selling propositions to stand out from the competition can also open new hotels in districts in cluster 0 with moderate competition. Lastly, property developers are advised to avoid districts in cluster 1 which already have high concentration of hotels and suffering from intense competition.