# Applied Data Science Capstone Course
## Week 5 Final Report
### Opening a New Shopping Mall in Beijing, China
• Build a dataframe of neighborhoods in Beijing, China by web scraping the data from Wikipedia page

• Get the geographical coordinates of the neighborhoods

• Obtain the venue data for the neighborhoods from Foursquare API

• Explore and cluster the neighborhoods

• Select the best cluster to open a new shopping mall

In [2]:
import requests
import urllib.request
import numpy as np 
import pandas as pd 
import json
from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from sklearn.cluster import KMeans


print("Libraries imported.")

Libraries imported.


### Scrap data from Wikipedia page into a DataFrame

In [3]:
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighbourhoods_of_Beijing").text

soup = BeautifulSoup(data, 'html.parser')

In [4]:
neighborhoodList = []


for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

bj_df = pd.DataFrame({"Neighborhood": neighborhoodList})

bj_df.head()

Unnamed: 0,Neighborhood
0,798 Art Zone
1,Andingmen
2,Beixinqiao Subdistrict
3,Brown Stone
4,Caishikou


In [5]:
bj_df.shape

(50, 1)

### Get the Geographical Coordinates

In [6]:
import geocoder


In [7]:
def get_latlng(neighborhood):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Beijing, China'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [8]:
coords = [ get_latlng(neighborhood) for neighborhood in bj_df["Neighborhood"].tolist() ]

coords

[[39.90750000000003, 116.39723000000004],
 [39.94382000000007, 116.39952000000005],
 [39.733600000000024, 116.7316800000001],
 [39.90750000000003, 116.39723000000004],
 [39.64283000000006, 115.97881000000007],
 [39.90750000000003, 116.39723000000004],
 [39.914600000000064, 116.41671000000008],
 [39.89972000000006, 116.41222000000005],
 [39.888780000000054, 116.46472000000006],
 [39.98635000000007, 116.48395000000005],
 [39.90750000000003, 116.39723000000004],
 [39.90750000000003, 116.39723000000004],
 [39.94126000000006, 116.37929000000008],
 [39.93943000000007, 116.39301000000012],
 [39.90750000000003, 116.39723000000004],
 [39.90750000000003, 116.39723000000004],
 [39.93596000000008, 116.43027000000006],
 [40.47831000000008, 116.04172000000005],
 [39.92295000000007, 116.3478],
 [39.90556000000004, 116.35111000000006],
 [40.170570000000055, 116.49630000000002],
 [39.90750000000003, 116.39723000000004],
 [39.88806000000005, 116.34194000000002],
 [39.90750000000003, 116.39723000000004],

In [9]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])


bj_df['Latitude'] = df_coords['Latitude']
bj_df['Longitude'] = df_coords['Longitude']

print(bj_df.shape)

(50, 3)


In [10]:
bj_df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,798 Art Zone,39.9075,116.39723
1,Andingmen,39.94382,116.39952
2,Beixinqiao Subdistrict,39.7336,116.73168
3,Brown Stone,39.9075,116.39723
4,Caishikou,39.64283,115.97881
5,Beijing central business district,39.9075,116.39723
6,Chaoyangmen,39.9146,116.41671
7,Chongwenmen,39.89972,116.41222
8,Chuiyangliu,39.88878,116.46472
9,Dashanzi,39.98635,116.48395


### Create a map of Beijing with neighborhoods superimposed on top

In [11]:
address = 'Beijing, China'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Beijing, China {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Beijing, China 40.190632, 116.412144.


In [12]:
map_bj = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(bj_df['Latitude'], bj_df['Longitude'], bj_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_bj)  
    
map_bj

# save the map as HTML file
map_bj.save('map_bj.html')

In [13]:
map_bj

### Use the Foursquare API to explore the neighborhoods

In [14]:
# define Foursquare Credentials and Version
CLIENT_ID = 'ZMVWHCS5SPIAZPYLYPFNEUKQXIR2NNNHS3BYWPEYPM3FSCVI' # your Foursquare ID
CLIENT_SECRET = 'X4SOTQRPZXPO0P3PMNUY0FZ2P2GM3NA1TYLOOFT2ZBP0DZAM' # your Foursquare Secret
VERSION = '20181130' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZMVWHCS5SPIAZPYLYPFNEUKQXIR2NNNHS3BYWPEYPM3FSCVI
CLIENT_SECRET:X4SOTQRPZXPO0P3PMNUY0FZ2P2GM3NA1TYLOOFT2ZBP0DZAM


In [15]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(bj_df['Latitude'], bj_df['Longitude'], bj_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3559, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,798 Art Zone,39.9075,116.39723,Tian'anmen Tower (天安门城楼),39.906562,116.391582,Historic Site
1,798 Art Zone,39.9075,116.39723,端门,39.908694,116.391192,Historic Site
2,798 Art Zone,39.9075,116.39723,Din Tai Fung (鼎泰丰),39.91363,116.405766,Dumpling Restaurant
3,798 Art Zone,39.9075,116.39723,Gate of Supreme Harmony (太和门),39.913719,116.39088,Historic Site
4,798 Art Zone,39.9075,116.39723,Lost Heaven 花马天堂,39.900272,116.395612,Yunnan Restaurant


In [16]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
798 Art Zone,100,100,100,100,100,100
Andingmen,100,100,100,100,100,100
Beijing central business district,100,100,100,100,100,100
Brown Stone,100,100,100,100,100,100
Chaoyangmen,100,100,100,100,100,100
Chongwenmen,100,100,100,100,100,100
Chuiyangliu,62,62,62,62,62,62
Dashanzi,100,100,100,100,100,100
Dashilan Subdistrict,100,100,100,100,100,100
Dengshikou,100,100,100,100,100,100


In [17]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

There are 182 uniques categories.


array(['Historic Site', 'Dumpling Restaurant', 'Yunnan Restaurant',
       'Lounge', 'Sculpture Garden', 'Shopping Mall', 'Park',
       'Chinese Restaurant', 'Tea Room', 'Zhejiang Restaurant',
       'Peking Duck Restaurant', 'Plaza', 'Electronics Store', 'Hotel',
       'Concert Hall', 'American Restaurant', 'Performing Arts Venue',
       'Church', 'Toy / Game Store', 'Steakhouse', 'Coffee Shop',
       'Bookstore', 'Garden', 'Beijing Restaurant', 'History Museum',
       'French Restaurant', 'Fast Food Restaurant', 'Restaurant',
       'Hostel', 'Clothing Store', 'Monument / Landmark',
       'Cantonese Restaurant', 'Pizza Place', 'Shopping Plaza',
       'Japanese Restaurant', 'Dessert Shop', 'Cocktail Bar', 'Café',
       'Bar', 'BBQ Joint', 'Vegetarian / Vegan Restaurant', 'Art Museum',
       'Scenic Lookout', 'Tiki Bar', 'Buddhist Temple',
       'Szechuan Restaurant', 'Brewery', 'Italian Restaurant', 'Butcher',
       'Furniture / Home Store'], dtype=object)

In [18]:
"Neighborhood" in venues_df['VenueCategory'].unique()

False

### Analyze Each Neighborhood

In [19]:
bj_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")


bj_onehot['Neighborhoods'] = venues_df['Neighborhood'] 


fixed_columns = [bj_onehot.columns[-1]] + list(bj_onehot.columns[:-1])
bj_onehot = bj_onehot[fixed_columns]

print(bj_onehot.shape)
bj_onehot.head()

(3559, 183)


Unnamed: 0,Neighborhoods,American Restaurant,Antique Shop,Aquarium,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wings Joint,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant,Zoo,Zoo Exhibit
0,798 Art Zone,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,798 Art Zone,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,798 Art Zone,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,798 Art Zone,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,798 Art Zone,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
bj_grouped = bj_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(bj_grouped.shape)
bj_grouped

(44, 183)


Unnamed: 0,Neighborhoods,American Restaurant,Antique Shop,Aquarium,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wings Joint,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant,Zoo,Zoo Exhibit
0,798 Art Zone,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
1,Andingmen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.03,...,0.03,0.01,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0
2,Beijing central business district,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
3,Brown Stone,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
4,Chaoyangmen,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,...,0.01,0.02,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0
5,Chongwenmen,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
6,Chuiyangliu,0.0,0.0,0.0,0.0,0.032258,0.0,0.016129,0.016129,0.016129,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Dashanzi,0.03,0.0,0.0,0.0,0.03,0.01,0.01,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
8,Dashilan Subdistrict,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
9,Dengshikou,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0


In [21]:
len(bj_grouped[bj_grouped["Shopping Mall"] > 0])

32

### Create a new DataFrame for Shopping Mall data only

In [22]:
bj_mall = bj_grouped[["Neighborhoods","Shopping Mall"]]

bj_mall.head()

Unnamed: 0,Neighborhoods,Shopping Mall
0,798 Art Zone,0.02
1,Andingmen,0.0
2,Beijing central business district,0.02
3,Brown Stone,0.02
4,Chaoyangmen,0.04


### Cluster Neighborhoods
#### Run k-means to cluster the neighborhoods in Beijinginto 3 clusters.

In [34]:
# set number of clusters
kclusters = 4

bj_clustering = bj_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(bj_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 0, 0, 3, 0, 3, 3, 0, 0])

In [35]:
bj_merged = bj_mall.copy()

# add clustering labels
bj_merged["Cluster Labels"] = kmeans.labels_

bj_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
bj_merged.head()

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels
0,798 Art Zone,0.02,0
1,Andingmen,0.0,2
2,Beijing central business district,0.02,0
3,Brown Stone,0.02,0
4,Chaoyangmen,0.04,3


In [36]:

bj_merged = bj_merged.join(bj_df.set_index("Neighborhood"), on="Neighborhood")

print(bj_merged.shape)
bj_merged.head() # check the last columns!

(44, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,798 Art Zone,0.02,0,39.9075,116.39723
1,Andingmen,0.0,2,39.94382,116.39952
2,Beijing central business district,0.02,0,39.9075,116.39723
3,Brown Stone,0.02,0,39.9075,116.39723
4,Chaoyangmen,0.04,3,39.9146,116.41671


In [37]:
# sort the results by Cluster Labels
print(bj_merged.shape)
bj_merged.sort_values(["Cluster Labels"], inplace=True)
bj_merged

(44, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,798 Art Zone,0.02,0,39.9075,116.39723
40,Yabaolu,0.02,0,39.9075,116.39723
39,Xuanwumen (Beijing),0.02,0,39.9075,116.39723
38,Xizhimen,0.018519,0,39.93889,116.35028
36,Xidan,0.02,0,39.9075,116.39723
33,Wangjing Subdistrict,0.03,0,39.9933,116.47284
31,Shifoying,0.02,0,39.9075,116.39723
30,Sanlitun,0.02,0,39.93609,116.44375
29,Ping'anli,0.02,0,39.9075,116.39723
28,Niujie,0.027027,0,39.88301,116.35703


### Visualize the resulting clusters

In [38]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(bj_merged['Latitude'], bj_merged['Longitude'], bj_merged['Neighborhood'], bj_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [39]:
map_clusters.save('map_clusters.html')

### Examine the Clusters

## Cluster 0

In [41]:
bj_merged.loc[bj_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,798 Art Zone,0.02,0,39.9075,116.39723
40,Yabaolu,0.02,0,39.9075,116.39723
39,Xuanwumen (Beijing),0.02,0,39.9075,116.39723
38,Xizhimen,0.018519,0,39.93889,116.35028
36,Xidan,0.02,0,39.9075,116.39723
33,Wangjing Subdistrict,0.03,0,39.9933,116.47284
31,Shifoying,0.02,0,39.9075,116.39723
30,Sanlitun,0.02,0,39.93609,116.44375
29,Ping'anli,0.02,0,39.9075,116.39723
28,Niujie,0.027027,0,39.88301,116.35703


## Cluster 1

In [42]:
bj_merged.loc[bj_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
24,Huilongguan,0.103448,1,40.07718,116.33527


## Cluster 2

In [43]:
bj_merged.loc[bj_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
41,Yayuncun Subdistrict,0.0,2,40.01388,116.39644
1,Andingmen,0.0,2,39.94382,116.39952
37,Xinjiangcun,0.0,2,23.20555,113.53199
35,Wudaokou,0.0,2,39.99257,116.33208
34,"Weigongcun, Beijing",0.0,2,39.95736,116.31273
32,Tiantongyuan,0.0,2,26.78194,112.13472
21,"Hepingli Subdistrict, Beijing",0.0,2,39.95276,116.41093
10,Deshengmen,0.0,2,39.94126,116.37929
11,Di'anmen,0.0,2,39.93943,116.39301
42,Yongdingmen,0.0,2,39.87028,116.39306


## Cluster 3

In [44]:
bj_merged.loc[bj_merged['Cluster Labels'] == 3]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
6,Chuiyangliu,0.048387,3,39.88878,116.46472
15,Fuchengmen,0.045455,3,39.92295,116.3478
25,Jianguomen,0.04,3,39.9146,116.41671
4,Chaoyangmen,0.04,3,39.9146,116.41671
22,Hepingmen,0.033708,3,39.89996,116.37435
14,Dongzhimen,0.04,3,39.93596,116.43027
7,Dashanzi,0.04,3,39.98635,116.48395
16,Fuxingmen,0.056338,3,39.90556,116.35111
