# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
import geocoder


Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



# Scraping the neighborhoods data

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_communities_in_Dubai")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
df[0].to_json(orient='records')

'[{"0":"Community Number","1":"Community (English)","2":"Community (Arabic)","3":"Area(km2)","4":"Population(2000)","5":"Population density(\\/km2)"},{"0":"126","1":"Abu Hail","2":"\\u0623\\u0628\\u0648 \\u0647\\u064a\\u0644","3":"1.27 km\\u00b2","4":"21414","5":"16,861.4\\/km\\u00b2"},{"0":"711","1":"Al Awir First","2":"\\u0627\\u0644\\u0639\\u0648\\u064a\\u0631 \\u0627\\u0644\\u0623\\u0648\\u0644\\u0649","3":null,"4":null,"5":null},{"0":"721","1":"Al Awir Second","2":"\\u0627\\u0644\\u0639\\u0648\\u064a\\u0631 \\u0627\\u0644\\u062b\\u0627\\u0646\\u064a\\u0629","3":null,"4":null,"5":null},{"0":"333","1":"Al Bada","2":"\\u0627\\u0644\\u0628\\u062f\\u0639","3":"0.82 km\\u00b2","4":"18816","5":"22946\\/km\\u00b2"},{"0":"122","1":"Al Baraha","2":"\\u0627\\u0644\\u0628\\u0631\\u0627\\u062d\\u0629","3":"1.104 km\\u00b2","4":"7823","5":"7,086\\/km\\u00b2"},{"0":"373","1":"Al Barsha First","2":"\\u0627\\u0644\\u0628\\u0631\\u0634\\u0627\\u0621 \\u0627\\u0644\\u0623\\u0648\\u0644\\u0649","3":n

In [3]:
data = pd.DataFrame(df[0])

In [4]:
data = pd.DataFrame(data[1])

In [5]:
data.columns=['Neighborhood']
data.drop([0],inplace=True)

In [6]:
data = data.reset_index().drop(columns=['index'])

# Getting the geographical coordinates

In [7]:
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Dubai, UAE'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [8]:
coords = [ get_latlng(neighborhood) for neighborhood in data["Neighborhood"].tolist() ]

In [9]:
coords

[[25.28308000000004, 55.33435000000003],
 [25.18593000000004, 55.54126000000008],
 [25.167920000000038, 55.543310000000076],
 [25.21861000000007, 55.26406000000003],
 [25.282800000000066, 55.31678000000005],
 [25.11483000000004, 55.19136000000003],
 [25.107230000000072, 55.20485000000008],
 [25.08958000000007, 55.23424000000006],
 [25.077390000000037, 55.24267000000003],
 [25.062290000000075, 55.23995000000008],
 [25.093420000000037, 55.19044000000008],
 [25.269250000000056, 55.29944000000006],
 [25.272170000000074, 55.30157000000003],
 [25.243370000000027, 55.352670000000046],
 [25.269510000000025, 55.30884000000003],
 [25.25696000000005, 55.30246000000005],
 [25.29871000000003, 55.33546000000007],
 [25.237130000000036, 55.27707000000004],
 [25.220540000000028, 55.34166000000005],
 [25.233420000000024, 55.29001000000005],
 [25.245290000000068, 55.30364000000003],
 [25.27177000000006, 55.33762000000007],
 [25.24282000000005, 55.48440000000005],
 [25.22784000000007, 55.522320000000036],

In [10]:
coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [11]:
coords.head()

Unnamed: 0,Latitude,Longitude
0,25.28308,55.33435
1,25.18593,55.54126
2,25.16792,55.54331
3,25.21861,55.26406
4,25.2828,55.31678


In [12]:
data['Latitude'] = coords['Latitude']
data['Longitude'] = coords['Longitude']

In [13]:
data.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Abu Hail,25.28308,55.33435
1,Al Awir First,25.18593,55.54126
2,Al Awir Second,25.16792,55.54331
3,Al Bada,25.21861,55.26406
4,Al Baraha,25.2828,55.31678


In [14]:
data.shape

(131, 3)

# Create a map of Dubai with neighborhoods superimposed on top

In [15]:
# get the coordinates of Dubai
address = 'Dubai, United Arab Emirates'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Dubai, United Arab Emirates {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Dubai, United Arab Emirates 25.0659637, 55.1713403.


In [16]:
# create map of Dubai using latitude and longitude values
map_dubai = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(data['Latitude'], data['Longitude'], data['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_dubai)  
    
map_dubai

# Using the Foursquare API to explore the neighborhoods

In [17]:
# define Foursquare Credentials and Version
CLIENT_ID = 'id' #  Foursquare ID
CLIENT_SECRET = 'secret' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: AH2KXBET0GPAI1L5LAFTBLR4ORS3SLJ0DXTOHGUL3K5X3FRZ
CLIENT_SECRET:JZ35MYKL1GZL1H5QX3BZCM332IIIRHQRLBZLIIHUPUOUENCQ


# getting the top 100 venues that are within a radius of 1 kilometer.

In [18]:
radius = 1000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(data['Latitude'], data['Longitude'], data['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [19]:
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(5530, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Abu Hail,25.28308,55.33435,Habib Bakery,25.281124,55.332774,Bakery
1,Abu Hail,25.28308,55.33435,Gold's Gym,25.282698,55.341019,Gym
2,Abu Hail,25.28308,55.33435,Al Douri Roastery,25.277057,55.328223,Bakery
3,Abu Hail,25.28308,55.33435,Union Co-Operative Society,25.282769,55.340896,Department Store
4,Abu Hail,25.28308,55.33435,McDonald's,25.282839,55.34078,Fast Food Restaurant


# Checking how many venues were returned for each neighorhood

In [20]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abu Hail,21,21,21,21,21,21
Al Awir First,4,4,4,4,4,4
Al Awir Second,5,5,5,5,5,5
Al Bada,58,58,58,58,58,58
Al Baraha,21,21,21,21,21,21
Al Barsha First,86,86,86,86,86,86
Al Barsha Second,99,99,99,99,99,99
Al Barsha South First,7,7,7,7,7,7
Al Barsha South Second,12,12,12,12,12,12
Al Barsha South Third,14,14,14,14,14,14


# How many unique categories can be curated from all the returned venue

In [21]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 312 uniques categories.


In [22]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(['Bakery', 'Gym', 'Department Store', 'Fast Food Restaurant',
       'Dessert Shop', 'Indian Restaurant', 'Scenic Lookout', 'Park',
       'Market', 'Shopping Mall', 'History Museum', 'Beach', 'Track',
       'Spa', 'Café', 'Cafeteria', 'Burger Joint', 'Coffee Shop',
       'Clothing Store', 'Restaurant', 'Gym / Fitness Center',
       'Shopping Plaza', 'Mediterranean Restaurant', 'Hotel',
       'Middle Eastern Restaurant', 'Gluten-free Restaurant',
       'Chinese Restaurant', 'Supermarket', 'Seafood Restaurant',
       'Pharmacy', 'Comfort Food Restaurant', 'Pool', 'Boutique',
       'Pizza Place', 'Ice Cream Shop', 'Cosmetics Shop', 'Flower Shop',
       'Furniture / Home Store', 'Bridal Shop', 'Resort',
       'Asian Restaurant', 'Salon / Barbershop', 'Costume Shop',
       'American Restaurant', "Women's Store", 'Smoke Shop',
       'Turkish Restaurant', 'Lounge', 'Bar', 'Convenience Store'],
      dtype=object)

In [23]:
# check if the results contain "Coffee Shop"
"Coffee Shop" in venues_df['VenueCategory'].unique()

True

# Analyze Each Neighborhood

In [24]:
# one hot encoding
dubai_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dubai_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [dubai_onehot.columns[-1]] + list(dubai_onehot.columns[:-1])
dubai_onehot = dubai_onehot[fixed_columns]

print(dubai_onehot.shape)
dubai_onehot.head()

(5530, 313)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vietnamese Restaurant,Water Park,Waterfront,Wine Bar,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Abu Hail,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abu Hail,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abu Hail,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Abu Hail,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Abu Hail,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
dubai_grouped = dubai_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(dubai_grouped.shape)
dubai_grouped

(131, 313)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vietnamese Restaurant,Water Park,Waterfront,Wine Bar,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Abu Hail,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
1,Al Awir First,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
2,Al Awir Second,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
3,Al Bada,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
4,Al Baraha,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.047619,...,0.00,0.000000,0.000000,0.00,0.000000,0.047619,0.0,0.00000,0.000000,0.000000
5,Al Barsha First,0.0,0.011628,0.00,0.000000,0.00,0.00,0.00,0.00,0.011628,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
6,Al Barsha Second,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.010101,0.000000,0.0,0.00000,0.000000,0.000000
7,Al Barsha South First,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
8,Al Barsha South Second,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000
9,Al Barsha South Third,0.0,0.000000,0.00,0.000000,0.00,0.00,0.00,0.00,0.000000,...,0.00,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.00000,0.000000,0.000000


In [26]:
len(dubai_grouped[dubai_grouped["Coffee Shop"] > 0])

83

# Create a new DataFrame for coffee shops data only

In [27]:
dubai_coffee = dubai_grouped[["Neighborhoods","Coffee Shop"]]
dubai_coffee.head()

Unnamed: 0,Neighborhoods,Coffee Shop
0,Abu Hail,0.0
1,Al Awir First,0.0
2,Al Awir Second,0.2
3,Al Bada,0.103448
4,Al Baraha,0.047619


# Clustering the Neighborhoods

In [28]:
# set number of clusters
kclusters = 3

dubai_clustering = dubai_coffee.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dubai_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 0, 0, 0, 2, 0, 2, 0, 0])

In [29]:
#create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
dubai_merged = dubai_coffee.copy()

# add clustering labels
dubai_merged["Cluster Labels"] = kmeans.labels_

In [30]:
dubai_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
dubai_merged.head()

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels
0,Abu Hail,0.0,2
1,Al Awir First,0.0,2
2,Al Awir Second,0.2,0
3,Al Bada,0.103448,0
4,Al Baraha,0.047619,0


In [31]:
# merge dubai_grouped with toronto_data to add latitude/longitude for each neighborhood
dubai_merged = dubai_merged.join(data.set_index("Neighborhood"), on="Neighborhood")

print(dubai_merged.shape)
dubai_merged.head() # check the last columns!

(131, 5)


Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
0,Abu Hail,0.0,2,25.28308,55.33435
1,Al Awir First,0.0,2,25.18593,55.54126
2,Al Awir Second,0.2,0,25.16792,55.54331
3,Al Bada,0.103448,0,25.21861,55.26406
4,Al Baraha,0.047619,0,25.2828,55.31678


In [32]:
# sort the results by Cluster Labels
print(dubai_merged.shape)
dubai_merged.sort_values(["Cluster Labels"], inplace=True)
dubai_merged

(131, 5)


Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
130,Za'abeel Second,0.080000,0,25.208310,55.279360
34,Al Mushrif,0.055556,0,25.272570,55.378020
35,Al Muteena,0.072165,0,25.272280,55.322910
110,Palm Jumeira,0.068493,0,25.111575,55.140083
37,Al Nahda Second,0.072289,0,25.292550,55.374810
39,Al Quoz First,0.081081,0,25.168520,55.250650
40,Al Quoz Industrial First,0.133333,0,25.145250,55.230150
81,Downtown Dubai,0.060000,0,25.187586,55.271463
42,Al Quoz Industrial Second,0.076923,0,25.135920,55.243130
43,Al Quoz Industrial Third,0.090909,0,25.130680,55.231320


# Visualize the resulting clusters

In [33]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dubai_merged['Latitude'], dubai_merged['Longitude'], dubai_merged['Neighborhood'], dubai_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine each Cluster

In [34]:
dubai_merged.loc[dubai_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
130,Za'abeel Second,0.08,0,25.20831,55.27936
34,Al Mushrif,0.055556,0,25.27257,55.37802
35,Al Muteena,0.072165,0,25.27228,55.32291
110,Palm Jumeira,0.068493,0,25.111575,55.140083
37,Al Nahda Second,0.072289,0,25.29255,55.37481
39,Al Quoz First,0.081081,0,25.16852,55.25065
40,Al Quoz Industrial First,0.133333,0,25.14525,55.23015
81,Downtown Dubai,0.06,0,25.187586,55.271463
42,Al Quoz Industrial Second,0.076923,0,25.13592,55.24313
43,Al Quoz Industrial Third,0.090909,0,25.13068,55.23132


In [35]:
dubai_merged.loc[dubai_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
91,Jebel Ali 2,0.5,1,25.02777,55.12673
90,Jebel Ali 1,0.5,1,25.02777,55.12673


In [36]:
dubai_merged.loc[dubai_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Latitude,Longitude
84,Emirates Hill First,0.000000,2,25.05435,55.15188
87,Hatta,0.000000,2,24.80051,56.11640
92,Jebel Ali Industrial,0.000000,2,24.95078,55.09272
88,Hor Al Anz,0.000000,2,25.27743,55.33746
86,Emirates Hill Third,0.000000,2,25.06834,55.17540
93,Jebel Ali Palm,0.000000,2,24.95078,55.09272
112,Ras Al Khor,0.000000,2,25.18519,55.33039
101,Muhaisanah Third,0.000000,2,25.27131,55.40101
125,Umm Suqeim Third,0.000000,2,25.13508,55.19788
99,Muhaisanah Fourth,0.000000,2,25.27640,55.40964
