In [1]:
import numpy as np
from sklearn.cluster import KMeans
import folium
from folium import plugins
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Make the graphs a bit prettier, and bigger
# pd.set_option('display.mpl_style', 'default')
plt.style.use('ggplot')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

plt.rcParams['figure.figsize'] = (15, 5)

In [None]:
df = pd.read_csv("coffeeshops.csv")
df = df.drop_duplicates()

In [None]:
df.head(5)

Unnamed: 0,id,name,lat,lng,checkins,address
0,4b78d398f964a520cee32ee3,Starbucks Coffee,14.73471,121.057599,10428,"Ground Flr, SM City Fairview (Annex 1)"
1,53055beb498e87041e7204f9,Starbucks Coffee,14.735735,121.060213,3793,"Fairview Terraces, Quirino Hwy"
2,4bc1f1a6f8219c74a0f7b310,The Coffee Bean & Tea Leaf,14.736308,121.058135,17441,"G/F Gateway Mall, Gen Roxas Ave"
3,4e5b077352b1b78b81b213b9,Starbucks Coffee,14.734639,121.059474,4116,"2nd Flr, SM City Fairview (Main Bldg)"
4,4ceb3522e888f04d2161516b,Starbucks Coffee,14.553593,121.050662,40149,"Crossroads, Bonifacio Global City"


# 1. What are the top 3 most common coffee places? ( 1pt )

In [None]:
pd.DataFrame(data = df.groupby(by = 'name').sum()['checkins']).sort_values(by = 'checkins', ascending = False).iloc[:3,:]

Unnamed: 0_level_0,checkins
name,Unnamed: 1_level_1
Starbucks Coffee,1460679
The Coffee Bean & Tea Leaf,238829
Seattle's Best Coffee,68194


From this, we see that the top 3 most common coffee places are Starbucks Coffee, The Coffee Bean & Tea Leaf, and Seattle's Best Coffee.

# 2. What is the latitude variance and longitude variance, and explain why these variances are different. ( 3pts )

In [None]:
pd.DataFrame(data = df.var(), columns = ['Variance']).iloc[:2,:]

Unnamed: 0,Variance
lat,0.007107
lng,0.00155


From this, we see that the variance for latitude is 0.007107 while the one for longitude is 0.001550. Since variance is a measure of dispersion/spread, we can say that the latitude attribute is more dispersed as compared to the longitude attribute. This makes sense in the context of the Philippines, since the length of the country is greater than its width when viewed from a map. Hence, we would expect these coffeeshops to be more dispersed "vertically" than "horizontally".

# 3. Explain what KMeans intertia is and how different it is from DBSCAN epsilon. ( 4pts )

**KMeans Inertia**

From what I understand, inertia is a measure of how well KMeans is able to cluster the dataset. It is a function of the distances between the points and their respective cluster centroids. Ideally, the choice for k should minimize the inertia, but in most cases, increasing k will most likely lower the inertia anyways. Hence, the use of the so-called elbow plot allows us to see which value of k is optimal, since we want a low inertia but we don't want to use too many clusters as this is harder to explain and may not be ideal in practice. 

**DBSCAN Epsilon**

Whereas inertia is a performance metric used AFTER clustering, the epsilon value for DBSCAN on the other hand is a hyperparameter that needs to be declared PRIOR to clustering. The epsilon value dictates how "far away" from a data point should another data point be for them to be considered neighbors. This information is then used, alongside the parameter min_samples, to form the clusters using DBSCAN.

# 4. Cluster the locations data with DBSCAN. ( 5pts )

In [None]:
X = df[['lat', 'lng']]

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.005, min_samples=5).fit(X) # we use eps = 0.005 as it is on the same order of magnitude as with the lat and lng variance
y_dbscan = dbscan.labels_

In [None]:
# We want to visualize the clusters, so we first do some processing for colors.

df["dbscan_cluster"] = y_dbscan
palette = sns.color_palette("pastel", len(y_dbscan)).as_hex()
color_getter = lambda x: palette[int(x)]
df["db_color"] = df["dbscan_cluster"].map(color_getter)
df.head()

Unnamed: 0,id,name,lat,lng,checkins,address,dbscan_cluster,db_color
0,4b78d398f964a520cee32ee3,Starbucks Coffee,14.73471,121.057599,10428,"Ground Flr, SM City Fairview (Annex 1)",0,#a1c9f4
1,53055beb498e87041e7204f9,Starbucks Coffee,14.735735,121.060213,3793,"Fairview Terraces, Quirino Hwy",0,#a1c9f4
2,4bc1f1a6f8219c74a0f7b310,The Coffee Bean & Tea Leaf,14.736308,121.058135,17441,"G/F Gateway Mall, Gen Roxas Ave",0,#a1c9f4
3,4e5b077352b1b78b81b213b9,Starbucks Coffee,14.734639,121.059474,4116,"2nd Flr, SM City Fairview (Main Bldg)",0,#a1c9f4
4,4ceb3522e888f04d2161516b,Starbucks Coffee,14.553593,121.050662,40149,"Crossroads, Bonifacio Global City",1,#ffb482


In [None]:
df = df[df["dbscan_cluster"] > -1]  # removes all noisy instances

In [None]:
# We then use folium to visualize the clusters.

basemap = folium.Map(location=[14.5562297,121.0204097], tiles="cartodbdark_matter", zoom_start=15)

for index, row in df.iterrows():
    folium.CircleMarker([row["lat"], row["lng"]], radius=5, fill=True,
    color =row["db_color"], fill_color=row["db_color"], popup = folium.Popup(str(row["lat"]) + " " + str(row["lng"]), parse_html=True)
    ).add_to(basemap)

basemap

In [None]:
np.max(y_dbscan) # since the max value is 10, this means we have a total of 11 clusters (excluding the noisy instances)

10

# 5. Present an analysis of the data following the DBSCAN results. ( 7pts )
*   explain what the clusters mean
*   what does the # of clusters tell us
*   what is this information useful for

**Explain what the clusters mean.**
- Since we used the latitude and longitude attributes for clustering, the resulting clusters can be thought of as geographic clusters of the coffeeshops, which means they are solely based on location. However, unlike KMeans which is distance-based, our clustering is density-based. This means that the clusters that we formed can be thought of as regions with a high concentration of coffeeshops (coffeeshop density is high).

**What does the number of clusters tell us?**
- From the code above, we see that the number of clusters is 11. Hence, one possible interpretation is that, in the dataset that we are given, we are able to detect 11 regions with a high concentration of coffeeshops. Further research can be made in order to know why these areas tend to have more coffeeshops than usual. They can be business centers, areas around malls, etc.

**What is this information useful for?**
- One possible use case is the identification of a possible location when opening up a coffeeshop. Knowing where coffeeshops tend to cluster allows us to identify possible competition in a given area, untapped regions with no coffeeshops yet, etc. This use case can extend to other products/establishments that go well with coffee. For example, if market research suggests that libraries and bookstores tend to become more successful in regions with a lot of coffeeshops, then this information might be useful for those engaging in that kind of business as well.

# Bonus 1 : What is your favorite coffee shop? (1pt)

I don't really go to coffee shops that much since I'm not a fan of coffee in the first place. Despite this, the bread and pastry products of Starbucks are quite good so I would say it is my favorite coffee shop.

# Bonus 2 : Run everything on your own geospatial dataset, and answer the same questions listed above. (5pts) Use this: https://developer.foursquare.com/docs/resources/categories

We first specify our places and topics before using foursquare.

In [14]:
places = ["14.73371, 121.06813", "14.73554, 121.01268", "14.72026, 120.96342", \
          "14.6922, 121.06212", "14.69037, 121.01732", "14.68954, 120.97063", \
          "14.65517, 121.05182", "14.65866, 121.01028", "14.64703, 120.9708", \
          "14.6364, 121.07414", "14.64819, 121.1174", "14.62361, 121.03998", \
          "14.61049, 120.9914", "14.60949, 121.10658", "14.60152, 121.05165", \
          "14.56946, 121.09491", "14.5776, 121.05972", "14.5771, 121.02865", \
          "14.5565, 121.02092", "14.55583, 121.02607", "14.55583, 121.02607", \
          "14.55002, 121.06126", "14.55002, 121.04581", "14.5241, 121.02127", \
          "14.40209, 121.02779", "14.60974, 121.08002", "14.64969, 121.0747"]
topics = ["fast food", "fastfood", "mcdo", "burger", "pizza", "pasta"]

We then use foursquare to get our geospatial data.

In [15]:
import foursquare

# Construct the client object
client = foursquare.Foursquare(client_id='LK0U2PMB3ZFV3DH03TKBF2H04KUBRHFOZOZNHP5NWOESAALM', 
                               client_secret='PKNTSEUE5L4T533TSRI4JTNND3KQTOLGJAZVTTFT5EU3HCU4', 
                               redirect_uri='http://fondu.com/oauth/authorize')

# Build the authorization url for your app
auth_uri = client.oauth.auth_url()

In [16]:
response = [client.venues.search(params={'query': topic, 
                                         'll': place,
                                        'radius': 2000,
                                        'limit': 30, 
                                        'intent': "browse", 
                                        'categoryId': '4d4b7105d754a06374d81259'})
            for place in places for topic in topics]

In [17]:
len(response)  # we see that we have 162 responses

162

We then process our data and create the DataFrame df2.

In [18]:
data_list = []
for x in range(len(response)):
    try:
        for i in range(len(response[x]["venues"])):
            poi_id =  response[x]["venues"][i]["id"]
            poi_name =  response[x]["venues"][i]["name"]
            poi_lat =  response[x]["venues"][i]["location"]["lat"]
            poi_lng =  response[x]["venues"][i]["location"]["lng"]
            data_list.append([poi_id, poi_name, poi_lat, poi_lng])
    except KeyError:
        pass

df2 = pd.DataFrame(data_list, columns=["id", "name", "lat", "lng"])
df2 = df2.drop_duplicates()
df2["name"] = df2['name'].str.lower()

In [19]:
df2.head()

Unnamed: 0,id,name,lat,lng
0,53a29eff498e6bc264a2f746,zeny's fast food,14.737028,121.051134
1,4d294f6c888af04da15eceaf,bodhi vegetarian health food house,14.733963,121.060023
2,5b231c1dfe63bd002c2ea6a6,limbooz food hub,14.735246,121.06787
3,4d29305d068e8cfaabc7cd4c,sm foodcourt,14.734557,121.05925
4,58dfc03b076be155f2ac96e4,rooftop food park,14.730262,121.060988


We now proceed with answering all the questions above in the context of this dataset.

1. What are the top 3 most common fast food chains? ( 1pt )

In [23]:
pd.DataFrame(df2['name'].value_counts()).iloc[:3,:]

Unnamed: 0,name
mcdonald's,248
burger machine,95
pizza hut,78


From this, we see that the top 3 most common fast food chains are mcdonald's, burger machine, and pizza hut

2. What is the latitude variance and longitude variance, and explain why these variances are different. ( 3pts )

In [25]:
pd.DataFrame(data = df2.var(), columns = ['Variance'])

Unnamed: 0,Variance
lat,0.00364
lng,0.001476


The latitude variance is 0.003640 while the longitude variance is 0.001476. The explanation behind the fact that the latitude variance is higher than the longitude variance is similar to the explanation above.

3. Explain what KMeans intertia is and how different it is from DBSCAN epsilon. ( 4pts )

This question is already answered above.

4. Cluster the locations data with DBSCAN. ( 5pts )

In [26]:
X2 = df2[['lat', 'lng']]

In [27]:
from sklearn.cluster import DBSCAN
dbscan2 = DBSCAN(eps=0.005, min_samples=5).fit(X2)  # we use eps=0.002 since it is closer to our variances
y_dbscan2 = dbscan2.labels_

In [28]:
# We want to visualize the clusters, so we first do some processing for colors.

df2["dbscan_cluster"] = y_dbscan2
palette = sns.color_palette("pastel", len(y_dbscan2)).as_hex()
color_getter = lambda x: palette[int(x)]
df2["db_color"] = df2["dbscan_cluster"].map(color_getter)
df2.head()

Unnamed: 0,id,name,lat,lng,dbscan_cluster,db_color
0,53a29eff498e6bc264a2f746,zeny's fast food,14.737028,121.051134,0,#a1c9f4
1,4d294f6c888af04da15eceaf,bodhi vegetarian health food house,14.733963,121.060023,0,#a1c9f4
2,5b231c1dfe63bd002c2ea6a6,limbooz food hub,14.735246,121.06787,0,#a1c9f4
3,4d29305d068e8cfaabc7cd4c,sm foodcourt,14.734557,121.05925,0,#a1c9f4
4,58dfc03b076be155f2ac96e4,rooftop food park,14.730262,121.060988,0,#a1c9f4


In [29]:
df2 = df2[df2["dbscan_cluster"] > -1]  # removes all noisy instances

In [31]:
# We then use folium to visualize the clusters.

basemap2 = folium.Map(location=[14.5562297,121.0204097], tiles="cartodbdark_matter", zoom_start=15)

for index, row in df2.iterrows():
    folium.CircleMarker([row["lat"], row["lng"]], radius=5, fill=True,
    color =row["db_color"], fill_color=row["db_color"], popup = folium.Popup(str(row["lat"]) + " " + str(row["lng"]), parse_html=True)
    ).add_to(basemap2)

basemap2

Output hidden; open in https://colab.research.google.com to view.

In [32]:
np.max(y_dbscan2) # this means we have a total of 30 clusters

29

5. Present an analysis of the data following the DBSCAN results. ( 7pts )
*   explain what the clusters mean
*   what does the # of clusters tell us
*   what is this information useful for

The explanation for this is exactly the same as the previous explanation but taken in a different context. In particular, we are dealing with fast food chains instead of coffee shops.