## Café in Copenhagen?

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import folium # map rendering library

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

from bs4 import BeautifulSoup

In [2]:
with open('copenhagen_districts.geojson') as json_data:
    cph_data = json.load(json_data)
    
neighborhoods_data = cph_data['features']

# define the dataframe columns
column_names = ['Neighbourhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [3]:
for data in neighborhoods_data:
    neighborhood_name = data['properties']['name']
    
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_latlon = np.mean(neighborhood_latlon[0][0],axis=0)
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]

    neighbourhood_price = data['properties']['k_price']
    neighbourhood_pop = data['properties']['total_pop']

    neighborhoods = neighborhoods.append({'Neighbourhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon,
                                          'Population': neighbourhood_pop,
                                          'Sqm_Price': neighbourhood_price},
                                          ignore_index=True)

In [4]:
cph_data = neighborhoods

### Display of Neighbourhoods

In [5]:
address = 'Copenhagen, CPH'

geolocator = Nominatim(user_agent="cph_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Copenhagen are {}, {}.'.format(latitude, longitude))

# create map of Copenhagen using latitude and longitude values
map_cph = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Neighbourhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_cph)  
    
map_cph

The geograpical coordinate of Copenhagen are 55.6445495, 12.5451878.


Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

## Define Foursquare Credentials and Version


In [6]:
CLIENT_ID = 'R01LINGO2WC45KLRLKT3ZHU2QENAO2IPRK2N2ELOHRNK4P3K' # your Foursquare ID
CLIENT_SECRET = '4JT1TWRMXMPLX5IOKNBAFU3L3ARXK4D5JJDPFK1CLRZM2ZVW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [7]:
# defining radius and limit of venues to get
radius=2000
LIMIT=100

In [8]:
#import requests
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
cph_venues = getNearbyVenues(names=cph_data['Neighbourhood'],
                                   latitudes=cph_data['Latitude'],
                                   longitudes=cph_data['Longitude']
                                  )

Osterbro
Valby
Kongens Enghave
Amager Ost
Bronshoj-Husum
Norrebro
Vesterbro
Indre By
Bispebjerg
Christianshavn
Fredriksberg
Amager Vest
Vanlose


### One hot encoding

In [10]:
# one hot encoding
cph_onehot = pd.get_dummies(cph_venues[['Venue Category']], prefix = '', prefix_sep = "")
# add neighborhood column back to dataframe
cph_onehot['Neighbourhood'] = cph_venues['Neighbourhood'] 
# add neighbourhood column to the first column
fixed_columns = [cph_onehot.columns[-1]] + list(cph_onehot.columns[:-1])
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
cph_grouped = cph_onehot.groupby('Neighbourhood').mean().reset_index()

In [11]:
"""
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
"""

'\ndef return_most_common_venues(row, num_top_venues):\n    row_categories = row.iloc[1:]\n    row_categories_sorted = row_categories.sort_values(ascending=False)\n    \n    return row_categories_sorted.index.values[0:num_top_venues]\n'

In [12]:
"""# dataframe with top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = cph_grouped['Neighbourhood']

for ind in np.arange(cph_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cph_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head(12)
"""

"# dataframe with top 10 venues for each neighborhood\nnum_top_venues = 10\n\nindicators = ['st', 'nd', 'rd']\n\n# create columns according to number of top venues\ncolumns = ['Neighbourhood']\nfor ind in np.arange(num_top_venues):\n    try:\n        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))\n    except:\n        columns.append('{}th Most Common Venue'.format(ind+1))\n\n# create a new dataframe\nneighbourhoods_venues_sorted = pd.DataFrame(columns=columns)\nneighbourhoods_venues_sorted['Neighbourhood'] = cph_grouped['Neighbourhood']\n\nfor ind in np.arange(cph_grouped.shape[0]):\n    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cph_grouped.iloc[ind, :], num_top_venues)\n\nneighbourhoods_venues_sorted.head(12)\n"

In [13]:
# One hot cafe : "Café",'Lounge','Dessert Shop','Coffee Shop' Merged as they are pretty similar
cph_cafe = cph_grouped[["Neighbourhood","Café",'Lounge','Dessert Shop','Coffee Shop']]
cph_cafe_sum = cph_cafe
cph_cafe_sum['total'] = cph_cafe.sum(axis=1)
cph_cafe_merged = cph_cafe_sum[['Neighbourhood','total']]
cph_cafe_merged.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Neighbourhood,total
0,Amager Ost,0.0
1,Amager Vest,0.0
2,Bispebjerg,0.090909
3,Bronshoj-Husum,0.0
4,Christianshavn,0.0


## K-means : 3 Clusters [High , Moderate, No Concentration]

In [14]:
# set number of clusters
kclusters = 3

neigh_grouped_clustering = cph_cafe_merged.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neigh_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 2, 1, 1, 0, 2, 1, 2, 1, 0, 1, 0], dtype=int32)

In [15]:
cph_cafe_merged["Cluster_Labels"] = kmeans.labels_
cph_merged = cph_cafe_merged.join(cph_venues.set_index("Neighbourhood"), on="Neighbourhood")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
#no data available for some neighbourhood : drop row
cph_merged=cph_merged.dropna()
cph_merged['Cluster_Labels'] = cph_merged.Cluster_Labels.astype(int)

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cph_merged['Neighbourhood Latitude'], cph_merged['Neighbourhood Longitude'], cph_merged['Neighbourhood'], cph_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [17]:
cph_gen_data = cph_cafe_merged.join(cph_data.set_index("Neighbourhood"), on="Neighbourhood")
cph_gen_data.head(13)

Unnamed: 0,Neighbourhood,total,Cluster_Labels,Latitude,Longitude,Population,Sqm_Price
0,Amager Ost,0.0,1,55.671191,12.632028,63794.0,0.384185
1,Amager Vest,0.0,1,55.637642,12.567667,42152.0,0.51515
2,Bispebjerg,0.090909,2,55.716863,12.536427,46496.0,0.234804
3,Bronshoj-Husum,0.0,1,55.710054,12.486824,44404.0,0.0
4,Christianshavn,0.0,1,55.685673,12.609458,11760.0,1.0
5,Fredriksberg,0.173913,0,55.68029,12.524036,62859.0,0.74255
6,Indre By,0.106383,2,55.685952,12.587683,55766.0,0.859703
7,Kongens Enghave,0.0,1,55.645375,12.54039,11084.0,0.654452
8,Norrebro,0.069767,2,55.696433,12.548778,48240.0,0.443041
9,Osterbro,0.0,1,55.711846,12.589657,58235.0,0.68811


## Cluster Selection Process
Cluster 1 (purple) has no cafés, cluster 2 (green) has the most, and cluster 0 (red) has a moderate amount.

It is possible that creating a café where there is none, is not optimal.

My guess is that it should be in one of the areas with a moderate amount of cafés, so that it's neither saturated, nor nobody will want to go because it will be a bit isolated. 

Frederiksberg, Valby and Vesterbro have a moderate amount of cafés.

Let's See which one should we choose.

### Calculate Distance from Center

In [18]:
from math import sin, cos, sqrt, atan2, radians

def dist_from_center(center_lat,center_lon,lat_other,lon_other):
    # approximate radius of earth in km
    R = 6373.0
    
    lat1 = radians(center_lat)
    lon1 = radians(center_lon)
    lat2 = radians(lat_other)
    lon2 = radians(lon_other)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [19]:
# Copenhagen center coordinates (studenterhuset)
center_lat = 55.681175
center_lon = 12.576300

distance_from_center=[]
for i in range(len(cph_data['Latitude'])):
    temp_lat=cph_data['Latitude'][i]
    temp_lon=cph_data['Longitude'][i]
    temp_dist=dist_from_center(center_lat,center_lon,temp_lat,temp_lon)
    distance_from_center.append(temp_dist)
# fix for husum
cph_gen_data.loc[cph_gen_data['Sqm_Price']==0, 'Sqm_Price'] = 0.2


cph_gen_data['Dist_from_Center']=distance_from_center

### Create Index for How Good Each Neighbourhood Based on Population, Distance from the Center and Price per Square Meter

In [22]:
# Index = Population_Ratio / (Price_per_SqM * Distance_from_Center)
good_idx_df = cph_gen_data
good_idx_df['Population']=good_idx_df['Population']/good_idx_df['Population'].sum()
good_idx_df['INDEX']=good_idx_df['Population']/(good_idx_df['Sqm_Price']*good_idx_df['Dist_from_Center'])

Neighbourhood_Index=cph_gen_data[['Neighbourhood']]
Neighbourhood_Index['Index']=good_idx_df['INDEX']
Neighbourhood_Index=Neighbourhood_Index.sort_values(by=['Index'],ascending=False)
Neighbourhood_Index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,Neighbourhood,Index
10,Valby,0.127199
3,Bronshoj-Husum,0.108556
0,Amager Ost,0.084765
2,Bispebjerg,0.07761
9,Osterbro,0.070959
11,Vanlose,0.063498
5,Fredriksberg,0.062716
6,Indre By,0.049733
8,Norrebro,0.041623
7,Kongens Enghave,0.03413


## Conclusion : Best opportunity to open a café - Valby area

In [23]:
kk=cph_gen_data.join(Neighbourhood_Index.set_index('Neighbourhood'),on='Neighbourhood')
m = folium.Map(location=[cph_data['Latitude'][6], cph_data['Longitude'][6]], zoom_start=11.5)

folium.Choropleth(geo_data='copenhagen_districts.geojson',
                      name='choropleth',
                      data=kk,
                      columns=['Neighbourhood','Index'],
                      key_on='feature.properties.name', 
                      fill_color='YlGn',
                      fill_opacity=0.4,
                      line_opacity=0.2,
                      legend_name='Index').add_to(m)

folium.LayerControl().add_to(m)


#no data available for some neighbourhood : drop row
cph_merged=cph_merged.dropna()
cph_merged['Cluster_Labels'] = cph_merged.Cluster_Labels.astype(int)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cph_merged['Venue Latitude'], cph_merged['Venue Longitude'], cph_merged['Neighbourhood'], cph_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(m)

folium.LayerControl().add_to(m)     
m