In [5]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import random # library for random number generation

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.18.1               |             py_0          51 KB  conda-forge
    openssl-1.0.2p             |       h470a237_2         3.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0         conda-forge
    geopy:         1.18.1-py_0       conda-forge

The following packages will be UPDATED:

    openssl:       1.0.2p-h470a237_1 conda-forge --> 1.0.2p-h470a237_2 conda-forge


Downloading and Extracting Packages
geopy-1.18.1         | 51 KB     | #############

In [12]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [13]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [14]:
neighborhoods_data = newyork_data['features']

In [15]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
   

In [30]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688
5,Manhattan,Manhattanville,40.816934,-73.957385
6,Manhattan,Central Harlem,40.815976,-73.943211
7,Manhattan,East Harlem,40.792249,-73.944182
8,Manhattan,Upper East Side,40.775639,-73.960508
9,Manhattan,Yorkville,40.77593,-73.947118


In [27]:
address = 'Manhattan, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


In [88]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [8]:
CLIENT_ID = 'ZOZG5W1IURBBIFYFQFVXES5ZSZUJFPEZJCB51HOO2NWLBSOF' # your Foursquare ID
CLIENT_SECRET = 'SZYTP5IYCE0LRYSB3P3BJJS0N5JUXSO11R4TSFHO3EIVVEDN' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 50
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZOZG5W1IURBBIFYFQFVXES5ZSZUJFPEZJCB51HOO2NWLBSOF
CLIENT_SECRET:SZYTP5IYCE0LRYSB3P3BJJS0N5JUXSO11R4TSFHO3EIVVEDN


In [9]:
search_query = 'Cafe'
radius = 5000
print(search_query + ' .... OK!')

Cafe .... OK!


In [19]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=ZOZG5W1IURBBIFYFQFVXES5ZSZUJFPEZJCB51HOO2NWLBSOF&client_secret=SZYTP5IYCE0LRYSB3P3BJJS0N5JUXSO11R4TSFHO3EIVVEDN&ll=40.7900869,-73.9598295&v=20180604&query=Cafe&radius=5000&limit=50'

In [20]:
results = requests.get(url).json()


In [21]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)


In [44]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered.head()

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,neighborhood,postalCode,state,id
0,Cafe 3,Café,Guggenheim Museum,US,New York,United States,"Tower, Level 3",775,"[Guggenheim Museum (Tower, Level 3), New York,...","[{'label': 'display', 'lat': 40.78315732645777...",40.783157,-73.958845,,10128,NY,513a4e2fe4b06cdafa77f245
1,Cafe D'Alsace,French Restaurant,1695 2nd Ave,US,New York,United States,at E 88th St,1423,"[1695 2nd Ave (at E 88th St), New York, NY 101...","[{'label': 'display', 'lat': 40.77915220371584...",40.779152,-73.951069,,10128,NY,467ffefcf964a52019481fe3
2,Hard Rock Cafe New York,Theme Restaurant,1501 Broadway,US,New York,United States,at W 43rd St,4306,"[1501 Broadway (at W 43rd St), New York, NY 10...","[{'label': 'display', 'lat': 40.75718879075926...",40.757189,-73.986697,,10036,NY,428a8580f964a52083231fe3
3,American Wing Cafe,Café,1000 5th Ave,US,New York,United States,,1138,"[1000 5th Ave, New York, NY 10028, United States]","[{'label': 'display', 'lat': 40.78024603338391...",40.780246,-73.963507,,10028,NY,4a8450e1f964a52041fc1fe3
4,Café Sabarsky,Austrian Restaurant,1048 5th Ave,US,New York,United States,at E 86th St,963,"[1048 5th Ave (at E 86th St), New York, NY 100...","[{'label': 'display', 'lat': 40.78144505277665...",40.781445,-73.960385,,10028,NY,49fa2837f964a520cf6d1fe3


In [72]:
dataframe_filtered1=dataframe_filtered.drop(['categories','address','cc','city','country','crossStreet','distance','formattedAddress','labeledLatLngs','neighborhood','state'], axis=1)
dataframe_filtered1.head()

Unnamed: 0,name,lat,lng,postalCode,id
0,Cafe 3,40.783157,-73.958845,10128,513a4e2fe4b06cdafa77f245
1,Cafe D'Alsace,40.779152,-73.951069,10128,467ffefcf964a52019481fe3
2,Hard Rock Cafe New York,40.757189,-73.986697,10036,428a8580f964a52083231fe3
3,American Wing Cafe,40.780246,-73.963507,10028,4a8450e1f964a52041fc1fe3
4,Café Sabarsky,40.781445,-73.960385,10028,49fa2837f964a520cf6d1fe3


In [68]:
address = 'Manhattan, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


In [85]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=15) # generate map

# add Ecco as a red circle mark
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    popup='Ecco',
    fill=True,
    color='red',
    fill_color='red',
    fill_opacity=0.6
    ).add_to(venues_map)

# add popular spots to the map as blue circle markers
for lat, lng, label in zip(dataframe_filtered1.lat, dataframe_filtered1.lng, dataframe_filtered1.name):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(venues_map)

# display map
venues_map

In [73]:
dataframe_filtered1.groupby('postalCode').count()

Unnamed: 0_level_0,name,lat,lng,id
postalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001,1,1,1,1
10010,1,1,1,1
10016,1,1,1,1
10017,1,1,1,1
10019,3,3,3,3
10020,2,2,2,2
10021,4,4,4,4
10022,2,2,2,2
10023,5,5,5,5
10024,6,6,6,6


In [75]:

dataframe_filtered1_onehot = pd.get_dummies(dataframe_filtered1[['name']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dataframe_filtered1_onehot['postalCode'] = dataframe_filtered1['postalCode'] 

# move neighborhood column to the first column
fixed_columns = [dataframe_filtered1_onehot.columns[-1]] + list(dataframe_filtered1_onehot.columns[:-1])
dataframe_filtered1_onehot = dataframe_filtered1_onehot[fixed_columns]
dataframe_filtered1_onehot.head()

Unnamed: 0,postalCode,94 Corner Cafe,American Wing Cafe,Artopolis Cafe,Banc Cafe,Bouchon Bakery & Cafe,Cafe 3,Cafe 61 & Rooftop Grill,Cafe 71,Cafe Amrita,Cafe D'Alsace,Cafe East - Columbia University,Cafe Eighty Two,Cafe Fiorello,Cafe Henri - LIC,Cafe Lalo,Cafe Luka,Cafe Luxembourg,Cafe On 4,Cafe On One,Cafe R,Cafe Roma,Cafe Victoria Event Venue,Cafe Viva Gourmet Pizza,Cafe du Soleil,Café Boulud,Café Carlyle,Café Duke,Café Oliviero,Café Sabarsky,Café Zaiya,Candle Cafe,Corner Cafe and Bakery,Europa Cafe,Europan Bakery Cafe,Food Emporium Illy Cafe,Hard Rock Cafe New York,Highlands Cafe,Juan Valdez Café,Le Viet Café,Lox at Cafe Weissman,MY NY Bakery Cafe,Park West Cafe & Deli,Peacefood Cafe,Piccolo Café,Rock Center Cafe,Tacombi Café El Presidente,The Muffins Café,The Petrie Court Cafe & Wine Bar,Victor's Café
0,10128,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,10128,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10036,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10028,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
#group rows by postal code and by taking the mean of the frequency of occurrence of each category

dataframe_filtered1_grouped = dataframe_filtered1_onehot.groupby('postalCode').mean().reset_index()
dataframe_filtered1_grouped

Unnamed: 0,postalCode,94 Corner Cafe,American Wing Cafe,Artopolis Cafe,Banc Cafe,Bouchon Bakery & Cafe,Cafe 3,Cafe 61 & Rooftop Grill,Cafe 71,Cafe Amrita,Cafe D'Alsace,Cafe East - Columbia University,Cafe Eighty Two,Cafe Fiorello,Cafe Henri - LIC,Cafe Lalo,Cafe Luka,Cafe Luxembourg,Cafe On 4,Cafe On One,Cafe R,Cafe Roma,Cafe Victoria Event Venue,Cafe Viva Gourmet Pizza,Cafe du Soleil,Café Boulud,Café Carlyle,Café Duke,Café Oliviero,Café Sabarsky,Café Zaiya,Candle Cafe,Corner Cafe and Bakery,Europa Cafe,Europan Bakery Cafe,Food Emporium Illy Cafe,Hard Rock Cafe New York,Highlands Cafe,Juan Valdez Café,Le Viet Café,Lox at Cafe Weissman,MY NY Bakery Cafe,Park West Cafe & Deli,Peacefood Cafe,Piccolo Café,Rock Center Cafe,Tacombi Café El Presidente,The Muffins Café,The Petrie Court Cafe & Wine Bar,Victor's Café
0,10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,10016,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10019,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
5,10020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
6,10021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,10022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0
9,10024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# set number of clusters
kclusters = 5


dataframe_grouped_clustering = dataframe_filtered1_grouped.drop('postalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dataframe_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]



array([2, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [86]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lng, postalCode, cluster  in zip(dataframe_filtered1.lat, dataframe_filtered1.lng,dataframe_filtered1.postalCode,kmeans.labels_):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_clusters

From the analysis it is advisable to fetch a place around the postcode area covering Upper East Side and Upper West Side considering the popularity of cafes and the population density.