In [1]:
# import the packages we will need for this project
import pandas as pd
import numpy as np
import json
import os
import requests
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
stations_df = pd.read_csv('stations_df.csv')

In [11]:
FOURSQUARE_KEY = 'fsq3y9duIMFoJDJd885YeSLrAmu6PJXvWaMIgKd+i6Y88Vo='

# Foursquare

## Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [14]:
def get_venues_fs(latitude, longitude, radius, FOURSQUARE_KEY, categories):
    """
    Get venues from foursquare with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): foursquare API to use for query
        categories (str) : Foursquare-recognized place type. If not passed no place_type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """
    url = "https://api.foursquare.com/v3/places/search"
    
    params = {"categories": categories, # below, we will search for three different category types
              "radius":radius,
              "ll": f"{latitude},{longitude}",
              "limit":50, # this is the upper limit
              "fields": "rating,popularity"   # rating: A numerical rating (from 0.0 to 10.0) of the FSQ Place, based on user votes, likes/dislikes, tips sentiment, and visit data. Not all FSQ Places will have a rating.
                                              # popularity: A measure of the FSQ Place's popularity, by foot traffic. This score is on a 0 to 1 scale and uses a 6-month span of POI visits for a given geographic area.
             }
    # Create a dictionary for headers
    headers = {"Accept": "application/json"}
    # Add key with our API KEY
    headers['Authorization'] = FOURSQUARE_KEY 
    res = requests.get(url, params=params, headers=headers)
    foursquare_data = res.json()
    return foursquare_data

In [None]:
# test the function
results = get_venues_fs(stations_df['latitude'][0], stations_df['longitude'][0], 1000, FOURSQUARE_KEY, 13003)

## Combine the above flow into a for loop to obtain a count of different venues for the dataset and their category ID's
1. Bars - 13003 (cocktail bars, pubs, sports bars etc.)
2. Lodging - 19009 (Hotels, Motels, Hostels etc.)
3. Transportation Hubs - 19030 (Train stations, bus stations, rental car locations etc.)

In [None]:
# create a function to count the results for each category returned by the API call
def category_count(category):
    """
    Loop through the stations_df dataframe and return a list of counts of the number of items returned in the get_venues_fs function that corresponds with the input category
    
    Args:
        category (integer): the category integer of what POI we are searching for

    Returns:
        response: count_of_category: a list of the number of POI's which match the category integer from the get_venues_fs function
    """
    count_of_category = []

    for row in stations_df.itertuples(index=False): 
        
        latitude = row[8] # store the latitude of each station in a variable
        longitude = row[9] # store the longitude of each station in a variable
        
        venue_results = get_venues_fs(latitude, longitude, 1000, FOURSQUARE_KEY, category) # call the get_venues_fs function for each station 
        normalized_results = pd.json_normalize(venue_results['results']) # store and normalize the results
        
        count_of_results = len(normalized_results) # count the number of rows in the normalized results
        count_of_category.append(count_of_results) # append the count to the empty list
    return count_of_category

In [None]:
# create a function to calculate the average popularity for the response from each category returned by the get_venues_fs function
def popularity_average(category):
    """
    Loop through the stations_df dataframe and return the calculated average popularity from the items returned in the get_venues_fs function that corresponds with the input category and append that value to a list
    
    Args:
        category (integer): the category integer of what POI we are searching for

    Returns:
        response: average_category_popularity: a list of the average popularity of POI's which match the category integer from the get_venues_fs function
    """
    average_category_popularity = []

    for row in stations_df.itertuples(index=False): 
        latitude = row[8]
        longitude = row[9]
        
        venue_results = get_venues_fs(latitude, longitude, 1000, FOURSQUARE_KEY, category)
        normalized_results = pd.json_normalize(venue_results['results'])
        
        # Check if the 'popularity' column exists in the normalized results
        if 'popularity' in normalized_results.columns:
            average_popularity = normalized_results['popularity'].mean()
            average_category_popularity.append(average_popularity)
        else:
            # Case when the 'popularity' column is missing
            average_category_popularity.append(0)
        
    return average_category_popularity

In [None]:
# create a function to calculate the average rating for the response from each category returned by the get_venues_fs function
def rating_average(category):
    """
    Loop through the stations_df dataframe and return the calculated average ratings from the items returned in the get_venues_fs function that corresponds with the input category and append that value to a list
    
    Args:
        category (integer): the category integer of what POI we are searching for

    Returns:
        response: average_category_rating: a list of the average ranking of POI's which match the category integer from the get_venues_fs function
    """
    average_category_rating = []

    for row in stations_df.itertuples(index=False): 
        latitude = row[8]
        longitude = row[9]
        
        venue_results = get_venues_fs(latitude, longitude, 1000, FOURSQUARE_KEY, category)
        normalized_results = pd.json_normalize(venue_results['results'])
        
        # Check if the 'rating' column exists in the normalized results
        if 'rating' in normalized_results.columns:
            average_rating = normalized_results['rating'].mean()
            average_category_rating.append(average_rating)
        else:
            # Case when the 'rating' column is missing
            average_category_rating.append(0)
        
    return average_category_rating

### Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

#### I will be appending the count of each POI type, as well as the average popularity and average rating if the POI has either of these metrics

### Put your parsed results into a DataFrame

In [None]:
# run the functions for bars
stations_df['num_nearby_bars'] = category_count(13003)
stations_df['avg_pop_bars'] = popularity_average(13003)
stations_df['avg_rat_bars'] = rating_average(13003)

In [None]:
# run the functions for lodging
stations_df['num_nearby_lodging'] = category_count(19009)
stations_df['avg_pop_lodging'] = popularity_average(19009)
stations_df['avg_rat_lodging'] = rating_average(19009)

In [None]:
# run the functions for transportation hubs
stations_df['num_nearby_trans_hubs'] = category_count(19030)
stations_df['avg_pop_trans_hubs'] = popularity_average(19030)
stations_df['avg_rating_trans_hubs'] = rating_average(19030)

In [None]:
stations_df

# Yelp

In [8]:
stations_df_fs = pd.read_csv('stations_df_fs.csv')
stations_df_fs

Unnamed: 0,station_name,station_id,station_address,empty_slots,renting,returning,uid,free_bikes,latitude,longitude,timestamp,num_nearby_bars,avg_pop_bars,avg_rat_bars,num_nearby_lodging,avg_pop_lodging,avg_rat_lodging,num_nearby_trans_hubs,avg_pop_trans_hubs,avg_rating_trans_hubs
0,Imperial & 7th,b6faafb631c1e75fe42dcf0183d5cdd9,699 S Imperial St,13,1,1,bcycle_lametro_3037,7,34.03480,-118.23128,2023-08-30T00:08:35.679000Z,18,0.849992,8.416667,6,0.943272,9.100000,16,0.813149,0.000000
1,Willow St & Mateo St,6ae1d85179bc7085c9d8855ac003f6fb,1301 Willow Street,7,1,1,bcycle_lametro_3036,11,34.03919,-118.23253,2023-08-30T00:08:35.684000Z,46,0.849432,8.266667,15,0.505910,7.850000,11,0.923558,0.000000
2,Hope & 6th,8ef537aeddba68ac15b0c47ff03b0a2c,557 S Hope Street,20,1,1,bcycle_lametro_3032,11,34.04989,-118.25588,2023-08-30T00:08:35.690000Z,50,0.972846,7.689796,50,0.834985,7.583333,50,0.709939,7.083333
3,Hope & 1st,c14da060d71e97130920dde4509d7323,111 N Hope St,10,1,1,bcycle_lametro_3024,7,34.05772,-118.24897,2023-08-30T00:08:35.715000Z,33,0.905051,7.600000,30,0.775464,6.450000,32,0.687911,7.220000
4,Grand & Washington,3eaeb9f13ce28cf2e56d21e7dd2b3b5c,1014 S Grand Ave,9,1,1,bcycle_lametro_3025,6,34.03286,-118.26808,2023-08-30T00:08:35.716000Z,6,0.952832,7.400000,12,0.833727,0.000000,25,0.735822,6.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,Sunset & Virgil,a6ace1fa31df6d7d551cb5626c3a73ab,4459 Sunset Blvd,4,1,1,bcycle_lametro_4303,6,34.09801,-118.28707,2023-08-30T00:08:35.877000Z,31,0.859253,7.725000,17,0.917085,6.650000,16,0.728487,5.800000
220,Hoover & Fountain,d463dde2137b159cee2678c6e9525741,1254 N Hoover St,9,1,1,bcycle_lametro_4438,5,34.09493,-118.28456,2023-08-30T00:08:35.921000Z,27,0.863153,7.684615,16,0.918970,6.650000,16,0.706834,5.800000
221,Pershing Square,2c62b4e6116c60d6d04820adb67152da,532 W Olive Street,12,1,1,bcycle_lametro_3063,9,34.04804,-118.25374,2023-08-30T00:08:35.893000Z,50,0.970269,7.700000,50,0.763539,7.530000,50,0.718122,7.083333
222,Venice & Inglewood,976e229ed7e475f2fb1f6885723fe3fe,12006 Venice Blvd,5,1,1,bcycle_lametro_4555,3,34.00587,-118.42916,2023-08-30T00:08:35.983000Z,8,0.915316,7.475000,4,0.000000,0.000000,4,0.855221,0.000000


## Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [3]:
YELP_KEY = 'hspGJYwFXqh4cvg4SyzjtoyshMv8QxHwAIoCA2ek0DkDPVi2ygMK4Z-Zl_DTxnMnVrmVgCe3RUO_aZtBdN3Uzm8_MryW7qE7y_eWuxdXfVfzRAIrph5gjeNXtGTtZHYx'

In [10]:
def get_venues_yelp(latitude, longitude, radius, categories):
    """
    Get venues from YELP with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): foursquare API to use for query
        categories (str) : Foursquare-recognized place type. If not passed no place_type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """
    url = "https://api.yelp.com/v3/businesses/search"
    
    params = {"latitude": latitude, # below, we will search for three different category types
              "longitude": longitude,
              "limit": 50,
              "categories": categories 
             }
    # Create a dictionary for headers
    headers = {"Accept": "application/json"}
    # Add key with our API KEY
    headers['Authorization'] = "Bearer hspGJYwFXqh4cvg4SyzjtoyshMv8QxHwAIoCA2ek0DkDPVi2ygMK4Z-Zl_DTxnMnVrmVgCe3RUO_aZtBdN3Uzm8_MryW7qE7y_eWuxdXfVfzRAIrph5gjeNXtGTtZHYx" 
    res = requests.get(url, params=params, headers=headers)
    yelp_data = res.json()
    return yelp_data

### Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### Yelp Equivalents for Foursquare POI searches
1. Bars - 'bars' (Foursquare equivalent: 13003 (cocktail bars, pubs, sports bars etc.))
2. Hotels - 'hotel,bedbreakfast,hostels' (Foursquare equivalent: Lodging - 19009 (Hotels, Motels, Hostels etc.))
3. Transportation - 'transport' (Foursquare equivalent: Transportation Hubs - 19030 (Train stations, bus stations, rental car locations etc.))

In [None]:
 # test with restaurant
restaurants = get_venues_yelp(stations_df_fs['latitude'][0], stations_df_fs['longitude'][0], 1000, "restaurant")
# restaurants

In [None]:
results_yelp_restaurants = pd.json_normalize(restaurants['businesses'])
# results_yelp_restaurants

In [26]:
# test with bars
bars = get_venues_yelp(stations_df_fs['latitude'][0], stations_df_fs['longitude'][0], 1000, "bars")
bars

{'error': {'code': 'ACCESS_LIMIT_REACHED',
  'description': "You've reached the access limit for this client. See instructions for requesting a higher access limit at https://docs.developer.yelp.com/docs/fusion-rate-limiting"}}

In [25]:
results_yelp_bars = pd.json_normalize(bars['businesses'])
results_yelp_bars

NameError: name 'results_yelp_bars' is not defined

In [8]:
# test with hotels
# hotels = get_venues_yelp(stations_df_fs['latitude'][0], stations_df_fs['longitude'][0], 1000,
                         'hotel,bedbreakfast,hostels')
# hotels

{'error': {'code': 'ACCESS_LIMIT_REACHED',
  'description': "You've reached the access limit for this client. See instructions for requesting a higher access limit at https://docs.developer.yelp.com/docs/fusion-rate-limiting"}}

In [None]:
# results_yelp_hotels = pd.json_normalize(hotels['businesses'])
# results_yelp_hotels

### Repeat the functions created to loop through the Foursquare results for YELP

In [5]:
def category_count_yelp(category):
    """
    Loop through the stations_df_fs dataframe and return a list of counts of the number of items returned in the get_venues_fs function that corresponds with the input category
    
    Args:
        category (integer): the category integer of what POI we are searching for

    Returns:
        response: count_of_category: a list of the number of POI's which match the category integer from the get_venues_fs function
    """
    count_of_category = []

    for row in stations_df_fs.itertuples(index=False): 
        
        latitude = row[8] # store the latitude of each station in a variable
        longitude = row[9] # store the longitude of each station in a variable
        
        venue_results = get_venues_yelp(latitude, longitude, 1000, category) # call the get_venues_fs function for each station 
        normalized_results = pd.json_normalize(venue_results['businesses']) # store and normalize the results
        
        count_of_results = len(normalized_results) # count the number of rows in the normalized results
        count_of_category.append(count_of_results) # append the count to the empty list
    
    return count_of_category

In [None]:
# test for bars
# category_count_yelp("bars")

In [12]:
def average_rating_yelp(category):
    """
    Loop through the stations_df_fs dataframe and return the calculated average ratings from the items returned in the get_venues_yelp function that corresponds with the input category and append that value to a list
    
    Args:
        category (string): the category string of businesses we are searching for. For multiple categories, input a comma-separated string list "hotel,bars"

    Returns:
        response: average_category_rating: a list of the average ranking of businesses which match the category integer from the get_venues_yelp function
    """
    average_category_rating = []

    for row in stations_df_fs.itertuples(index=False): 
        
        latitude = row[8] # store the latitude of each station in a variable
        longitude = row[9] # store the longitude of each station in a variable
        
        venue_results = get_venues_yelp(latitude, longitude, 1000, category) # call the get_venues_fs function for each station 
        normalized_results = pd.json_normalize(venue_results['businesses']) # store and normalize the results
        
        # Check if the 'rating' column exists in the normalized results
        if 'rating' in normalized_results.columns:
            average_rating = normalized_results['rating'].mean()
            average_category_rating.append(average_rating)
        else:
            # Case when the 'rating' column is missing
            average_category_rating.append(0)
        
    return average_category_rating

In [None]:
# test for the hotels grouping
# average_rating_yelp('hotel,bedbreakfast,hostels')

In [20]:
def average_reviews_yelp(category):
    """
    Loop through the stations_df_fs dataframe and return the calculated average number of reviews from the items returned in the get_venues_yelp function that corresponds with the input category and append that value to a list
    
    Args:
        category (string): the category string of businesses we are searching for ('transport'). For multiple categories, input a comma-separated string list ("hotel,bars")

    Returns:
        response: average_count_reviews: a list of the average number of reviews for the businesses which match the category integer from the get_venues_yelp function
    """
    average_count_reviews = []

    for row in stations_df_fs.itertuples(index=False): 
        
        latitude = row[8] # store the latitude of each station in a variable
        longitude = row[9] # store the longitude of each station in a variable
        
        venue_results = get_venues_yelp(latitude, longitude, 1000, category) # call the get_venues_fs function for each station 
        normalized_results = pd.json_normalize(venue_results['businesses']) # store and normalize the results
        
        # Check if the 'review_count' column exists in the normalized results
        if 'review_count' in normalized_results.columns:
            average_reviews = normalized_results['review_count'].mean()
            average_count_reviews.append(average_reviews)
        else:
            # Case when the 'review_count' column is missing
            average_count_reviews.append(0)
        
    return average_count_reviews

In [21]:
average_num_yelp_reviews_bar = average_reviews_yelp('bars')

KeyError: 'businesses'

Put your parsed results into a DataFrame

In [13]:
average_yelp_rating_bar = average_rating_yelp('bars')

In [24]:
average_yelp_rating_hotels = average_rating_yelp('hotels')

KeyError: 'businesses'

In [None]:
stations_df_fs['average_yelp_rating_bar'] = average_yelp_rating_bar
stations_df_fs['average_num_yelp_reviews_bar'] = average_num_yelp_reviews_bar

In [29]:
stations_df_fs_yelp = stations_df_fs.to_csv('stations_df_fs_yelp.csv')

# Comparing Results

## Which API provided you with more complete data? Provide an explanation. 

### Foursquare

#### Pros:
In the context of this analysis, I believe that the popularity metric, which only Foursquare provides, is very powerful. When factoring in bike rentals, this mode of transportation is closer to walking than it is to vehicular travel, and therefore this shows a correlation. I also believe that because the metric is only for the last six months, this allows the metric to be more fluid and accurate. For example, imagine a bar opened 5 years ago and as of today, the bar has a 9/10 rating on Foursquare. However, under closer scrutiny, most good reviews came more than three years ago, and the average rating in the last two years is lower than before. This allows us to compare the normalized rating and popularity values later to see how well they compare.
#### Cons:
Not having a 'reviews' column to show the number of reviews that went into calculating the 'rating' column makes the 'rating' a black box; there could be only one review for a POI. 

### YELP

#### Pros:
The reviews column allows us to filter out results that don't meet a specified threshold if required. Additionally, it appears that there are more businesses on YELP vs. Foursquare which provides a better representation of the real world.

#### Cons:
The limitations of the YELP API are much more restrictive and prevent tuning of the request with the 500 API calls per day limit. I was unaware that I would reach this limit so quickly each day; being able to essentially call the Foursquare API a limitless number of times per day was very helpful in order to ensure my functions were correct.

Considering the above, I believe the Foursquare API provided me with more complete data.

## Get the top 10 restaurants according to their rating

In [55]:
def get_restaurants_fs(latitude, longitude, radius, FOURSQUARE_KEY, categories):
    """
    Get restaurants from foursquare with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): foursquare API to use for query
        categories (str) : Foursquare-recognized place type. If not passed no place_type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """
    url = "https://api.foursquare.com/v3/places/search"
    
    params = {"categories": categories, # below, we will search for three different category types
              "radius":radius,
              "ll": f"{latitude},{longitude}",
              "limit":50, # this is the upper limit
              "fields": "fsq_id,name,rating"  
            }
    # Create a dictionary for headers
    headers = {"Accept": "application/json"}
    # Add key with our API KEY
    headers['Authorization'] = FOURSQUARE_KEY 
    res = requests.get(url, params=params, headers=headers)
    foursquare_data = res.json()
    return foursquare_data

In [57]:
# using the above function, loop through each station, searching for restaurants, append the results to an empty list, then concatenate to the new dataframe
restaurants = pd.DataFrame() # empty dataframe that will have values concatenated to it
list_of_restaurants = [] # empty list for appending results

for row in stations_df.itertuples(index=False):
    latitude = row[8]
    longitude = row[9]
            
    venue_results = get_restaurants_fs(latitude, longitude, 1000, FOURSQUARE_KEY, 13065)
    normalized_results = pd.json_normalize(venue_results['results'])
    list_of_restaurants.append(normalized_results)

restaurants = pd.concat(list_of_restaurants)    

In [96]:
# Fill the NaN values for restaurants with zeros
restaurants = restaurants.fillna(0)
restaurants

Unnamed: 0,fsq_id,name,rating
0,60c184c9528f822158a147e8,Afuri Ramen,8.9
1,55fccaa5498ec525f32b644d,Everson Rocye Bar,9.1
2,4a15b070f964a520b9781fe3,Tony's Saloon,8.6
3,4e0e69c445ddc2c6d1780ab2,Pizzanista,8.4
4,5b549f2ec97f28002cb31a5f,Guerrilla Tacos,8.5
...,...,...,...
37,63d4b6beea0fa64ea1f18ef1,Mun Korean Steakhouse,0.0
38,4c9e3729d3c2b60cb8f7c8bc,La Poblanita,0.0
39,62bf97634432bf416e57b6ba,Kunnai Thai Restaurant,0.0
40,7c75265c14f0448ae6e60e30,Mariscos Michoacan,0.0


In [93]:
# check for duplicate values
restaurants.duplicated().sum()

6950

In [94]:
# the majority of the restaurants are duplicates, drop from the analysis
clean_restaurants = restaurants.drop_duplicates(subset=['fsq_id'], keep='first')
# remove the fsq_id column
clean_restaurants = clean_restaurants.drop(['fsq_id'],axis=1)
clean_restaurants

Unnamed: 0,name,rating
0,Afuri Ramen,8.9
1,Everson Rocye Bar,9.1
2,Tony's Saloon,8.6
3,Pizzanista,8.4
4,Guerrilla Tacos,8.5
...,...,...
33,Citizen Sprout,0.0
34,Citrin,0.0
35,Tiato Kitchen & Venue,0.0
36,Cafe Lucy,0.0


In [95]:
# sort the values by 'rating' and return the top 10
top_10_restaurants = clean_restaurants.sort_values('rating', ascending=False).head(10)

#### The top 10 restaurants from Foursquare results within 1000m of all city bik.es stations are:

In [88]:
top_10_restaurants

Unnamed: 0,name,rating
7,Salt & Straw,9.4
8,Saffron & Rose Ice Cream,9.3
11,Guisados,9.3
2,Firestone Walker - The Propagator,9.3
11,Superba Food + Bread,9.3
0,Porto's Bakery & Cafe,9.3
9,Bavel,9.3
35,Cobi's,9.3
24,Walt Disney Concert Hall,9.3
7,Hatchet Hall,9.2
