In [1]:
# imports using python 3.9.20
import pandas as pd
import os # for our API keys
import requests


In [5]:
# load in our bike dataframe
bike_df = pd.read_pickle('../data/bike_dataframe.pkl')

# load in our foursquare and Yelp API keys
FOURSQUARE_KEY = os.getenv('FOURSQUARE_API_KEY')
YELP_KEY = os.getenv('YELP_API_KEY')


# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [6]:
#initial code built with the help of AI however it was heavily butchered to the point the only thing remaining is the description
def get_venues_fs(latitude, longitude, radius, api_key, categories): 
    """
    Get venues from foursquare with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): foursquare API to use for query
        categories (str) : Foursquare-recognized place type. If not passed no place_type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """  
    url = 'https://api.foursquare.com/v3/places/search?ll=' + str(latitude) + "," + str(longitude) + "&radius=" + str(radius) + "&limit=50"#+ "&categories=" + categories

    headers = {
        "accept": "application/json",
        "Authorization" : api_key,
    }
    response = requests.get(url=url,headers=headers)
    return response

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [7]:
#Set our base variables and create the list we will be populating with all of the information from the API's
categories="None-Set"

#dropping to radius of 500 as I found maxed out my call for too many results
radius = 500
responses = []

# Run through the bike data frame and get the longitude and latitude for every row, put it into foursquare to get our responses
for index, row in bike_df.iterrows():
    latitude = row['latitude']
    longitude = row['longitude']
    res = get_venues_fs(latitude, longitude, radius, FOURSQUARE_KEY, categories)
    
    # make sure we got a valid response
    if res.status_code == 200:
        responses.append(res.json())
    else:
        print("fail at: ", index, res.status_code)


Put your parsed results into a DataFrame

In [8]:
# create an empty dataframe for foursquare (fsq)
fsq_df = pd.DataFrame()
i = 0

#loop through all of the responses, convert them 1 by 1 into a dataframe and concatenate them all together
while i < len(responses):
    try:
        temp_df = pd.DataFrame(responses[i]['results'])
        temp_df['bs_id'] = bike_df.at[i, 'bs_id']
        i += 1
        fsq_df = pd.concat([fsq_df, temp_df], ignore_index = True)
    except:
        print("fail at: ", i)
        i+= 1

Clean up the Dataframe

In [None]:
#pull out all the different columns that have multiple layers to them
categories_series = fsq_df['categories']
geocodes_series = fsq_df['geocodes'] 
location_series = fsq_df['location']

# related places isn't relevant to our research so we will be leaving that out.
related_places_series = fsq_df['related_places']

# drop the above columns for cleaning
fsq_df.drop(['related_places', 'location', 'geocodes','categories'], axis = 1, inplace = True)

#add the geocode information back in, once we have the full df we will compare the three columns and see if they are required to be kept
fsq_df['main_latitude'] = [d['main']['latitude'] if 'main' in d else None for d in geocodes_series]
fsq_df['main_longitude'] = [d['main']['longitude'] if 'main' in d else None for d in geocodes_series]
fsq_df['drop_off_latitude'] = [d['drop_off']['latitude'] if 'drop_off' in d else None for d in geocodes_series]
fsq_df['drop_off_longitude'] = [d['drop_off']['longitude'] if 'drop_off' in d else None for d in geocodes_series]
fsq_df['roof_latitude'] = [d['roof']['latitude'] if 'roof' in d else None for d in geocodes_series]
fsq_df['roof_longitude'] = [d['roof']['longitude'] if 'roof' in d else None for d in geocodes_series]

# break the location down and add it back in
location_df = location_series.apply(pd.Series)
fsq_df = pd.merge(fsq_df, location_df, left_index=True, right_index=True)
categories_list = []

#rename the distance column to fsq_distance
fsq_df.rename(columns={'distance': 'fsq_distance'})

#add sets of all the categories into the dataframe
def extract_ids(data): #quick function to go through the json and return me all of the sets of the id's (AI assisted for building this function)
  ids = [item['id'] for item in data]
  return set(ids)

for item in categories_series:
  id_set = extract_ids(item)
  categories_list.append(id_set)
fsq_df['categories_ids'] = categories_list

# seperate the categories into a seperate dataframe we can save as its own table
fsq_categories_df = fsq_df[['fsq_id', 'categories_ids']].explode('categories_ids')

# drop unnecessary columns that are just going to slow us down for our current project.
fsq_df.drop(['chains',
              'link',
                'census_block',
                  'drop_off_latitude',
                    'drop_off_longitude',
                      'roof_latitude',
                        'roof_longitude',
                          'formatted_address',
                            'cross_street',
                              'closed_bucket',
                                'timezone',
                                  'country',
                                    'dma',
                                      'locality',
                                        'region',
                                          'address_extended',
                                            'po_box',
                                              'categories_ids'],
                                                axis = 1, inplace = True)


Save for later use

In [10]:
#save the dataframe to a pickle
fsq_df.to_pickle('../data/fsq_dataframe.pkl')
fsq_categories_df.to_pickle('../data/fsq_categories_df.pkl')

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [11]:
#initial code built with the help of AI however it was heavily butchered to the point the only thing remaining is the description
def get_venues_yelp(latitude, longitude, radius, api_key, categories): 
    """
    Get venues from YELP with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): YELP API to use for query
        categories (str) : Foursquare-recognized place type. If not passed no place_type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """  
    url = f'https://api.yelp.com/v3/businesses/search?latitude={str(latitude)}&longitude={str(longitude)}&radius={str(radius)}&categories=&sort_by=best_match&limit=50'
    headers = {
        "accept": "application/json",
        "Authorization": f'Bearer {api_key}'
    }
    response = requests.get(url=url,headers=headers)
    return response

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [None]:
#set my base variables for the search
categories="None-Set"
#dropping to radius of 500 as I found maxed out my call for too many results
radius = 500
responses = []
#run through the dataframe and search for all POI's in the radius of the longitude and latitude
for index, row in bike_df.iterrows():
    latitude = row['latitude']
    longitude = row['longitude']
    res = get_venues_yelp(latitude, longitude, radius, YELP_KEY, categories)
    # make sure we got a valid response
    if res.status_code == 200:
        responses.append(res.json())
    else:
        print("fail at: ", index, res.status_code)

Create the DataFrame

In [13]:
#create the yelp dataframe and populate it filling in the bike station id for every call
yelp_df = pd.DataFrame()
i = 0
while i < len(responses):
    try:
        temp_df = pd.DataFrame(responses[i]['businesses'])
        temp_df['bs_id'] = bike_df.at[i, 'bs_id']
        i += 1
        yelp_df = pd.concat([yelp_df, temp_df], ignore_index = True)
    except:
        print("fail at: ", i)
        i+= 1

Clean up the Dataframe

In [14]:
#split out the columns that still have a series stored in them
categories_series = yelp_df['categories']
coordinates_series = yelp_df['coordinates']
location_series = yelp_df['location']
#don't currently need business hours, will come back to this if we change our mind
business_hours_series = yelp_df['business_hours']

# drop the columns we pulled out above
yelp_df.drop(['location', 'business_hours', 'coordinates', 'categories'], axis = 1, inplace=True)

#rename id's and distance column
yelp_df.rename(columns={'id':'yelp_id', 'distance':'yelp_distance'}, inplace=True)


#categories do not have ID's like with foursquare so the category column will be sets of the titles
categories_list = []
def extract_ids(data): #quick function to go through the json and return me all of the sets of the titles's (AI assisted for building this function)
  ids = [item['title'] for item in data]
  return set(ids)

for item in categories_series:
  id_set = extract_ids(item)
  categories_list.append(id_set)
yelp_df['categories'] = categories_list

#lets seperate out the categories for a more normalised database when we get to it(in and out and in and out)
yelp_catagories_df = yelp_df[['yelp_id','categories']].explode('categories')

#lets put the coordinates back in
coord_df = coordinates_series.apply(pd.Series)
yelp_df = pd.merge(yelp_df, coord_df, left_index=True, right_index=True)

#put the address back in:
local_df = location_series.apply(pd.Series)
local_df.drop('display_address',axis = 1, inplace = True)
yelp_df = pd.merge(yelp_df, local_df, left_index=True, right_index=True)

#get rid of all these columns that are unnesscary to the problems at hand and will just waste space
yelp_df.drop(['transactions',
               'phone',
                 'display_phone',
                   'address2',
                     'address3',
                       'city',
                         'country',
                           'state',
                            'categories',
                              'image_url',
                                'url',
                                  'is_closed',
                                    'attributes',
                                      'alias'], axis = 1, inplace = True)



Save the Dataframe

In [15]:
#save the dataframes to pickles
yelp_df.to_pickle('../data/yelp_dataframe.pkl')
yelp_catagories_df.to_pickle('../data/yelp_categories.pkl')

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

 - Looking through the two API's I found more information, and more pertinant information in the yelp API.
 - The Yelp API includes ratings, if foursquare once did it does not seem to anymore.
 - Yelp API also includes a price rating so I can get an idea of which bike stations have more expensive things around them (categorical analysis coming your way?)
 - The one element I did prefer about Foursquare is they seperate "Categories" out with ID's which would allow me to normalize the database a little more, not so much with Yelp
 


Get the top 10 restaurants according to their rating

In [None]:
yelp_df[['name','rating']].sort_values('rating',ascending=False).head(10)

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: left;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>name</th>
      <th>rating</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>6522</th>
      <td>Karnitas 71st</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>596</th>
      <td>Paradise Cups</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>566</th>
      <td>Karnitas 71st</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>586</th>
      <td>Meat The Veggies</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>589</th>
      <td>North Shore Historic District Marker</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>592</th>
      <td>Vatito Taqueria</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>594</th>
      <td>Makla Halal</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>595</th>
      <td>The Scoop Coop</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>597</th>
      <td>Schonberger Park</td>
      <td>5.0</td>
    </tr>
    <tr>
      <th>545</th>
      <td>Berry Sweet Pavlovas</td>
      <td>5.0</td>
    </tr>
  </tbody>
</table>
</div>