In [1]:
# library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
# load API Credentials
with open('/Users/x471074/.secret/yelp_api.json') as f:
    login = json.load(f)

# isnstantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

In [3]:
# define search
LOCATION = 'Los Angeles, California'
TERM = 'Starbucks'

In [4]:
# create a results in progress JSON file
# include search terms in filename
JSON_FILE = "Data/results_in_progress_LA_startbucks.json"
JSON_FILE

'Data/results_in_progress_LA_startbucks.json'

In [5]:
# Check if JSON file exists
# If it doesn't exists create any folders needed and save empty list as JSON_FILE

# check if file exists
file_exists = os.path.isfile(JSON_FILE)
# if it does not exist
if file_exists == False:

    # create needed folder
    folder = os.path.dirname(JSON_FILE)

    # if JSON_FILE included a folder
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)

    # inform user and save empty list
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')

    with open(JSON_FILE,'w') as f:
        json.dump([],f)

else:
    print(f"[i] {JSON_FILE} already exists")

[i] Data/results_in_progress_NY_pizza.json already exists


In [6]:
# Load previous results and use len of results for offset
with open(JSON_FILE, 'r') as f:
    previous_results = json.load(f)

n_results = len(previous_results)
print(f'- {n_results} previous results found')


- 1000 previous results found


In [7]:
# use search query to perform API calls

results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                                 offset=n_results)

results.keys()

HTTPError: 400 Client Error: Bad Request for url: https://api.yelp.com/v3/businesses/search?location=NY%2C+NY&term=Pizza&offset=1000

In [None]:
# total results available
total_results = results['total']
total_results

In [14]:
# how many results did we retrieve
results_per_page = len(results['businesses'])
results_per_page

20

In [15]:
import time, math

# use math.ceil to round up for the total number of pages of results
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

625

In [16]:
# join new result with previous list and save
previous_results.extend(results['businesses'])
with open(JSON_FILE, 'w') as f:
    json.dump(previous_results,f)

In [22]:
for i in tqdm_notebook(range(1, n_pages+1)):

    # read in results in progress file and check length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)

    # save number of results to use as offset
    n_results = len(previous_results)

    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break

    # use n_results as offset
    results = yelp_api.search_query(location=LOCATION,
                                   term=TERM,
                                   offset=n_results)

    # append new results and save to file
    previous_results.extend(results['businesses'])

    with open(JSON_FILE, 'w') as f:
        json.dump(previous_results,f)

    time.sleep(0.2)

  0%|          | 0/625 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


In [23]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,zj8Lq1T8KIC5zwFief15jg,prince-street-pizza-new-york-2,Prince Street Pizza,https://s3-media4.fl.yelpcdn.com/bphoto/PfI8oV...,False,https://www.yelp.com/biz/prince-street-pizza-n...,5081,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.5,"{'latitude': 40.72308755605564, 'longitude': -...","[delivery, pickup]",$,"{'address1': '27 Prince St', 'address2': None,...",12129664100,(212) 966-4100,1961.877142
1,ysqgdbSrezXgVwER2kQWKA,julianas-brooklyn-3,Juliana's,https://s3-media2.fl.yelpcdn.com/bphoto/od36nF...,False,https://www.yelp.com/biz/julianas-brooklyn-3?a...,2702,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 40.70274718768062, 'longitude': -...",[delivery],$$,"{'address1': '19 Old Fulton St', 'address2': '...",17185966700,(718) 596-6700,308.569844
2,WG639VkTjmK5dzydd1BBJA,rubirosa-new-york-2,Rubirosa,https://s3-media3.fl.yelpcdn.com/bphoto/l0Phrn...,False,https://www.yelp.com/biz/rubirosa-new-york-2?a...,3193,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.722766, 'longitude': -73.996233}",[pickup],$$,"{'address1': '235 Mulberry St', 'address2': ''...",12129650500,(212) 965-0500,1932.94677
3,v1DHGRNCH9247WLYoaoA9A,l-industrie-pizzeria-brooklyn,L'industrie Pizzeria,https://s3-media1.fl.yelpcdn.com/bphoto/1-VBwA...,False,https://www.yelp.com/biz/l-industrie-pizzeria-...,1138,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 40.71162, 'longitude': -73.95783}",[delivery],$,"{'address1': '254 S 2nd St', 'address2': '', '...",17185990002,(718) 599-0002,3145.016041
4,WIhm0W9197f_rRtDziq5qQ,lombardis-pizza-new-york-4,Lombardi's Pizza,https://s3-media1.fl.yelpcdn.com/bphoto/lBq1IB...,False,https://www.yelp.com/biz/lombardis-pizza-new-y...,6619,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.7215934960083, 'longitude': -7...","[delivery, pickup]",$$,"{'address1': '32 Spring St', 'address2': '', '...",12129417994,(212) 941-7994,1798.995978


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
995,L1pgoxWt8I5cp1c9van_4Q,pizzeria-hoboken-hoboken,Pizzeria Hoboken,https://s3-media4.fl.yelpcdn.com/bphoto/INcnxm...,False,https://www.yelp.com/biz/pizzeria-hoboken-hobo...,68,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",3.0,"{'latitude': 40.7472306936165, 'longitude': -7...","[pickup, delivery]",$,"{'address1': '732 Jefferson St', 'address2': '...",12016836158,(201) 683-6158,5811.341336
996,rtivWWzG0XjUuhSrwusTgQ,fausto-brooklyn,Fausto,https://s3-media4.fl.yelpcdn.com/bphoto/If-kbS...,False,https://www.yelp.com/biz/fausto-brooklyn?adjus...,214,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 40.67597, 'longitude': -73.97181}",[delivery],$$$,"{'address1': '348 Flatbush Ave', 'address2': '...",19179091427,(917) 909-1427,3778.959234
997,FR_E9_Z3WG5HjeIUyTKSWw,antonios-kitchen-east-elmhurst,Antonio's Kitchen,https://s3-media2.fl.yelpcdn.com/bphoto/BpWTwB...,False,https://www.yelp.com/biz/antonios-kitchen-east...,144,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",3.5,"{'latitude': 40.7704153034747, 'longitude': -7...","[pickup, delivery]",$$,"{'address1': '76-08 21st Ave', 'address2': '',...",17187289200,(718) 728-9200,11152.520898
998,G7jCcluHVgOP6n-vvZ797Q,corato-i-pizza-restaurant-ridgewood,Corato I Pizza Restaurant,https://s3-media3.fl.yelpcdn.com/bphoto/q2emK6...,False,https://www.yelp.com/biz/corato-i-pizza-restau...,160,"[{'alias': 'pizza', 'title': 'Pizza'}]",2.5,"{'latitude': 40.7058105, 'longitude': -73.8969...","[pickup, delivery]",$,"{'address1': '66-94 Fresh Pond Rd', 'address2'...",17184976177,(718) 497-6177,8211.971924
999,a9wmbku2blsP8QVDEslm4w,di-fara-pizza-by-wonder-new-york-4,Di Fara Pizza by Wonder,https://s3-media3.fl.yelpcdn.com/bphoto/5yToSE...,False,https://www.yelp.com/biz/di-fara-pizza-by-wond...,1,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.77671, 'longitude': -73.98184}","[pickup, delivery]",,"{'address1': '2030 Broadway', 'address2': None...",18558185755,(855) 818-5755,7996.859866


In [25]:
# check for duplicate ID's
final_df.duplicated(subset='id').sum()

64

In [27]:
# drop duplicate ids 
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

In [29]:
# save df to csv
final_df.to_csv('Data/final_results_NY_pizza.csv.gz')