In [1]:
!pip install YelpAPI



In [2]:
!pip install tqdm



In [3]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#additional imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [5]:
def create_json_file(JSON_FILE,  delete_if_exists=False):
    
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    
    ## If it DOES exist:
    if file_exists == True:
        
        ## Check if user wants to delete if exists
        if delete_if_exists==True:
            
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)
            ## Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")            
            
            
    ## If it does NOT exist:
    else:
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
        ## CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)
        
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## Save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f) 

# Load and Create Yelp API 

In [6]:
# import json to read yelp api credentials
import json

# with open: yelp api credentials (save as variable)
with open('/Users/Belan/.secret/yelp_api.json') as f:
    login = json.load(f)

login.keys()

dict_keys(['client-id', 'api-key'])

In [7]:

# instantiate yelpapi variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x1b1fa85db50>

# Define Searchand File Path

In [8]:
# use our yelp_api variable's search_query method to perform our API call
location='Atlanta, GA'
term='Soul Food'

In [9]:
#specify folder to save data
FOLDER = 'Data/'
os.makedirs(FOLDER, exist_ok = True)

#specify JSON file name in a programatic way
JSON_FILE = FOLDER+f"results_in_progress_{location.split(',')[0]}_{term}.json"
JSON_FILE

'Data/results_in_progress_Atlanta_Soul Food.json'

# Check JSON File

In [10]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] Data/results_in_progress_Atlanta_Soul Food.json not found. Saving empty list to file.


# Results in File

In [11]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

[!] Data/results_in_progress_Atlanta_Soul Food.json already exists. Deleting previous file...
[i] Data/results_in_progress_Atlanta_Soul Food.json not found. Saving empty list to new file.
- 0 previous results found.


# # Of Result Pages

In [12]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=location,
                                term=term,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

22

# For Loop

In [13]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=location,
                                    term=term, 
                                    offset=n_results)
    
    
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    time.sleep(.2)

  0%|          | 0/22 [00:00<?, ?it/s]

# JSON file with Pandas

In [14]:
df = pd.read_json(JSON_FILE)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             439 non-null    object 
 1   alias          439 non-null    object 
 2   name           439 non-null    object 
 3   image_url      439 non-null    object 
 4   is_closed      439 non-null    bool   
 5   url            439 non-null    object 
 6   review_count   439 non-null    int64  
 7   categories     439 non-null    object 
 8   rating         439 non-null    float64
 9   coordinates    439 non-null    object 
 10  transactions   439 non-null    object 
 11  price          295 non-null    object 
 12  location       439 non-null    object 
 13  phone          439 non-null    object 
 14  display_phone  439 non-null    object 
 15  distance       439 non-null    float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 52.0+ KB
None


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,AvXqLbcGCxdIEF_qZTY0Kw,the-busy-bee-cafe-atlanta,The Busy Bee Cafe,https://s3-media4.fl.yelpcdn.com/bphoto/KhI0mc...,False,https://www.yelp.com/biz/the-busy-bee-cafe-atl...,1116,"[{'alias': 'soulfood', 'title': 'Soul Food'}, ...",4.0,"{'latitude': 33.7544325583427, 'longitude': -8...","[delivery, pickup]",$$,{'address1': '810 Martin Luther King Jr Dr SW'...,14045259212,(404) 525-9212,1571.578941
1,lGDHzx8L-ZjTUh6ch2ar9g,southern-kitchen-and-grill-atlanta,Southern Kitchen & Grill,https://s3-media2.fl.yelpcdn.com/bphoto/yg3jSZ...,False,https://www.yelp.com/biz/southern-kitchen-and-...,298,"[{'alias': 'southern', 'title': 'Southern'}, {...",4.5,"{'latitude': 33.88625820727653, 'longitude': -...","[delivery, pickup]",$$,"{'address1': '3781 Presidential Pkwy', 'addres...",16786941098,(678) 694-1098,19454.617103
2,B5KTQH01_SoaRxhniVGA1g,delilahs-everyday-soul-atlanta,Delilah's Everyday Soul,https://s3-media1.fl.yelpcdn.com/bphoto/x22gKI...,False,https://www.yelp.com/biz/delilahs-everyday-sou...,53,"[{'alias': 'soulfood', 'title': 'Soul Food'}, ...",4.5,"{'latitude': 33.80284, 'longitude': -84.4285}",[pickup],$$,"{'address1': '1235 Chattahoochee Ave NW', 'add...",14708274567,(470) 827-4567,3932.239448
3,qDL_cyl2JbbHDoiFPrn4yQ,southern-fire-kitchen-no-title,Southern Fire Kitchen,https://s3-media3.fl.yelpcdn.com/bphoto/Puud-d...,False,https://www.yelp.com/biz/southern-fire-kitchen...,130,"[{'alias': 'southern', 'title': 'Southern'}, {...",4.0,"{'latitude': 33.84314, 'longitude': -84.32838}",[],,"{'address1': '3375 Buford Hwy NE', 'address2':...",14049637594,(404) 963-7594,12023.739216
4,8-Z8KokLH--CXZu3HhIiTw,soulbox-atlanta-2,SoulBox,https://s3-media1.fl.yelpcdn.com/bphoto/hK9imD...,False,https://www.yelp.com/biz/soulbox-atlanta-2?adj...,64,"[{'alias': 'soulfood', 'title': 'Soul Food'}]",4.0,"{'latitude': 33.74929, 'longitude': -84.42117}","[delivery, pickup]",,"{'address1': '1017 Fair St', 'address2': None,...",14044945805,(404) 494-5805,2038.595219


In [15]:
# check for duplicate ID's 
df.duplicated(subset='id').sum()

0

In [16]:
## convert the filename to a .csv.gz
csv_file = JSON_FILE.replace('.json','.csv.gz')
csv_file

'Data/results_in_progress_Atlanta_Soul Food.csv.gz'

In [17]:
## Save it as a compressed csv (to save space)
df.to_csv(csv_file, compression = 'gzip', index = False)

# Compare File Size

In [18]:
size_json = os.path.getsize(JSON_FILE)
size_csv_gz = os.path.getsize(JSON_FILE.replace('.json','.csv.gz'))

print(f'JSON FILE: {size_json:,} Bytes')
print(f'CSV.GZ FILE: {size_csv_gz:,} Bytes')

print(f'the csv.gz is {size_json/size_csv_gz} times smaller!')

JSON FILE: 437,370 Bytes
CSV.GZ FILE: 68,031 Bytes
the csv.gz is 6.428980905763549 times smaller!
