<a href="https://colab.research.google.com/github/Frances824/ACMDIGW/blob/main/extracting_and_saving_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# most common shortcut keys
# ctrl + / = to comment a code
# ctrl + enter = run a code block
# ctrl + m, b = create a code block below
# ctrl + m, a = create a code block above

# Loading Imports and Libraries

In [None]:
# connect gdrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# install yelpapi
!pip install yelpapi --quiet

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as pit
import seaborn as sns

# additional libraries
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

# Loading Credentials and Creating the YelpAPI object

In [None]:
# load yelp API credentials and instantiate the YelpAPI object
with open('/content/drive/MyDrive/Colab Notebooks/credentials/yelp_api.json') as file:
  yelp_credentials = json.load(file)

# create the yelp api engine
yelp_api = YelpAPI(yelp_credentials['api-key'], timeout_s = 5.0)

# Defining the Search Terms and File Path

In [None]:
# define API call parameters and output file path
LOCATION = 'Greenville, SC'
TERM = 'Sushi'
JSON_FILE = '/Data/results_SC_sushi.json'
# go to files > MyDrive > Colab Notebooks > Data (click 3 dots)
  # Copy path > paste in above after JSON_FILE = ' and add /Data/results_SC_sushi.json'

# display the file path where data will be saved
print(f'Data will be saved to: {JSON_FILE}')
# when you write a print() with f' your message here' this allows you to print
# the value in a variable {variable here}

Data will be saved to: /content/drive/MyDrive/Colab Notebooks/Data/results_SC_sushi.json


# Check if JSON file exists else create

In [None]:
# check if JSON_FILE exists and create if it doesn't
if not os.path.isfile(JSON_FILE):

  # create the directory
  os.makedirs(os.path.dirname(JSON_FILE), exist_ok=True)

  # confirm and save an empty list on the file
  print(f'[i] {JSON_FILE} not found. Saving empty list to file')
  with open(JSON_FILE, 'w') as file:
    json.dump([], file)

else:
  # inform the user if the file already exists
  print(f'[i] {JSON_FILE} already exists.')


[i] /content/drive/MyDrive/Colab Notebooks/Data/results_SC_sushi.json already exists.


In [None]:
# load previous results and set offset based on the number of results
with open(JSON_FILE, 'r') as file:
  previous_results = json.load(file)

n_results = len(previous_results)

print(f'{n_results} previous results found.')

0 previous results found.


# Making the first API call to get the first page of data

In [None]:
# use your yelp_api variable search_query method to perform your API call
# make first API call
results = yelp_api.search_query(location = LOCATION,
                                term = TERM,
                                offset = n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [None]:
# return how many results were found
total_results = results['total']
total_results

111

In [None]:
business_results = results['businesses']
# business_data was replaced with business_results
# ' symbol should be inside the [] to work

# specify the filename where you want to save the data
json_file_path = JSON_FILE

# save the business data as a JSON file
with open(json_file_path, 'w') as file:
  json.dump(business_results, file, indent = 4)
  # business_data was replaced with business_results

# indent = 4 means
# save the data in this format
# key: value,
# key: value,
# key: value,
# key: value
# kasi sabi ko = 4

# otherwise, withought indent = 4, it will look like:
# key: value,key: value,key: value, key: value

In [None]:
# how many details did we get?
results_per_page = len(business_results)
# business_data was replaced with business_results
print(f'number of results retrieved per page',results_per_page)

number of results retrieved per page 20


In [None]:
# using the math.ceil to round up the total number of pages
n_pages = math.ceil(total_results/results_per_page)
print(f'total number of pages: {n_pages}')

total number of pages: 6


In [None]:
# create a loop to extract the data
for i in tqdm_notebook(range(1, total_results + 1)):
  try:
    time.sleep(0.2) # short delay to respect API rate limits
    # so that when we call, pause, call pause, it doesn't think we're attacking it

    #load existing results to append new data
    with open(JSON_FILE, 'r') as file:
      previous_results = json.load(file)

    # Fetch new results
    new_results = yelp_api.search_query(location = LOCATION,
                                        term = TERM,
                                        offset = len(previous_results))

    # append and save the updated results
    updated_results = previous_results + new_results['businesses']
    with open(JSON_FILE, 'w') as file: # w should not be capital
      json.dump(updated_results, file)

  except Exception as e:
    if 'Too Many Requests for url' in str(e):
      # if you see 'Too Many Requests for url', use:
      print('Rate limit exceeded. Stopping data-collection')
      break # exit the loop if the rate limit is exceeded

    else: # if there is an error, print it and continue if able
      print(f'An error occured: {e}')
      continue # continue to the next iteration in case of other errors

  0%|          | 0/111 [00:00<?, ?it/s]

# Open the final JSON file with Pandas

In [None]:
# load the final JSON file
df = pd.read_json(JSON_FILE)

# display the first 5 rows of the data
df.head()

#Save the file in the directory

In [None]:
# specify the directory
directory = '/content/drive/MyDrive/Colab Notebooks/Data'
filename = 'final_results_SC_sushi.csv.gz' # make sure to include the .csv.gz extension
path = os.path.join(directory, filename)

# ensure that the 'Data' directory exists
os.makedirs(directory, exist_ok=True)

# save the dataframe as a compressed csv file (to save space)
df.to_csv(path, compression='gzip', index=False)
# df was defined in the previous code block: df = pd.read_json(JSON_FILE)

In [None]:
# save as JSON file
json_file = '/content/drive/MyDrive/Colab Notebooks/Data/final_results_SC_sushi.json'
# add Data folders copied path before /final_results_SC_sushi.json

# save the json as a dataframe
df.to_json(json_file, orient='records', lines=True)

In [None]:
# save the file as gzip
csv_gz_file = json_file.replace('.json','.csv.gz')

# save the dataframe as a compressed csv without the index
df.to_csv(csv_gz_file, compression='gzip',index=False)