# 01 DATA INGESTION

## Yelp Review Data

#### Yelp Fusion API Documentation

https://docs.developer.yelp.com/reference/v3_business_search

#### Use the 'business search' request to get review data for Starbucks in zip codes of interest

In [1]:
import os
from dotenv import load_dotenv
import requests
import time
import json
from box import ConfigBox
import pandas as pd

In [2]:
%pwd

'/Users/chrissunderland/Desktop/starbucks_store_predictions/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/chrissunderland/Desktop/starbucks_store_predictions'

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
load_dotenv()

True

Parameters
- location: zip code
- term: 'starbucks'
- limit: number of results to return (50 max)
- offset: index position of the first item to return 

#### Test 'business search'

In [10]:
from src.StarbucksProject import logger
from src.StarbucksProject.constants import *
from src.StarbucksProject.utils.common import read_yaml

In [12]:
config = read_yaml(CONFIG_FILE_PATH)

[2024-07-09 18:43:50,622: INFO: common: yaml file: config/config.yaml loaded successfully]


In [13]:
zip_codes = [80016, 80211, 80212, 80214, 80215, 80216, 80217, 80218, 80219, 80220]
stores_per_zip = []
reviews_per_zip = []
weighted_avg_per_zip = []

In [14]:
for zip_code in zip_codes:
    
    off = 0
    term = "starbucks"
    limit = 50
    biz_list = [] # add stores to this list

    while True:

        API_KEY = os.getenv('yelp_api_key')
        ENDPOINT = config.data_ingestion.yelp_api_endpoint
        HEADERS = {"accept": "application/json", 
                   "Authorization": f"Bearer {API_KEY}"}
        PARAMETERS = {'location': zip_code,
                      'term': term,
                      'offset': off,
                      'limit': limit}

        
        response = requests.get(url=ENDPOINT,
                                params= PARAMETERS,
                                headers= HEADERS)
        logger.info("API request made")
        
        json_response = response.json()
        biz_array = json_response['businesses']
        biz_names = [i['name'] for i in biz_array]
        biz_ratings = [i['rating'] for i in biz_array]
        biz_review_count = [i['review_count'] for i in biz_array]
        biz_zips = [i['location']['zip_code'] for i in biz_array]
        biz_addresses = [",".join(i['location']['display_address']) for i in biz_array]

        biz_data = list(zip(biz_names, biz_ratings, biz_review_count, biz_zips, biz_addresses))    
        # filter out the Starbucks stores that aren't in the zip code of interest
        filtered_data = [i for i in biz_data if (term in i[0].lower()) and (i[3] == str(zip_code))]

        if len(filtered_data) == 0:
            logger.info("Broke out of while loop - last request didn't return any stores in current zip code")
            break  
        
        for shop in filtered_data:
            print("store in zip = ", shop)
            biz_list.append(shop)

        if len([i[3] for i in biz_data[25:] if i[3] == str(zip_code) and term in i[0].lower()]) == 0: 
            print("Broke out of while loop - last 25 results didn't include a store in current zip code")
            break
        
        off += limit  # make another HTTP request for the same zip code

        time.sleep(2)
        
    try: 
        
        # summarize Yelp review history to assess how Starbucks has performed in the zip code
        df = pd.DataFrame(biz_list, columns=['store_name', 'avg_rating', 'reviews', 'zip', 'address'])
        df['store_total_reviews'] = df.groupby('address')['reviews'].cumsum()
        df['store_weight'] = df['avg_rating'] * df['reviews']
        df['store_total_weight'] =  df.groupby('address')['store_weight'].cumsum()
        df['avg_rating_weighted'] = round(df['store_total_weight'] / df['store_total_reviews'], 2)
        df = df.drop_duplicates(subset=['address'], keep='last')
        df.drop(['avg_rating', 'reviews', 'address', 'store_weight', 'store_total_weight'], axis=1, inplace=True)
        df['zip_average_contribution'] = round(df['store_total_reviews'] * df['avg_rating_weighted'], 2)

        stores_per_zip.append(df.shape[0])
        reviews_per_zip.append(df['store_total_reviews'].sum())
        weighted_avg_per_zip.append(round(df['zip_average_contribution'].sum() / df['store_total_reviews'].sum(), 3))
    
    except Exception as e:
        logger.exception(f"{e}")
        logger.info("Current zip code didn't contain any Starbucks locations")
        stores_per_zip.append(0)
        reviews_per_zip.append(0)
        weighted_avg_per_zip.append(0)       
    
    print("\n")

[2024-07-09 18:45:04,951: INFO: 4177544235: API request made]
store in zip =  ('Starbucks', 3.3, 13, '80016', '6100 S Gun Club Rd,Safeway,Aurora, CO 80016')
store in zip =  ('Starbucks', 2.9, 49, '80016', '7500 S Gartrell Rd,Ste 101,Aurora, CO 80016')
store in zip =  ('Starbucks', 2.8, 10, '80016', '7400 S Gartrell Rd,Target,Aurora, CO 80016')
store in zip =  ('Starbucks', 2.0, 1, '80016', '25455 E Smoky Hill Rd,Aurora, CO 80016')
store in zip =  ('STARBUCKS COFFEE', 2.4, 16, '80016', '25701 E Smoky Hill Rd,Aurora, CO 80016')
store in zip =  ('Starbucks', 3.3, 40, '80016', '15795 E Arapahoe Rd,Centennial, CO 80016')
Broke out of while loop - last 25 results didn't include a store in current zip code


[2024-07-09 18:45:05,776: INFO: 4177544235: API request made]
store in zip =  ('Starbucks', 5.0, 1, '80211', '2660 Federal Blvd,Safeway,Denver, CO 80211')
store in zip =  ('Starbucks', 2.5, 44, '80211', '2990 W 44th Ave,Denver, CO 80211')
store in zip =  ('Starbucks', 1.7, 6, '80211', '38

In [15]:
starbucks_data = list(zip(zip_codes, stores_per_zip, reviews_per_zip, weighted_avg_per_zip))

In [16]:
starbucks_data_df = pd.DataFrame(starbucks_data, columns=['zip', 'total_stores', 'total_reviews', 'review_weighted_avg'])

In [17]:
starbucks_data_df 

Unnamed: 0,zip,total_stores,total_reviews,review_weighted_avg
0,80016,6,129,2.988
1,80211,4,63,2.463
2,80212,2,44,3.489
3,80214,3,80,3.746
4,80215,0,0,0.0
5,80216,2,119,2.392
6,80217,0,0,0.0
7,80218,2,14,3.279
8,80219,1,30,1.7
9,80220,4,39,2.572
