# Extract, Transform, Load (ETL)
---
The purpose of this Jupyter Notebook is to extract data and storing it in a SQLite database within this package. The codes below show the steps in this process:
- Data pulling <i>['Starbucks', 'Dunkin' Donuts', 'Think Coffee', 'Joe Coffee', 'Gregorys Coffee', 'Birch Coffee']</i> located around NYC
    - <b>NYC location -</b> longitude: -73.99429321289062, latitude: 40.70544486444615
    - Includes these areas: Brooklyn, the Bronx, Manhattan, Queens and Staten Island
    - Using search radius of 40000 (meters) or ~25 miles
- ETL - for only needed data that will be used to store in the database
    - Remove duplicates based on ID (due to overlapping results from multiple search)
    - Remove duplicates based on yelp returned info where coordinates are the same while having unique IDs) by checking for coordinates and name of the shop
- Configure database
    - Delete (if exists) and create a new database coffee_chains.sqlite and the table
    - Load data into the database

#### Note that this is not subjected to only coffee finding but can be any other search terms

In [1]:
# Import dependencies
from api_key import api_key
from jsonschema import validate
import json
import requests
import pandas as pd

In [2]:
# Set up Yelp API constants
API_HOST = 'https://api.yelp.com/v3/businesses/search'
HEADERS = {
    'Authorization': 'bearer %s' % api_key
}

# Schema for comparing in checking before extraction
schema = {
    'alias': '',
    'categories': [],
    'coordinates': {},
    'display_phone': '',
    'distance': 0.00,
    'id': 'string',
    'image_url': '',
    'is_closed': True,
    'location': {},
    'name': '',
    'phone': '',
    'price': '',
    'rating': 0.0,
    'review_count': 0,
    'transaction': [],
    'url': ''
}

### Custom functions for the ETL steps

In [3]:
# Simple request function for bussiness search endpoint from Yelp API
def request(term = '', loc = '', offsets = 200, rad = 10000):
    data = []
    
    for offset in range(0, offsets, 50):
        params = {
            'term': term.replace(' ', '+'),
            'location': loc.replace(' ', '+'),
            'limit': 50,
            'offset': offset,
            'radius': rad
        }

        # Send the request
        response = requests.get(API_HOST, headers = HEADERS, params = params)

        # Verify the response and return None if error returned else return the json data
        if response.status_code == 200:            
            data += response.json()['businesses']
        else:
            return None

    return data

# Function to verify the predefined schema on what we should be expecting before extracting
def verify_schema(data = None):
    
    # Verify the object entered before extraction
    try:
        validate(instance=data, schema=schema)
        return True
    except:
        return False

# Return the json to DF for cleanining
def json_to_dataframe(data = None):
    try:
        if not verify_schema(data[0]):
            return pd.DataFrame({'error': ["{'error': 'SCHEMA VALIDATION ERROR'}"]})
        else:
            return pd.DataFrame(data)
    except TypeError:
        return pd.DataFrame({'error': ["{'error': 'OBJECT INPUT ERROR'}"]})

# Function to extract the id, name, price, rating, review_count, location (address 1, address 2, address 3, city, 
# state, zip_code), coordinates (latitude and longtitude), and phone into a DataFrame
def cleaned_yelp_dataframe(df, name, filter_by_arr = ['']):
    # Create a copy of the df to work with
    clean_df = df.copy()
    
    # Sometimes random result return not matching the criteria, filter those out by the name
    key_search = [name.lower()]
    clean_df['name'] = clean_df.name.apply(lambda x: x.lower())
    clean_df = clean_df.loc[clean_df['name'].isin(key_search)]
    
    # Normalizing the coordinates and location columns with nested dictionary
    clean_df[['latitude', 'longitude']] = pd.json_normalize(clean_df['coordinates'])
    clean_df[[
        'address1', 
        'address2', 
        'address3', 
        'city', 
        'zip', 
        'country', 
        'state', 
        'display_address'
    ]] = pd.json_normalize(clean_df['location'])
    
    # Drop off the columns no longer needed
    clean_df = clean_df.drop(columns = [
        'alias', 'is_closed', 'categories', 'coordinates', 'transactions', 'location',
        'phone', 'distance', 'display_address'        
    ])
    
    # Add price point column, fill na, and convert to int
    clean_df['price'] = clean_df['price'].fillna('')
    clean_df['price_point'] = clean_df['price'].str.len()
    clean_df['price_point'] = clean_df['price_point'].fillna(0)
    clean_df['price_point'] = clean_df['price_point'].astype('int')
    
    # Reorganize the df for easy viewing
    clean_df = clean_df[[
        'id', 'name', 'review_count', 'rating', 'price', 'price_point', 'display_phone', 'url', 'image_url', 'address1', 
        'address2', 'address3', 'city', 'state', 'zip', 'country', 'latitude', 'longitude'
    ]]
    
    # Filter out just stores found within NYC
    key_search = filter_by
    clean_df = clean_df.loc[clean_df['city'].isin(key_search)]
    
    # Replace any "None" values from address2 and address3 to ""
    clean_df['address2'] = clean_df['address2'].fillna('')
    clean_df['address3'] = clean_df['address3'].fillna('')
    
    # Proper casing for the name and city
    clean_df['name'] = clean_df['name'].str.title()

    # Take only rows that do not have nan for coordinates
    clean_df = clean_df[clean_df['latitude'].notna()]

    return clean_df

### Perform the ETL into the database

In [4]:
# Using for loop to pull the requests, clean, and then combine to export out into one single json file and add to database
search_locations = ['nyc', 'bronx, NY', 'queens, NY', 'staten island, NY', 'brooklyn, NY']
search_terms = ['Starbucks', 'Dunkin\' Donuts', 'Tim Hortons', 'Think Coffee', 'Joe Coffee', 'Gregorys Coffee', 'Birch Coffee']
filter_by = ['New York', 'Brooklyn', 'Bronx', 'Manhattan', 'Queens', 'Staten Island']
dfs = []

# Loop through to append the output cleaned df of each coffee chain for the merging using the defined functions from above
for loc in search_locations:
    for term in search_terms:
        response_data = request(term, loc, offsets = 200)
        response_data = json_to_dataframe(response_data)
        cleaned_response_data = cleaned_yelp_dataframe(response_data, term, filter_by)
        dfs.append(cleaned_response_data)

In [5]:
# Now perform the merge
df_merged = pd.concat(dfs)

In [6]:
# Count how many before dropping duplicates and count duplicates
df_merged.shape[0]

752

In [7]:
# Check for duplicates
df_merged.groupby(df_merged.columns.tolist(),as_index=False).size()

Unnamed: 0,id,name,review_count,rating,price,price_point,display_phone,url,image_url,address1,address2,address3,city,state,zip,country,latitude,longitude,size
0,-2kUyfQrEBxYXfNKfDwAJQ,Starbucks,109,3.0,$$,2,(718) 369-3098,https://www.yelp.com/biz/starbucks-brooklyn-3?...,https://s3-media1.fl.yelpcdn.com/bphoto/hlpn0z...,166 7th Ave,,,Brooklyn,NY,11215,US,40.672042,-73.977554,1
1,-2kUyfQrEBxYXfNKfDwAJQ,Starbucks,109,3.0,$$,2,(718) 369-3098,https://www.yelp.com/biz/starbucks-brooklyn-3?...,https://s3-media1.fl.yelpcdn.com/bphoto/hlpn0z...,66 Nassau Ave,,,Brooklyn,NY,11222,US,40.723415,-73.951434,1
2,-7I1o35D-9DCJz-skdlFUA,Starbucks,75,3.0,$$,2,(718) 459-6923,https://www.yelp.com/biz/starbucks-forest-hill...,https://s3-media4.fl.yelpcdn.com/bphoto/CAKpKm...,776 Avenue Of The Americas,The Capitol At Chelsea,,New York,NY,10001,US,40.745069,-73.990953,1
3,-OmMU5ZjOKlB9K6kSCFv5A,Starbucks,68,3.0,$$,2,(212) 688-9118,https://www.yelp.com/biz/starbucks-new-york-32...,https://s3-media3.fl.yelpcdn.com/bphoto/sqoPSN...,245 E 80th St,,,New York,NY,10075,US,40.774338,-73.954752,1
4,-UKGH0bcyLNMmTeFlx0qdA,Starbucks,20,3.5,$$,2,(212) 974-0902,https://www.yelp.com/biz/starbucks-new-york-62...,https://s3-media4.fl.yelpcdn.com/bphoto/3hTBMH...,240 Central Park S,,,New York,NY,10019,US,40.767073,-73.981052,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,zfOaYbyx5RAa91Qw_ThsPQ,Starbucks,11,2.5,$$,2,(646) 647-5558,https://www.yelp.com/biz/starbucks-new-york-51...,https://s3-media2.fl.yelpcdn.com/bphoto/3jsaqc...,969 1st Ave,,,New York,NY,10022,US,40.756047,-73.964801,1
584,zkpRCCgdlTiMS0i10t5x6w,Starbucks,89,2.5,$$,2,(201) 222-2491,https://www.yelp.com/biz/starbucks-hoboken-3?a...,https://s3-media3.fl.yelpcdn.com/bphoto/LYpSkk...,462 7th Ave,,,New York,NY,10018,US,40.752105,-73.990306,1
585,zt4hC6ZDY5Kdy8AfGDi34Q,Tim Hortons,106,3.5,$,1,(718) 370-2555,https://www.yelp.com/biz/tim-hortons-staten-is...,https://s3-media1.fl.yelpcdn.com/bphoto/MeFLRd...,1700 Richmond Ave,,,Staten Island,NY,10314,US,40.606854,-74.162811,2
586,zwvFJHtRw5L0a_OVn7cd0Q,Starbucks,46,2.5,$$,2,(929) 214-8871,https://www.yelp.com/biz/starbucks-brooklyn-49...,https://s3-media2.fl.yelpcdn.com/bphoto/Wl8WeJ...,315 Seventh Ave,,,New York,NY,10001,US,40.746962,-73.993224,1


In [8]:
# Drop any duplicates based on id since we may have them overlapping based on the pull from multiple NYC areas
df_merged = df_merged.drop_duplicates(subset=['id'])

# Drop again for duplicated shops based on same coordinates even though they have unique IDs by using shop name, latitude, and
# longtitude
df_merged = df_merged.drop_duplicates(subset=['name', 'latitude', 'longitude'])

In [9]:
# Check for duplicates
df_merged.groupby(df_merged.columns.tolist(),as_index=False).size()

Unnamed: 0,id,name,review_count,rating,price,price_point,display_phone,url,image_url,address1,address2,address3,city,state,zip,country,latitude,longitude,size
0,-2kUyfQrEBxYXfNKfDwAJQ,Starbucks,109,3.0,$$,2,(718) 369-3098,https://www.yelp.com/biz/starbucks-brooklyn-3?...,https://s3-media1.fl.yelpcdn.com/bphoto/hlpn0z...,66 Nassau Ave,,,Brooklyn,NY,11222,US,40.723415,-73.951434,1
1,-UKGH0bcyLNMmTeFlx0qdA,Starbucks,20,3.5,$$,2,(212) 974-0902,https://www.yelp.com/biz/starbucks-new-york-62...,https://s3-media4.fl.yelpcdn.com/bphoto/3hTBMH...,240 Central Park S,,,New York,NY,10019,US,40.767073,-73.981052,1
2,01cAFjVfryumdxZCs-QG-A,Starbucks,52,3.5,$,1,(646) 613-0148,https://www.yelp.com/biz/starbucks-new-york-21...,https://s3-media2.fl.yelpcdn.com/bphoto/qG1CAL...,405 Broadway,,,New York,NY,10013,US,40.719076,-74.002624,1
3,02mNOAlJEyPG-3zgHBgWoQ,Gregorys Coffee,175,3.5,$$,2,(877) 208-1928,https://www.yelp.com/biz/gregorys-coffee-new-y...,https://s3-media4.fl.yelpcdn.com/bphoto/epeEip...,520 8th Ave,,,New York,NY,10018,US,40.753613,-73.992042,1
4,05aAwdHqE7Nd_TqzB-FZdQ,Starbucks,25,3.0,$$,2,(212) 249-2473,https://www.yelp.com/biz/starbucks-new-york-57...,https://s3-media2.fl.yelpcdn.com/bphoto/Eh7BQC...,1445 1st Ave,,,New York,NY,10021,US,40.769992,-73.954654,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,zONp4aFgz6qo2bCsrkOIlw,Starbucks,3,4.0,,0,(929) 558-1500,https://www.yelp.com/biz/starbucks-new-york-61...,https://s3-media4.fl.yelpcdn.com/bphoto/sxK_oM...,4 Columbus Cir,,,New York,NY,10019,US,40.767544,-73.983013,1
309,zfOaYbyx5RAa91Qw_ThsPQ,Starbucks,11,2.5,$$,2,(646) 647-5558,https://www.yelp.com/biz/starbucks-new-york-51...,https://s3-media2.fl.yelpcdn.com/bphoto/3jsaqc...,969 1st Ave,,,New York,NY,10022,US,40.756047,-73.964801,1
310,zkpRCCgdlTiMS0i10t5x6w,Starbucks,89,2.5,$$,2,(201) 222-2491,https://www.yelp.com/biz/starbucks-hoboken-3?a...,https://s3-media3.fl.yelpcdn.com/bphoto/LYpSkk...,462 7th Ave,,,New York,NY,10018,US,40.752105,-73.990306,1
311,zt4hC6ZDY5Kdy8AfGDi34Q,Tim Hortons,106,3.5,$,1,(718) 370-2555,https://www.yelp.com/biz/tim-hortons-staten-is...,https://s3-media1.fl.yelpcdn.com/bphoto/MeFLRd...,1700 Richmond Ave,,,Staten Island,NY,10314,US,40.606854,-74.162811,1


In [10]:
# Count how many after dropping duplicates
df_merged.shape[0]

313

In [11]:
# Print out the dtypes for the DF
df_merged.dtypes

id                object
name              object
review_count       int64
rating           float64
price             object
price_point        int32
display_phone     object
url               object
image_url         object
address1          object
address2          object
address3          object
city              object
state             object
zip               object
country           object
latitude         float64
longitude        float64
dtype: object

In [12]:
# Export out into csv and json in addition to adding it to a SQLite database
df_merged.to_csv('../static/dataset/merged.csv', index = False)
df_merged.to_json('../static/dataset/merged.json', orient = 'records', indent = 4)

In [13]:
# Import dependencies for handling the database
from os import path, remove
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session

In [14]:
# Setup the db path
db_path = '../coffee_chains.sqlite'

# Delete the existing database if it exists
if path.exists(db_path):
    remove(db_path)

In [15]:
# Setup the engine and connect the database
engine = create_engine(f'sqlite:///{db_path}')
conn = engine.connect()

In [17]:
# Append the DF to the shop table created
df_merged.to_sql(name = 'shops', con = engine, if_exists = 'replace', index = False)

313

In [18]:
# Quick check to see if things got appended correctly
session = Session(bind = engine)

In [19]:
session.execute(text('SELECT * FROM shops')).fetchone()

('qcnoyytlFIuqlcjDXkXJiw', 'Starbucks', 80, 2.5, '$$', 2, '(718) 855-0856', 'https://www.yelp.com/biz/starbucks-brooklyn-21?adjust_creative=0wbs0iSeYd0wJkiEuTz28g&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=0wbs0iSeYd0wJkiEuTz28g', 'https://s3-media2.fl.yelpcdn.com/bphoto/rvbmfbKSWW4G4RmDu018hA/o.jpg', '67 Main St', '', '', 'Brooklyn', 'NY', '11201', 'US', 40.70275444, -73.99088374)

In [20]:
# Close out of the session and engine
session.close()
engine.dispose()