In [2]:
import pandas as pd
from pandas import json_normalize
import requests
import json
import csv
import os
import re
from quant_preprocess import query_and_preprocess_data
from quant_preprocess import query_and_reshape_long
from quant_preprocess import recode_long_data


measurements = ["oz", "ml", "lb", "l", "ounces", "g"]
numbers = ["1", "2", "3", "4", "5", "6", "7", "8", "9","0", "."]
IN_FILE_PATH = os.path.join("..//data", "items.txt")
OUTPUT_DIR = "data"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'ingredient_prices.csv')

In [3]:
def load_ingredients():

    df = query_and_reshape_long()
    recoded = recode_long_data(df)
    summary = recoded[["ingredient", "amount"]].groupby("ingredient").agg(["mean", "sum"])
    return summary.index.values.tolist()

In [4]:
def read_txt_as_json():
    
    j_list = []
    with open(IN_FILE_PATH) as text:
        json_list = [line.rstrip("\n") for line in text]
        for j in json_list:
            j_list.append(json.loads(j))
        return j_list
       

In [5]:
def split(word):
    return [char for char in word]

In [6]:
 def get_products_best_seller():
        
        j_list = read_txt_as_json()
        product_dict = {}
        for j in j_list:
            amount_temp = []
            units_temp = []
            price_temp = []
            desc_temp = []
            ingredient = j["request_parameters"]["search_term"]
            key = ingredient
            product_dict.setdefault(key, [])
            a = "search_results"
            if a not in j:
                continue
            else:
                results = j["search_results"]
                i = 0
                for r in results[0:5]:
                    n_temp = []
                    m_temp = []
                    title = r["product"]["title"]
                    chars = split(title)
                    title_list = title.split()
                    price = r["offers"]["primary"]["price"]
                    price_temp.append(price)
                    desc_temp.append(title)
                    for c in reversed(range(len(chars))):
                        if (chars[c] not in (''.join(numbers))) and (len(n_temp) > 0):
                            break
                        if c ==  0:
                            amount_temp.append("NA")
                        if chars[c] not in (''.join(numbers)):
                            continue
                        else:
                            for n in numbers:
                                if n == chars[c]:
                                    n_temp.append(chars[c])
                    amount_temp.append((''.join(n_temp))[::-1])
                    for t in reversed(range(len(title_list))):
                        for m in measurements:
                            if len(m_temp) > 0:
                                break
                            if t == 0:
                                units_temp.append("NA")
                            if m not in (title_list[t].lower()):
                                continue
                            else:
                                m_temp.append(m)
                    units_temp.append(''.join(m_temp))
                product_dict[key].append(price_temp)
                product_dict[key].append(amount_temp)
                product_dict[key].append(units_temp)
                product_dict[key].append(desc_temp)
        return product_dict


In [7]:
def drop_bad_products(product_dict):
    
    ingredients = load_ingredients()
    for i in ingredients:
        if (len(product_dict[i]) == 0):
            del product_dict[i]
        else:
            for n in range(0,3):
                if (len(product_dict[i][n]) !=5):
                    del product_dict[i]
                    break
    return product_dict
        


In [8]:
def set_index(dictionary):
    index = ['price', 'measurement', 'units', 'description']
    df = pd.DataFrame(dictionary)
    df['index'] = index
    df = df.set_index('index')
    return df

In [9]:
def load_best_match_batch():
    
    l1 = load_ingredients()
    rough_dict = get_products_best_seller()
    best_seller_dict = drop_bad_products(rough_dict)
    df = set_index(best_seller_dict)
    df = drop_wrong_item(df)
    l2 = df.columns.values.tolist()
    best_match_batch = set(l1).difference(set(l2))
    return list(best_match_batch)

In [10]:
def drop_wrong_item(df):
    
    df.drop("tonic water", axis=1, inplace= True)
    df.drop("butter", axis=1, inplace= True)
    df.drop("cornstarch", axis=1, inplace= True)
    df.drop("cognac", axis=1, inplace= True)
    df.drop("flavored rum", axis=1, inplace= True)
    df.drop("flavored vodka", axis=1, inplace= True)
    df.drop("fruit", axis=1, inplace= True)
    df.drop("ice cream", axis=1, inplace= True)
    df.drop("fruit juice", axis=1, inplace= True)
    df.drop("grain alcohol", axis=1, inplace= True)
    df.drop("hot sauce", axis=1, inplace= True)
    df.drop("milk", axis=1, inplace= True)
    df.drop("whiskey", axis=1, inplace= True)
    df.drop("sugar", axis=1, inplace= True)
    df.drop("spice", axis=1, inplace= True)
    df.drop("soda", axis=1, inplace= True)
    df.drop("sherry", axis=1, inplace= True)
    df.drop("prosecco", axis=1, inplace= True)
    df.drop("olive brine", axis=1, inplace= True)
    df.drop("nut", axis=1, inplace= True)
    df.drop("mix", axis=1, inplace= True)
    df.drop("herb", axis=1, inplace=True)
    df.drop("sarsaparilla", axis=1, inplace=True)
    df.drop("ice", axis=1, inplace=True)
    df.drop("dry vermouth", axis=1, inplace=True)
    df.drop("water", axis=1, inplace=True)
    df.drop("erin cream", axis=1, inplace= True)
    
    
    return df      

In [13]:
def convert_prices(data):
    ingreds = data.columns.values.tolist()
    a = {}
    for i in ingreds:
        a.setdefault(i, [])
        p = []
        m = []
        u = []
        for n in range(len(data.iloc[0][i])):
            p.append(float(data.iloc[0][i][n]))
            m.append(float(data.iloc[1][i][n]))
            u.append(data.iloc[2][i][n])
        a[i].append(p)
        a[i].append(m)
        a[i].append(u)
    df = pd.DataFrame(a)
    ingreds = df.columns.values.tolist()
    for i in ingreds:
        price = []
        average = []
        for n in range(len(data.iloc[0][i])):
            p = df.iloc[0][i][n]
            m = df.iloc[1][i][n]
            u = df.iloc[2][i][n]
            if u == "ounces":
                price.append(p/m)
                continue
            if u == "oz":
                price.append(p/m)
                continue
            if u == "ml":
                price.append(p/(m*0.033814))
                continue
            if (u  == "l") and (m <= 2):
                price.append(p/(m*33.814))
                continue
            else:
                continue
        a[i] = (sum(price)/len(price))
    return pd.DataFrame(a, index=[0])
        
        


Unnamed: 0,beer,bitters,carbonated water,champagne,cherry heering,club soda,egg,food coloring,red wine,soy sauce,sweet and sour,wine
0,1.706667,1.7475,0.082061,0.326377,0.244617,0.020355,0.795601,1.426459,0.303293,0.15645,0.807044,0.287948


In [12]:
d = get_products_best_seller()
d1 = drop_bad_products(d)
df = set_index(d1)
data = drop_wrong_item(df)
data

Unnamed: 0_level_0,beer,bitters,carbonated water,champagne,cherry heering,club soda,egg,food coloring,red wine,soy sauce,sweet and sour,wine
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
price,"[19.98, 18.98, 19.98, 24.48, 18.98]","[7.88, 7.88, 7.88, 17.28, 22.95]","[0.7, 0.7, 0.7, 3.88, 0.84]","[12.99, 6.48, 13.48, 6.48, 7.98]","[0.7, 0.7, 3.88, 6.42, 3.88]","[0.7, 0.7, 0.7, 0.7, 0.64]","[3.88, 6.68, 21.98, 21.98, 20.36]","[3.88, 3.88, 11.82, 1.5, 8.98]","[2.96, 11.73, 13.98, 12.27, 9.48]","[1.58, 2.98, 1.58, 2.98, 6.97]","[2.82, 2.48, 3.38, 19.63, 19.18]","[2.96, 12.27, 13.98, 18.73, 12.27]"
measurement,"[12, 12, 12, 12, 12]","[4, 4, 4, 16, 32]","[33.8, 33.8, 33.8, 12, 1]","[2, 750, 750, 750, 750]","[33.8, 33.8, 12, 12, 12]","[33.8, 33.8, 33.8, 33.8, 33.8]","[16, 7, 32, 32, 13]","[2.7, 2.7, 4, 8.3, 8]","[750, 750, 750, 1.5, 750]","[15, 15, 15, 15, 40]","[14.8, 12, 1, 6, 72]","[750, 1.5, 750, 5, 1.5]"
units,"[oz, oz, oz, oz, oz]","[oz, oz, oz, oz, g]","[oz, oz, oz, oz, l]","[l, ml, ml, l, l]","[oz, oz, oz, oz, oz]","[oz, oz, oz, oz, oz]","[oz, g, oz, oz, oz]","[ounces, ounces, oz, oz, oz]","[ml, l, ml, l, l]","[oz, oz, oz, oz, oz]","[oz, oz, l, oz, oz]","[ml, l, ml, l, l]"
description,"[Miller Lite Light Lager Beer 4.2% ABV, 24-pac...","[ANGOSTURA Aromatic Bitters, 4 FL OZ, ANGOSTUR...","[Clear American Peach Sparkling Water, 33.8 fl...",[Gift Box - Romanee (x2) Music Collection Reu...,"[Clear American Black Cherry Sparkling Water, ...","[Clear American Peach Sparkling Water, 33.8 fl...","[Old South Egg Pickled 16 Oz, Cajun Classics 7...","[Great Value Gel Food Colors, Classic Colors, ...",[Oak Leaf Vineyards Cabernet Sauvignon Red Wi...,"[Great Value Less Sodium Soy Sauce, 15 fl oz, ...",[La Choy Sweet and Sour Stir Fry Sauce & Marin...,[Oak Leaf Vineyards Cabernet Sauvignon Red Wi...
