In [None]:
# change dir to root
%cd ../

In [None]:
import json
import time
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from core import OfferDBSession

## Gold Queries

In [None]:
db = OfferDBSession()
offers = db.get_rows()
df = pd.DataFrame(offers)
df.head()

In [None]:
retailers = set(df.RETAILER.unique())
retailers.remove("")

r_scores = [(df.RETAILER == retailer).astype(int).to_list() for retailer in retailers ]

In [None]:
brands = set(df.BRAND.unique())
b_scores = [(df.BRAND == brand).astype(int).to_list() for brand in brands ]

In [None]:
categories = set()
for cats in df.CATEGORIES:
    categories.update(json.loads(cats))
c_scores = []
for category in categories:
    scores = []
    for cats in df.CATEGORIES:
        score = 1 if category in json.loads(cats) else 0
        scores.append(score)
    c_scores.append(scores)

In [None]:
super_categories = set()
for cats in df.SUPER_CATEGORIES:
    super_categories.update(json.loads(cats))
sc_scores = []
for category in super_categories:
    scores = []
    for cats in df.SUPER_CATEGORIES:
        score = 1 if category in json.loads(cats) else 0
        scores.append(score)
    sc_scores.append(scores)

In [None]:
print(len(retailers), len(r_scores))
print(len(brands), len(b_scores))
print(len(categories), len(c_scores))
print(len(super_categories), len(sc_scores))

In [None]:
queries = list(retailers) + list(brands) + list(categories) + list(super_categories)
types = ["retailer"] * len(retailers) + ["brand"] * len(brands) + ["category"] * len(categories) + ["super category"] * len(super_categories)
scores = r_scores + b_scores + c_scores + sc_scores
# scores = [json.dumps(score) for score in scores]

gold_scores = pd.DataFrame({'QUERY': queries, 'TYPE': types, 'SCORES': scores})
gold_scores = gold_scores.sort_values(['TYPE', 'QUERY']).reset_index()
gold_scores.drop('index', axis=1, inplace=True)

In [None]:
gold_scores.to_csv('./data/processed/true_scores_gold.csv', index=False)

## Synthetic Queries

In [None]:
client = OpenAI()

def complete(model_name, sys_prompt, user_prompt, pbar, temperature = 0, top_p = 1):

    MAX_API_RETRY = 5
    for i in range(MAX_API_RETRY):
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": sys_prompt},
                    {
                        "role": "user",
                        "content": user_prompt,
                    },
                ],
                temperature= temperature,
                top_p = top_p,
            )
            content = response.choices[0].message.content

            return content
        except Exception as e:
            pbar.write(f'Error: {e}')
            time.sleep(5 * i)

    raise RuntimeError('Maximum retries reached\n')

def get_queries(input_type, input_, pbar):
    model_name = 'gpt-3.5-turbo'
    temperature = 0.2

    sys_prompt = 'You are a synthetic search query generator, write 10 natural language queries that a user '
    sys_prompt += f'might type or say to find relevant offers for the given {input_type.title()} name.\n'
    sys_prompt += f'4 out of 10 queries should contain possible typos of the {input_type.title()} name.\n'
    sys_prompt += 'Generate queries in json format like - {"queries": ["query1", ...]}\n'

    match input_type:
        case 'retailer':
            sys_prompt += 'For example, if the retailer is Amazon, the queries could be:\n'
            sys_prompt += '"Show me the best deals from Amazon"\n'
            sys_prompt += '"amazonn"'
        case 'brand':
            sys_prompt += 'For example, if the brand is Huggies, the queries could be:\n'
            sys_prompt += '"Show me the best deals on Huggies diapers"\n'
            sys_prompt += '"hugies"'
        case 'category', 'super category':
            sys_prompt += 'For example, if the category is Hair Care, the queries could be:\n'
            sys_prompt += '"Show me the best deals for Hair Care products"\n'
            sys_prompt += '"Har Care"'

    user_prompt = f'{input_type.title()}: {input_.lower()}\nQueries:'

    try:
        response = complete(model_name, sys_prompt, user_prompt, pbar, temperature)
        response = json.loads(response)
        return response
    except:
        print(f"Failed:\t {input_type} = {input_}")
        return {}

In [None]:
score_df = pd.read_csv('./data/processed/true_scores_gold.csv')
queries = score_df.QUERY.to_list()
types = score_df.TYPE.to_list()
syn_queries = []

In [None]:
pbar = tqdm(desc="Generating...", total=len(queries), unit='query')
for idx, (type_, query) in enumerate(zip(types,queries)):
    data = {'index': idx, 'term':query, 'type':type_}
    data['queries'] = get_queries(type_,query, pbar).get('queries',[])
    syn_queries.append(data)
    pbar.update(1)

for query in syn_queries:
    if len(query['queries']) < 10:
        query['queries'] = get_queries(query['type'],query['term'], pbar).get('queries')
    if len(query['queries']) > 10:
        query['queries'] = query['queries'][0:10]
    pbar.write(f"Count Correction: {query['index']}, {query['term']}, {query['type']}, {len(query['queries'])}")
pbar.close()

In [None]:
with open('./data/processed/syn_queries.json','w') as file:
    json.dump(syn_queries, file)

In [None]:
with open('./data/processed/syn_queries.json','r') as file:
    syn_queries = json.load(file)

In [None]:
terms = []
queries = []
types = []
scores = []
for query in syn_queries:
    score = score_df[score_df.QUERY==query['term']].SCORES.head(1).tolist()[0]
    terms.extend([query['term']]*len(query['queries']))
    queries.extend(query['queries'])
    types.extend([query['type']]*len(query['queries']))
    scores.extend([score]*len(query['queries']))

In [None]:
syn_scores = pd.DataFrame({'TERM':terms, 'QUERY':queries, 'TYPE':types, 'SCORES':scores})
syn_scores.to_csv('./data/processed/true_scores_syn.csv', index=False)

## Merge Queries

In [None]:
gold_df = pd.read_csv('./data/processed/true_scores_gold.csv')
syn_df = pd.read_csv('./data/processed/true_scores_syn.csv')
syn_df.drop('TERM', axis=1, inplace=True)
scores_df = pd.concat([gold_df, syn_df], ignore_index=True)[['QUERY','TYPE', 'SCORES']]
scores_df.to_csv('./data/processed/true_scores.csv', index=False)