In [None]:
import openai
import pandas as pd
from tqdm import tqdm
import string
from collections import Counter


# PARAMETERS
openai.api_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

MODELS = ['gpt-3.5-turbo', 'gpt-4']

DATASET = 'test_sets/test_qa.csv'

PROMPT_EN = "Answer in AS FEW WORDS AS POSSIBLE. ---\n\nQuestion: {}{}\n\nAnswer: "
PROMPT_PT = "Responda com o MÍNIMO DE PALAVRAS POSSÍVEL. ---\n\nPergunta: {}{}\n\nResposta: "

PROMPT_EN_CONTEXT = "Answer the question in AS FEW WORDS AS POSSIBLE and based on the context below.\n\nContext: {}\n\n\---\n\nQuestion: {}\nAnswer: "
PROMPT_PT_CONTEXT = "Responda à pergunta com o MÍNIMO DE PALAVRAS POSSÍVEL e com base no contexto abaixo.\n\nContexto: {}\n\n\---\n\nPergunta: {}\nResposta: "

USECOLS = ['id_qa', 'corpus', 
              'abstract', 'abstract_translated_pt',
              'question_en_origin', 'answer_en_origin', 
              'question_pt_origin', 'answer_pt_origin']

EXPERIMENTS = {
    # 1) Pergunta em E, s/ texto
    '1_Qen': {
        'prompt': PROMPT_EN,
        'question': 'question_en_origin',
        'context': '',
        'answer': 'answer_en_origin'
    },
    # 2) Pergunta em P, s/ texto
    '2_Qpt': {
        'prompt': PROMPT_PT,
        'question': 'question_pt_origin',
        'context': '',
        'answer': 'answer_pt_origin'
    },
    # 3) Pergunta em E, texto em E
    '3_Qen_Cen': {
        'prompt': PROMPT_EN_CONTEXT,
        'question': 'question_en_origin',
        'context': 'abstract',
        'answer': 'answer_en_origin',
    },
    # 4) Pergunta em P, texto em P
    '4_Qpt_Cpt': {
        'prompt': PROMPT_PT_CONTEXT,
        'question': 'question_pt_origin',
        'context': 'abstract_translated_pt',
        'answer': 'answer_pt_origin',
    },
    # 5) Pergunta em P, texto em E
    '5_Qpt_Cen': {
        'prompt': PROMPT_PT_CONTEXT,
        'question': 'question_pt_origin',
        'context': 'abstract',
        'answer': 'answer_pt_origin'
    }
}


def normalize_answer(s):
    """Lower text and remove punctuation and extra whitespace."""

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(s)))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


# def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
#     scores_for_ground_truths = []
#     for ground_truth in ground_truths:
#         score = metric_fn(prediction, ground_truth)
#         scores_for_ground_truths.append(score)
#     return max(scores_for_ground_truths)


# def evaluate(gold_answers, predictions):
#     f1 = exact_match = total = 0

#     for ground_truths, prediction in zip(gold_answers, predictions):
#       total += 1
#       exact_match += metric_max_over_ground_truths(
#                     exact_match_score, prediction, ground_truths)
#       f1 += metric_max_over_ground_truths(
#           f1_score, prediction, ground_truths)
    
#     exact_match = 100.0 * exact_match / total
#     f1 = 100.0 * f1 / total

#     return {'exact_match': exact_match, 'f1': f1}


def chatgpt_answer(prompt, question, context='', model='gpt-3.5-turbo'):
    # Available models: 'gpt-3.5-turbo' and 'gpt-4'

    response = openai.ChatCompletion.create(
        model=model,
        temperature=0.0,
        max_tokens=100,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        messages=[
            {'role': 'system', 
             'content': prompt.format(context, question)}
        ]
    )

    return response.choices[0].message.content

In [None]:
df = pd.read_csv(DATASET, usecols=USECOLS)
print(len(df))
df.head()

In [None]:
for model in MODELS:
    print(f'\n\n>>> MODEL: {model}')

    for exp in EXPERIMENTS:
        print(f'>>>>>> Experiment: {exp}')
        
        prompt = EXPERIMENTS[exp]['prompt']
        question_col = EXPERIMENTS[exp]['question']
        context_col = EXPERIMENTS[exp]['context']
        answer_col = EXPERIMENTS[exp]['answer']

        gpt = []
        f1 = []
        em = []
        for _, row in tqdm(df.iterrows()):
            question = row[question_col]
            context = None if question_col == '' else row[question_col]

            ans = chatgpt_answer(prompt, question, context, model)
            
            gpt.append(ans)
            f1.append(f1_score(ans, row[answer_col]))
            em.append(exact_match_score(ans, row[answer_col]))

        df['gpt_answers'] = gpt
        df['gpt_f1'] = f1
        df['gpt_em'] = em

        df.to_csv(f'results/experiments_chatgpt_qa/{model}_{exp}.csv', index=False)