In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import sys
sys.path.append('..')

In [3]:
from helper import *
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, OPTForCausalLM, GPT2Tokenizer
from sklearn.utils import shuffle
from torch import optim
from tqdm import tqdm
import time
import json
import csv
import random
import numpy as np
import pandas as pd

In [4]:
config = load_config()
train_splits, val_splits, test_splits = load_data(config)

agg_fcns = {"min":min, "max":max, "amean":np.mean, "gmean":custom_gmean}

In [6]:
# for generating set prompts from ground-truth aspects
def few_shot_prompt(q, add_answer=False):
  text = ""
  for key in list(q['correctness_explanation'].keys()):
    text += "input: " + str(key) + ' \n'
    text += "output: "
    if add_answer:
      text += q['options'][q['answer']] + " \n"

  return text

In [5]:
def get_fewshot_NLL_score(model,tokenizer,condition,text,filler=' so I recommend ',normalize=False):
  text = condition + filler  + text
  encodings = tokenizer(text, return_tensors="pt")
  condition = tokenizer(condition, return_tensors="pt")
  stride = condition.input_ids.size(1)

  nlls = []

  begin_loc = 0
  end_loc = stride
  trg_len = encodings.input_ids.size(1) - stride
  input_ids = encodings.input_ids.to('cuda')
  target_ids = input_ids.clone()

  with torch.no_grad():
      outputs = model(input_ids, labels=target_ids)
      neg_log_likelihood = outputs[0] 
  
  if normalize:
      with torch.no_grad():
        c_input_ids = condition.input_ids.to('cuda')
        outputs = model(c_input_ids, labels=c_input_ids)
        c_neg_log_likelihood = outputs[0]
      return (-1 * neg_log_likelihood) - (-1 * c_neg_log_likelihood)
  else:
    return -1 * neg_log_likelihood


In [12]:
def aspect_fewshot(dataset, model, tokenizer, normalize, prompt, agg_fcn):
  type_correct = {
      "Specific": 0,
      "Subjective": 0,
      "Commonsense": 0,
      "Compound": 0,
      "Negated": 0,
      "Analogical": 0,
      "Temporal": 0
    }
  type_count = {
      "Specific": 0,
      "Subjective": 0,
      "Commonsense": 0,
      "Compound": 0,
      "Negated": 0,
      "Analogical": 0,
      "Temporal": 0
    }
  # number of correct predictions
  correct = 0
  # h@1 evaluation metric
  total_hit_at_1 = 0
  # number of queries
  count = 0
  output_message = ""

  # loop through each query
  for sample in dataset:
    count += 1
    if count % 50 == 0:
      print('--> ',count)
    
    for key in sample['query_type']:
        if sample['query_type'][key] == 1:
          type_count[key] += 1

    output_message += str(count) + ' Query: ' + sample["query"] + ' \n'
    
    q_text = sample["query"]
    aspects = sample["correctness_explanation"].keys()
    options_list = [val for val in sample["options"].values()]

    all_scores = []
    for a in aspects: 
      p ="input: " + a + "\n"
      q_text = prompt + p
      scores = []

      for key in sample["options"]:
        score = get_fewshot_NLL_score(model, tokenizer, q_text, sample["options"][key], normalize=normalize, filler='')
        assert not torch.isnan(score), 'score is nan'
        scores.append(float(score))

        if key == sample["answer"]:
          output_message += 'Answer: ' + str(score) + ' ' + sample["options"][key] + ' \n'
        else:
          output_message += str(score) + ' ' + sample["options"][key] + ' \n'

      all_scores.append(scores)

    agg_scores = aggregate(all_scores, agg_fcn)
    agg_scores, options_list = shuffle(agg_scores, options_list, random_state=0)
    args = np.argsort(agg_scores)
    predicted_id = options_list[args[-1]]

    ## check if predicted id is the same as correct id
    if predicted_id == sample["options"][sample["answer"]]:
      output_message += 'True \n'
      correct += 1
      total_hit_at_1 += 1
      
      # count number of correct queries for each type
      for key in sample['query_type']:
        if sample['query_type'][key] == 1:
          type_correct[key] += 1
    else:
      output_message+='False: ' + predicted_id +'\n'
    
    output_message+='-'*10 + ' \n'

  return correct, count, type_correct, type_count

In [9]:
def evaluate(name, model, tokenizer, prompt_size=5):

  for agg_name, agg_fcn in agg_fcns.items():
    results_file = "FewShot_" + name + "_" + agg_name + "_" + str(prompt_size) + ".csv"
    for i in range(len(train_splits)):
      print("Trial: ", i)
      prompt = ''

      # generate prompt sample
      for index in range(prompt_size):
        p = few_shot_prompt(train_splits[i][index],True)
        prompt += p
      
      print("Prompt:")
      print(prompt)
      correct, count, type_correct, type_count = aspect_fewshot(test_splits[i], model, tokenizer, normalize=True, prompt=prompt, agg_fcn=agg_fcn)
      results.append(correct)
      for key, val in type_correct.items():
          type_correct[key] = val*100/type_count[key]
        type_correct.update({"All":correct*100/count})

      with open(results_file, "a") as f:
        writer = csv.writer(f)
        writer.writerow([i, type_correct['All'], type_correct["Analogical"], 
                        type_correct["Commonsense"], type_correct["Compound"], 
                        type_correct["Negated"], type_correct["Specific"],
                        type_correct["Subjective"], type_correct["Temporal"]])
      
      print("Total correct: {} out of {}".format(correct, count))

    with open(results_file, "a") as f:
        writer = csv.writer(f)
        writer.writerow(["Total", count, type_count["Analogical"], 
                        type_count["Commonsense"], type_count["Compound"], 
                        type_count["Negated"], type_count["Specific"],
                        type_count["Subjective"], type_count["Temporal"]])


In [None]:
prompt_size = config['prompt_size']

model_name = 'facebook/opt-1.3b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = OPTForCausalLM.from_pretrained(model_name).cuda()

evaluate("OPT-1.3b", model, tokenizer)

In [None]:
model_name = 'gpt-2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).cuda()

evaluate("GPT-2", model, tokenizer)