In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, OPTForCausalLM, GPT2Tokenizer
from sklearn.utils import shuffle
from torch import optim
from tqdm import tqdm
import torch
import time
import json

In [None]:
# tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
# model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt")


# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2").cuda()



# tokenizer = GPT2Tokenizer.from_pretrained("microsoft/DialoGPT-large")
# model = GPT2LMHeadModel.from_pretrained("microsoft/DialoGPT-large").cuda()


tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
model = OPTForCausalLM.from_pretrained("facebook/opt-1.3b").cuda()


In [None]:
data_path = "../data/500QA.json"
with open(data_path) as f:
  data = json.load(f)

In [None]:
def few_shot_prompt(q, add_answer=False):
  text = "input: "+ q["query"] + " \n"
  text+= "output: "
  if add_answer:
    text+= q['options'][q['answer']] + " \n"

  return text

In [None]:
def get_fewshot_NLL_score(model,tokenizer,condition,text,filler=' so I recommend ',normalize=False):
  text = condition + filler  + text
  encodings = tokenizer(text, return_tensors="pt")
  condition = tokenizer(condition, return_tensors="pt")
  stride = condition.input_ids.size(1)

  nlls = []

  begin_loc =0
  end_loc = stride
  trg_len = encodings.input_ids.size(1) - stride
  input_ids = encodings.input_ids.to('cuda')
  target_ids = input_ids.clone()

  with torch.no_grad():
      outputs = model(input_ids, labels=target_ids)
      neg_log_likelihood = outputs[0] 
  
  if normalize:
      with torch.no_grad():
        c_input_ids = condition.input_ids.to('cuda')
        outputs = model(c_input_ids, labels=c_input_ids)
        c_neg_log_likelihood = outputs[0]
      return (-1 * neg_log_likelihood) - (-1 * c_neg_log_likelihood)
  else:
    return -1 * neg_log_likelihood


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,T5ForConditionalGeneration
import torch
import random
import numpy as np

def eval(dataset,normalize,prompt):

  correct_type = {
            "Specific": [],
            "Subjective": [],
            "Indirect": [],
            "Compound": [],
            "Negated": [],
            "Analogical": [],
            "Temporal": []
        }
  incorrect_type = {
            "Specific": [],
            "Subjective": [],
            "Indirect": [],
            "Compound": [],
            "Negated": [],
            "Analogical": [],
            "Temporal": []
        }

  # number of correct predictions
  correct = 0
  # h@1 evaluation metric
  total_hit_at_1 = 0
  # number of queries
  count = 0
  output_message = ""

  # loop through each query
  for query in dataset:
    count +=1
    if count % 50==0:
      print('--> ',count)

    output_message+= str(count)+' Query: '+query["query"]+' \n'
    
    scores = []
    q_text = query["query"]
    p = few_shot_prompt(query,False)
    q_text = prompt + p 
    for key in query["options"]:

      score = get_fewshot_NLL_score(model,tokenizer,q_text,query["options"][key],normalize=normalize,filler='')
      assert not torch.isnan(score), 'score is nan'

      scores.append([key,score])
      if key == query["answer"]:
         output_message+='Answer: '+str(score)+' '+query["options"][key] + ' \n'
      else:
         output_message+= str(score)+' '+query["options"][key] + ' \n'
    
    def takeSecond(elem):
      return elem[1]

    # sort list with key
    scores.sort(key=takeSecond, reverse=True)
    predicted_id = scores[0][0]
    ## check if predicted id is the same as correct id
    if predicted_id == query["answer"]:
      output_message+='True \n'
      correct += 1
      total_hit_at_1 += 1
      for query_type in correct_type:
        if query_type =="Indirect":
          if query["query_type"][query_type]==0:
            correct_type[query_type].append(1)
        else:
          if query["query_type"][query_type]==1:
            correct_type[query_type].append(1)

    else:
      output_message+='False: ' + query["options"][predicted_id] +'\n'
      for query_type in correct_type:
        if query_type =="Indirect":
          if query["query_type"][query_type]==0:
            incorrect_type[query_type].append(1)
        else:
          if query["query_type"][query_type]==1:
            incorrect_type[query_type].append(1)
    output_message+='-'*10 + ' \n'

  for query_type in correct_type:
    N = (np.sum(correct_type[query_type])+np.sum(incorrect_type[query_type]))
    print(query_type,', Acc:',np.sum(correct_type[query_type])/(N),', N:',N)
  return correct

In [None]:
# adjustable parameters
train_split = 0.2
prompt_size = 5 # samples per prompt

num_holdout = 0.2*len(data)
shuffled_data = shuffle(data, random_state=0)

train_data = shuffled_data[:num_holdout]
sample_perms = torch.randperm(num_holdout)

results = []
for i in range(0, num_holdout-prompt_size, prompt_size):
  print("Trial: ", i)
  prompt = ''
  for index in sample_perms[i:i+5]:
    p = few_shot_prompt(train_data[index],True)
    prompt+=p
  
  print("Prompt:")
  print(prompt)
  correct = eval(data, normalize=True, prompt=prompt)
  results.append(correct)
  print("Total correct: ", correct)

print(results)
print("Average accuracy across {} trials: {}".format(len(results), sum(results)/len(results)))