In [64]:
#Upload instructions:
#Make a data directory and upload all content from ../data in repo
#Make a utils directory and upload all content from utils in repo

In [65]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [66]:
import json
from utils import make_fold_inds
from utils import soft_embedding
from google.colab import files
import torch
import torch.nn as nn

In [None]:
uploaded = files.upload()

In [None]:
#read experiment settings

#experiment config directory
experiment_config_path = "utils/experiment_settings.json"

with open(experiment_config_path) as f:
        exp_conf = json.load(f)

#K is number of folds
K = exp_conf["K"]
#path to file with fold indicies
folds_path = exp_conf["folds_path"]
#path to data 
data_path = exp_conf["data_path"]

In [None]:
#write fold indicies to folds_path
make_fold_inds.make_fold_inds(K = K,write_path=folds_path, data_path=data_path)

In [None]:
#set up prompt tuning
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
#ANTON: To do - set up prompt tuning from config not hardcoded
#read prompt tuning settings

#prompt tuning config directory
#prompt_tuning_config_path = "utils/prompt_tuning_settings.json"

#load config
#with open(prompt_tuning_config_path) as f:
#       prompt_conf = json.load(f)

#name of LM (e.g. "gpt2")
#LM_name = prompt_conf["LM_name"]
#values for n_tokens (i.e. number of prompt tokens) to sweep during tuning
#n_tokens = prompt_conf["n_tokens"]


In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [None]:
#modified from https://github.com/kipgparker/soft-prompt-tuning/blob/main/example.ipynb
n_prompts = 3
initialize_from_vocab = True

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained('gpt2')

#append learnable embeddings to initial LM embeddings (learnable embeddings can be accessed by: model.transformer.wte.learned_embedding)
s_wte = soft_embedding.SoftEmbedding(model.get_input_embeddings(), 
                      n_prompts=n_prompts, 
                      initialize_from_vocab=initialize_from_vocab)

model.set_input_embeddings(s_wte)

In [1]:
#TO DO: 
#Make inputs:
#Input ids: tensor of 500 x max_seq_length, made up concatenation of n_prompts prompt ids (iset to any input id) + tokenized(query) + tokenized(correct option) + pad tokens
#Pad attention mask with ones for all tokens other than padding tokens
#Target_ids: mask all but outputs (don't think I need to mask padded tokens, but should check by tryin masking (via -100) with and without padded tokens)  

#put these three inputs together in a dictionary: {key:value} = {"labels" : torch.Size([500,max_length]), 'input_ids': torch.Size([500,max_length]), 'attention_mask': torch.Size([500,max_length])}

#Initilize optimizer: e.g. 
#from transformers import Adam
#optimizer = Adam([model.transformer.wte.learned_embedding])

#put model on cuda

#train:

#for epoch in range(num_epochs):
#        outputs = model(**batch)
#        loss = outputs.loss
#        loss.backward()
#        lr_scheduler.step()
#        optimizer.zero_grad()
#        progress_bar


#make predictions and get accuracy

In [None]:

#Try with a single sequence - might not need attention mask

#outputs = model(**batch)
#print(out)


In [67]:
#EXAMPLE:
inputs = tokenizer("May the force be", return_tensors="pt")

# need to pad attention_mask and input_ids to be full seq_len + n_learned_tokens
# even though it does not matter what you pad input_ids with, it's just to make HF happy

#ANTON: gpt2 vocabulary size is 50257, which is where 50256 below comes from
inputs['input_ids'] = torch.cat([torch.full((1,n_prompts), 50256), inputs['input_ids']], 1)
inputs['attention_mask'] = torch.cat([torch.full((1,n_prompts), 1), inputs['attention_mask']], 1)

outputs = model(**inputs)

In [68]:
inputs

{'input_ids': tensor([[50256, 50256, 50256,  6747,   262,  2700,   307]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [None]:
#try:
#input: tokenized (soft prompt plus query plus option)
#o_input: option (tokenized option)


In [None]:
#Zero Shot code:
  def __init__(self, model_name="facebook/opt-1.3b", device='cuda'):
    self.model = OPTForCausalLM.from_pretrained(model_name).to(device)
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    #self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    #self.model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
    self.device = device

  def get_answer(self, q, options):
    scores = []
    for o in options:
      #tokenized query plus option
      input = self.tokenizer(q+' '+o, return_tensors="pt").input_ids.to(self.device)
      #tokenized option
      o_input = self.tokenizer(o, return_tensors="pt").to(self.device)
      #length of option
      o_len = o_input.input_ids.size(1)
      #set input ids as target ids
      target_ids = input.clone()
      #-100 is a mask, to ignore these tokens in the loss: see https://huggingface.co/docs/transformers/model_doc/opt under OPTForCausalLM
      target_ids[:, :-o_len] = -100

      with torch.no_grad():
          outputs = self.model(input, labels=target_ids)
          neg_log_likelihood = outputs[0] 

      #Will try outputs.loss.backward()

      scores.append((-1*neg_log_likelihood.cpu()))
    scores, options = shuffle(scores, options, random_state=0)
    args = np.argsort(scores)
    return options[args[-1]]


#will need to pad inputs to maximum length for a batch
#The padding token ID can be found in tokenizer.pad_token_id.
#when padding attention mask, use 1 for tokens to attend to, 0 ow
#see https://huggingface.co/course/chapter3/4?fw=tf

In [None]:
#Mahdi's fine tuning code
input_ids = tokenizer(["question: where is capital of France? answer: question: where is capital of France? answer: question: where is capital of France? answer: "*10], return_tensors="pt").input_ids.cuda()
labels = tokenizer(["the capital of France is Paris the capital of France is Paris the capital of France is Paris the capital of France is Paris"*10], return_tensors="pt").input_ids.cuda()
optimizer.zero_grad(set_to_none=True)
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
print(loss)
loss.backward()
optimizer.step()
print(time.time()-t0)
torch.cuda.empty_cache()