In [1]:
%load_ext autoreload
%autoreload 2
%env HF_HOME=/home/mila/m/moksh.jain/scratch/hf

UsageError: Line magic function `%export` not found.


In [2]:
import random
import json
import numpy as np

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

from utils import score_fast, append_sol_and_remove_eos

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_samples = 50
log_interval = 10

rng_seed = 2

In [4]:
np.random.seed(rng_seed)
random.seed(rng_seed)
torch.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
model_to_use = 'instruct-gpt-j-fp16' # 'gpt2'

if model_to_use == 'instruct-gpt-j-fp16':
    tokenizer = AutoTokenizer.from_pretrained('nlpcloud/instruct-gpt-j-fp16')
    model = AutoModelForCausalLM.from_pretrained('nlpcloud/instruct-gpt-j-fp16',
                                                torch_dtype=torch.bfloat16)
elif model_to_use == 'gpt2':
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    model = AutoModelForCausalLM.from_pretrained('gpt2')

model.to('cuda')

Downloading (…)okenizer_config.json: 100%|██████████| 619/619 [00:00<00:00, 154kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 56.9MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 44.3MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.37M/1.37M [00:00<00:00, 38.1MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 4.04k/4.04k [00:00<00:00, 8.55MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading (…)lve/main/config.json: 100%|██████████| 836/836 [00:00<00:00, 252kB/s]
Downloading pytorch_model.bin:   1%|          | 94.4M/12.1G [00:01<02:20, 85.5MB/s]

KeyboardInterrupt: 

In [None]:
answers = [ 'objective', 'subjective' ]

obj_id = tokenizer.vocab['Ġobjective']
subj_id = tokenizer.vocab['Ġsubjective']

data_train = [ json.loads(l) for l in open(f'data/subj/train.{train_samples}.jsonl', 'r') ]
data_test = [ json.loads(l) for l in open('data/subj/test.jsonl', 'r') ]

data_train = [sample for sample in data_train]
data_test = [sample for sample in data_test]

train_queries = []
train_sols = []

test_queries = []
test_sols = []

intro_prompt = 'Classify this movie review as objective or subjective: "'
cot_prompt = '" It is'

for sample in data_train:
    train_queries.append(intro_prompt + sample['text'] + cot_prompt)
    train_sols.append(' ' + sample['label_text'])

few_show_examples = [train_queries[i] + train_sols[i] + '.\n' for i in range(train_samples)]
random.shuffle(few_show_examples)
few_shot_prompt = ''.join(few_show_examples)
    
for sample in data_test:
    test_queries.append(few_shot_prompt+intro_prompt + sample['text'] + cot_prompt)
    test_sols.append(' ' + sample['label_text'])

In [None]:
encoded_train_queries = [tokenizer(query, return_tensors='pt')['input_ids'].cuda() for query in train_queries]
encoded_train_sols = [tokenizer(answer, return_tensors='pt')['input_ids'].cuda() for answer in train_sols]
encoded_train_all_sols = [tokenizer(' objective.', return_tensors='pt')['input_ids'].cuda(),
                          tokenizer(' subjective.', return_tensors='pt')['input_ids'].cuda()]
encoded_test_queries = [tokenizer(query, return_tensors='pt')['input_ids'].cuda() for query in test_queries]

eos_token_id = tokenizer.eos_token_id
pad_token_id = tokenizer.eos_token_id

In [None]:
def get_preds(model, encoded_queries, top_n = 999999, bsz = 1):
    preds = []
    encoded_obj = tokenizer(' objective',
                                return_tensors='pt').to('cuda')['input_ids'][0]
    encoded_sub = tokenizer(' subjective',
                                return_tensors='pt').to('cuda')['input_ids'][0]
    encoded_results = torch.nn.utils.rnn.pad_sequence([encoded_obj, encoded_sub], batch_first=True, padding_value=eos_token_id)
    encoded_queries_to_use = encoded_queries[:top_n]
    for i in range(len(encoded_queries_to_use) // bsz):
        batch_input = torch.nn.utils.rnn.pad_sequence([x[0] for x in encoded_queries_to_use[i*bsz:(i+1)*bsz]],
                                                      batch_first=True,
                                                      padding_value=eos_token_id)
        with torch.no_grad():
            mean_reward = score_fast(model,
                            append_sol_and_remove_eos(batch_input.repeat_interleave(2, dim=0),
                                                      encoded_results.repeat(bsz, 1), eos_token_id, pad_token_id),
                            eos_token_id=eos_token_id)
        pred = mean_reward.reshape(bsz, 2)
        preds += (pred[:, 0] > pred[:, 1]).tolist()
    return preds

In [None]:
true_preds_train = torch.tensor([True if 'objective' in sol else False for sol in train_sols])
true_preds = torch.tensor([True if 'objective' in sol else False for sol in test_sols])

model.eval()
train_preds = get_preds(model, encoded_train_queries, bsz = 10)
print(f'Train Acc : {(torch.tensor(train_preds) == true_preds_train).sum() / len(true_preds_train)}')
test_preds = get_preds(model, encoded_test_queries, bsz = 10)
print(f'Test Acc : {(torch.tensor(test_preds) == true_preds).sum() / len(true_preds)}')