<a href="https://colab.research.google.com/github/Anderson-Lee-Git/cse447-nlp/blob/main/src/sample_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers datasets tqdm gdown==v4.6.3

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting gdown==v4.6.3
  Downloading gdown-4.6.3-py3-none-any.whl (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, gdown, datasets
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.3
    Uninstalling gdown-4.7.3:
      Successfully uninstalled gdown-4.7.3
Successfully installed datasets-2.18.0 dill-0.3.8 gdown-4.6.3 mul

# Dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("openbookqa")
dataset_train, dataset_valid, dataset_test = dataset["train"], dataset["validation"], dataset["test"]
print(dataset_train)
print(dataset_valid)
print(f"first question sample: {dataset_train['question_stem'][0]}")
# Note that choices contains 'text' and 'label' keys
print(f"first choice sample: {dataset_train['choices'][0]}")
print(f"first answer key sample: {dataset_train['answerKey'][0]}")
# make sure every label ordering in choices is in order ['A', 'B', 'C', 'D']
for choice in dataset_train["choices"]:
    assert choice["label"] == ['A', 'B', 'C', 'D']

Downloading readme:   0%|          | 0.00/9.06k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/496k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 4957
})
Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 500
})
first question sample: The sun is responsible for
first choice sample: {'text': ['puppies learning new tricks', 'children growing up and getting old', 'flowers wilting in a vase', 'plants sprouting, blooming and wilting'], 'label': ['A', 'B', 'C', 'D']}
first answer key sample: D


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
from dataclasses import dataclass

@dataclass
class OpenQASample:
    id: str
    question_stem: str
    choices: list[str]
    labels: list[str]
    answer_key: str

    @staticmethod
    def from_dict(data: dict):
        return OpenQASample(
            id=data["id"],
            question_stem=data["question_stem"],
            choices=data["choices"],
            labels=data["labels"],
            answer_key=data["answer_key"]
        )

class OpenQADataset(Dataset):
    tokenizer: PreTrainedTokenizerFast = None

    def __init__(self, split):
        self.data = [
            OpenQASample(**{
                "id": raw_sample["id"],
                "question_stem": raw_sample["question_stem"],
                "choices": raw_sample["choices"]["text"],
                "labels": raw_sample["choices"]["label"],
                "answer_key": raw_sample["answerKey"]
            }) for raw_sample in OpenQADataset.get_openqa(split)
        ]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    @staticmethod
    def get_openqa(split):
        dataset = load_dataset("openbookqa")
        return dataset[split]

    @staticmethod
    def format_question(question):
        return question

    @staticmethod
    def format_choices(choices, labels):
        for i in range(len(choices)):
            choices[i] = f"{labels[i]} - {choices[i]}"
        return choices

    @staticmethod
    def format_answer_keys(answer_keys):
        """
        Format answer keys from A, B, C, D to 0, 1, 2, 3
        :param: list of answer keys in integer
        """
        return [ord(a) - ord("A") for a in answer_keys]

    @staticmethod
    def collate_fn(batched_samples):
        B = len(batched_samples)
        batched_question = [[OpenQADataset.format_question(sample.question_stem)] * 4 for sample in batched_samples]  # B, 4
        batched_choices = [OpenQADataset.format_choices(sample.choices, sample.labels) for sample in batched_samples]  # B, 4
        batched_answer_key = [sample.answer_key for sample in batched_samples]  # B, 1
        # flatten batched_questions for tokenization
        batched_question = sum(batched_question, [])
        batched_choices = sum(batched_choices, [])
        # Tokenize the input texts.
        text_encoding = OpenQADataset.tokenizer(batched_question,
                                                batched_choices,
                                                padding=True,
                                                max_length=128,
                                                truncation=True,
                                                return_tensors="pt")
        # unflatten
        label_encoding = torch.LongTensor(OpenQADataset.format_answer_keys(batched_answer_key))  # B, 1

        return {
            "text_encoding": {k: v.view(B, 4, -1) for (k, v) in text_encoding.items()},
            "label_encoding": label_encoding,
        }

# Evaluation

In [4]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

def compute_accuracy(predictions: torch.Tensor, labels: torch.Tensor) -> float:
    accuracy = torch.sum(predictions == labels) / len(predictions)
    return accuracy

@torch.no_grad
def evaluate(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    for batch in tqdm(dataloader):
        text_encoding = batch["text_encoding"]
        for k, v in text_encoding.items():
            text_encoding[k] = v.to(model.device)
        label_encoding = batch["label_encoding"].to(model.device)
        out = model(**text_encoding, labels=label_encoding)
        logits = out.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions += predictions
        all_labels += label_encoding
    all_predictions = torch.Tensor(all_predictions)
    all_labels = torch.Tensor(all_labels)
    accuracy = compute_accuracy(all_predictions, all_labels)
    print(accuracy)

# device = "cuda"
# model = AutoModelForMultipleChoice.from_pretrained("nghuyong/ernie-1.0-base-zh").to(device)
# tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
# OpenQADataset.tokenizer = tokenizer
# dataset_train = OpenQADataset("train")
# dataloader_train = DataLoader(dataset=dataset_train,
#                             batch_size=128,
#                             collate_fn=OpenQADataset.collate_fn)
# print(model.num_parameters())
# evaluate(model, dataloader_train)

In [None]:

# prompt = Question: the question, Options: concatenated list of answers
# ### do input stuff
# answer = model(**inputs)

# if answer is in options:
#   final_answre = answer
# else:
#   final_answer = sbert.multiple_choice(answer, choices)

prompt = Question: the question, Option 1:


for each question:
  option1, option2, option3, option4 = options broken up
  prompt1,2,3 = Question + option1,option2...

  probs = [len 4]
  ouput = model(promptx)
  softmax(output.logits())

  get prob for sequence being generated (sequence = optionx)
  prob[x] = prob

chosen = argmax(prob)
final = options[chosen]
loss += beta*cossim(final, correct)

loss.backward()
optimizer.step()
optimizer.zero_grad()


In [55]:
! pip install datasets
import torch
import random
from tqdm import tqdm
import torch.nn.functional as F
from datasets import load_dataset
from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import AdamW, set_seed
from typing import Dict, Union, List, Tuple



In [56]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice, AutoModelForCausalLM, RobertaTokenizer, RobertaModel, LlamaForCausalLM, LlamaTokenizer

# model1 = AutoModelForCausalLM.from_pretrained("WizardLM/WizardLM-13B-V1.2").to(device)
# tokenizer1 = AutoTokenizer.from_pretrained("WizardLM/WizardLM-13B-V1.2")

# model2 = LlamaForCausalLM.from_pretrained("/output/path")
# tokenizer2 = LlamaTokenizer.from_pretrained("/output/path")

# tokenizer3 = RobertaTokenizer.from_pretrained('roberta-base')
model3 = AutoModelForCausalLM.from_pretrained('roberta-base').cuda()

tokenizer3 = AutoTokenizer.from_pretrained('roberta-base', padding_side = 'left')
tokenizer3.pad_token_id = tokenizer3.eos_token_id
model3.config.pad_token_id = tokenizer3.eos_token_id

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [57]:
print(dataset_train)
print(dataset_valid)
print(f"first question sample: {dataset_train['question_stem'][0]}")
# Note that choices contains 'text' and 'label' keys
print(f"first choice sample: {dataset_train['choices'][0]}")
print(f"first answer key sample: {dataset_train['answerKey'][0]}")

Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 4957
})
Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 500
})
first question sample: The sun is responsible for
first choice sample: {'text': ['puppies learning new tricks', 'children growing up and getting old', 'flowers wilting in a vase', 'plants sprouting, blooming and wilting'], 'label': ['A', 'B', 'C', 'D']}
first answer key sample: D


PROCESS ROWS OF DATASET TRIAN

In [85]:
concat_prompts_train = []
concat_prompts_valid = []

for row in dataset_train:
  char_count = ord(row['answerKey']) - ord('A')
  # print(char_count)
  answer = row['choices']['text'][char_count]
  question = row['question_stem']
  concat = [row['id'], question]
  for option in row['choices']['text']:
    concat.append(question + ' ' + option)
  concat.append(char_count)
  # break
  concat_prompts_train.append(concat)
  # print(row)
  # break

for row in dataset_valid:
  char_count = ord(row['answerKey']) - ord('A')
  # print(char_count)
  char_count = ord(row['answerKey']) - ord('A')
  # print(char_count)
  answer = row['choices']['text'][char_count]
  question = row['question_stem']
  concat = [row['id'], question]
  for option in row['choices']['text']:
    concat.append(question + ' ' + option)
  concat.append(char_count)
  # break
  concat_prompts_valid.append(concat)

print([row[-1] for row in concat_prompts_train[:10]])
# print(concat_prompts_valid[:10])

[3, 3, 2, 2, 3, 0, 3, 3, 0, 3]


In [43]:
test_train = concat_prompts_train[:5]

In [63]:
prompts_train = []
prompts_valid = []

for row in concat_prompts_train:
  prompt = row[2:6]
  prompts_train.append(prompt)

for row in concat_prompts_valid:
  prompt = row[2:6]
  prompts_valid.append(prompt)

# print(prompts_train)

In [88]:
batch = prompts_train[0]
print(batch)

['The sun is responsible for puppies learning new tricks', 'The sun is responsible for children growing up and getting old', 'The sun is responsible for flowers wilting in a vase', 'The sun is responsible for plants sprouting, blooming and wilting']


In [89]:
input_tokens = tokenizer3(batch, truncation=True, padding=True, return_tensors='pt').to('cuda')
# print(input_tokens)
output = model3(**input_tokens)
print(output.logits.shape)

torch.Size([4, 16, 50265])


In [79]:

for prompt in prompts_train:
  batch = prompt
  input_tokens = tokenizer3(batch, truncation=True, padding=True, return_tensors='pt').to('cuda')

  # print(input_tokens)
  output = model3(**input_tokens)
  log_probs = F.log_softmax(output.logits, dim=-1)
  # print(input_tokens['input_ids'].shape)
  # print(input_tokens['input_ids'])
  output_tokens = input_tokens['input_ids'][:, 1:]
  # print(output_tokens.shape)
  # print(log_probs.shape)

  chosen_log_probs = log_probs[:, :-1, :]
  # print(output_tokens.shape)
  # print(chosen_log_probs.shape)
  chosen_log_probs = torch.gather(chosen_log_probs, 2, output_tokens.unsqueeze(-1))
  chosen_log_probs = chosen_log_probs.squeeze(-1)

  # print(chosen_log_probs)
  # print('size', chosen_log_probs.shape)
  # print(output.logits.shape)
  print(torch.argmax(F.softmax(torch.sum(chosen_log_probs, dim=1).detach())))
# [3, 3, 2, 2, 3, 0, 3, 3, 0, 3]

CROSS ENTROPY LOSS TO GET LOSS HEHE
TRAIN

  print(torch.argmax(F.softmax(torch.sum(chosen_log_probs, dim=1).detach())))


tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')
tensor(3, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(2, device='cuda:0')
tensor(2, device='cuda:0')
tensor(3, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')
tensor(3, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(3, device='cuda:0')
tensor(1, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(0, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(3, device='cuda:0')
t

KeyboardInterrupt: 