<a href="https://colab.research.google.com/github/Anderson-Lee-Git/cse447-nlp/blob/main/src/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets tqdm gdown==v4.6.3



# Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("openbookqa")
dataset_train, dataset_valid, dataset_test = dataset["train"], dataset["validation"], dataset["test"]
print(dataset_train)
print(f"first question sample: {dataset_train['question_stem'][0]}")
# Note that choices contains 'text' and 'label' keys
print(f"first choice sample: {dataset_train['choices'][0]}")
print(f"first answer key sample: {dataset_train['answerKey'][0]}")
# make sure every label ordering in choices is in order ['A', 'B', 'C', 'D']
for choice in dataset_train["choices"]:
    assert choice["label"] == ['A', 'B', 'C', 'D']

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 4957
})
first question sample: The sun is responsible for
first choice sample: {'text': ['puppies learning new tricks', 'children growing up and getting old', 'flowers wilting in a vase', 'plants sprouting, blooming and wilting'], 'label': ['A', 'B', 'C', 'D']}
first answer key sample: D


In [10]:
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
from dataclasses import dataclass

@dataclass
class OpenQASample:
    id: str
    question_stem: str
    choices: list[str]
    labels: list[str]
    answer_key: str

    @staticmethod
    def from_dict(data: dict):
        return OpenQASample(
            id=data["id"],
            question_stem=data["question_stem"],
            choices=data["choices"],
            labels=data["labels"],
            answer_key=data["answer_key"]
        )

class OpenQADataset(Dataset):
    tokenizer: PreTrainedTokenizerFast = None

    def __init__(self, split):
        self.data = [
            OpenQASample(**{
                "id": raw_sample["id"],
                "question_stem": raw_sample["question_stem"],
                "choices": raw_sample["choices"]["text"],
                "labels": raw_sample["choices"]["label"],
                "answer_key": raw_sample["answerKey"]
            }) for raw_sample in OpenQADataset.get_openqa(split)
        ]
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    @staticmethod
    def get_openqa(split):
        dataset = load_dataset("openbookqa")
        return dataset[split]
    
    @staticmethod
    def format_question(question):
        return question
    
    @staticmethod
    def format_choices(choices, labels):
        for i in range(len(choices)):
            choices[i] = f"{labels[i]} - {choices[i]}"
        return choices

    @staticmethod
    def format_answer_keys(answer_keys):
        """
        Format answer keys from A, B, C, D to 0, 1, 2, 3
        :param: list of answer keys in integer
        """
        return [ord(a) - ord("A") for a in answer_keys]
    
    @staticmethod
    def collate_fn(batched_samples):
        B = len(batched_samples)
        batched_question = [[OpenQADataset.format_question(sample.question_stem)] * 4 for sample in batched_samples]  # B, 4
        batched_choices = [OpenQADataset.format_choices(sample.choices, sample.labels) for sample in batched_samples]  # B, 4
        batched_answer_key = [sample.answer_key for sample in batched_samples]  # B, 1
        # flatten batched_questions for tokenization
        batched_question = sum(batched_question, [])
        batched_choices = sum(batched_choices, [])
        # Tokenize the input texts.
        text_encoding = OpenQADataset.tokenizer(batched_question,
                                                batched_choices,
                                                padding=True,
                                                max_length=128,
                                                truncation=True,
                                                return_tensors="pt")
        # unflatten
        label_encoding = torch.LongTensor(OpenQADataset.format_answer_keys(batched_answer_key))  # B, 1

        return {
            "text_encoding": {k: v.view(B, 4, -1) for (k, v) in text_encoding.items()},
            "label_encoding": label_encoding,
        }

# Evaluation

In [None]:
from dataset import OpenQADataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

def compute_accuracy(predictions: torch.Tensor, labels: torch.Tensor) -> float:
    accuracy = torch.sum(predictions == labels) / len(predictions)
    return accuracy

@torch.no_grad
def evaluate(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    for batch in tqdm(dataloader):
        text_encoding = batch["text_encoding"]
        for k, v in text_encoding.items():
            text_encoding[k] = v.to(model.device)
        label_encoding = batch["label_encoding"].to(model.device)
        out = model(**text_encoding, labels=label_encoding)
        logits = out.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions += predictions
        all_labels += label_encoding
    all_predictions = torch.Tensor(all_predictions)
    all_labels = torch.Tensor(all_labels)
    accuracy = compute_accuracy(all_predictions, all_labels)
    print(accuracy)

device = "cuda"
model = AutoModelForMultipleChoice.from_pretrained("roberta-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
OpenQADataset.tokenizer = tokenizer
dataset_train = OpenQADataset("train")
dataloader_train = DataLoader(dataset=dataset_train,
                            batch_size=128,
                            collate_fn=OpenQADataset.collate_fn)
evaluate(model, dataloader_train)