In [2]:
# Install dependency and download codebase
#%pip install torch transformers datasets matplotlib tqdm
!mkdir checkpoints

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [18]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, PreTrainedTokenizerFast
import torch.nn as nn
import numpy as np
from dataclasses import dataclass
from torch.utils.data import DataLoader

In [4]:
# Download and prepare the OpenBookQA dataset
dataset = load_dataset("openbookqa")

# Access the splits like:
train_data = dataset["train"]
dev_data = dataset["validation"]
test_data = dataset["test"]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.06k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/496k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/58.2k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [5]:
train_data[0]

{'id': '7-980',
 'question_stem': 'The sun is responsible for',
 'choices': {'text': ['puppies learning new tricks',
   'children growing up and getting old',
   'flowers wilting in a vase',
   'plants sprouting, blooming and wilting'],
  'label': ['A', 'B', 'C', 'D']},
 'answerKey': 'D'}

In [6]:
@dataclass
class OpenBookQAExample:
    question_stem: str
    choices: list  # list of possible answers (strings)
    correct_idx: int  # integer in [0..3]

    @staticmethod
    def from_dict(data: dict):
        label_to_idx = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
        question_stem = data['question_stem']
        answerKey = data['answerKey']
        correct_idx = label_to_idx[answerKey]
        choices = [ch for ch in data['choices']['text']]

        return OpenBookQAExample(
            question_stem=question_stem,
            choices=choices,
            correct_idx=correct_idx
        )

In [21]:
from torch.utils.data import Dataset

class OpenBookQADataset(torch.utils.data.Dataset):
    tokenizer: PreTrainedTokenizerFast = None

    def __init__(self, tokenizer, raw_data_list):
        OpenBookQADataset.tokenizer = tokenizer
        self.sample_list = [OpenBookQAExample.from_dict(d) for d in raw_data_list]

    def __len__(self):
        return len(self.sample_list)

    def __getitem__(self, idx):
        return self.sample_list[idx]

    @staticmethod
    def collate_fn(batch_samples):
        stems = [ex.question_stem for ex in batch_samples]
        list_of_choices = [ex.choices for ex in batch_samples]
        labels = [ex.correct_idx for ex in batch_samples]

        # Flatten out question+choice pairs
        flattened_inputs = []
        for stem, choices in zip(stems, list_of_choices):
            for c in choices:
                flattened_inputs.append(stem + " " + c)

        # Tokenize
        tokenizer = OpenBookQADataset.tokenizer
        tokenized = tokenizer(
            flattened_inputs,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        # Reshape
        batch_size = len(batch_samples)
        num_choices = len(list_of_choices[0])  # typically 4
        for k in tokenized:
            tokenized[k] = tokenized[k].view(batch_size, num_choices, -1)

        tokenized["labels"] = torch.LongTensor(labels)
        return tokenized


In [8]:
def initialize_openbookqa_datasets(tokenizer):
    raw_data = load_dataset("openbookqa", "main")
    split_datasets = {}
    for split_name in raw_data.keys():
        split_data = list(raw_data[split_name])
        split_datasets[split_name] = OpenBookQADataset(tokenizer, split_data)
    return split_datasets

In [27]:
def compute_accuracy(preds, labels):
    return (preds == labels).float().mean()

from tqdm import tqdm

def train_one_epoch(model, dataloader, optimizer, epoch):
    model.train()
    all_preds, all_labels = [], []

    # Wrap your dataloader with tqdm to get a progress bar
    progress_bar = tqdm(dataloader, desc=f"Train Epoch {epoch}", leave=True)

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].cuda()
        attn_mask = batch["attention_mask"].cuda()
        labels = batch["labels"].cuda()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attn_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits  # [batch_size, num_choices]

        loss.backward()
        optimizer.step()

        # Predictions
        preds = torch.argmax(logits, dim=1).detach().cpu()
        labels_cpu = labels.detach().cpu()
        all_preds.extend(preds.tolist())
        all_labels.extend(labels_cpu.tolist())

        # Optionally, update the progress bar text
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    # Compute overall accuracy for the epoch
    accuracy = compute_accuracy(torch.tensor(all_preds), torch.tensor(all_labels))
    print(f"Train Epoch {epoch} - Loss: {loss.item():.4f} - Accuracy: {accuracy:.4f}")
    return accuracy


@torch.no_grad()
def evaluate(model, dataloader, split="Val"):
    model.eval()
    all_preds, all_labels = [], []
    for batch in dataloader:
        input_ids = batch["input_ids"].cuda()
        attn_mask = batch["attention_mask"].cuda()
        labels = batch["labels"].cuda()

        outputs = model(input_ids=input_ids, attention_mask=attn_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu()
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.cpu().tolist())

    accuracy = compute_accuracy(torch.tensor(all_preds), torch.tensor(all_labels))
    print(f"{split} Accuracy: {accuracy:.4f}")
    return accuracy.item()

In [34]:
torch.manual_seed(64)

def main():
    model_name = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMultipleChoice.from_pretrained(model_name)
    model.cuda()

    # Training hyperparameters
    num_epochs = 3
    batch_size = 4
    learning_rate = 1e-5

    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Prepare data
    datasets = initialize_openbookqa_datasets(tokenizer)
    train_loader = DataLoader(
        datasets["train"], batch_size=batch_size, shuffle=True,
        collate_fn=OpenBookQADataset.collate_fn
    )
    val_loader = DataLoader(
        datasets["validation"], batch_size=batch_size, shuffle=False,
        collate_fn=OpenBookQADataset.collate_fn
    )
    test_loader = DataLoader(
        datasets["test"], batch_size=batch_size, shuffle=False,
        collate_fn=OpenBookQADataset.collate_fn
    )

    best_val_acc = 0.0
    for epoch in range(1, num_epochs+1):
        train_acc = train_one_epoch(model, train_loader, optimizer, epoch)
        val_acc = evaluate(model, val_loader, split="Val")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            model.save_pretrained("./checkpoints")

    # Evaluate best model on test set
    best_model = AutoModelForMultipleChoice.from_pretrained("./checkpoints").cuda()
    test_acc = evaluate(best_model, test_loader, split="Test")
    print("Final Test Acc:", test_acc)


In [35]:
main()

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train Epoch 1: 100%|██████████| 1240/1240 [02:17<00:00,  9.05it/s, loss=0.1317]


Train Epoch 1 - Loss: 0.1317 - Accuracy: 0.4420
Val Accuracy: 0.5100


Train Epoch 2: 100%|██████████| 1240/1240 [02:16<00:00,  9.10it/s, loss=2.0023]


Train Epoch 2 - Loss: 2.0023 - Accuracy: 0.6147
Val Accuracy: 0.5580


Train Epoch 3: 100%|██████████| 1240/1240 [02:16<00:00,  9.10it/s, loss=1.8629]


Train Epoch 3 - Loss: 1.8629 - Accuracy: 0.7178
Val Accuracy: 0.5720
Test Accuracy: 0.5420
Final Test Acc: 0.5419999957084656
