If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it.

In [None]:
!pip install datasets transformers sentencepiece

# import os, sys
# from google.colab import drive
# drive.mount('/content/drive')
# nb_path = '/content/notebooks'
# os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
# sys.path.insert(0,nb_path)

# !pip install --target=$nb_path datasets transformers huggingface_hub

In [None]:
from huggingface_hub import notebook_login, login

import os
# Replace "your_api_token" with your actual Hugging Face API token
api_token = "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"
os.environ['HUGGINGFACE_TOKEN'] = api_token

login(token=api_token)

notebook_login()

In [None]:
!apt install git-lfs

In [None]:
import transformers

print(transformers.__version__)

In [None]:
from transformers.utils import send_example_telemetry

send_example_telemetry("multiple_choice_notebook", framework="pytorch")

In [None]:
model_checkpoint = "allenai/unifiedqa-t5-small"
dataset_name = "derek-thomas/ScienceQA"
batch_size = 16
num_choices = 5

In [None]:
!pip3 install datasets

In [None]:
from datasets import load_dataset, load_metric

In [None]:
downloaded = load_dataset(dataset_name, "regular")

In [None]:
downloaded

In [None]:
def show_one(example):
    print(f"Context: {example['sent1']}")
    print(f"  A - {example['sent2']} {example['ending0']}")
    print(f"  B - {example['sent2']} {example['ending1']}")
    print(f"  C - {example['sent2']} {example['ending2']}")
    print(f"  D - {example['sent2']} {example['ending3']}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")

In [None]:
import sentencepiece

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

{'input_ids': [8774, 6, 48, 80, 7142, 55, 1, 275, 48, 7142, 1550, 28, 34, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
def convert_example(example):
    new_example = {}
    new_example['startphrase'] = example['question']
    new_example['sent1'] = example['question']
    new_example['label'] = example['answer']

    max_choices = num_choices
    for i in range(max_choices):
        choice_key = f'ending{i}'
        if i < len(example['choices']):
            new_example[choice_key] = example['choices'][i]
        else:
            new_example[choice_key] = ''

    # Add missing components as null objects
    new_example['video-id'] = None
    new_example['fold-ind'] = None
    new_example['sent2'] = None
    new_example['gold-source'] = None

    return new_example

datasets = downloaded.map(convert_example, remove_columns = downloaded["train"].column_names) # convert_dataset(datasets)



In [None]:
ending_names = ["ending0", "ending1", "ending2", "ending3"]

def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of second sentences.
    first_sentences = [[context] * 4 for context in examples["sent1"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["sent2"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
examples = datasets["train"][:5]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

In [None]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(4)]

In [None]:
show_one(datasets["train"][3])

In [None]:
encoded_datasets = datasets.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-scienceqa",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]

In [None]:
show_one(datasets["train"][8])

In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()