# Install packages

In [1]:
!pip install datasets==2.14.6
!pip install transformers
!pip install evaluate
!pip install --no-cache-dir transformers sentencepiece
!pip install numpy==1.26.4

Collecting datasets==2.14.6
  Downloading datasets-2.14.6-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=8.0.0 (from datasets==2.14.6)
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.6)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting requests>=2.19.0 (from datasets==2.14.6)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets==2.14.6)
  Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.14.6)
  Downloading multiprocess-0.70.18-py310-none-any.whl.metadata (7.5 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.14.6)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting aiohttp (from datasets==2.14.6)
  Using cached aiohttp-3.11.18-cp310-cp310-manylinux_2_17_x86_64.m

In [1]:
!pip install accelerate -U

[31mERROR: Operation cancelled by user[0m[31m
[0m^C


# Imports

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, TrainingArguments, Trainer

# from string import Template
# from pathlib import Path

import os

import warnings
warnings.simplefilter("ignore")

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict

from torch.utils.data import DataLoader


from IPython.display import Markdown, display

  from .autonotebook import tqdm as notebook_tqdm


# Prepare training data

To access certain Language Model Models (LLMs) through the Hugging Face library, you may need to obtain an access token. You can acquire a token by signing up on the Hugging Face website and gaining permission to use the specific model you're interested in. 

The following cell demonstrates how to pass your access token in order to download the model and tokenizer. Put your access token in the `YOUR_HUGGING_FACE_TOKEN` variable.

In [4]:
from huggingface_hub import login
from dotenv import load_dotenv
import random

# Load variables from .env file into the environment
load_dotenv()

# Access token
hf_token = os.environ["HUGGINGFACE_TOKEN"]

# Log in to Hugging Face
login(token=hf_token)

# Pseudo-randomness
from transformers import set_seed
seed = 42
set_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Here we determine the model we are using and the sub-task we are solving (Sentence Puzzle or Word Puzzle).

In [5]:
DATA_DIR = "../data"

# Select WP or SP
task = "SP"
# (Maybe) Change to roberta-large for better performance
model_name = "FacebookAI/roberta-base"

### Load Data

Loading data locally (modified from original notebook)

In [6]:
train_data = np.load(f"{DATA_DIR}/{task}-train.npy", allow_pickle=True)
test_data  = np.load(f"{DATA_DIR}/{task}_test_labeled.npy", allow_pickle=True)

### Make directory for our output

In [7]:
date_of_run = pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")

if '/' in model_name:
    # Split the model_name by "/"
    parts = model_name.split("/")
    
    # Check if there are at least 4 parts
    if len(parts) >= 5:
        # Concatenate the 3rd and 4th parts with an underscore
        model_suffix = parts[3] + "_" + parts[5]
    else:
        # model_suffix = model_name
        model_suffix = model_name.replace('/', '_')


run_dir = "./small_Mlt_" + task + "_" + model_suffix + "_" + date_of_run
print(run_dir)

# Create the directory if it does not exist
if not os.path.exists(run_dir):
    os.makedirs(run_dir)

os.chdir(run_dir)

./small_Mlt_SP_FacebookAI_roberta-base_2025_05_07_14_10


# Basic preprocessing

* Here we preprocess the data by splitting the data in Original, Scemantic Reconstruction and Context Reconstruction. 

* We then split the data into train, validation and test sets for each of the three types of data. This is done before shuffling in order to retain the same ids in the training, validation and test sets regarding the three types of data.

After that we concatenate the data and shuffle it in each of the three sets (Original, Scemantic, Context).


We create a test split of the given training data to evaluate the model on unseen data. This is done because of the absence of a test set in the dataset in the beginning of the competition.

In [8]:
def convert_from_numpy_to_dataset_type (numpy_array, split):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    # display(df.head(1))
    if split == "train":
      df['id'] = df['id'].astype(str)      
      df['distractor1'] = df['distractor1'].astype(str)
      df['distractor2'] = df['distractor2'].astype(str)
      df['distractor(unsure)'] = df['distractor(unsure)'].astype(str)
      df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df,  split=split)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

Importing the tokenizer in order to tokenize the data.


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

## `preprocess_function` Overview

The preprocessing function described below takes input rows of our multiple-choice dataset.

### Steps:

1. **Combine Sentences:**
   - Replicate each `question` four times to create sentence starts for pairing with each `choice`.

2. **Extract Sentence Endings (`second_sentences`)**:
   - Flattens the list of choice lists to extract all possible sentence endings.

2. **Flatten and Tokenize:**
   - Tokenize the flattened sentences to obtain `input_ids` and `attention_mask`.

3. **Unflatten and Assign Labels:**
   - Unflatten the tokenized sequences into pairs of `(input_ids, attention_mask)` corresponding to `questions` and `choices`.


In [10]:
def preprocess_function(examples):

    first_sentences = [[context] * 4 for context in examples["question"]]
    first_sentences = sum(first_sentences, [])
    # print(first_sentences)
    # print()
    
    second_sentences = [item for item in examples["choice_list"]]
    second_sentences = sum(second_sentences, [])
    # print(second_sentences)

    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # display(tokenized_examples)

    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [11]:
display(train_data[-1])

{'id': 'SP-208_CR',
 'question': "You are running so fast but you're not getting closer. Where are you?",
 'answer': 'Treadmill.',
 'distractor1': 'Country road.',
 'distractor2': 'High way.',
 'distractor(unsure)': 'None of above.',
 'label': 1,
 'choice_list': ['Country road.', 'Treadmill.', 'High way.', 'None of above.'],
 'choice_order': [1, 0, 2, 3]}

### Train data

In [12]:
train_dataset = convert_from_numpy_to_dataset_type(train_data, "train")

{'id': 'SP-0',
 'question': 'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?',
 'answer': 'Each daughter shares the same brother.',
 'distractor1': 'Some daughters get married and have their own family.',
 'distractor2': 'Some brothers were not loved by family and moved away.',
 'distractor(unsure)': 'None of above.',
 'label': 1,
 'choice_list': ['Some daughters get married and have their own family.',
  'Each daughter shares the same brother.',
  'Some brothers were not loved by family and moved away.',
  'None of above.'],
 'choice_order': [1, 0, 2, 3]}

{'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'distractor1': Value(dtype='string', id=None),
 'distractor2': Value(dtype='string', id=None),
 'distractor(unsure)': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'choice_list': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'choice_order': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

Here we are using `.map()` to apply the `preprocess` function to the dataset.

In [13]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
print(f"Training set size: {len(tokenized_train)}")

Map: 100%|██████████| 507/507 [00:00<00:00, 2072.08 examples/s]

Training set size: 507





### Test data

In [14]:
def convert_from_numpy_to_dataset_test_type (numpy_array):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    df = pd.DataFrame(data_list)
    df['id'] = df['id'].astype(str)      
    df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

In [15]:
test_dataset = convert_from_numpy_to_dataset_test_type(test_data)

{'id': 'SP-122_CR',
 'question': 'In a small village, two farmers are working in their fields - a diligent farmer and a lazy farmer. The hardworking farmer is the son of the lazy farmer, but the lazy farmer is not the father of the hardworking farmer. Can you explain this unusual relationship?',
 'choice_list': ['The lazy farmer is his mother.',
  'The lazy farmer is not a responsible father as he is lazy.',
  'The diligent farmer devoted himself to the farm and gradually forgot his father.',
  'None of above.'],
 'label': 0,
 'answer': 'The lazy farmer is his mother.'}

{'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'choice_list': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'answer': Value(dtype='string', id=None)}

In [16]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)
print(f"Training set size: {len(tokenized_test)}")

Map: 100%|██████████| 120/120 [00:00<00:00, 2783.68 examples/s]

Training set size: 120





## Splitting the dataset

### Train data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
ori_original_dataset = tokenized_train.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
ori_scemantic_dataset = tokenized_train.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
ori_context_dataset = tokenized_train.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

print(f"Original dataset size: {len(ori_original_dataset)}")
print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
print(f"Context dataset size: {len(ori_context_dataset)}")

Filter: 100%|██████████| 507/507 [00:00<00:00, 3711.42 examples/s]
Filter: 100%|██████████| 507/507 [00:00<00:00, 3532.66 examples/s]
Filter: 100%|██████████| 507/507 [00:00<00:00, 3611.14 examples/s]

Original dataset size: 169
Semantic dataset size: 169
Context dataset size: 169





In [19]:
def splitting_dataset(dataset, split_size):
    
    #split_size% test + validation
    train_testvalid = dataset.train_test_split(test_size=split_size, shuffle=False)
    
    # Split the rest test + valid in half test, half valid
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5, shuffle=False)
    
    # gather everyone if you want to have a single DatasetDict
    datasets = DatasetDict({
        "train": train_testvalid["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]})
    
    return datasets


Here we are splitting the dataset into train, validation and test sets. **A good rule of thumb is to use 70% of the data for training, 15% for validation and 15% for testing.**

<u>**WE DO NOT WANT TO SHUFFLE THE DATASET BEFORE SPLITTING IT TO KEEP THE ORDER OF THE SENTENCES!!!**</u>

In [20]:
original_dataset = splitting_dataset(ori_original_dataset, 0.3)
scemantic_dataset = splitting_dataset(ori_scemantic_dataset, 0.3)
context_dataset = splitting_dataset(ori_context_dataset, 0.3)


In [21]:
from datasets import concatenate_datasets

assert original_dataset["train"].features.type == scemantic_dataset["train"].features.type
assert original_dataset["train"].features.type == context_dataset["train"].features.type
train_dataset = concatenate_datasets([original_dataset["train"], scemantic_dataset["train"], context_dataset["train"]])
# print(f"Training set size: {len(temp_train_dataset)}")
# print(temp_train_dataset)

assert original_dataset["valid"].features.type == scemantic_dataset["valid"].features.type
assert original_dataset["valid"].features.type == context_dataset["valid"].features.type
valid_dataset = concatenate_datasets([original_dataset["valid"], scemantic_dataset["valid"], context_dataset["valid"]])
# print(f"Validation set size: {len(valid_dataset)}")
# print(valid_dataset)

In [22]:
train_dataset = train_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)

my_dataset = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset})

print(my_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'label', 'choice_list', 'choice_order', 'input_ids', 'attention_mask'],
        num_rows: 354
    })
    valid: Dataset({
        features: ['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'label', 'choice_list', 'choice_order', 'input_ids', 'attention_mask'],
        num_rows: 75
    })
})


## Fine-tuning model

Transformers doesn’t have a data collator for multiple choice, so you’ll need to adapt the `DataCollatorWithPadding` to create a batch of examples. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [23]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

When called on a list of examples, it will flatten all the inputs/attentions masks etc. in big lists that it will pass to the `tokenizer.pad` method. This will return a dictionary with big tensors (of shape `(batch_size * 4) x seq_length`) that we then unflatten.

We can check this data collator works on a list of features, we just have to make sure to remove all features that are not inputs accepted by our model:

In [24]:
accepted_keys = ["input_ids", "attention_mask", "label", "labels"]
features = [
    {k: v for k, v in my_dataset["train"][i].items() if k in accepted_keys}
    for i in range(10)
]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Evaluate

Including a metric during training is often helpful for evaluating your model’s performance. or this task, we load the accuracy metric.

In [25]:
import evaluate

accuracy = evaluate.load("accuracy")

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

First we need to preprocess the data for the trainer.


The `get_final_dataset` function modifies the input `dataset` by renaming the column "label" to "labels". 

* Depending on the value of `i` (either 1 or not 1), it removes specific columns from the dataset. It then sets the format of the dataset to "torch" and returns the modified dataset.

In [27]:
def get_final_dataset(dataset):
    tokenized_dataset = dataset.rename_column("label", "labels")
    tokenized_dataset = tokenized_dataset.remove_columns(['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order'])
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

In [28]:
tokenized_datasets = get_final_dataset(my_dataset)

We disable Weights & Biases. You'll need to apply an API key when prompted if you use it for tracking the training metrics.

In [29]:
os.environ["WANDB_DISABLED"] = "true"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"


### Here we are loading the model we are using for the task.

In [30]:
model = AutoModelForMultipleChoice.from_pretrained(model_name, ignore_mismatched_sizes=True)

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We check for the availability of a CUDA-enabled GPU and assign the appropriate device and then we move our model to that device for computation.

In [31]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# print(device)
model.to(device)

RobertaForMultipleChoice(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

Here we are passing basic arguments to the `Trainer` class.
- **`batch_size`**: This parameter determines the number of examples (data points) processed in each iteration (or batch) during training.

- **`lr` (learning rate)**: This is the rate at which the model weights are updated during training.

- **`num_epochs`**: Specifies the number of times the training dataset will be iterated over by the model during training.

- **`num_training_steps`**: This calculates the total number of training steps that will be performed over the specified number of epochs.

- **`batches_per_epoch`**: This represents the number of batches (or iterations) that will be processed in each epoch.

In [32]:
batch_size = 4

lr=3e-5 

num_epochs = 3
# max_steps = 100

num_training_steps = (len(my_dataset["train"]) // batch_size) * num_epochs 
batches_per_epoch = len(my_dataset["train"]) // batch_size
# print(batches_per_epoch)

We are initializing optimizer and scheduler here.

In [33]:
# Optimizer initialization
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# Learning rate scheduler initialization
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Several arguments that we difine are the following:

- `output_dir`: The directory where model checkpoints and outputs will be saved.
- `logging_steps`: Log metrics every specified number of training steps.
- `logging_strategy`: Specify whether logging is done by "steps" or "epoch".
- `save_strategy`: Strategy for saving model checkpoints, either by "epoch" or "steps".
- `save_steps`: Save a model checkpoint every specified number of steps.
- `save_total_limit`: Maximum number of checkpoints to keep.
- `evaluation_strategy`: Strategy for evaluating the model during training.
- `eval_steps`: Evaluate the model every specified number of training steps.
- `report_to`: Where to report evaluation results, set to "none" to disable reporting.


In [34]:
import accelerate

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    eval_steps=20,
    
#     logging_strategy="epoch", #Extra: to log training data stats for loss
    logging_steps=20,
    logging_strategy="steps",
    
    learning_rate=lr,
    num_train_epochs=num_epochs,
    # max_steps=100,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # warmup_steps=0,
    # weight_decay=0.01,
#     logging_dir="./logs",
    report_to=None,  # Set report_to to None to disable integrations
    save_strategy="steps",  # Set save_strategy to "no" to prevent saving model checkpoints
    save_steps=100,               # Save every 10 checkpoints
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    optimizers=(optimizer, lr_scheduler),  # Pass both optimizer and scheduler
    compute_metrics=compute_metrics
)

In [36]:
import numpy as np
print(np.__version__)
# Need 1.26.4!

1.26.4


Now we are ready to train our model!

In [37]:
# Training loop using Trainer API
print('training model {}...'.format(model_name))

train_result = trainer.train()

training model FacebookAI/roberta-base...


Step,Training Loss
20,1.4096
40,1.2166
60,0.9042
80,0.4989
100,0.3984
120,0.427
140,0.279
160,0.3229
180,0.2572
200,0.184


In [38]:
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

## Predict with fine-tuned model

##### Accuracy on each dataset (original, scemanic, context) by itself


In [39]:
def evaluate_accuracy(dataset, pred_list=False):
    total_answers = 0
    correct_answers = 0
    predictions = []
    
    model.eval()
    
    for i in dataset:
        prompt = i['question'].strip()
        candidates = i['choice_list']
        true_label_original = i['label']
        candidate_1, candidate_2, candidate_3, candidate_4 = candidates[0].strip(), candidates[1].strip(), candidates[2].strip(), candidates[3].strip()
        
        inputs = tokenizer([[prompt, candidate_1], [prompt, candidate_2], [prompt, candidate_3], [prompt, candidate_4]],
                           return_tensors="pt", padding=True).to("cuda")
        
        labels = torch.tensor(true_label_original).unsqueeze(0).to("cuda")  # Batch size 1
        
        # Pass the input through the model to obtain predictions
        with torch.no_grad():
            outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
        
        logits = outputs.logits
        predicted_class = logits.argmax().item()
        
        predictions.append(predicted_class)
        
        if predicted_class == true_label_original:
            correct_answers += 1        
        total_answers += 1
    
    # Calculate accuracy
    accuracy = correct_answers / total_answers
    
    # Round accuracy to three decimal places
    rounded_accuracy = round(accuracy, 3)
    print("Accuracy is", rounded_accuracy)

    if pred_list:
        return rounded_accuracy, predictions
    return rounded_accuracy

In [40]:
original_acc = evaluate_accuracy(original_dataset["test"])
scemantic_acc = evaluate_accuracy(scemantic_dataset["test"])
context_acc = evaluate_accuracy(context_dataset["test"])

Accuracy is 0.692
Accuracy is 0.654
Accuracy is 0.538


Here we implement the logic to calculate group-based accuracy. 

First we check that we have the same ids in all the datasets.

In [41]:
# Initialize a dictionary to store the results
id_is_substring = {}

for id1 in original_dataset["test"]['id']:
    # print(id1)
    id_is_substring[id1] = []
    for id2 in scemantic_dataset["test"]['id']:
        if str(id1) in str(id2):
            id_is_substring[id1].append(id2)
            
    for id3 in context_dataset["test"]['id']:
        if str(id1) in str(id3):
            id_is_substring[id1].append(id3)
    
# print(id_is_substring)

assert len(id_is_substring) == len(original_dataset["test"]['id'])

for key, value in id_is_substring.items():
    assert len(value) == 2

The function below will take a row of the given dataset and model and return all the information needed to calculate the accuracy of the model on that row.

In [42]:
def dataset_compute (row, model):
    
    prompt = row['question'][0].strip()
    candidates = row['choice_list'][0]
    true_label_original = row['label'][0]
    candidate_1, candidate_2, candidate_3, candidate_4 = candidates[0].strip(), candidates[1].strip(), candidates[2].strip(), candidates[3].strip()
    
    inputs = tokenizer([[prompt, candidate_1], [prompt, candidate_2], [prompt, candidate_3], [prompt, candidate_4]],
                       return_tensors="pt", padding=True).to("cuda")
    
    labels = torch.tensor(true_label_original).unsqueeze(0).to("cuda")  # Batch size 1
    
    # Pass the input through the model to obtain predictions
    with torch.no_grad():
        outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
    
    logits = outputs.logits
    predicted_class = logits.argmax().item()
    
    return prompt, candidates, true_label_original, predicted_class

Here based on the `group` number we will calculate the accuracy of the model on that group (Ori&Scem / Ori&Scem&Cont).

In [43]:
def group_accuracy(original_dataset, scemantic_dataset, context_dataset, model, num_groups=2):

    correct_predictions = {}
    wrong_predictions = {}
    total_correct = 0
    model.eval()  # Set the model to evaluation mode

    # Iterate over keys
    for i, key in enumerate(original_dataset['id']):
        
        ############################ original dataset ############################
        original_data = original_dataset.filter(lambda example: example['id'] == key)
        original_results = dataset_compute(original_data, model)
        
        ############################ semantic dataset ############################
        semantic_data = scemantic_dataset.filter(lambda example: example['id'] == key + '_SR')
        semantic_results = dataset_compute(semantic_data, model)
        
        if num_groups == 3:
        ############################ context dataset ############################
            context_data = context_dataset.filter(lambda example: example['id'] == key + '_CR')
            context_results = dataset_compute(context_data, model)
            
        # dataset_compute returns a tuple of (prompt, candidates, true_label, predicted_class)
        
        # check if the predicted class is equal to the true label
        is_original_correct = original_results[2] == original_results[3]
        is_semantic_correct = semantic_results[2] == semantic_results[3]
        is_context_correct = num_groups == 3 and context_results[2] == context_results[3]

        # if the predicted class is equal to the true label, add the results to the correct_predictions dictionary
        if num_groups == 2 and is_original_correct and is_semantic_correct:
            correct_predictions[key] = (original_results, semantic_results)
            total_correct += 1
        elif num_groups == 3 and is_original_correct and is_semantic_correct and is_context_correct:
            correct_predictions[key] = (original_results, semantic_results, context_results)
            total_correct += 1
        # otherwise, add the results to the wrong_predictions dictionary
        else:
            wrong_predictions[key] = (original_results, semantic_results)
            if num_groups == 3:
                wrong_predictions[key] += (context_results,)

        total_instances = i + 1
    accuracy = round(total_correct / total_instances, 3)
    if num_groups ==2:
        print("Accuracy Ori & Sem: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    else:
        print("Accuracy Ori & Sem & Con: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    
    return correct_predictions, wrong_predictions, accuracy


Function to display detailed results of the model on a specific group

In [44]:
# wrong_preds has num_groups tuples of (prompt, candidates, true_label, predicted_class)
def display_group_predictions(predictions):
    print("Number of predictions: {} \n".format(len(predictions)))
    for key, results in predictions.items():
        print(f"Key: {key}")
        for i, result in enumerate(results):
            print("  Dataset {}:".format(i + 1))
            print("    Prompt: {}".format(result[0]))
            # print("    Candidates: {}".format(result[1]))
            print("    True Label: {} -> {}".format(result[2], result[1][result[2]].strip()))
            print("    Predicted Class: {} -> {}".format(result[3], result[1][result[3]].strip()))


### Ori & Sem Accuracy


In [45]:
correct_preds, wrong_preds, ori_sem_accuracy = group_accuracy(original_dataset["test"], scemantic_dataset["test"], context_dataset["test"], model, num_groups=2)
display_group_predictions(wrong_preds)

Filter: 100%|██████████| 26/26 [00:00<00:00, 1426.34 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2606.65 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 3009.99 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2701.58 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2791.91 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2507.81 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2492.79 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2882.15 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2528.04 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2218.71 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2355.69 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2471.99 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2468.91 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2462.61 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2514.40 examples/s]
Filter: 100%|██████████| 

Accuracy Ori & Sem: 0.654 -> 17/26
Number of predictions: 9 

Key: SP-174
  Dataset 1:
    Prompt: A man pushes his car until he reaches a hotel. When he arrives, he realizes he's bankrupt. What happened?
    True Label: 2 -> He's playing Monopoly and he lands on a space with a hotel and doesn't have the money to pay the fee.
    Predicted Class: 1 -> The man was pushing his car because it had run out of gas.
  Dataset 2:
    Prompt: Until he arrives at a hotel, a man pushes his vehicle. He realizes he is bankrupt when he gets there. What took place?
    True Label: 1 -> He's playing Monopoly and he lands on a space with a hotel and doesn't have the money to pay the fee.
    Predicted Class: 2 -> The man was pushing his car because it had run out of gas.
Key: SP-175
  Dataset 1:
    Prompt: A woman enters a room and presses a button. Within seconds she instantly loses 20 pounds. How did she lose the weight?
    True Label: 0 -> The room is actually an elevator. When she gets inside to 

### Ori & Sem & Con Accuracy

In [46]:
correct_preds, wrong_preds, ori_sem_con_accuracy = group_accuracy(original_dataset["test"], scemantic_dataset["test"], context_dataset["test"], model, num_groups=3)
display_group_predictions(wrong_preds)

Filter: 100%|██████████| 26/26 [00:00<00:00, 2156.63 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2450.27 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 916.24 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 1566.03 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2558.58 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 1319.12 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2412.01 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2405.47 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2126.02 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 1671.09 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2979.48 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2556.24 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2527.57 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2818.46 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 2579.28 examples/s]
Filter: 100%|██████████| 2

Accuracy Ori & Sem & Con: 0.462 -> 12/26
Number of predictions: 14 

Key: SP-174
  Dataset 1:
    Prompt: A man pushes his car until he reaches a hotel. When he arrives, he realizes he's bankrupt. What happened?
    True Label: 2 -> He's playing Monopoly and he lands on a space with a hotel and doesn't have the money to pay the fee.
    Predicted Class: 1 -> The man was pushing his car because it had run out of gas.
  Dataset 2:
    Prompt: Until he arrives at a hotel, a man pushes his vehicle. He realizes he is bankrupt when he gets there. What took place?
    True Label: 1 -> He's playing Monopoly and he lands on a space with a hotel and doesn't have the money to pay the fee.
    Predicted Class: 2 -> The man was pushing his car because it had run out of gas.
  Dataset 3:
    Prompt: A tower stands in front of a horse, and the horse jumps over. Both the horse and the tower are fine. Why?
    True Label: 2 -> It is on the chessboard.
    Predicted Class: 2 -> It is on the chessboard.


## For the competion Try the Trained Model!

Here we handle the test set that is provided by the competition. We are following the same logic as above.

### Prepare test dataset

In [47]:
original_dataset = tokenized_test.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
scemantic_dataset = tokenized_test.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
context_dataset = tokenized_test.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

print(f"Original dataset size: {len(original_dataset)}")
print(f"Semantic dataset size: {len(scemantic_dataset)}")
print(f"Context dataset size: {len(context_dataset)}")

Filter:   0%|          | 0/120 [00:00<?, ? examples/s]

Filter: 100%|██████████| 120/120 [00:00<00:00, 1854.47 examples/s]
Filter: 100%|██████████| 120/120 [00:00<00:00, 2838.10 examples/s]
Filter: 100%|██████████| 120/120 [00:00<00:00, 3002.45 examples/s]

Original dataset size: 40
Semantic dataset size: 40
Context dataset size: 40





In [48]:
# check that every id in original_dataset is also in scemantic_dataset and context_dataset
original_ids = [data["id"] for data in original_dataset]
scemantic_ids = [data["id"].split("_")[0] for data in scemantic_dataset]
context_ids = [data["id"].split("_")[0] for data in context_dataset]

print("Difference between original and context datasets {}".format(len(set(original_ids) - set(context_ids))))
print("Difference between scemantic and context datasets {}".format(len(set(scemantic_ids) - set(context_ids))))
print("Difference between original and scemantic datasets {}".format(len(set(original_ids) - set(scemantic_ids))))

Difference between original and context datasets 0
Difference between scemantic and context datasets 0
Difference between original and scemantic datasets 0


### Predict with fine-tuned model

##### Accuracy on each dataset (original, scemanic, context) by itself


In [49]:
test_set_original_acc = evaluate_accuracy(original_dataset)
test_set_scemantic_acc = evaluate_accuracy(scemantic_dataset)
test_set_context_acc = evaluate_accuracy(context_dataset)

Accuracy is 0.775
Accuracy is 0.75
Accuracy is 0.75


In [50]:
id_is_substring = {}

for id1 in original_dataset['id']:
    # print(id1)
    id_is_substring[id1] = []
    for id2 in scemantic_dataset['id']:
        if str(id1 +"_") in str(id2):
            id_is_substring[id1].append(id2)
            
    for id3 in context_dataset['id']:
        if str(id1+"_") in str(id3):
            id_is_substring[id1].append(id3)
    
# print(id_is_substring)

assert len(id_is_substring) == len(original_dataset['id'])

for key, value in id_is_substring.items():
    assert len(value) == 2

Function to produce detailed results of the model on a specific group

In [51]:
def display_group_predictions(predictions):
    output = "Number of predictions: {} \n\n".format(len(predictions))
    for key, results in predictions.items():
        output += "#"*120 + "\n"
        output += "Key: {}\n".format(key)
        for i, result in enumerate(results):
            output += "  Dataset {}:\n".format(i + 1)
            output += "    Prompt: {}\n".format(result[0])
            output += "    True Label: {} -> {}\n".format(result[2], result[1][result[2]].strip())
            output += "    Predicted Class: {} -> {}\n".format(result[3], result[1][result[3]].strip())
    

    output += "#"*120 + "\n"

    return output

### Ori & Sem Accuracy


In [52]:
correct_preds, wrong_preds, test_set_ori_sem_accuracy = group_accuracy(original_dataset, scemantic_dataset, context_dataset, model, num_groups=2)

Filter: 100%|██████████| 40/40 [00:00<00:00, 1537.73 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2340.18 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1979.82 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2797.74 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2232.11 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1610.14 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2400.72 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2684.23 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1990.32 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2591.28 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2944.61 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2617.19 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1347.51 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2662.67 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2653.70 examples/s]
Filter: 100%|██████████| 

Accuracy Ori & Sem: 0.725 -> 29/40


In [53]:
test_set_ori_sem_wrong_answers = display_group_predictions(wrong_preds)

### Ori & Sem & Con Accuracy

In [54]:
correct_preds, wrong_preds, test_set_ori_sem_con_accuracy = group_accuracy(original_dataset, scemantic_dataset, context_dataset, model, num_groups=3)

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Filter: 100%|██████████| 40/40 [00:00<00:00, 1734.56 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2137.61 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2833.56 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1203.44 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1496.12 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1551.59 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2608.93 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2215.87 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 1526.13 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2402.24 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2486.51 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 3144.21 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2498.62 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2049.73 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 2412.22 examples/s]
Filter: 100%|██████████| 

Accuracy Ori & Sem & Con: 0.675 -> 27/40


In [55]:
test_set_ori_sem_con_wrong_answers = display_group_predictions(wrong_preds)

Save information of mispredictions regarding group-based metric

In [56]:
# Define the directory path
results_dir = './results/'

# Create the directory if it does not exist
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [57]:
def save_to_text_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [58]:
save_to_text_file(test_set_ori_sem_wrong_answers, './results/test_set_ori_sem_con_wrong.txt')
save_to_text_file(test_set_ori_sem_con_wrong_answers, './results/test_set_ori_sem_wrong.txt')

Gathering results to a json

In [59]:

df_res = pd.DataFrame(columns=['checkpoint', 'task',  'lr', 'batch_size', 'num_epochs', 'original_acc', 'scemantic_acc', 'context_acc', 'ori_sem_acc', 'ori_sem_con_acc', 'date_of_run'])

# Create a dictionary for the new row
new_row_data = {
    'checkpoint': [model_name],
    'task': [task],
    'lr': [lr],
    'batch_size': [batch_size],
    'num_epochs': [num_epochs],
    'original_acc': [original_acc],
    'scemantic_acc': [scemantic_acc],
    'context_acc': [context_acc],
    'ori_sem_acc': [ori_sem_accuracy],
    'ori_sem_con_acc': [ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_temp = pd.DataFrame(new_row_data)

new_row_data = {
    'checkpoint': [model_name],
    'task': [task+"_test_set"],
    'lr': [lr],
    'batch_size': [batch_size],
    'num_epochs': [num_epochs],
    'original_acc': [test_set_original_acc],
    'scemantic_acc': [test_set_scemantic_acc],
    'context_acc': [test_set_context_acc],
    'ori_sem_acc': [test_set_ori_sem_accuracy],
    'ori_sem_con_acc': [test_set_ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_res = pd.DataFrame(new_row_data)

# display(df_temp)
# df_temp.to_csv('./results/results.csv', index=False)


df_res = df_res._append(df_temp, ignore_index=False)
display(df_res)


csv_path = os.path.join(results_dir, 'results.csv')
df_temp.to_csv(csv_path, index=False)


Unnamed: 0,checkpoint,task,lr,batch_size,num_epochs,original_acc,scemantic_acc,context_acc,ori_sem_acc,ori_sem_con_acc,date_of_run
0,FacebookAI/roberta-base,SP_test_set,3e-05,4,3,0.775,0.75,0.75,0.725,0.675,2025_05_07_14:22
0,FacebookAI/roberta-base,SP,3e-05,4,3,0.692,0.654,0.538,0.654,0.462,2025_05_07_14:22


### Create submission

Here we only keep ordered the predictions of the model in order to create the submission file.

In [60]:
acc, test_all_answers = evaluate_accuracy(tokenized_test, pred_list=True)
print(test_all_answers)

Accuracy is 0.758
[0, 2, 2, 1, 3, 0, 0, 3, 3, 2, 1, 0, 2, 0, 0, 1, 0, 2, 2, 0, 0, 1, 1, 0, 0, 3, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3, 0, 2, 0, 1, 1, 0, 0, 3, 2, 2, 2, 0, 2, 0, 0, 3, 1, 2, 0, 2, 0, 1, 1, 3, 1, 1, 1, 3, 3, 2, 1, 2, 0, 3, 0, 2, 2, 1, 2, 3, 3, 1, 0, 3, 3, 3, 0, 3, 1, 3, 0, 3, 2, 3, 0, 3, 0, 2, 3, 1, 2, 0, 2, 1, 0, 0, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 3, 3, 3, 2, 3, 1, 2]


In [61]:
def save_answers_to_file(filename, predictions):
    with open(filename, 'w') as file:
        for predicted_class in predictions:
            file.write(f'{predicted_class}\n')


In [62]:
if task == "SP":
    directory = './submission'
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
    save_answers_to_file(os.path.join(directory, 'answer_sen_'+ pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")+ '.txt'), test_all_answers)

if task == "WP":
    directory = './submission'
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
    save_answers_to_file(os.path.join(directory, 'answer_word_'+ pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")+ '.txt'), test_all_answers)

##### Save model

In [63]:
check = model_name[:model_name.find('/')]

model.save_pretrained('./models/{}_{}_{}'.format(task, check, pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")))