In [1]:
from dataclasses import dataclass
from typing import List, Tuple

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from datasets import load_dataset
from transformers import RobertaTokenizer
import evaluate
from torch.optim import AdamW
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Global variables
TEST_DATASET_LENGTH = 100
VAL_DATASET_LENGTH = TEST_DATASET_LENGTH + 100

USE_SMALL_DATASET = True
BATCH_SIZE = 12

In [3]:
# load dataset from datasets/strategyqa_train_filtered.json
# dataset = load_dataset("json", data_files={"train": "../datasets/strategyqa_train_filtered.json", "test": "../datasets/strategyqa_test.json"})
dataset = load_dataset("json", data_files={"train": "../datasets/strategyqa_train_filtered.json"})
print(dataset)
# initialize training, validation, and testing dataset
train_dataset = dataset['train'].select(indices=range(len(dataset['train']) - VAL_DATASET_LENGTH))
val_dataset = dataset['train'].select(indices=range(len(dataset['train']) - VAL_DATASET_LENGTH, len(dataset['train']) - TEST_DATASET_LENGTH))
test_dataset = dataset['train'].select(indices=range(len(dataset['train']) - TEST_DATASET_LENGTH, len(dataset['train'])))
if USE_SMALL_DATASET:
    train_dataset = train_dataset.select(range(100)) # we use the first 100 entries to test the code
    val_dataset = val_dataset.select(range(100)) # we use the first 100 entries to test the code
    test_dataset = test_dataset.select(range(100)) # we use the first 100 entries to test the code
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))
print(train_dataset[0])
print(val_dataset[0])
print(test_dataset[0])



DatasetDict({
    train: Dataset({
        features: ['qid', 'term', 'description', 'question', 'answer'],
        num_rows: 2821
    })
})
100
100
100
{'qid': '872', 'term': 'Swastika', 'description': 'a geometrical figure and an ancient religious icon in the cultures of Eurasia and 20th-century symbol of Nazism', 'question': 'Did the Hopi Indians use a symbol that was similar to the swastika?', 'answer': True}
{'qid': '8857', 'term': 'Tonsillitis', 'description': 'Inflammation of the tonsils', 'question': 'Would someone with Tonsillitis be uncomfortable at a party?', 'answer': True}
{'qid': '9032', 'term': 'Cactus', 'description': 'Family of mostly succulent plants, adapted to dry environments', 'question': ' Is cactus fruit an important menu item for a restaurant themed around Cuauhtémoc?', 'answer': True}


In [4]:
# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# tokenize the dataset
def tokenize_function(batch, tokenizer=tokenizer, field_name="question"):
    return tokenizer(batch[field_name], padding="max_length", truncation=True)

def add_labels(tokenized_dataset):
    tokenized_dataset["labels"] = 1 if tokenized_dataset["answer"] else 0 # Assuming "answer" exists
    print(tokenized_dataset["labels"], tokenized_dataset["answer"], tokenized_dataset["question"])
    return tokenized_dataset
# load training dataset


# tokenize the datasets
tokenized_datasets = {}
tokenized_datasets["train"] = train_dataset.map(tokenize_function, batched=True).map(add_labels)
tokenized_datasets["val"] = val_dataset.map(tokenize_function, batched=True).map(add_labels)
tokenized_datasets["test"] = test_dataset.map(tokenize_function, batched=True).map(add_labels)
print(tokenized_datasets["train"][0]["labels"])
print(tokenized_datasets["train"].column_names)
# print(tokenized_datasets["train"][98]["labels"], tokenized_datasets["train"][98]["question"])
for i in range(TEST_DATASET_LENGTH):
    print(i, tokenized_datasets["val"][i]["labels"], tokenized_datasets["val"][i]["question"])


1
['qid', 'term', 'description', 'question', 'answer', 'input_ids', 'attention_mask', 'labels']
0 1 Would someone with Tonsillitis be uncomfortable at a party?
1 1 Do popcorn and pretzels use different kinds of salt?
2 0 Is World of Warcraft typically played on the Nintendo Switch?
3 0 Were all of the performers at Aretha Franklin's funeral black?
4 1 Does a Jewish holiday include a tradition regarding Elijah?
5 0 Is DDR the most hyped game for Playstation 4?
6 0 Is DDR the most anticipated game for Playstation 4?
7 0 Could the Playstation 4 play Super Smash Brothers?
8 0 Is the New Testament taught in most kindergarten classes?
9 0 Is the New Testament taught in kindergarten classes?
10 1 Would intelligent design be inappropriate to teach in public school?
11 0 Is Bucharest located south of the Equator?
12 1 Is Mercedes-Benz a car company from the Eastern Hemisphere?
13 0 Was Gandalf involved in the defeat of the Witch King?
14 0 Was Gandalf present at the death of Sauron?
15 0 Do com

In [5]:
# %pip install evaluate

In [6]:
# check if GPU is available
# ! nvidia-smi
# ! nvcc --version
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.current_device())

2.6.0+cu126
12.6
True
1
NVIDIA GeForce RTX 3060 Laptop GPU
0


In [7]:
# ! pip install torch torchvision torchaudio accelerate>=0.26.0


In [8]:
def train_one_epoch(model, dataloader, optimizer, epoch):
    model.train()

    with tqdm(dataloader, desc=f"Train Ep {epoch}", total=len(dataloader)) as tq:
        for batch in tq:
            # TODO: retrieve the data from your batch and send it to the same device as your model (i.e., model.device).
            # Hint: model.device should point to 'cuda' as you set it as such in the main function below.
            #       However, please use `model.device` and don't hard code it to 'cuda' as the auto-grader will put the model on CPU.
            # text_encoding = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}
            input_ids = batch["text_encoding"]["input_ids"].to(model.device)
            attention_mask = batch["text_encoding"]["attention_mask"].to(model.device)
            label_encoding = batch["label_encoding"].to(model.device)

            # TODO: Compute loss by running model with text_encoding and label_encoding.
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_encoding)
            loss = output.loss

            # TODO: compute gradients and update parameters using optimizer.
            # Hint: you need three lines of code here!
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tq.set_postfix({"loss": loss.detach().item()}) # for printing better-looking progress bar

def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Convert logits to class labels
    print("labels")
    print(labels)
    # print("logits")
    # for logit in logits:
    #     print([f"{value:.2f}" for value in logit])
    print("predictions")
    print(predictions)
    print()
    for i in range(len(predictions)):
        print(f"Prediction: {predictions[i]} | Label: {labels[i]} | Sentence: {tokenized_datasets["test"][i]["question"]}")
    return {"accuracy": (predictions == labels).mean()}

In [9]:

# training

learning_rate = 5e-5
num_train_epochs = 5


model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

model = model.cuda()


optimizer = AdamW(model.parameters(), lr=learning_rate)

training_args = TrainingArguments("results", 
                                num_train_epochs=num_train_epochs, 
                                per_device_train_batch_size=BATCH_SIZE, 
                                per_device_eval_batch_size=BATCH_SIZE, 
                                logging_dir= 'logs', 
                                logging_steps=10, 
                                evaluation_strategy= "epoch")

metric = evaluate.load("accuracy")

trainer = Trainer(model=model, 
                args=training_args, 
                train_dataset=tokenized_datasets["train"], 
                eval_dataset=tokenized_datasets["val"], 
                compute_metrics=compute_metrics)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# This cell clears GPU memory, do this when GPU out of memory

# from numba import cuda
import gc
gc.collect()
# torch.cuda.empty_cache()

87

Training part

In [11]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.304439,0.54
2,0.515600,1.410735,0.54
3,0.318000,1.85868,0.55
4,0.232100,2.184089,0.57
5,0.113100,2.305369,0.57


labels
[1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
predictions
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Prediction: 1 | Label: 1 | Sentence:  Is cactus fruit an important menu item for a restaurant themed around Cuauhtémoc?
Prediction: 1 | Label: 1 | Sentence:  Is cactus fruit an important menu item for a restaurant inspired by Cuauhtémoc?
Prediction: 1 | Label: 0 | Sentence: Are Douglas firs native to the Mojave?
Prediction: 1 | Label: 0 | Sentence: Is Orange County near the border with Canada?
Prediction: 1 | Label: 1 | Sentence: Is June associated with a patriotic American symbol?
Prediction: 1 | Label: 0 | Sentence: Is June associated with a seasonal change?
Predi

TrainOutput(global_step=45, training_loss=0.27257751491334703, metrics={'train_runtime': 44.0467, 'train_samples_per_second': 11.352, 'train_steps_per_second': 1.022, 'total_flos': 131555527680000.0, 'train_loss': 0.27257751491334703, 'epoch': 5.0})

In [12]:
# if loading from a checkpoint, set load_model to True

load_model = False
if load_model:
    checkpoint_path = "./results/checkpoint-first"

    # Load model from a specific checkpoint
    model = RobertaForSequenceClassification.from_pretrained(checkpoint_path)

    trainer = Trainer(model=model, 
                    args=training_args, 
                    train_dataset=tokenized_datasets["train"], 
                    eval_dataset=tokenized_datasets["val"], 
                    compute_metrics=compute_metrics)

trainer.evaluate(tokenized_datasets["test"])



labels
[1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predictions
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

Prediction: 1 | Label: 1 | Sentence:  Is cactus fruit an important menu item for a restaurant themed around Cuauhtémoc?
Prediction: 1 | Label: 1 | Sentence:  Is cactus fruit an important menu item for a restaurant inspired by Cuauhtémoc?
Prediction: 1 | Label: 0 | Sentence: Are Douglas firs native to the Mojave?
Prediction: 1 | Label: 0 | Sentence: Is Orange County near the border with Canada?
Prediction: 1 | Label: 1 | Sentence: Is June associated with a patriotic American symbol?
Prediction: 1 | Label: 1 | Sentence: Is June associated with a seasonal change?
Predi

{'eval_loss': 1.7730556726455688,
 'eval_accuracy': 0.61,
 'eval_runtime': 1.9929,
 'eval_samples_per_second': 50.177,
 'eval_steps_per_second': 4.516,
 'epoch': 5.0}

In [13]:
def predict_factually_correct(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to GPU if available
    inputs = {key: value.cuda() for key, value in inputs.items()} if torch.cuda.is_available() else inputs

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Get the predicted class (index of the highest probability)
    predicted_class = torch.argmax(probabilities, dim=-1).item()

    # Map predicted class to "correct" or "incorrect"
    if predicted_class == 1:
        return "Factually Correct"
    else:
        return "Factually Incorrect"


In [None]:
# we can now use the model to predict the factuality of a given sentence, go play with it!
user_input = "Was the KGB responsible for Lincoln's assassination?"
prediction = predict_factually_correct(user_input)
print(f"The sentence is: {prediction}")

The sentence is: Factually Correct
