In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Define the Google Drive path
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
print("Files in Google Drive:")
!ls "{drive_path}"

Files in Google Drive:
boolq_finetuned_bert_best	   boolq_test_predictions_epoch3.csv  train_df_clean.csv
boolq_finetuned_bert_best_current  dev_df_clean.csv
boolq_test_predictions.csv	   test_df_clean.csv


In [None]:
# Import libraries
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

In [None]:
# Load cleaned DataFrames from Google Drive
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
train_df_clean = pd.read_csv(f"{drive_path}/train_df_clean.csv")
dev_df_clean = pd.read_csv(f"{drive_path}/dev_df_clean.csv")
test_df_clean = pd.read_csv(f"{drive_path}/test_df_clean.csv")
print("Loaded cleaned DataFrames:")
print("Train shape:", train_df_clean.shape)
print("Dev shape:", dev_df_clean.shape)
print("Test shape:", test_df_clean.shape)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Define the BoolQDataset class
class BoolQDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, label_column='answer', is_test=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_column = label_column
        self.is_test = is_test  # Flag to indicate if this is the test set

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = str(self.data.iloc[idx]['question'])
        passage = str(self.data.iloc[idx]['passage'])

        # Handle labels (only for train/dev, not for test)
        if not self.is_test:
            label = 1 if self.data.iloc[idx][self.label_column] else 0
            label_tensor = torch.tensor(label, dtype=torch.long)
        else:
            label_tensor = torch.tensor(0, dtype=torch.long)  # Dummy label for test set (not used)

        encoding = self.tokenizer(
            question,
            passage,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation='longest_first',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label_tensor
        }

# Load the tokenizer (already done, but included for completeness)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
label_column = 'answer'  # Confirmed as the correct column name
train_dataset = BoolQDataset(train_df_clean, tokenizer, label_column=label_column)
dev_dataset = BoolQDataset(dev_df_clean, tokenizer, label_column=label_column)
test_dataset = BoolQDataset(test_df_clean, tokenizer, label_column=label_column, is_test=True)

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
dev_loader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

print(f"Train batches: {len(train_loader)}")
print(f"Dev batches: {len(dev_loader)}")
print(f"Test batches: {len(test_loader)}")

Loaded cleaned DataFrames:
Train shape: (9427, 4)
Dev shape: (3270, 4)
Test shape: (3245, 3)
Train batches: 1179
Dev batches: 409
Test batches: 406


In [None]:
# Copy the model from Google Drive to Colab
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
!cp -r "{drive_path}/boolq_finetuned_bert_best" /content/

# Verify the model files
print("Model files in /content/boolq_finetuned_bert_best:")
!ls /content/boolq_finetuned_bert_best

# Load the model
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('/content/boolq_finetuned_bert_best')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("BERT model loaded successfully.")

Model files in /content/boolq_finetuned_bert_best:
config.json  model.safetensors	special_tokens_map.json  tokenizer_config.json	vocab.txt
BERT model loaded successfully.


In [None]:
# Print the column names of each DataFrame
print("Train DataFrame columns:", train_df_clean.columns)
print("Dev DataFrame columns:", dev_df_clean.columns)
print("Test DataFrame columns:", test_df_clean.columns)

# Print a sample of each DataFrame
print("\nTrain DataFrame sample:\n", train_df_clean.head())
print("\nDev DataFrame sample:\n", dev_df_clean.head())
print("\nTest DataFrame sample:\n", test_df_clean.head())

Train DataFrame columns: Index(['question', 'title', 'answer', 'passage'], dtype='object')
Dev DataFrame columns: Index(['question', 'title', 'answer', 'passage'], dtype='object')
Test DataFrame columns: Index(['question', 'title', 'passage'], dtype='object')

Train DataFrame sample:
                                             question  \
0    do iran and afghanistan speak the same language   
1  do good samaritan laws protect those who help ...   
2  is windows movie maker part of windows essentials   
3  is confectionary sugar the same as powdered sugar   
4         is elder scrolls online the same as skyrim   

                      title  answer  \
0          Persian language    True   
1        Good Samaritan law    True   
2       Windows Movie Maker    True   
3            Powdered sugar    True   
4  The Elder Scrolls Online   False   

                                             passage  
0  Persian (/ˈpɜːrʒən, -ʃən/), also known by its ...  
1  Good Samaritan laws offer leg

In [None]:
class BoolQDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, label_column='answer', is_test=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_column = label_column
        self.is_test = is_test  # Flag to indicate if this is the test set

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = str(self.data.iloc[idx]['question'])
        passage = str(self.data.iloc[idx]['passage'])

        # Handle labels (only for train/dev, not for test)
        if not self.is_test:
            label = 1 if self.data.iloc[idx][self.label_column] else 0
            label_tensor = torch.tensor(label, dtype=torch.long)
        else:
            label_tensor = torch.tensor(0, dtype=torch.long)  # Dummy label for test set (not used)

        encoding = self.tokenizer(
            question,
            passage,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation='longest_first',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label_tensor
        }

# Create datasets with the correct label column name
# Replace 'answer' with the actual column name after inspecting the DataFrames
label_column = 'answer'  # Update this based on the output of Step 1
train_dataset = BoolQDataset(train_df_clean, tokenizer, label_column=label_column)
dev_dataset = BoolQDataset(dev_df_clean, tokenizer, label_column=label_column)
test_dataset = BoolQDataset(test_df_clean, tokenizer, label_column=label_column, is_test=True)

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
dev_loader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

print(f"Train batches: {len(train_loader)}")
print(f"Dev batches: {len(dev_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 1179
Dev batches: 409
Test batches: 406


In [None]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Evaluate on the dev set
model.eval()
dev_preds, dev_labels = [], []
total_dev_loss = 0
with torch.no_grad():
    for batch in tqdm(dev_loader, desc="Evaluating BERT"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        total_dev_loss += outputs.loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        dev_preds.extend(preds.cpu().numpy())
        dev_labels.extend(labels.cpu().numpy())

avg_dev_loss = total_dev_loss / len(dev_loader)
accuracy = accuracy_score(dev_labels, dev_preds)
f1 = f1_score(dev_labels, dev_preds)

print(f"BERT Validation Loss: {avg_dev_loss:.4f}")
print(f"BERT Dev Accuracy: {accuracy:.4f}")
print(f"BERT Dev F1: {f1:.4f}")

Evaluating BERT:   1%|▏         | 6/409 [00:02<02:02,  3.29it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating BERT:   8%|▊         | 34/409 [00:09<01:23,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating BERT:  17%|█▋        | 70/409 [00:17<01:11,  4.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating BERT:  29%|██▊       | 117/409 [00:27<01:02,  4.70it/s]Be aware, overflowing tokens are not returned for the setting 

BERT Validation Loss: 0.7416
BERT Dev Accuracy: 0.7153
BERT Dev F1: 0.7893


In [None]:
# Predict on the test set with BERT
model.eval()
test_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting with BERT"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

# Add predictions to the test DataFrame
test_df_clean['predicted_answer_bert'] = [bool(pred) for pred in test_preds]
print("Sample BERT test predictions:\n", test_df_clean[['question', 'predicted_answer_bert']].head())
test_df_clean.to_csv('/content/boolq_test_predictions_bert.csv', index=False)

# Download the predictions
from google.colab import files
files.download('/content/boolq_test_predictions_bert.csv')

# Save to Google Drive
!cp /content/boolq_test_predictions_bert.csv "{drive_path}/"
print("BERT test predictions saved and downloaded.")

Predicting with BERT:  18%|█▊        | 72/406 [00:17<01:16,  4.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Predicting with BERT:  24%|██▍       | 99/406 [00:23<01:09,  4.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Predicting with BERT:  35%|███▍      | 141/406 [00:33<01:00,  4.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Predicting with BERT:  39%|███▉      | 158/406 [00:37<00:57,  4.30it/s]Be aware, overflowing tokens are not ret

Sample BERT test predictions:
                                             question  predicted_answer_bert
0  is the first series 20 euro note still legal t...                   True
1  do the champions league winners get automatic ...                   True
2                  can a bull snake kill a small dog                   True
3                are all nba playoff games best of 7                  False
4  can i use my train ticket on the tram in manch...                   True


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

BERT test predictions saved and downloaded.


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Load RoBERTa tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.to(device)
print("RoBERTa model loaded successfully.")

# Create datasets with RoBERTa tokenizer
train_dataset_roberta = BoolQDataset(train_df_clean, roberta_tokenizer, label_column='answer')
dev_dataset_roberta = BoolQDataset(dev_df_clean, roberta_tokenizer, label_column='answer')
train_loader_roberta = DataLoader(train_dataset_roberta, sampler=RandomSampler(train_dataset_roberta), batch_size=8)
dev_loader_roberta = DataLoader(dev_dataset_roberta, sampler=SequentialSampler(dev_dataset_roberta), batch_size=8)

# Optimizer and scheduler
optimizer = AdamW(roberta_model.parameters(), lr=1e-5, weight_decay=0.1)
num_training_steps = len(train_loader_roberta) * 3
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa model loaded successfully.


In [None]:
# Training loop for RoBERTa
best_f1 = 0
best_epoch = 0
for epoch in range(3):
    print(f"\nRoBERTa Epoch {epoch + 1}/3")
    roberta_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_loader_roberta, desc="Training RoBERTa")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = roberta_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(roberta_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        if (step + 1) % 50 == 0:
            print(f"Step {step + 1}/{len(train_loader_roberta)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader_roberta)
    print(f"RoBERTa Average Training Loss: {avg_train_loss:.4f}")

    # Evaluate RoBERTa
    roberta_model.eval()
    dev_preds, dev_labels = [], []
    total_dev_loss = 0
    with torch.no_grad():
        for batch in tqdm(dev_loader_roberta, desc="Evaluating RoBERTa"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = roberta_model(input_ids, attention_mask=attention_mask, labels=labels)
            total_dev_loss += outputs.loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(labels.cpu().numpy())

    avg_dev_loss = total_dev_loss / len(dev_loader_roberta)
    accuracy = accuracy_score(dev_labels, dev_preds)
    f1 = f1_score(dev_labels, dev_preds)
    print(f"RoBERTa Validation Loss: {avg_dev_loss:.4f}")
    print(f"RoBERTa Dev Accuracy: {accuracy:.4f}")
    print(f"RoBERTa Dev F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch + 1
        roberta_model.save_pretrained("/content/boolq_finetuned_roberta_best")
        roberta_tokenizer.save_pretrained("/content/boolq_finetuned_roberta_best")
        print(f"New best RoBERTa model saved with F1 {best_f1:.4f}")
        drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
        !cp -r /content/boolq_finetuned_roberta_best "{drive_path}/"
    elif epoch - best_epoch >= 1:
        print(f"Early stopping at epoch {epoch + 1}. Best F1: {best_f1:.4f}")
        break

print(f"RoBERTa Best F1: {best_f1:.4f} at epoch {best_epoch}")


RoBERTa Epoch 1/3


Training RoBERTa:   4%|▍         | 50/1179 [00:37<16:41,  1.13it/s]

Step 50/1179, Loss: 0.5936


Training RoBERTa:   5%|▍         | 56/1179 [00:41<13:39,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   6%|▌         | 69/1179 [00:50<13:44,  1.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   6%|▌         | 70/1179 [00:51<13:40,  1.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   7%|▋         | 85/1179 [01:02<13:20,  1.37it/s]Be aware, overflowing tokens are not returned for the 

Step 100/1179, Loss: 0.7439


Training RoBERTa:  13%|█▎        | 150/1179 [01:50<15:02,  1.14it/s]

Step 150/1179, Loss: 0.7517


Training RoBERTa:  17%|█▋        | 200/1179 [02:26<14:16,  1.14it/s]

Step 200/1179, Loss: 0.6731


Training RoBERTa:  21%|██        | 250/1179 [03:03<13:28,  1.15it/s]

Step 250/1179, Loss: 0.6106


Training RoBERTa:  25%|██▌       | 300/1179 [03:39<12:48,  1.14it/s]

Step 300/1179, Loss: 0.5823


Training RoBERTa:  30%|██▉       | 350/1179 [04:15<12:03,  1.15it/s]

Step 350/1179, Loss: 0.7083


Training RoBERTa:  32%|███▏      | 378/1179 [04:35<09:42,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  34%|███▍      | 400/1179 [04:52<11:21,  1.14it/s]

Step 400/1179, Loss: 0.7797


Training RoBERTa:  36%|███▌      | 419/1179 [05:05<09:12,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  37%|███▋      | 437/1179 [05:18<09:01,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  38%|███▊      | 450/1179 [05:28<10:38,  1.14it/s]

Step 450/1179, Loss: 0.7668


Training RoBERTa:  40%|███▉      | 470/1179 [05:42<08:34,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  40%|████      | 472/1179 [05:44<08:32,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  41%|████      | 480/1179 [05:49<08:27,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  42%|████▏     | 500/1179 [06:04<09:53,  1.14it/s]

Step 500/1179, Loss: 0.4870


Training RoBERTa:  44%|████▎     | 515/1179 [06:15<08:03,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  47%|████▋     | 550/1179 [06:41<09:09,  1.14it/s]

Step 550/1179, Loss: 0.5934


Training RoBERTa:  51%|█████     | 600/1179 [07:17<08:25,  1.14it/s]

Step 600/1179, Loss: 0.5344


Training RoBERTa:  54%|█████▍    | 641/1179 [07:46<06:31,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  55%|█████▌    | 650/1179 [07:54<07:40,  1.15it/s]

Step 650/1179, Loss: 0.5232


Training RoBERTa:  59%|█████▉    | 700/1179 [08:30<06:57,  1.15it/s]

Step 700/1179, Loss: 0.7270


Training RoBERTa:  64%|██████▎   | 750/1179 [09:06<06:14,  1.14it/s]

Step 750/1179, Loss: 0.8535


Training RoBERTa:  64%|██████▍   | 754/1179 [09:09<05:03,  1.40it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  68%|██████▊   | 800/1179 [09:43<05:29,  1.15it/s]

Step 800/1179, Loss: 0.3272


Training RoBERTa:  70%|██████▉   | 822/1179 [09:58<04:20,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  72%|███████▏  | 850/1179 [10:19<04:46,  1.15it/s]

Step 850/1179, Loss: 0.8959


Training RoBERTa:  76%|███████▋  | 900/1179 [10:55<04:03,  1.15it/s]

Step 900/1179, Loss: 0.4230


Training RoBERTa:  81%|████████  | 950/1179 [11:32<03:19,  1.15it/s]

Step 950/1179, Loss: 0.3200


Training RoBERTa:  85%|████████▍ | 1000/1179 [12:08<02:35,  1.15it/s]

Step 1000/1179, Loss: 0.5862


Training RoBERTa:  88%|████████▊ | 1041/1179 [12:37<01:40,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  89%|████████▉ | 1050/1179 [12:44<01:52,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Step 1050/1179, Loss: 0.4536


Training RoBERTa:  93%|█████████▎| 1100/1179 [13:21<01:09,  1.14it/s]

Step 1100/1179, Loss: 0.6626


Training RoBERTa:  98%|█████████▊| 1150/1179 [13:57<00:25,  1.14it/s]

Step 1150/1179, Loss: 0.3743


Training RoBERTa:  98%|█████████▊| 1161/1179 [14:05<00:13,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa: 100%|██████████| 1179/1179 [14:17<00:00,  1.37it/s]


RoBERTa Average Training Loss: 0.6298


Evaluating RoBERTa:   1%|▏         | 6/409 [00:01<01:29,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:   8%|▊         | 34/409 [00:07<01:18,  4.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:  17%|█▋        | 70/409 [00:14<01:11,  4.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:  29%|██▊       | 117/409 [00:24<01:02,  4.65it/s]Be aware, overflowing tokens are not returned for 

RoBERTa Validation Loss: 0.5470
RoBERTa Dev Accuracy: 0.7275
RoBERTa Dev F1: 0.8089
New best RoBERTa model saved with F1 0.8089

RoBERTa Epoch 2/3


Training RoBERTa:   3%|▎         | 36/1179 [00:25<14:03,  1.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   4%|▍         | 50/1179 [00:36<16:45,  1.12it/s]

Step 50/1179, Loss: 0.2057


Training RoBERTa:   6%|▌         | 73/1179 [00:52<13:26,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   7%|▋         | 80/1179 [00:58<13:15,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   7%|▋         | 87/1179 [01:03<13:11,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   8%|▊         | 100/1179 [01:13<15:38,  1.15it/s]

Step 100/1179, Loss: 0.4031


Training RoBERTa:  10%|█         | 119/1179 [01:26<12:42,  1.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  13%|█▎        | 150/1179 [01:49<15:00,  1.14it/s]

Step 150/1179, Loss: 1.0039


Training RoBERTa:  17%|█▋        | 200/1179 [02:25<14:15,  1.14it/s]

Step 200/1179, Loss: 0.3725


Training RoBERTa:  21%|██        | 250/1179 [03:02<13:31,  1.15it/s]

Step 250/1179, Loss: 0.7081


Training RoBERTa:  25%|██▌       | 300/1179 [03:38<12:45,  1.15it/s]

Step 300/1179, Loss: 0.9580


Training RoBERTa:  30%|██▉       | 350/1179 [04:14<12:04,  1.14it/s]

Step 350/1179, Loss: 0.5797


Training RoBERTa:  34%|███▎      | 396/1179 [04:47<09:31,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  34%|███▍      | 400/1179 [04:51<11:20,  1.15it/s]

Step 400/1179, Loss: 0.4013


Training RoBERTa:  36%|███▌      | 421/1179 [05:05<09:10,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  38%|███▊      | 450/1179 [05:27<10:34,  1.15it/s]

Step 450/1179, Loss: 0.3488


Training RoBERTa:  42%|████▏     | 500/1179 [06:03<09:51,  1.15it/s]

Step 500/1179, Loss: 0.5247


Training RoBERTa:  45%|████▌     | 536/1179 [06:29<07:46,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  47%|████▋     | 550/1179 [06:40<09:09,  1.15it/s]

Step 550/1179, Loss: 0.4978


Training RoBERTa:  51%|█████     | 600/1179 [07:16<08:24,  1.15it/s]

Step 600/1179, Loss: 0.4377


Training RoBERTa:  52%|█████▏    | 617/1179 [07:28<06:47,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  55%|█████▍    | 647/1179 [07:50<06:28,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  55%|█████▌    | 650/1179 [07:52<07:42,  1.14it/s]

Step 650/1179, Loss: 0.6056


Training RoBERTa:  57%|█████▋    | 675/1179 [08:10<06:07,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  59%|█████▉    | 700/1179 [08:29<06:57,  1.15it/s]

Step 700/1179, Loss: 0.5615


Training RoBERTa:  63%|██████▎   | 748/1179 [09:03<05:13,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  64%|██████▎   | 750/1179 [09:05<06:15,  1.14it/s]

Step 750/1179, Loss: 0.4103


Training RoBERTa:  67%|██████▋   | 789/1179 [09:33<04:45,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  68%|██████▊   | 800/1179 [09:41<05:31,  1.14it/s]

Step 800/1179, Loss: 0.3658


Training RoBERTa:  72%|███████▏  | 850/1179 [10:18<04:46,  1.15it/s]

Step 850/1179, Loss: 0.6013


Training RoBERTa:  73%|███████▎  | 858/1179 [10:23<03:51,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  76%|███████▋  | 900/1179 [10:54<04:02,  1.15it/s]

Step 900/1179, Loss: 0.6371


Training RoBERTa:  80%|████████  | 944/1179 [11:26<02:50,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  81%|████████  | 950/1179 [11:30<03:19,  1.15it/s]

Step 950/1179, Loss: 0.9132


Training RoBERTa:  85%|████████▍ | 1000/1179 [12:07<02:36,  1.15it/s]

Step 1000/1179, Loss: 0.3002


Training RoBERTa:  87%|████████▋ | 1028/1179 [12:27<01:49,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  87%|████████▋ | 1031/1179 [12:29<01:47,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  89%|████████▉ | 1050/1179 [12:43<01:52,  1.15it/s]

Step 1050/1179, Loss: 0.1498


Training RoBERTa:  93%|█████████▎| 1100/1179 [13:19<01:08,  1.15it/s]

Step 1100/1179, Loss: 0.1561


Training RoBERTa:  98%|█████████▊| 1150/1179 [13:56<00:25,  1.15it/s]

Step 1150/1179, Loss: 0.4702


Training RoBERTa: 100%|██████████| 1179/1179 [14:16<00:00,  1.38it/s]


RoBERTa Average Training Loss: 0.4813


Evaluating RoBERTa:   1%|▏         | 6/409 [00:01<01:29,  4.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:   8%|▊         | 34/409 [00:07<01:18,  4.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:  17%|█▋        | 70/409 [00:14<01:12,  4.68it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:  29%|██▊       | 117/409 [00:24<01:01,  4.76it/s]Be aware, overflowing tokens are not returned for 

RoBERTa Validation Loss: 0.5021
RoBERTa Dev Accuracy: 0.7697
RoBERTa Dev F1: 0.8216
New best RoBERTa model saved with F1 0.8216

RoBERTa Epoch 3/3


Training RoBERTa:   4%|▍         | 50/1179 [00:36<16:40,  1.13it/s]

Step 50/1179, Loss: 0.9649


Training RoBERTa:   5%|▍         | 55/1179 [00:39<13:38,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:   8%|▊         | 100/1179 [01:12<15:36,  1.15it/s]

Step 100/1179, Loss: 0.4145


Training RoBERTa:   9%|▉         | 109/1179 [01:18<12:55,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  12%|█▏        | 145/1179 [01:45<12:39,  1.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  13%|█▎        | 150/1179 [01:49<15:02,  1.14it/s]

Step 150/1179, Loss: 0.3412


Training RoBERTa:  16%|█▌        | 183/1179 [02:12<12:03,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  17%|█▋        | 200/1179 [02:25<14:11,  1.15it/s]

Step 200/1179, Loss: 0.8209


Training RoBERTa:  21%|██        | 247/1179 [02:59<11:19,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  21%|██        | 250/1179 [03:02<13:31,  1.14it/s]

Step 250/1179, Loss: 0.4482


Training RoBERTa:  24%|██▎       | 280/1179 [03:23<10:51,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  25%|██▌       | 300/1179 [03:38<12:47,  1.15it/s]

Step 300/1179, Loss: 0.5162


Training RoBERTa:  28%|██▊       | 330/1179 [03:59<10:17,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  30%|██▉       | 350/1179 [04:14<12:04,  1.14it/s]

Step 350/1179, Loss: 0.1344


Training RoBERTa:  34%|███▎      | 397/1179 [04:48<09:27,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  34%|███▍      | 400/1179 [04:51<11:22,  1.14it/s]

Step 400/1179, Loss: 0.0653


Training RoBERTa:  35%|███▌      | 415/1179 [05:01<09:14,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  38%|███▊      | 450/1179 [05:27<10:36,  1.15it/s]

Step 450/1179, Loss: 0.1232


Training RoBERTa:  39%|███▉      | 465/1179 [05:38<08:38,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  40%|███▉      | 470/1179 [05:41<08:33,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  42%|████▏     | 500/1179 [06:03<09:54,  1.14it/s]

Step 500/1179, Loss: 0.1796


Training RoBERTa:  47%|████▋     | 550/1179 [06:40<09:09,  1.15it/s]

Step 550/1179, Loss: 0.2298


Training RoBERTa:  51%|█████     | 600/1179 [07:16<08:25,  1.15it/s]

Step 600/1179, Loss: 0.5482


Training RoBERTa:  55%|█████▌    | 650/1179 [07:52<07:40,  1.15it/s]

Step 650/1179, Loss: 0.0832


Training RoBERTa:  59%|█████▉    | 700/1179 [08:29<06:58,  1.14it/s]

Step 700/1179, Loss: 0.4413


Training RoBERTa:  64%|██████▎   | 750/1179 [09:05<06:15,  1.14it/s]

Step 750/1179, Loss: 0.0698


Training RoBERTa:  68%|██████▊   | 800/1179 [09:42<05:30,  1.15it/s]

Step 800/1179, Loss: 1.0637


Training RoBERTa:  72%|███████▏  | 850/1179 [10:18<04:46,  1.15it/s]

Step 850/1179, Loss: 0.2304


Training RoBERTa:  76%|███████▋  | 900/1179 [10:54<04:03,  1.15it/s]

Step 900/1179, Loss: 0.6582


Training RoBERTa:  79%|███████▉  | 936/1179 [11:20<02:56,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  81%|████████  | 950/1179 [11:31<03:19,  1.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Step 950/1179, Loss: 0.1374


Training RoBERTa:  85%|████████▍ | 1000/1179 [12:07<02:36,  1.14it/s]

Step 1000/1179, Loss: 0.0917


Training RoBERTa:  87%|████████▋ | 1026/1179 [12:25<01:50,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  89%|████████▉ | 1050/1179 [12:43<01:52,  1.15it/s]

Step 1050/1179, Loss: 0.2394


Training RoBERTa:  93%|█████████▎| 1100/1179 [13:20<01:09,  1.14it/s]

Step 1100/1179, Loss: 0.0955


Training RoBERTa:  95%|█████████▍| 1119/1179 [13:33<00:43,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa:  98%|█████████▊| 1150/1179 [13:56<00:25,  1.15it/s]

Step 1150/1179, Loss: 0.2994


Training RoBERTa:  99%|█████████▉| 1171/1179 [14:11<00:05,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa: 100%|█████████▉| 1176/1179 [14:15<00:02,  1.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training RoBERTa: 100%|██████████| 1179/1179 [14:17<00:00,  1.38it/s]


RoBERTa Average Training Loss: 0.3822


Evaluating RoBERTa:   1%|▏         | 6/409 [00:01<01:28,  4.55it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:   8%|▊         | 34/409 [00:07<01:18,  4.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:  17%|█▋        | 70/409 [00:14<01:10,  4.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating RoBERTa:  29%|██▊       | 117/409 [00:24<01:02,  4.69it/s]Be aware, overflowing tokens are not returned for 

RoBERTa Validation Loss: 0.5660
RoBERTa Dev Accuracy: 0.7835
RoBERTa Dev F1: 0.8261
New best RoBERTa model saved with F1 0.8261
RoBERTa Best F1: 0.8261 at epoch 3


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Load DistilBERT tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
distilbert_model.to(device)
print("DistilBERT model loaded successfully.")

# Create datasets with DistilBERT tokenizer
train_dataset_distilbert = BoolQDataset(train_df_clean, distilbert_tokenizer, label_column='answer')
dev_dataset_distilbert = BoolQDataset(dev_df_clean, distilbert_tokenizer, label_column='answer')
train_loader_distilbert = DataLoader(train_dataset_distilbert, sampler=RandomSampler(train_dataset_distilbert), batch_size=8)
dev_loader_distilbert = DataLoader(dev_dataset_distilbert, sampler=SequentialSampler(dev_dataset_distilbert), batch_size=8)

# Optimizer and scheduler
optimizer = AdamW(distilbert_model.parameters(), lr=1e-5, weight_decay=0.1)
num_training_steps = len(train_loader_distilbert) * 3
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT model loaded successfully.


In [None]:
# Training loop for DistilBERT
best_f1 = 0
best_epoch = 0
for epoch in range(3):
    print(f"\nDistilBERT Epoch {epoch + 1}/3")
    distilbert_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_loader_distilbert, desc="Training DistilBERT")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(distilbert_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        if (step + 1) % 50 == 0:
            print(f"Step {step + 1}/{len(train_loader_distilbert)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader_distilbert)
    print(f"DistilBERT Average Training Loss: {avg_train_loss:.4f}")

    # Evaluate DistilBERT
    distilbert_model.eval()
    dev_preds, dev_labels = [], []
    total_dev_loss = 0
    with torch.no_grad():
        for batch in tqdm(dev_loader_distilbert, desc="Evaluating DistilBERT"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
            total_dev_loss += outputs.loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(labels.cpu().numpy())

    avg_dev_loss = total_dev_loss / len(dev_loader_distilbert)
    accuracy = accuracy_score(dev_labels, dev_preds)
    f1 = f1_score(dev_labels, dev_preds)
    print(f"DistilBERT Validation Loss: {avg_dev_loss:.4f}")
    print(f"DistilBERT Dev Accuracy: {accuracy:.4f}")
    print(f"DistilBERT Dev F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch + 1
        distilbert_model.save_pretrained("/content/boolq_finetuned_distilbert_best")
        distilbert_tokenizer.save_pretrained("/content/boolq_finetuned_distilbert_best")
        print(f"New best DistilBERT model saved with F1 {best_f1:.4f}")
        drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
        !cp -r /content/boolq_finetuned_distilbert_best "{drive_path}/"
    elif epoch - best_epoch >= 1:
        print(f"Early stopping at epoch {epoch + 1}. Best F1: {best_f1:.4f}")
        break

print(f"DistilBERT Best F1: {best_f1:.4f} at epoch {best_epoch}")


DistilBERT Epoch 1/3


Training DistilBERT:   4%|▍         | 51/1179 [00:17<06:16,  3.00it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Step 50/1179, Loss: 0.6614


Training DistilBERT:   7%|▋         | 77/1179 [00:27<06:32,  2.81it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:   9%|▊         | 101/1179 [00:35<06:06,  2.94it/s]

Step 100/1179, Loss: 0.5987


Training DistilBERT:  13%|█▎        | 151/1179 [00:53<06:01,  2.84it/s]

Step 150/1179, Loss: 0.6358


Training DistilBERT:  17%|█▋        | 201/1179 [01:12<05:44,  2.84it/s]

Step 200/1179, Loss: 0.6119


Training DistilBERT:  21%|██▏       | 251/1179 [01:30<05:22,  2.88it/s]

Step 250/1179, Loss: 0.4710


Training DistilBERT:  24%|██▍       | 284/1179 [01:42<05:23,  2.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  25%|██▍       | 290/1179 [01:44<05:21,  2.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  26%|██▌       | 301/1179 [01:48<05:01,  2.91it/s]

Step 300/1179, Loss: 0.6605


Training DistilBERT:  30%|██▉       | 351/1179 [02:06<04:45,  2.90it/s]

Step 350/1179, Loss: 0.6686


Training DistilBERT:  33%|███▎      | 388/1179 [02:20<04:48,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  34%|███▍      | 401/1179 [02:24<04:33,  2.85it/s]

Step 400/1179, Loss: 0.7649


Training DistilBERT:  36%|███▋      | 429/1179 [02:35<04:33,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  38%|███▊      | 451/1179 [02:43<04:12,  2.88it/s]

Step 450/1179, Loss: 0.5938


Training DistilBERT:  42%|████▏     | 501/1179 [03:01<03:55,  2.88it/s]

Step 500/1179, Loss: 0.8544


Training DistilBERT:  47%|████▋     | 551/1179 [03:19<03:36,  2.90it/s]

Step 550/1179, Loss: 0.6795


Training DistilBERT:  51%|█████     | 601/1179 [03:37<03:21,  2.87it/s]

Step 600/1179, Loss: 0.6842


Training DistilBERT:  51%|█████▏    | 607/1179 [03:40<03:27,  2.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  54%|█████▍    | 641/1179 [03:52<03:16,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  55%|█████▌    | 651/1179 [03:56<03:05,  2.85it/s]

Step 650/1179, Loss: 0.4664


Training DistilBERT:  59%|█████▉    | 701/1179 [04:14<02:45,  2.89it/s]

Step 700/1179, Loss: 0.6352


Training DistilBERT:  64%|██████▎   | 751/1179 [04:32<02:28,  2.88it/s]

Step 750/1179, Loss: 0.5602


Training DistilBERT:  64%|██████▍   | 753/1179 [04:33<02:31,  2.81it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  65%|██████▍   | 765/1179 [04:37<02:30,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  68%|██████▊   | 801/1179 [04:50<02:10,  2.89it/s]

Step 800/1179, Loss: 0.6391


Training DistilBERT:  68%|██████▊   | 807/1179 [04:52<02:15,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  69%|██████▉   | 812/1179 [04:54<02:13,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  72%|███████▏  | 851/1179 [05:08<01:54,  2.87it/s]

Step 850/1179, Loss: 0.6042


Training DistilBERT:  76%|███████▋  | 901/1179 [05:27<01:37,  2.86it/s]

Step 900/1179, Loss: 0.5132


Training DistilBERT:  78%|███████▊  | 920/1179 [05:34<01:34,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  81%|████████  | 951/1179 [05:45<01:19,  2.88it/s]

Step 950/1179, Loss: 0.5821


Training DistilBERT:  85%|████████▍ | 998/1179 [06:02<01:05,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  85%|████████▍ | 1001/1179 [06:03<01:02,  2.86it/s]

Step 1000/1179, Loss: 0.7982


Training DistilBERT:  88%|████████▊ | 1034/1179 [06:15<00:52,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  89%|████████▉ | 1051/1179 [06:21<00:44,  2.88it/s]

Step 1050/1179, Loss: 0.6528


Training DistilBERT:  93%|█████████▎| 1101/1179 [06:40<00:27,  2.88it/s]

Step 1100/1179, Loss: 0.5181


Training DistilBERT:  96%|█████████▌| 1132/1179 [06:51<00:17,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  98%|█████████▊| 1151/1179 [06:58<00:09,  2.86it/s]

Step 1150/1179, Loss: 0.8120


Training DistilBERT: 100%|██████████| 1179/1179 [07:08<00:00,  2.75it/s]


DistilBERT Average Training Loss: 0.6386


Evaluating DistilBERT:   1%|▏         | 6/409 [00:00<00:50,  7.98it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:   8%|▊         | 34/409 [00:04<00:48,  7.72it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:  17%|█▋        | 70/409 [00:08<00:40,  8.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:  29%|██▊       | 117/409 [00:14<00:34,  8.35it/s]Be aware, overflowing tokens are not r

DistilBERT Validation Loss: 0.6073
DistilBERT Dev Accuracy: 0.6612
DistilBERT Dev F1: 0.7247
New best DistilBERT model saved with F1 0.7247

DistilBERT Epoch 2/3


Training DistilBERT:   4%|▍         | 51/1179 [00:18<06:33,  2.87it/s]

Step 50/1179, Loss: 0.7046


Training DistilBERT:   7%|▋         | 87/1179 [00:31<06:37,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:   9%|▊         | 101/1179 [00:36<06:15,  2.87it/s]

Step 100/1179, Loss: 0.7914


Training DistilBERT:  11%|█▏        | 133/1179 [00:48<06:21,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  13%|█▎        | 151/1179 [00:54<05:56,  2.88it/s]

Step 150/1179, Loss: 0.4932


Training DistilBERT:  16%|█▋        | 193/1179 [01:10<05:58,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  17%|█▋        | 201/1179 [01:13<05:39,  2.88it/s]

Step 200/1179, Loss: 0.6407


Training DistilBERT:  21%|██▏       | 251/1179 [01:31<05:29,  2.82it/s]

Step 250/1179, Loss: 0.4654


Training DistilBERT:  26%|██▌       | 301/1179 [01:49<05:11,  2.82it/s]

Step 300/1179, Loss: 0.7053


Training DistilBERT:  30%|██▉       | 351/1179 [02:07<04:48,  2.87it/s]

Step 350/1179, Loss: 0.6463


Training DistilBERT:  33%|███▎      | 394/1179 [02:23<04:46,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  34%|███▍      | 401/1179 [02:26<04:30,  2.87it/s]

Step 400/1179, Loss: 0.5189


Training DistilBERT:  36%|███▋      | 428/1179 [02:35<04:33,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  38%|███▊      | 451/1179 [02:44<04:11,  2.90it/s]

Step 450/1179, Loss: 0.4192


Training DistilBERT:  42%|████▏     | 501/1179 [03:02<03:57,  2.85it/s]

Step 500/1179, Loss: 0.2942


Training DistilBERT:  47%|████▋     | 551/1179 [03:20<03:38,  2.88it/s]

Step 550/1179, Loss: 0.2467


Training DistilBERT:  51%|█████     | 601/1179 [03:39<03:19,  2.90it/s]

Step 600/1179, Loss: 0.7051


Training DistilBERT:  55%|█████▌    | 651/1179 [03:57<03:01,  2.90it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Step 650/1179, Loss: 0.5364


Training DistilBERT:  57%|█████▋    | 676/1179 [04:06<03:03,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  59%|█████▉    | 701/1179 [04:15<02:45,  2.88it/s]

Step 700/1179, Loss: 0.5550


Training DistilBERT:  61%|██████▏   | 724/1179 [04:23<02:44,  2.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  64%|██████▎   | 751/1179 [04:33<02:28,  2.87it/s]

Step 750/1179, Loss: 0.6969


Training DistilBERT:  68%|██████▊   | 801/1179 [04:51<02:10,  2.89it/s]

Step 800/1179, Loss: 0.2875


Training DistilBERT:  72%|███████▏  | 851/1179 [05:10<01:53,  2.88it/s]

Step 850/1179, Loss: 0.5658


Training DistilBERT:  76%|███████▋  | 901/1179 [05:28<01:36,  2.88it/s]

Step 900/1179, Loss: 0.4977


Training DistilBERT:  77%|███████▋  | 908/1179 [05:30<01:38,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  81%|████████  | 951/1179 [05:46<01:18,  2.90it/s]

Step 950/1179, Loss: 0.6803


Training DistilBERT:  82%|████████▏ | 966/1179 [05:52<01:17,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  85%|████████▍ | 1001/1179 [06:04<01:02,  2.86it/s]

Step 1000/1179, Loss: 0.1843


Training DistilBERT:  89%|████████▉ | 1051/1179 [06:22<00:44,  2.90it/s]

Step 1050/1179, Loss: 0.6343


Training DistilBERT:  90%|█████████ | 1062/1179 [06:26<00:42,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  91%|█████████ | 1067/1179 [06:28<00:40,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  93%|█████████▎| 1098/1179 [06:40<00:29,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  93%|█████████▎| 1101/1179 [06:41<00:27,  2.88it/s]

Step 1100/1179, Loss: 0.4190


Training DistilBERT:  95%|█████████▍| 1120/1179 [06:48<00:21,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  95%|█████████▌| 1121/1179 [06:48<00:21,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  98%|█████████▊| 1151/1179 [06:59<00:09,  2.90it/s]

Step 1150/1179, Loss: 0.4444


Training DistilBERT:  99%|█████████▉| 1169/1179 [07:05<00:03,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT: 100%|██████████| 1179/1179 [07:09<00:00,  2.74it/s]


DistilBERT Average Training Loss: 0.5515


Evaluating DistilBERT:   1%|▏         | 6/409 [00:00<00:51,  7.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:   8%|▊         | 34/409 [00:04<00:44,  8.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:  17%|█▋        | 70/409 [00:08<00:40,  8.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:  29%|██▊       | 117/409 [00:14<00:35,  8.18it/s]Be aware, overflowing tokens are not r

DistilBERT Validation Loss: 0.5804
DistilBERT Dev Accuracy: 0.6954
DistilBERT Dev F1: 0.7586
New best DistilBERT model saved with F1 0.7586

DistilBERT Epoch 3/3


Training DistilBERT:   4%|▍         | 51/1179 [00:18<06:34,  2.86it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Step 50/1179, Loss: 0.3968


Training DistilBERT:   9%|▊         | 101/1179 [00:36<06:17,  2.85it/s]

Step 100/1179, Loss: 0.5046


Training DistilBERT:  10%|█         | 122/1179 [00:44<06:25,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  13%|█▎        | 151/1179 [00:54<05:57,  2.87it/s]

Step 150/1179, Loss: 0.1937


Training DistilBERT:  15%|█▌        | 178/1179 [01:04<06:05,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  16%|█▋        | 192/1179 [01:09<06:00,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  17%|█▋        | 201/1179 [01:13<05:38,  2.89it/s]

Step 200/1179, Loss: 0.6753


Training DistilBERT:  21%|██▏       | 251/1179 [01:31<05:28,  2.82it/s]

Step 250/1179, Loss: 0.4934


Training DistilBERT:  26%|██▌       | 301/1179 [01:49<05:04,  2.88it/s]

Step 300/1179, Loss: 0.4900


Training DistilBERT:  30%|██▉       | 351/1179 [02:07<04:54,  2.81it/s]

Step 350/1179, Loss: 0.3864


Training DistilBERT:  34%|███▍      | 401/1179 [02:26<04:28,  2.90it/s]

Step 400/1179, Loss: 1.2715


Training DistilBERT:  38%|███▊      | 451/1179 [02:44<04:12,  2.89it/s]

Step 450/1179, Loss: 0.5658


Training DistilBERT:  40%|████      | 473/1179 [02:52<04:17,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  42%|████▏     | 501/1179 [03:02<03:54,  2.89it/s]

Step 500/1179, Loss: 0.1548


Training DistilBERT:  46%|████▌     | 539/1179 [03:16<03:51,  2.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  47%|████▋     | 551/1179 [03:20<03:37,  2.89it/s]

Step 550/1179, Loss: 0.4839


Training DistilBERT:  50%|█████     | 595/1179 [03:36<03:32,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  51%|█████     | 601/1179 [03:38<03:23,  2.84it/s]

Step 600/1179, Loss: 0.3685


Training DistilBERT:  52%|█████▏    | 611/1179 [03:42<03:25,  2.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  55%|█████▌    | 651/1179 [03:57<03:05,  2.85it/s]

Step 650/1179, Loss: 0.2113


Training DistilBERT:  59%|█████▉    | 701/1179 [04:15<02:46,  2.87it/s]

Step 700/1179, Loss: 0.1357


Training DistilBERT:  62%|██████▏   | 730/1179 [04:25<02:44,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  64%|██████▎   | 751/1179 [04:33<02:27,  2.89it/s]

Step 750/1179, Loss: 0.3441


Training DistilBERT:  66%|██████▋   | 783/1179 [04:45<02:24,  2.74it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  68%|██████▊   | 801/1179 [04:51<02:10,  2.89it/s]

Step 800/1179, Loss: 0.1915


Training DistilBERT:  69%|██████▊   | 808/1179 [04:54<02:14,  2.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  72%|███████▏  | 851/1179 [05:10<01:55,  2.85it/s]

Step 850/1179, Loss: 0.5033


Training DistilBERT:  76%|███████▋  | 901/1179 [05:28<01:36,  2.88it/s]

Step 900/1179, Loss: 0.3031


Training DistilBERT:  77%|███████▋  | 904/1179 [05:29<01:38,  2.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  80%|███████▉  | 939/1179 [05:42<01:27,  2.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  81%|████████  | 951/1179 [05:46<01:19,  2.88it/s]

Step 950/1179, Loss: 0.1609


Training DistilBERT:  85%|████████▍ | 1001/1179 [06:04<01:01,  2.88it/s]

Step 1000/1179, Loss: 0.6392


Training DistilBERT:  86%|████████▌ | 1014/1179 [06:09<01:00,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  87%|████████▋ | 1021/1179 [06:11<00:57,  2.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training DistilBERT:  89%|████████▉ | 1051/1179 [06:22<00:44,  2.87it/s]

Step 1050/1179, Loss: 0.4500


Training DistilBERT:  93%|█████████▎| 1101/1179 [06:41<00:27,  2.86it/s]

Step 1100/1179, Loss: 0.2638


Training DistilBERT:  98%|█████████▊| 1151/1179 [06:59<00:09,  2.89it/s]

Step 1150/1179, Loss: 0.6572


Training DistilBERT: 100%|██████████| 1179/1179 [07:09<00:00,  2.75it/s]


DistilBERT Average Training Loss: 0.4672


Evaluating DistilBERT:   1%|▏         | 6/409 [00:00<00:50,  8.03it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:   8%|▊         | 34/409 [00:04<00:44,  8.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:  17%|█▋        | 70/409 [00:08<00:40,  8.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating DistilBERT:  29%|██▊       | 117/409 [00:14<00:35,  8.24it/s]Be aware, overflowing tokens are not r

DistilBERT Validation Loss: 0.6071
DistilBERT Dev Accuracy: 0.7073
DistilBERT Dev F1: 0.7735
New best DistilBERT model saved with F1 0.7735
DistilBERT Best F1: 0.7735 at epoch 3


In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm
import torch

# Load the best RoBERTa model
roberta_model = RobertaForSequenceClassification.from_pretrained('/content/boolq_finetuned_roberta_best')
roberta_tokenizer = RobertaTokenizer.from_pretrained('/content/boolq_finetuned_roberta_best')
roberta_model.to(device)
print("Best RoBERTa model loaded successfully.")

# Create test dataset with RoBERTa tokenizer
test_dataset_roberta = BoolQDataset(test_df_clean, roberta_tokenizer, label_column='answer', is_test=True)
test_loader_roberta = DataLoader(test_dataset_roberta, sampler=SequentialSampler(test_dataset_roberta), batch_size=8)

# Predict on the test set with RoBERTa
roberta_model.eval()
test_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader_roberta, desc="Predicting with RoBERTa"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = roberta_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

# Add predictions to the test DataFrame
test_df_clean['predicted_answer_roberta'] = [bool(pred) for pred in test_preds]
print("Sample RoBERTa test predictions:\n", test_df_clean[['question', 'predicted_answer_roberta']].head())
test_df_clean.to_csv('/content/boolq_test_predictions_roberta.csv', index=False)

# Download the predictions
from google.colab import files
files.download('/content/boolq_test_predictions_roberta.csv')

# Save to Google Drive
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
!cp /content/boolq_test_predictions_roberta.csv "{drive_path}/"
print("RoBERTa test predictions saved and downloaded.")

Best RoBERTa model loaded successfully.


Predicting with RoBERTa:   5%|▍         | 19/406 [00:04<01:23,  4.64it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Predicting with RoBERTa:  18%|█▊        | 72/406 [00:15<01:08,  4.87it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Predicting with RoBERTa:  24%|██▍       | 99/406 [00:21<01:03,  4.84it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Predicting with RoBERTa:  35%|███▍      | 141/406 [00:30<00:55,  4.81it/s]Be aware, overflowing tokens 

Sample RoBERTa test predictions:
                                             question  predicted_answer_roberta
0  is the first series 20 euro note still legal t...                     False
1  do the champions league winners get automatic ...                      True
2                  can a bull snake kill a small dog                      True
3                are all nba playoff games best of 7                      True
4  can i use my train ticket on the tram in manch...                      True


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

RoBERTa test predictions saved and downloaded.
