In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("drive/My Drive/")

In [None]:
import json
import pandas as pd

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load data
train_path = './Boolqa/train.jsonl'
dev_path = './Boolqa/dev.jsonl'
test_path = './Boolqa/test.jsonl'

train_data = load_jsonl(train_path)
dev_data = load_jsonl(dev_path)
test_data = load_jsonl(test_path)

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)

# Inspect
print("Train size:", len(train_df))
print("Dev size:", len(dev_df))
print("Test size:", len(test_df))
print("\nTrain columns:", train_df.columns.tolist())
print("Dev columns:", dev_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())
print("\nTrain sample:\n", train_df.iloc[0].to_dict())
print("Dev sample:\n", dev_df.iloc[0].to_dict())
print("Test sample:\n", test_df.iloc[0].to_dict())

Train size: 9427
Dev size: 3270
Test size: 3245

Train columns: ['question', 'title', 'answer', 'passage']
Dev columns: ['question', 'title', 'answer', 'passage']
Test columns: ['question', 'title', 'passage']

Train sample:
 {'question': 'do iran and afghanistan speak the same language', 'title': 'Persian language', 'answer': True, 'passage': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.'}
Dev sample:
 {'question': 'does ethanol take more energy make th

In [None]:
def clean_data(df, is_labeled=True):
    df['question'] = df['question'].str.strip().str.replace(r'\s+', ' ', regex=True)
    df['passage'] = df['passage'].str.strip().str.replace(r'\s+', ' ', regex=True)
    if is_labeled:
        df = df.dropna(subset=['question', 'passage', 'answer'])
        df = df[df['answer'].isin([True, False])]
        df = df.drop_duplicates(subset=['question', 'passage'])
    else:
        df = df.dropna(subset=['question', 'passage'])
        df = df.drop_duplicates(subset=['question', 'passage'])
    return df

train_df_clean = clean_data(train_df, is_labeled=True)
dev_df_clean = clean_data(dev_df, is_labeled=True)
test_df_clean = clean_data(test_df, is_labeled=False)

# Check label distribution
print("Train label distribution:\n", train_df_clean['answer'].value_counts(normalize=True))
print("Dev label distribution:\n", dev_df_clean['answer'].value_counts(normalize=True))
print("Train size after cleaning:", len(train_df_clean))
print("Dev size after cleaning:", len(dev_df_clean))
print("Test size after cleaning:", len(test_df_clean))

Train label distribution:
 answer
True     0.623104
False    0.376896
Name: proportion, dtype: float64
Dev label distribution:
 answer
True     0.621713
False    0.378287
Name: proportion, dtype: float64
Train size after cleaning: 9427
Dev size after cleaning: 3270
Test size after cleaning: 3245


In [None]:
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function
def preprocess_dataset(df, tokenizer, max_len=512, is_labeled=True):
    questions = df['question'].tolist()
    passages = df['passage'].tolist()
    labels = df['answer'].tolist() if is_labeled else None

    # Tokenize
    encoded = tokenizer(
        text=questions,
        text_pair=passages,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

    input_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']
    labels = torch.tensor([1 if label else 0 for label in labels], dtype=torch.long) if labels is not None else None

    # Create dataset
    dataset = TensorDataset(input_ids, attention_masks, labels) if labels is not None else TensorDataset(input_ids, attention_masks)

    # Check truncation
    truncated_count = 0
    total_tokens_lost = 0
    for q, p in zip(questions, passages):
        full_encoding = tokenizer(q, p, add_special_tokens=True, truncation=False, return_tensors='pt')
        full_length = full_encoding['input_ids'].shape[1]
        if full_length > max_len:
            truncated_count += 1
            total_tokens_lost += full_length - max_len

    print(f"Truncated samples: {truncated_count}/{len(df)} ({truncated_count/len(df)*100:.2f}%)")
    if truncated_count > 0:
        print(f"Avg tokens lost per truncated sample: {total_tokens_lost/truncated_count:.2f}")

    return dataset

# Preprocess
train_dataset = preprocess_dataset(train_df_clean, tokenizer, is_labeled=True)
dev_dataset = preprocess_dataset(dev_df_clean, tokenizer, is_labeled=True)
test_dataset = preprocess_dataset(test_df_clean, tokenizer, is_labeled=False)

# Validate
print(f"Train dataset size: {len(train_dataset)}")
print(f"Dev dataset size: {len(dev_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print("Sample train entry:", next(iter(train_dataset)))  # Unpack to check

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Truncated samples: 16/9427 (0.17%)
Avg tokens lost per truncated sample: 129.94


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Truncated samples: 10/3270 (0.31%)
Avg tokens lost per truncated sample: 137.00


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Truncated samples: 6/3245 (0.18%)
Avg tokens lost per truncated sample: 82.17
Train dataset size: 9427
Dev dataset size: 3270
Test dataset size: 3245
Sample train entry: (tensor([  101,  2079,  4238,  1998,  7041,  3713,  1996,  2168,  2653,   102,
         4723,  1006,  1013,   100,  1010,  1011,  1130, 29681,  2078,  1013,
         1007,  1010,  2036,  2124,  2011,  2049,  2203, 16585,  2213,  2521,
         5332,  1006,  1291, 25573, 17149, 29824, 24830,  2521,  5332,  1006,
         1042, 29678, 23432, 29692, 29715,  5332, 23432,  1007,  1006,  4952,
         1007,  1007,  1010,  2003,  2028,  1997,  1996,  2530,  7726,  4155,
         2306,  1996, 11424,  1011,  7726,  3589,  1997,  1996, 11424,  1011,
         2647,  2653,  2155,  1012,  2009,  2003,  3952,  5287,  1999,  4238,
         1010,  7041,  1006,  3985,  2124,  2004, 18243,  2072,  2144,  3845,
         1007,  1010,  1998, 23538,  1006,  3985,  2124,  2004, 11937,  4478,
         3211,  2144,  1996,  3354,  3690,  1007,

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 8  # Safe for T4
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
dev_loader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

print(f"Train batches: {len(train_loader)}")
print(f"Dev batches: {len(dev_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 1179
Dev batches: 409
Test batches: 406


In [None]:
from transformers import BertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check initial weight
initial_weight = model.bert.encoder.layer[0].attention.self.query.weight[0][0].item()
print(f"Initial weight: {initial_weight}")

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Slightly lower LR
num_training_steps = len(train_loader) * 4
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=200, num_training_steps=num_training_steps)

# Training loop
best_f1 = 0.0
best_epoch = 0
for epoch in range(4):
    print(f"\nEpoch {epoch + 1}/4")
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_loader, desc="Training")):
        batch = [b.to(device) for b in batch]
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        if (step + 1) % 50 == 0:
            print(f"Step {step + 1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Evaluate
    model.eval()
    dev_preds, dev_labels = [], []
    total_dev_loss = 0
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Evaluating"):
            batch = [b.to(device) for b in batch]
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_dev_loss += outputs.loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(labels.cpu().numpy())

    avg_dev_loss = total_dev_loss / len(dev_loader)
    accuracy = accuracy_score(dev_labels, dev_preds)
    f1 = f1_score(dev_labels, dev_preds)
    print(f"Validation Loss: {avg_dev_loss:.4f}")
    print(f"Dev Accuracy: {accuracy:.4f}")
    print(f"Dev F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch + 1
        model.save_pretrained("/content/boolq_finetuned_bert_best")
        tokenizer.save_pretrained("/content/boolq_finetuned_bert_best")
    elif epoch - best_epoch >= 1:
        print(f"Early stopping at epoch {epoch + 1}. Best F1: {best_f1:.4f}")
        break

# Check final weight
final_weight = model.bert.encoder.layer[0].attention.self.query.weight[0][0].item()
print(f"Final weight: {final_weight}")
print(f"Weight changed: {abs(final_weight - initial_weight):.6f}")

if best_f1 > 0:
    print(f"Best model saved from epoch {best_epoch} with F1 {best_f1:.4f}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initial weight: -0.01640571653842926

Epoch 1/4


Training:   4%|▍         | 50/1179 [00:34<12:51,  1.46it/s]

Step 50/1179, Loss: 0.6437


Training:   8%|▊         | 100/1179 [01:10<13:07,  1.37it/s]

Step 100/1179, Loss: 0.7725


Training:  13%|█▎        | 150/1179 [01:47<12:55,  1.33it/s]

Step 150/1179, Loss: 0.7526


Training:  17%|█▋        | 200/1179 [02:24<11:58,  1.36it/s]

Step 200/1179, Loss: 0.6058


Training:  21%|██        | 250/1179 [03:00<11:32,  1.34it/s]

Step 250/1179, Loss: 0.7434


Training:  25%|██▌       | 300/1179 [03:37<10:49,  1.35it/s]

Step 300/1179, Loss: 0.5477


Training:  30%|██▉       | 350/1179 [04:14<10:09,  1.36it/s]

Step 350/1179, Loss: 0.6156


Training:  34%|███▍      | 400/1179 [04:50<09:33,  1.36it/s]

Step 400/1179, Loss: 0.5395


Training:  38%|███▊      | 450/1179 [05:27<08:58,  1.35it/s]

Step 450/1179, Loss: 0.7012


Training:  42%|████▏     | 500/1179 [06:04<08:21,  1.35it/s]

Step 500/1179, Loss: 0.7383


Training:  47%|████▋     | 550/1179 [06:40<07:45,  1.35it/s]

Step 550/1179, Loss: 0.7326


Training:  51%|█████     | 600/1179 [07:17<07:07,  1.36it/s]

Step 600/1179, Loss: 0.5619


Training:  55%|█████▌    | 650/1179 [07:54<06:30,  1.36it/s]

Step 650/1179, Loss: 0.5831


Training:  59%|█████▉    | 700/1179 [08:30<05:54,  1.35it/s]

Step 700/1179, Loss: 0.6438


Training:  64%|██████▎   | 750/1179 [09:07<05:18,  1.35it/s]

Step 750/1179, Loss: 0.5249


Training:  68%|██████▊   | 800/1179 [09:44<04:40,  1.35it/s]

Step 800/1179, Loss: 0.7493


Training:  72%|███████▏  | 850/1179 [10:20<04:02,  1.36it/s]

Step 850/1179, Loss: 0.5103


Training:  76%|███████▋  | 900/1179 [10:57<03:25,  1.36it/s]

Step 900/1179, Loss: 0.6170


Training:  81%|████████  | 950/1179 [11:34<02:48,  1.36it/s]

Step 950/1179, Loss: 0.7311


Training:  85%|████████▍ | 1000/1179 [12:10<02:12,  1.35it/s]

Step 1000/1179, Loss: 0.6258


Training:  89%|████████▉ | 1050/1179 [12:47<01:35,  1.35it/s]

Step 1050/1179, Loss: 0.6581


Training:  93%|█████████▎| 1100/1179 [13:24<00:58,  1.35it/s]

Step 1100/1179, Loss: 0.6334


Training:  98%|█████████▊| 1150/1179 [14:00<00:21,  1.36it/s]

Step 1150/1179, Loss: 0.7371


Training: 100%|██████████| 1179/1179 [14:21<00:00,  1.37it/s]


Average Training Loss: 0.6499


Evaluating: 100%|██████████| 409/409 [01:34<00:00,  4.32it/s]


Validation Loss: 0.6072
Dev Accuracy: 0.6844
Dev F1: 0.7689

Epoch 2/4


Training:   4%|▍         | 50/1179 [00:36<13:59,  1.35it/s]

Step 50/1179, Loss: 0.5956


Training:   8%|▊         | 100/1179 [01:13<13:17,  1.35it/s]

Step 100/1179, Loss: 0.7461


Training:  13%|█▎        | 150/1179 [01:50<12:41,  1.35it/s]

Step 150/1179, Loss: 0.7095


Training:  17%|█▋        | 200/1179 [02:26<11:59,  1.36it/s]

Step 200/1179, Loss: 0.4389


Training:  21%|██        | 250/1179 [03:03<11:25,  1.36it/s]

Step 250/1179, Loss: 0.4548


Training:  25%|██▌       | 300/1179 [03:40<10:46,  1.36it/s]

Step 300/1179, Loss: 0.9234


Training:  30%|██▉       | 350/1179 [04:16<10:14,  1.35it/s]

Step 350/1179, Loss: 0.4123


Training:  34%|███▍      | 400/1179 [04:53<09:33,  1.36it/s]

Step 400/1179, Loss: 0.3735


Training:  38%|███▊      | 450/1179 [05:29<08:59,  1.35it/s]

Step 450/1179, Loss: 0.4067


Training:  42%|████▏     | 500/1179 [06:06<08:21,  1.35it/s]

Step 500/1179, Loss: 0.5356


Training:  47%|████▋     | 550/1179 [06:43<07:42,  1.36it/s]

Step 550/1179, Loss: 0.3456


Training:  51%|█████     | 600/1179 [07:19<07:06,  1.36it/s]

Step 600/1179, Loss: 0.4023


Training:  55%|█████▌    | 650/1179 [07:56<06:31,  1.35it/s]

Step 650/1179, Loss: 0.7325


Training:  59%|█████▉    | 700/1179 [08:33<05:54,  1.35it/s]

Step 700/1179, Loss: 0.4084


Training:  64%|██████▎   | 750/1179 [09:09<05:16,  1.36it/s]

Step 750/1179, Loss: 0.7512


Training:  68%|██████▊   | 800/1179 [09:46<04:40,  1.35it/s]

Step 800/1179, Loss: 0.2542


Training:  72%|███████▏  | 850/1179 [10:23<04:03,  1.35it/s]

Step 850/1179, Loss: 0.7538


Training:  76%|███████▋  | 900/1179 [11:00<03:26,  1.35it/s]

Step 900/1179, Loss: 0.6350


Training:  81%|████████  | 950/1179 [11:36<02:48,  1.36it/s]

Step 950/1179, Loss: 0.3252


Training:  85%|████████▍ | 1000/1179 [12:13<02:12,  1.35it/s]

Step 1000/1179, Loss: 0.7807


Training:  89%|████████▉ | 1050/1179 [12:50<01:35,  1.35it/s]

Step 1050/1179, Loss: 0.4634


Training:  93%|█████████▎| 1100/1179 [13:27<00:58,  1.35it/s]

Step 1100/1179, Loss: 0.5943


Training:  98%|█████████▊| 1150/1179 [14:03<00:21,  1.35it/s]

Step 1150/1179, Loss: 0.4695


Training: 100%|██████████| 1179/1179 [14:24<00:00,  1.36it/s]


Average Training Loss: 0.5408


Evaluating: 100%|██████████| 409/409 [01:34<00:00,  4.31it/s]


Validation Loss: 0.5870
Dev Accuracy: 0.6966
Dev F1: 0.7524

Epoch 3/4


Training:   4%|▍         | 50/1179 [00:36<13:56,  1.35it/s]

Step 50/1179, Loss: 0.1791


Training:   8%|▊         | 100/1179 [01:13<13:17,  1.35it/s]

Step 100/1179, Loss: 0.2806


Training:  13%|█▎        | 150/1179 [01:50<12:38,  1.36it/s]

Step 150/1179, Loss: 0.3529


Training:  17%|█▋        | 200/1179 [02:26<12:06,  1.35it/s]

Step 200/1179, Loss: 0.1454


Training:  21%|██        | 250/1179 [03:03<11:24,  1.36it/s]

Step 250/1179, Loss: 0.1034


Training:  25%|██▌       | 300/1179 [03:40<10:48,  1.36it/s]

Step 300/1179, Loss: 0.3382


Training:  30%|██▉       | 350/1179 [04:16<10:14,  1.35it/s]

Step 350/1179, Loss: 0.5836


Training:  34%|███▍      | 400/1179 [04:53<09:37,  1.35it/s]

Step 400/1179, Loss: 0.3144


Training:  38%|███▊      | 450/1179 [05:30<09:00,  1.35it/s]

Step 450/1179, Loss: 0.6901


Training:  42%|████▏     | 500/1179 [06:07<08:22,  1.35it/s]

Step 500/1179, Loss: 0.4178


Training:  47%|████▋     | 550/1179 [06:43<07:44,  1.35it/s]

Step 550/1179, Loss: 0.1787


Training:  51%|█████     | 600/1179 [07:20<07:07,  1.35it/s]

Step 600/1179, Loss: 0.6131


Training:  55%|█████▌    | 650/1179 [07:57<06:29,  1.36it/s]

Step 650/1179, Loss: 0.8581


Training:  59%|█████▉    | 700/1179 [08:33<05:51,  1.36it/s]

Step 700/1179, Loss: 0.5357


Training:  64%|██████▎   | 750/1179 [09:10<05:18,  1.35it/s]

Step 750/1179, Loss: 0.9697


Training:  68%|██████▊   | 800/1179 [09:47<04:41,  1.35it/s]

Step 800/1179, Loss: 0.7944


Training:  72%|███████▏  | 850/1179 [10:23<04:02,  1.35it/s]

Step 850/1179, Loss: 0.5138


Training:  76%|███████▋  | 900/1179 [11:00<03:25,  1.36it/s]

Step 900/1179, Loss: 0.2173


Training:  81%|████████  | 950/1179 [11:36<02:48,  1.36it/s]

Step 950/1179, Loss: 0.0666


Training:  85%|████████▍ | 1000/1179 [12:13<02:11,  1.36it/s]

Step 1000/1179, Loss: 1.2664


Training:  89%|████████▉ | 1050/1179 [12:50<01:35,  1.36it/s]

Step 1050/1179, Loss: 0.1109


Training:  93%|█████████▎| 1100/1179 [13:26<00:58,  1.36it/s]

Step 1100/1179, Loss: 0.0432


Training:  98%|█████████▊| 1150/1179 [14:03<00:21,  1.35it/s]

Step 1150/1179, Loss: 0.3690


Training: 100%|██████████| 1179/1179 [14:24<00:00,  1.36it/s]


Average Training Loss: 0.3805


Evaluating: 100%|██████████| 409/409 [01:34<00:00,  4.32it/s]


Validation Loss: 0.7416
Dev Accuracy: 0.7153
Dev F1: 0.7893

Epoch 4/4


Training:   4%|▍         | 50/1179 [00:36<13:57,  1.35it/s]

Step 50/1179, Loss: 0.1249


Training:   8%|▊         | 100/1179 [01:13<13:14,  1.36it/s]

Step 100/1179, Loss: 0.7406


Training:  13%|█▎        | 150/1179 [01:49<12:35,  1.36it/s]

Step 150/1179, Loss: 0.0220


Training:  17%|█▋        | 200/1179 [02:26<12:02,  1.35it/s]

Step 200/1179, Loss: 0.0725


Training:  21%|██        | 250/1179 [03:03<11:27,  1.35it/s]

Step 250/1179, Loss: 0.0523


Training:  25%|██▌       | 300/1179 [03:40<10:53,  1.35it/s]

Step 300/1179, Loss: 0.4677


Training:  30%|██▉       | 350/1179 [04:16<10:13,  1.35it/s]

Step 350/1179, Loss: 0.7435


Training:  34%|███▍      | 400/1179 [04:53<09:34,  1.36it/s]

Step 400/1179, Loss: 0.0839


Training:  38%|███▊      | 450/1179 [05:30<08:59,  1.35it/s]

Step 450/1179, Loss: 0.0143


Training:  42%|████▏     | 500/1179 [06:06<08:25,  1.34it/s]

Step 500/1179, Loss: 0.0797


Training:  47%|████▋     | 550/1179 [06:43<07:43,  1.36it/s]

Step 550/1179, Loss: 0.0354


Training:  51%|█████     | 600/1179 [07:19<07:07,  1.36it/s]

Step 600/1179, Loss: 0.0166


Training:  55%|█████▌    | 650/1179 [07:56<06:28,  1.36it/s]

Step 650/1179, Loss: 0.4744


Training:  59%|█████▉    | 700/1179 [08:33<05:52,  1.36it/s]

Step 700/1179, Loss: 0.0388


Training:  64%|██████▎   | 750/1179 [09:09<05:16,  1.35it/s]

Step 750/1179, Loss: 0.0990


Training:  68%|██████▊   | 800/1179 [09:46<04:38,  1.36it/s]

Step 800/1179, Loss: 0.3045


Training:  72%|███████▏  | 850/1179 [10:23<04:03,  1.35it/s]

Step 850/1179, Loss: 0.4526


Training:  76%|███████▋  | 900/1179 [10:59<03:25,  1.36it/s]

Step 900/1179, Loss: 0.6087


Training:  81%|████████  | 950/1179 [11:36<02:49,  1.35it/s]

Step 950/1179, Loss: 0.0996


Training:  85%|████████▍ | 1000/1179 [12:12<02:12,  1.35it/s]

Step 1000/1179, Loss: 0.5493


Training:  89%|████████▉ | 1050/1179 [12:49<01:35,  1.35it/s]

Step 1050/1179, Loss: 0.3210


Training:  93%|█████████▎| 1100/1179 [13:26<00:58,  1.35it/s]

Step 1100/1179, Loss: 0.2210


Training:  98%|█████████▊| 1150/1179 [14:02<00:21,  1.36it/s]

Step 1150/1179, Loss: 0.5262


Training: 100%|██████████| 1179/1179 [14:23<00:00,  1.37it/s]


Average Training Loss: 0.2681


Evaluating: 100%|██████████| 409/409 [01:34<00:00,  4.32it/s]


Validation Loss: 0.9786
Dev Accuracy: 0.7095
Dev F1: 0.7700
Final weight: -0.014194637537002563
Weight changed: 0.002211
Best model saved from epoch 3 with F1 0.7893


In [None]:
import shutil
import os

# Create a zip file of the model directory
model_dir = "/content/boolq_finetuned_bert_best"
zip_path = "/content/boolq_finetuned_bert_best.zip"
shutil.make_archive("/content/boolq_finetuned_bert_best", 'zip', model_dir)

# Download the zip file
from google.colab import files
files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os

# Check if the directory exists
model_dir = "/content/boolq_finetuned_bert_best"
if os.path.exists(model_dir):
    print(f"Directory {model_dir} exists!")
    print("Files in directory:", os.listdir(model_dir))
else:
    print(f"Directory {model_dir} does not exist.")

Directory /content/boolq_finetuned_bert_best exists!
Files in directory: ['model.safetensors', 'tokenizer_config.json', 'config.json', 'special_tokens_map.json', 'vocab.txt']


In [None]:
from transformers import BertForSequenceClassification

# Assuming `model` is still in memory (from your training cell)
# If not, we’ll reload from the last state
try:
    model.save_pretrained("./Data/boolq_finetuned_bert_best")
    tokenizer.save_pretrained("./Data/boolq_finetuned_bert_best")
    print("Model and tokenizer manually saved to /Data/boolq_finetuned_bert_best")
except NameError:
    print("Model variable not found. Let’s reload and save.")

# Verify again
if os.path.exists("/boolq_finetuned_bert_best"):
    print("Directory now exists!")
    print("Files in directory:", os.listdir("./boolq_finetuned_bert_best"))
else:
    print("Failed to save the model. Let’s troubleshoot further.")

Model and tokenizer manually saved to /Data/boolq_finetuned_bert_best
Failed to save the model. Let’s troubleshoot further.


In [None]:
from transformers import BertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Load the best model (Epoch 3)
model = BertForSequenceClassification.from_pretrained('./Data/boolq_finetuned_bert_best')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer with stronger regularization
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.1)  # Lower LR, higher weight decay
num_training_steps = len(train_loader) * 3  # Train for 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

# Training loop
best_f1 = 0.7893  # From Epoch 3
best_epoch = 0
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}/3")
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_loader, desc="Training")):
        batch = [b.to(device) for b in batch]
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        if (step + 1) % 50 == 0:
            print(f"Step {step + 1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Evaluate
    model.eval()
    dev_preds, dev_labels = [], []
    total_dev_loss = 0
    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Evaluating"):
            batch = [b.to(device) for b in batch]
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_dev_loss += outputs.loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(labels.cpu().numpy())

    avg_dev_loss = total_dev_loss / len(dev_loader)
    accuracy = accuracy_score(dev_labels, dev_preds)
    f1 = f1_score(dev_labels, dev_preds)
    print(f"Validation Loss: {avg_dev_loss:.4f}")
    print(f"Dev Accuracy: {accuracy:.4f}")
    print(f"Dev F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch + 1
        model.save_pretrained("./Data/boolq_finetuned_bert_best_v2")
        tokenizer.save_pretrained("./Data/boolq_finetuned_bert_best_v2")
        print(f"New best model saved with F1 {best_f1:.4f}")
    elif epoch - best_epoch >= 1:
        print(f"Early stopping at epoch {epoch + 1}. Best F1: {best_f1:.4f}")
        break

print(f"Best F1: {best_f1:.4f} at epoch {best_epoch}")




Epoch 1/3


Training:   4%|▍         | 50/1179 [00:37<14:01,  1.34it/s]

Step 50/1179, Loss: 0.0281


Training:   8%|▊         | 100/1179 [01:14<13:13,  1.36it/s]

Step 100/1179, Loss: 0.0229


Training:  13%|█▎        | 150/1179 [01:51<12:50,  1.34it/s]

Step 150/1179, Loss: 0.0136


Training:  17%|█▋        | 200/1179 [02:28<12:02,  1.35it/s]

Step 200/1179, Loss: 0.0124


Training:  21%|██        | 250/1179 [03:04<11:30,  1.34it/s]

Step 250/1179, Loss: 0.0098


Training:  25%|██▌       | 300/1179 [03:41<10:53,  1.34it/s]

Step 300/1179, Loss: 0.0543


Training:  30%|██▉       | 350/1179 [04:18<10:11,  1.36it/s]

Step 350/1179, Loss: 0.0125


Training:  34%|███▍      | 400/1179 [04:55<09:32,  1.36it/s]

Step 400/1179, Loss: 0.7332


Training:  38%|███▊      | 450/1179 [05:31<08:57,  1.36it/s]

Step 450/1179, Loss: 0.1191


Training:  42%|████▏     | 500/1179 [06:08<08:21,  1.35it/s]

Step 500/1179, Loss: 0.2272


Training:  47%|████▋     | 550/1179 [06:45<07:44,  1.36it/s]

Step 550/1179, Loss: 0.1435


Training:  48%|████▊     | 571/1179 [07:00<07:27,  1.36it/s]


KeyboardInterrupt: 

In [None]:
import os

# Check if the original model directory exists
print("Checking /content/boolq_finetuned_bert_best:")
if os.path.exists("./Data/boolq_finetuned_bert_best"):
    print("Found! Contents:", os.listdir("./Data/boolq_finetuned_bert_best"))
else:
    print("Not found in ./Data/")

# Try to save the current model (if still in memory)
try:
    from transformers import BertForSequenceClassification
    model.save_pretrained("./Data/boolq_finetuned_bert_best_current")
    tokenizer.save_pretrained("./Data/boolq_finetuned_bert_best_current")
    print("Current model saved to ./Data/boolq_finetuned_bert_best_current")
except NameError:
    print("Model variable not found. Unable to save current state.")

Checking /content/boolq_finetuned_bert_best:
Found! Contents: ['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt']
Current model saved to ./Data/boolq_finetuned_bert_best_current


In [None]:
# Save and download cleaned DataFrames
train_df_clean.to_csv('./Data/train_df_clean.csv', index=False)
dev_df_clean.to_csv('./Data/dev_df_clean.csv', index=False)
test_df_clean.to_csv('./Data/test_df_clean.csv', index=False)
from google.colab import files
files.download('./Data/train_df_clean.csv')
files.download('./Data/dev_df_clean.csv')
files.download('./Data/test_df_clean.csv')
print("Downloaded cleaned DataFrames.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded cleaned DataFrames.


In [None]:
# Create a project directory and copy files
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"
!mkdir -p "{drive_path}"
if os.path.exists("./Data/boolq_finetuned_bert_best"):
    !cp -r /content/boolq_finetuned_bert_best "{drive_path}/"
if os.path.exists("./Data/boolq_finetuned_bert_best_current"):
    !cp -r ./Data/boolq_finetuned_bert_best_current "{drive_path}/"
!cp ./Data/train_df_clean.csv "{drive_path}/"
!cp ./Data/dev_df_clean.csv "{drive_path}/"
!cp ./Data/test_df_clean.csv "{drive_path}/"

print(f"Files backed up to {drive_path}")

Files backed up to /content/drive/My Drive/Colab Notebooks/BoolQ_Project


In [None]:
from transformers import BertForSequenceClassification
import torch
from tqdm import tqdm

if os.path.exists("/content/boolq_finetuned_bert_best"):
    model = BertForSequenceClassification.from_pretrained("/content/boolq_finetuned_bert_best")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    model.eval()
    test_preds = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            batch = [b.to(device) for b in batch]
            input_ids, attention_mask = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            test_preds.extend(preds.cpu().numpy())

    test_df_clean['predicted_answer'] = [bool(pred) for pred in test_preds]
    print("Sample test predictions:\n", test_df_clean[['question', 'predicted_answer']].head())
    test_df_clean.to_csv('/content/boolq_test_predictions_epoch3.csv', index=False)
    files.download('/content/boolq_test_predictions_epoch3.csv')
    !cp /content/boolq_test_predictions_epoch3.csv "{drive_path}/"
    print("Test predictions saved and downloaded.")
else:
    print("Original model not found to generate predictions.")

Predicting: 100%|██████████| 406/406 [01:34<00:00,  4.30it/s]


Sample test predictions:
                                             question  predicted_answer
0  is the first series 20 euro note still legal t...              True
1  do the champions league winners get automatic ...              True
2                  can a bull snake kill a small dog              True
3                are all nba playoff games best of 7             False
4  can i use my train ticket on the tram in manch...              True


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test predictions saved and downloaded.


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

In [None]:
# Define paths
drive_path = "/content/drive/My Drive/Colab Notebooks/BoolQ_Project"

# Load cleaned DataFrames from Google Drive
train_df_clean = pd.read_csv(f"{drive_path}/train_df_clean.csv")
dev_df_clean = pd.read_csv(f"{drive_path}/dev_df_clean.csv")
test_df_clean = pd.read_csv(f"{drive_path}/test_df_clean.csv")
print("Loaded cleaned DataFrames:")
print("Train shape:", train_df_clean.shape)
print("Dev shape:", dev_df_clean.shape)
print("Test shape:", test_df_clean.shape)

Loaded cleaned DataFrames:
Train shape: (9427, 4)
Dev shape: (3270, 4)
Test shape: (3245, 3)


In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define dataset class (same as before)
class BoolQDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = str(self.data.iloc[idx]['question'])
        passage = str(self.data.iloc[idx]['passage'])
        label = 1 if self.data.iloc[idx]['answer'] else 0

        encoding = self.tokenizer(
            question,
            passage,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation='longest_first',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create datasets
train_dataset = BoolQDataset(train_df_clean, tokenizer)
dev_dataset = BoolQDataset(dev_df_clean, tokenizer)
test_dataset = BoolQDataset(test_df_clean, tokenizer)

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
dev_loader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

print(f"Train batches: {len(train_loader)}")
print(f"Dev batches: {len(dev_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 1179
Dev batches: 409
Test batches: 406


In [None]:
# Check if the model exists
print("Checking /content/boolq_finetuned_bert_best:")
if os.path.exists("/content/boolq_finetuned_bert_best"):
    print("Found! Contents:", os.listdir("/content/boolq_finetuned_bert_best"))
else:
    print("Not found in /content/")

# Check Google Drive
print("\nChecking Google Drive:")
if os.path.exists(f"{drive_path}/boolq_finetuned_bert_best"):
    print("Found in Drive! Copying to /content/...")
    !cp -r "{drive_path}/boolq_finetuned_bert_best" /content/
    print("Copied to /content/boolq_finetuned_bert_best")
else:
    print("Not found in Google Drive. You may need to upload manually.")

Checking /content/boolq_finetuned_bert_best:
Found! Contents: ['special_tokens_map.json', 'tokenizer_config.json', 'vocab.txt', 'model.safetensors', 'config.json']

Checking Google Drive:
Found in Drive! Copying to /content/...
Copied to /content/boolq_finetuned_bert_best
