In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Load and Filter the dataset
df = pd.read_csv("questions.csv")
df = df.dropna()
df = df[['question1', 'question2', 'is_duplicate']]

# Filter: 9000 duplicates and 6000 non-duplicates
duplicates = df[df['is_duplicate'] == 1].sample(n=9000, random_state=42)
non_duplicates = df[df['is_duplicate'] == 0].sample(n=6000, random_state=42)
df = pd.concat([duplicates, non_duplicates]).sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle after sampling

# Step 2: Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    list(zip(df['question1'], df['question2'])),
    df['is_duplicate'].tolist(),
    test_size=0.1,
    random_state=42
)

# Step 3: Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_pairs(pairs):
    return tokenizer(
        [q1 for q1, q2 in pairs],
        [q2 for q1, q2 in pairs],
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=128
    )

train_encodings = tokenize_pairs(train_texts)
val_encodings = tokenize_pairs(val_texts)

# Step 4: Custom Dataset
class QuoraDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()} | {'labels': self.labels[idx]}

    def __len__(self):
        return len(self.labels)

train_dataset = QuoraDataset(train_encodings, train_labels)
val_dataset = QuoraDataset(val_encodings, val_labels)

# Step 5: Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# Step 6: Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Step 7: Training Loop
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(dataloader)

# Step 8: Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds += torch.argmax(logits, dim=1).cpu().tolist()
            labels += batch['labels'].cpu().tolist()
    acc = accuracy_score(labels, preds)
    print(classification_report(labels, preds))
    return acc

# Step 9: Create Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Step 10: Training Loop
for epoch in range(3):  # Adjust epochs as needed
    print(f"\nEpoch {epoch + 1}")
    train_loss = train(model, train_loader)
    print(f"Train Loss: {train_loss:.4f}")
    val_acc = evaluate(model, val_loader)
    print(f"Validation Accuracy: {val_acc:.4f}")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1


Training: 100%|██████████| 844/844 [04:29<00:00,  3.13it/s]


Train Loss: 0.4507


Evaluating: 100%|██████████| 47/47 [00:08<00:00,  5.32it/s]


              precision    recall  f1-score   support

           0       0.81      0.76      0.78       615
           1       0.84      0.87      0.86       885

    accuracy                           0.83      1500
   macro avg       0.82      0.82      0.82      1500
weighted avg       0.83      0.83      0.83      1500

Validation Accuracy: 0.8273

Epoch 2


Training: 100%|██████████| 844/844 [04:33<00:00,  3.08it/s]


Train Loss: 0.2890


Evaluating: 100%|██████████| 47/47 [00:08<00:00,  5.30it/s]


              precision    recall  f1-score   support

           0       0.85      0.76      0.80       615
           1       0.85      0.91      0.87       885

    accuracy                           0.85      1500
   macro avg       0.85      0.83      0.84      1500
weighted avg       0.85      0.85      0.85      1500

Validation Accuracy: 0.8473

Epoch 3


Training:   7%|▋         | 55/844 [00:17<04:15,  3.09it/s]

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import os

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Load and filter the dataset
df = pd.read_csv("/questions.csv")
df = df.dropna()
df = df[['question1', 'question2', 'is_duplicate']]

# Limit to 9000 duplicate and 6000 non-duplicate examples
duplicates = df[df['is_duplicate'] == 1].sample(9000, random_state=42)
non_duplicates = df[df['is_duplicate'] == 0].sample(6000, random_state=42)
df = pd.concat([duplicates, non_duplicates]).sample(frac=1, random_state=42)  # Shuffle

# Step 2: Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    list(zip(df['question1'], df['question2'])),
    df['is_duplicate'].tolist(),
    test_size=0.1,
    random_state=42
)

# Step 3: Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_pairs(pairs):
    return tokenizer(
        [q1 for q1, q2 in pairs],
        [q2 for q1, q2 in pairs],
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=128
    )

train_encodings = tokenize_pairs(train_texts)
val_encodings = tokenize_pairs(val_texts)

# Step 4: Custom Dataset Class
class QuoraDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QuoraDataset(train_encodings, train_labels)
val_dataset = QuoraDataset(val_encodings, val_labels)

# Step 5: Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)
model.to(device)

# Step 6: Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Save checkpoint
def save_checkpoint(model, optimizer, epoch, path="checkpoint.pth"):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved at epoch {epoch}!")

# Load checkpoint
def load_checkpoint(model, optimizer, path="checkpoint.pth"):
    if os.path.isfile(path):
        checkpoint = torch.load(path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f" Checkpoint loaded! Resuming from epoch {start_epoch}")
        return start_epoch
    else:
        print("⚡ No checkpoint found. Starting from scratch.")
        return 0

# Step 7: Training Loop
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(dataloader)

# Step 8: Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds += torch.argmax(logits, dim=1).cpu().tolist()
            labels += batch['labels'].cpu().tolist()
    acc = accuracy_score(labels, preds)
    print(classification_report(labels, preds))
    return acc

# Step 9: Create Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Step 10: Load checkpoint if exists
start_epoch = load_checkpoint(model, optimizer, path="checkpoint.pth")

# Step 11: Run Training
num_epochs = 3  # You can tune this
for epoch in range(start_epoch, num_epochs):
    print(f"\n🔵 Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader)
    print(f"Train Loss: {train_loss:.4f}")

    # Save checkpoint after each epoch
    save_checkpoint(model, optimizer, epoch, path="checkpoint.pth")

    val_acc = evaluate(model, val_loader)
    print(f"Validation Accuracy: {val_acc:.4f}")

# After all epochs, save final model
torch.save(model.state_dict(), "final_model.pth")
print("Final model saved as final_model.pth!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


⚡ No checkpoint found. Starting from scratch.

🔵 Epoch 1/3


Training: 100%|██████████| 844/844 [04:45<00:00,  2.95it/s]


Train Loss: 0.4316
Checkpoint saved at epoch 0!


Evaluating: 100%|██████████| 47/47 [00:09<00:00,  5.03it/s]


              precision    recall  f1-score   support

           0       0.81      0.80      0.81       615
           1       0.86      0.87      0.87       885

    accuracy                           0.84      1500
   macro avg       0.84      0.84      0.84      1500
weighted avg       0.84      0.84      0.84      1500

Validation Accuracy: 0.8420

🔵 Epoch 2/3


Training: 100%|██████████| 844/844 [04:49<00:00,  2.92it/s]


Train Loss: 0.2659
Checkpoint saved at epoch 1!


Evaluating: 100%|██████████| 47/47 [00:09<00:00,  5.01it/s]


              precision    recall  f1-score   support

           0       0.88      0.73      0.80       615
           1       0.83      0.93      0.88       885

    accuracy                           0.85      1500
   macro avg       0.86      0.83      0.84      1500
weighted avg       0.85      0.85      0.85      1500

Validation Accuracy: 0.8487

🔵 Epoch 3/3


Training: 100%|██████████| 844/844 [04:49<00:00,  2.92it/s]


Train Loss: 0.1560
Checkpoint saved at epoch 2!


Evaluating: 100%|██████████| 47/47 [00:09<00:00,  5.03it/s]


              precision    recall  f1-score   support

           0       0.85      0.76      0.80       615
           1       0.84      0.91      0.88       885

    accuracy                           0.85      1500
   macro avg       0.85      0.83      0.84      1500
weighted avg       0.85      0.85      0.85      1500

Validation Accuracy: 0.8480
Final model saved as final_model.pth!
