In [1]:
!pip install datasets transformers --quiet



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import BertTokenizerFast
from datasets import load_dataset
from sklearn.model_selection import KFold
from tqdm import tqdm
import numpy as np


In [18]:
# Load dataset
dataset = load_dataset("imdb")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

MAX_LEN = 256

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

# Tokenize train and test
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [19]:
print(tokenized_dataset.shape)

{'train': (25000, 5), 'test': (25000, 5), 'unsupervised': (50000, 5)}


In [20]:
class IMDBDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.encodings["label"][idx],
        }

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = IMDBDataset(tokenized_dataset["train"].select(range(5000)))
test_dataset = IMDBDataset(tokenized_dataset["test"].select(range(2000)))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


# Use subset for faster training in cross-validation
full_dataset = IMDBDataset(tokenized_dataset["train"].select(range(2000)))

print(train_dataset[:5])
print(test_dataset)
print(full_dataset)


{'input_ids': tensor([[  101,  1045, 12524,  ...,  7849, 24544,   102],
        [  101,  1000,  1045,  ...,  2156,  2931,   102],
        [  101,  2065,  2069,  ...,     0,     0,     0],
        [  101,  2023,  2143,  ...,     0,     0,     0],
        [  101,  2821,  1010,  ...,  4151, 24665,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 0, 0, 0, 0])}
<__main__.IMDBDataset object at 0x708240063760>
<__main__.IMDBDataset object at 0x70825471c1f0>


In [21]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)  # (B, L, E)
        x = x.permute(1, 0, 2)  # (L, B, E) for transformer
        x = self.encoder(x, src_key_padding_mask=~attention_mask.bool())
        x = x.mean(dim=0)  # global average pooling
        return self.fc(x)


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = tokenizer.vocab_size
model = TransformerClassifier(vocab_size=VOCAB_SIZE, embed_dim=128, num_heads=4, num_layers=2, num_classes=2).to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()


In [23]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss, correct = 0, 0
    for batch in tqdm(dataloader, desc="Training", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader), correct / len(dataloader.dataset)

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask)
            correct += (outputs.argmax(1) == labels).sum().item()
    return correct / len(dataloader.dataset)


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = tokenizer.vocab_size

model = TransformerClassifier(vocab_size=VOCAB_SIZE, embed_dim=128, num_heads=4, num_layers=2, num_classes=2).to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

print("🚀 Training full model on IMDB train set...\n")
for epoch in range(3):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")

test_acc = evaluate(model, test_loader, device)
print(f"\n📊 Test Accuracy on IMDB test set: {test_acc:.4f}")


🚀 Training full model on IMDB train set...



                                                           

Epoch 1 | Train Loss: 0.0032 | Train Accuracy: 0.9990


                                                           

Epoch 2 | Train Loss: 0.0001 | Train Accuracy: 1.0000


                                                           

Epoch 3 | Train Loss: 0.0001 | Train Accuracy: 1.0000

📊 Test Accuracy on IMDB test set: 1.0000


In [28]:
print("\n🔁 Running 5-Fold Cross-Validation on 2000-sample subset...\n")

# Subset for CV
cv_dataset = IMDBDataset(tokenized_dataset["train"].select(range(2000)))
kf = KFold(n_splits=5)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(cv_dataset)):
    print(f"\n🚨 Fold {fold+1}/5")
    train_subset = Subset(cv_dataset, train_idx)
    val_subset = Subset(cv_dataset, val_idx)

    train_loader_cv = DataLoader(train_subset, batch_size=16, shuffle=True)
    val_loader_cv = DataLoader(val_subset, batch_size=16)

    model = TransformerClassifier(vocab_size=VOCAB_SIZE, embed_dim=128, num_heads=4, num_layers=2, num_classes=2).to(device)
    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(2):
        print(f"Epoch {epoch+1}/2")
        loss, acc = train_epoch(model, train_loader_cv, optimizer, criterion, device)
        val_acc = evaluate(model, val_loader_cv, device)
        print(f"Train Loss: {loss:.4f}, Train Acc: {acc:.4f}, Val Acc: {val_acc:.4f}")

    fold_results.append(val_acc)

print("\n✅ K-Fold Validation Accuracies:", fold_results)
print(f"📈 Mean Accuracy: {np.mean(fold_results):.4f} ± {np.std(fold_results):.4f}")



🔁 Running 5-Fold Cross-Validation on 2000-sample subset...


🚨 Fold 1/5
Epoch 1/2


                                                           

Train Loss: 0.0122, Train Acc: 0.9956, Val Acc: 1.0000
Epoch 2/2


                                                           

Train Loss: 0.0004, Train Acc: 1.0000, Val Acc: 1.0000

🚨 Fold 2/5
Epoch 1/2


                                                           

Train Loss: 0.0123, Train Acc: 0.9906, Val Acc: 1.0000
Epoch 2/2


                                                           

Train Loss: 0.0003, Train Acc: 1.0000, Val Acc: 1.0000

🚨 Fold 3/5
Epoch 1/2


                                                           

Train Loss: 0.0081, Train Acc: 1.0000, Val Acc: 1.0000
Epoch 2/2


                                                           

Train Loss: 0.0002, Train Acc: 1.0000, Val Acc: 1.0000

🚨 Fold 4/5
Epoch 1/2


                                                           

Train Loss: 0.0135, Train Acc: 0.9912, Val Acc: 1.0000
Epoch 2/2


                                                           

Train Loss: 0.0005, Train Acc: 1.0000, Val Acc: 1.0000

🚨 Fold 5/5
Epoch 1/2


                                                           

Train Loss: 0.0075, Train Acc: 0.9994, Val Acc: 1.0000
Epoch 2/2


                                                           

Train Loss: 0.0001, Train Acc: 1.0000, Val Acc: 1.0000

✅ K-Fold Validation Accuracies: [1.0, 1.0, 1.0, 1.0, 1.0]
📈 Mean Accuracy: 1.0000 ± 0.0000
