In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from torch.optim import AdamW
from torch.amp import GradScaler, autocast
import os
import time
import sys

In [2]:
# ------------------------ Configurations ------------------------
MAX_LEN = 128
MODEL_PATH = './roberta_sentiment_model.pt'
TEST_SAMPLE_SIZE = 7500
BATCH_SIZE = 2  # Lower due to limited VRAM (RTX 2050 4GB)
LEARNING_RATE = 2e-5
EPOCHS = 3
MODEL_NAME = 'roberta-base'
GRADIENT_ACCUMULATION_STEPS = 8  # Simulates effective batch size of 16

# Device & mixed precision scaler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
scaler = GradScaler()

# Worker config for Windows/Linux compatibility
NUM_WORKERS = 0 if os.name == 'nt' else 2

In [3]:
# ------------------------ Dataset Class ------------------------
class IMDBDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }


In [4]:
# ------------------------ Data Loader Creator ------------------------
def create_data_loaders(df, tokenizer, max_len, batch_size, test_sample_size):
    # Split the dataset into train, val, and test sets
    train_df, test_df = train_test_split(df, test_size=test_sample_size, random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

    # Create datasets
    train_dataset = IMDBDataset(train_df.review.to_numpy(), train_df.sentiment.to_numpy(), tokenizer, max_len)
    val_dataset = IMDBDataset(val_df.review.to_numpy(), val_df.sentiment.to_numpy(), tokenizer, max_len)
    test_dataset = IMDBDataset(test_df.review.to_numpy(), test_df.sentiment.to_numpy(), tokenizer, max_len)

    # Create loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=NUM_WORKERS, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS,pin_memory=False)

    return train_loader, val_loader, test_loader


In [5]:
# ------------------------ Training Loop ------------------------
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    autocast_device = 'cuda' if device.type == 'cuda' else 'cpu'

    progress_bar = tqdm(enumerate(data_loader), total=len(data_loader), desc="Training")

    for batch_idx, batch in progress_bar:
        try:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            targets = batch['targets'].to(device, non_blocking=True)

            with autocast(device_type=autocast_device):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
                loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS

            scaler.scale(loss).backward()

            if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item() * GRADIENT_ACCUMULATION_STEPS)

            progress_bar.set_postfix({
                'loss': np.mean(losses[-10:]),
                'acc': f"{(correct_predictions.double() / ((batch_idx + 1) * BATCH_SIZE) * 100):.1f}%",
                'GPU': f"{torch.cuda.memory_allocated(device)/1024**2:.0f} MB"
            })

            if batch_idx % 50 == 0:
                sys.stdout.flush()
        except RuntimeError as e:
            print(f"⚠️ Skipping batch {batch_idx} due to error: {e}")
            optimizer.zero_grad(set_to_none=True)
            continue

    return correct_predictions.double() / n_examples, np.mean(losses)

In [6]:
# ------------------------ Evaluation Loop ------------------------
def eval_model(model, data_loader, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [7]:
# ------------------------ Main Training Routine ------------------------
def train_model():
    # ✅ Load and preprocess the IMDB dataset
    df = pd.read_csv('../../data/raw/IMDB Dataset.csv')  # Make sure this CSV file exists!
    df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

    tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
    model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

    train_loader, val_loader, test_loader = create_data_loaders(df, tokenizer, MAX_LEN, BATCH_SIZE, TEST_SAMPLE_SIZE)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * EPOCHS // GRADIENT_ACCUMULATION_STEPS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_accuracy = 0
    with open("training_log.txt", "w") as f:
        f.write("Epoch,Train Loss,Train Acc,Val Loss,Val Acc\n")

    for epoch in range(EPOCHS):
        print(f"\n{'='*50}\nEpoch {epoch + 1}/{EPOCHS}\n{'='*50}")

        try:
            epoch_start = time.time()
            train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler, len(train_loader.dataset))
            val_acc, val_loss = eval_model(model, val_loader, device, len(val_loader.dataset))
            epoch_time = (time.time() - epoch_start) / 60

            print(f"\nEpoch {epoch + 1} Summary:")
            print(f"Time: {epoch_time:.1f} min")
            print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f}")
            print(f"Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f}")

            with open("training_log.txt", "a") as f:
                f.write(f"{epoch+1},{train_loss:.4f},{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n")

            if val_acc > best_accuracy:
                torch.save(model.state_dict(), MODEL_PATH)
                best_accuracy = val_acc
                print("✅ Saved new best model!")

        except Exception as e:
            print(f"\n❌ Error in epoch {epoch + 1}: {str(e)}")
            with open("error_log.txt", "a") as f:
                f.write(f"Epoch {epoch+1}: {str(e)}\n")
            continue

    # ------------------------ Final Test Evaluation ------------------------
    print(f"\n{'='*50}\nTesting Best Model...\n{'='*50}")
    model.load_state_dict(torch.load(MODEL_PATH))
    test_acc, test_loss = eval_model(model, test_loader, device, len(test_loader.dataset))
    print(f"\nTest Results → Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f}")


In [8]:

# ------------------------ Run ------------------------
if __name__ == '__main__':
    train_model()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training:   0%|          | 0/19125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2125 [00:00<?, ?it/s]


Epoch 1 Summary:
Time: 29.6 min
Train Loss: 0.3031 | Acc: 0.8785
Val Loss: 0.2534 | Acc: 0.9042
✅ Saved new best model!

Epoch 2/3


Training:   0%|          | 0/19125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2125 [00:00<?, ?it/s]


Epoch 2 Summary:
Time: 25.4 min
Train Loss: 0.2023 | Acc: 0.9313
Val Loss: 0.3024 | Acc: 0.9080
✅ Saved new best model!

Epoch 3/3


Training:   0%|          | 0/19125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/2125 [00:00<?, ?it/s]


Epoch 3 Summary:
Time: 25.5 min
Train Loss: 0.1447 | Acc: 0.9602
Val Loss: 0.3748 | Acc: 0.9078

Testing Best Model...


  model.load_state_dict(torch.load(MODEL_PATH))


Evaluating:   0%|          | 0/3750 [00:00<?, ?it/s]


Test Results → Loss: 0.2784 | Accuracy: 0.9129
