# Initial Setup

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Oct 27 19:24:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   44C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import psutil

ram_gb = psutil.virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 56.9 gigabytes of available RAM

You are using a high-RAM runtime!


# Imports

In [4]:
!pip install torchview

from google.colab import drive
import os

import pickle
import json
import time
import psutil
import shutil

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import label_binarize
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    classification_report,
    roc_curve,
    auc,
    confusion_matrix
)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from transformers import BertModel, BertTokenizer, Trainer, TrainingArguments, AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, accuracy_score, f1_score
import numpy as np

from torchview import draw_graph
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

Collecting torchview
  Downloading torchview-0.2.7-py3-none-any.whl.metadata (13 kB)
Downloading torchview-0.2.7-py3-none-any.whl (26 kB)
Installing collected packages: torchview
Successfully installed torchview-0.2.7


# Setting up Paths

In [5]:
def setup_paths(model_name: str, base_dir="/content/drive/MyDrive/ITRPA_PROJ"):
    """
    Initialize directory structure for a given model.
    Returns base_dir, output_dir, model_dir.
    """
    output_dir = os.path.join(base_dir, "outputs_colab_final")
    model_dir = os.path.join(output_dir, model_name)

    os.makedirs(model_dir, exist_ok=True)

    print(f"Base directory: {base_dir}")
    print(f"Outputs directory: {output_dir}")
    print(f"Model directory: {model_dir}")

    return base_dir, output_dir, model_dir

# Loading Data

In [6]:
def load_datafile(config, data_filename="reviews.csv", use_cached=True):
    """
    Loads or reloads the dataset depending on 'use_cached' flag.

    If use_cached=True:
        - Tries to find dataset already loaded in memory (globals()).
        - Otherwise loads from CSV and constructs X, y, label_enc.

    If use_cached=False:
        - Always reloads dataset from disk.

    Parameters
    ----------
    config : dict
        Configuration dictionary containing BASE_DIR or DATA_DIR keys.
    data_filename : str, optional
        CSV filename (default: 'reviews.csv').
    use_cached : bool, optional
        If True, reuse dataset if already loaded in memory.

    Returns
    -------
    X : pd.Series
        Text reviews
    y_encoded : np.ndarray
        Encoded sentiment labels (0=negative, 1=neutral, 2=positive)
    label_enc : LabelEncoder
        Fitted encoder mapping sentiments to integers
    """

    # --- Step 1: Check if data already exists in memory
    if use_cached:
        if all(var in globals() for var in ["X", "y", "label_enc"]):
            print("Dataset found in memory — using cached variables.")
            return globals()["X"], globals()["y"], globals()["label_enc"]
        else:
            print("No cached dataset found. Reloading from disk...")

    # --- Step 2: Load dataset from disk
    base_dir = config.get("BASE_DIR", "/content/drive/MyDrive/ITRPA_PROJ")
    data_dir = os.path.join(base_dir, "data")
    os.makedirs(data_dir, exist_ok=True)

    data_path = os.path.join(data_dir, data_filename)
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Dataset not found at: {data_path}")

    df = pd.read_csv(data_path)
    if not {"Review", "Label"}.issubset(df.columns):
        raise ValueError("CSV must contain 'Review' and 'Label' columns.")

    # --- Step 3: Process dataset
    X = df["Review"].astype(str)
    y = pd.to_numeric(df["Label"], errors="coerce")

    if not y.dropna().between(1, 5).all():
        raise ValueError("Label column must contain numeric values 1–5.")

    mapping = {1: "negative", 2: "negative", 3: "neutral", 4: "positive", 5: "positive"}
    y_sentiment = y.map(mapping)

    label_enc = LabelEncoder()
    y_encoded = label_enc.fit_transform(y_sentiment)

    # --- Step 4: Store in globals for reuse
    globals()["X"], globals()["y"], globals()["label_enc"] = X, y_encoded, label_enc

    # --- Step 5: Summary output
    print(f"Loaded '{data_filename}' from {data_dir} | Shape: {df.shape}")
    print("Sentiment distribution:")
    print(y_sentiment.value_counts())
    print("Label encoding:", dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_))))

    return X, y_encoded, label_enc

# Train Test Split

In [7]:
def prepare_splits(X, y, train_size=0.7, random_state=42):
    """
    Splits X and y into train, validation, and test sets,
    then fits and applies a LabelEncoder consistently across all.

    Parameters
    ----------
    X : array-like or DataFrame
        Input features.
    y : array-like or Series
        Target labels.
    train_size : float, default=0.7
        Proportion of data to use for training.
    random_state : int, default=42
        Random seed for reproducibility.

    Returns
    -------
    X_train, X_val, X_test : arrays/DataFrames
        Feature splits.
    y_train_enc, y_val_enc, y_test_enc : arrays
        Encoded label splits.
    label_enc : LabelEncoder
        Fitted LabelEncoder instance.
    """

    # First split train vs temp (val+test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y,
        train_size=train_size,
        random_state=random_state,
        stratify=y
    )

    # Split temp equally into val/test
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp,
        test_size=0.5,
        random_state=random_state,
        stratify=y_temp
    )

    # Label encode consistently across all sets
    label_enc = LabelEncoder()
    label_enc.fit(list(y_train) + list(y_val) + list(y_test))

    y_train_enc = label_enc.transform(y_train)
    y_val_enc   = label_enc.transform(y_val)
    y_test_enc  = label_enc.transform(y_test)

    # Print summary
    print(f" Train: {len(X_train)/len(X):.1%}")
    print(f" Val:   {len(X_val)/len(X):.1%}")
    print(f" Test:  {len(X_test)/len(X):.1%}")
    print("Classes found:", label_enc.classes_)

    return X_train, X_val, X_test, y_train_enc, y_val_enc, y_test_enc, label_enc

# Setting up Base Model

In [8]:
def load_base_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name)
    print(f"Loaded base model: {model_name}")
    return tokenizer, base_model

# Setting Dataset Class

In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128, inference=False):
        """
        texts: list/array of strings
        labels: list/array of ints (optional)
        tokenizer: Hugging Face tokenizer
        max_len: maximum sequence length
        inference: bool, if True, skip labels entirely (for inference)
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.inference = inference

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

        # Only attach labels if not inference mode
        if not self.inference:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

# Create Optimizer and Loss Functions

In [10]:
def build_optimizer(
    model,
    optimizer_name="adamw",
    lr_base=2e-5,
    lr_lstm=1e-4,
    lr_classifier=1e-4,
    weight_decay_base=0.01,
    weight_decay_others=0.0,
    fine_tune_base=True  #  allow the config to control this
):
    """
    Build an optimizer with explicit learning rates for:
      - Transformer base (BERT/Roberta/etc.)
      - Recurrent block (RNN/LSTM/GRU)
      - Classifier head

    Args:
        model (nn.Module): Model with attributes like .bert, .roberta, .base_model, .rnn/.gru/.lstm, and .classifier
        optimizer_name (str): 'adamw', 'adam', or 'sgd'
        lr_base (float): learning rate for transformer/base model
        lr_lstm (float): learning rate for RNN/LSTM/GRU
        lr_classifier (float): learning rate for classifier head
        weight_decay_base (float): weight decay for base model
        weight_decay_others (float): weight decay for RNN/classifier
        fine_tune_base (bool): if False, excludes the base transformer from optimizer groups
    """
    name = optimizer_name.lower()
    opt_classes = {
        "adamw": torch.optim.AdamW,
        "adam": torch.optim.Adam,
        "sgd": torch.optim.SGD,
    }
    if name not in opt_classes:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Identify base transformer model
    base_model = None
    for attr in ["bert", "roberta", "distilbert", "base_model"]:
        if hasattr(model, attr):
            base_model = getattr(model, attr)
            break

    # Only include base params if fine_tune_base=True
    base_params = []
    if fine_tune_base and base_model is not None:
        base_params = list(base_model.parameters())

    # Identify recurrent layer
    rnn_params = []
    for attr in ["rnn", "gru", "bigru", "lstm"]:
        if hasattr(model, attr):
            rnn_params = list(getattr(model, attr).parameters())
            break

    # Identify classifier head
    cls_params = list(model.classifier.parameters()) if hasattr(model, "classifier") else []

    # Build parameter groups
    param_groups = []
    if base_params:
        param_groups.append({
            "params": base_params,
            "lr": lr_base,
            "weight_decay": weight_decay_base
        })
    if rnn_params:
        param_groups.append({
            "params": rnn_params,
            "lr": lr_lstm,
            "weight_decay": weight_decay_others
        })
    if cls_params:
        param_groups.append({
            "params": cls_params,
            "lr": lr_classifier,
            "weight_decay": weight_decay_others
        })

    if not param_groups:
        param_groups = [{
            "params": model.parameters(),
            "lr": lr_base,
            "weight_decay": weight_decay_base
        }]
        print("No specific groups found — using all model parameters as one group.")

    # Create optimizer
    if name == "sgd":
        optimizer = opt_classes[name](param_groups, momentum=0.9)
    else:
        optimizer = opt_classes[name](param_groups)

    # Summary
    print("Optimizer parameter groups:")
    for i, g in enumerate(optimizer.param_groups):
        n_params = sum(p.numel() for p in g["params"])
        print(f"  Group {i}: lr={g['lr']:.1e}, weight_decay={g['weight_decay']}, n_params={n_params}")

    if not fine_tune_base:
        print("Note: fine_tune_base=False → Transformer backbone excluded from training.")

    return optimizer


In [11]:
def build_loss(loss_name, class_weights=None):
    loss_name = loss_name.lower()

    if loss_name == "crossentropy":
        if class_weights is not None:
            return nn.CrossEntropyLoss(weight=class_weights)
        return nn.CrossEntropyLoss()

    elif loss_name == "bce":
        return nn.BCEWithLogitsLoss()

    elif loss_name == "mse":
        return nn.MSELoss()

    else:
        raise ValueError(f"Unsupported loss: {loss_name}")

In [12]:
def compute_class_weights(y_train, device):
    """
    Compute balanced class weights for CrossEntropyLoss.

    Args:
        y_train: array-like of encoded class labels (e.g., [0, 1, 2, 2, 0, ...])
        device: torch.device to move the tensor to (e.g., 'cuda' or 'cpu')

    Returns:
        class_weights: torch.Tensor of shape [num_classes]
        weights_dict:  dict mapping class_id -> weight (for logging)
    """
    y_train = np.array(y_train)
    classes = np.unique(y_train)

    weights = compute_class_weight(
        class_weight='balanced',
        classes=classes,
        y=y_train
    )

    class_weights = torch.tensor(weights, dtype=torch.float).to(device)
    weights_dict = dict(zip(classes.tolist(), weights.tolist()))

    print("Computed class weights:", weights_dict)
    return class_weights, weights_dict

# Create Dataloader Class

In [13]:
def create_dataloaders(X_train=None, y_train=None,
                       X_val=None, y_val=None,
                       X_test=None, y_test=None,
                       tokenizer=None, max_len=128, batch_size=16,
                       inference_texts=None,  # NEW
                       balance=True):
    """
    Creates DataLoaders for training, validation, testing, and optional inference.
    If `inference_texts` is provided, returns an inference DataLoader only.
    """

    def make_loader(X, y=None, shuffle=False, balance=False):
        if y is not None:
            # --- Training or validation dataset (with labels) ---
            dataset = TextDataset(
                texts=X.tolist() if hasattr(X, "tolist") else list(X),
                labels=y.tolist() if hasattr(y, "tolist") else list(y),
                tokenizer=tokenizer,
                max_len=max_len
            )
        else:
            # --- Inference dataset (no labels) ---
            dataset = TextDataset(
                texts=X.tolist() if hasattr(X, "tolist") else list(X),
                tokenizer=tokenizer,
                max_len=max_len
            )

        if balance and y is not None:
            labels = np.array(y)
            class_counts = np.bincount(labels)
            class_weights = 1.0 / np.maximum(class_counts, 1)
            sample_weights = class_weights[labels]
            sampler = WeightedRandomSampler(
                weights=torch.DoubleTensor(sample_weights),
                num_samples=len(sample_weights),
                replacement=True
            )
            return DataLoader(dataset, batch_size=batch_size, sampler=sampler)
        else:
            return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    # --- Handle inference-only mode ---
    if inference_texts is not None:
        infer_loader = make_loader(inference_texts, y=None, shuffle=False, balance=False)
        print(f"Inference DataLoader ready — {len(infer_loader.dataset)} samples.")
        return infer_loader

    # --- Normal supervised mode ---
    train_loader = make_loader(X_train, y_train, shuffle=True, balance=balance)
    train_eval_loader = make_loader(X_train, y_train, shuffle=False, balance=False)
    val_loader = make_loader(X_val, y_val, shuffle=False)
    test_loader = make_loader(X_test, y_test, shuffle=False)

    print(f"Dataloaders ready — Train: {len(train_loader.dataset)}, "
          f"Val: {len(val_loader.dataset)}, Test: {len(test_loader.dataset)}")

    return train_loader, train_eval_loader, val_loader, test_loader

# Create Model Classes

In [31]:
def build_model(base_model, config, device):
    name = config.get("model_name", "").lower()
    hidden_size = config.get("hidden_size", 320)
    dropout = config.get("dropout", 0.5)
    activation = config.get("activation", "relu")
    fine_tune = config.get("fine_tune_base", True)
    num_classes = config.get("num_classes", 3)
    input_size = base_model.config.hidden_size

    # --- Select architecture ---
    if "bilstm" in name:
        model = TransformerBiLSTMClassifier(
            base_model=base_model,
            num_classes=num_classes,
            input_size=input_size,
            hidden_size=hidden_size,
            dropout=dropout,
            activation=activation,
            fine_tune_base=fine_tune
        )

    elif "lstm" in name:
        model = TransformerLSTMClassifier(
            base_model=base_model,
            num_classes=num_classes,
            input_size=input_size,
            hidden_size=hidden_size,
            dropout=dropout,
            activation=activation,
            fine_tune_base=fine_tune
        )

    elif "bigru" in name:
        model = TransformerGRUClassifier(
            base_model=base_model,
            num_classes=num_classes,
            input_size=input_size,
            hidden_size=hidden_size,
            dropout=dropout,
            activation=activation,
            fine_tune_base=fine_tune,
            bidirectional=True
        )


    elif "gru" in name:
        model = TransformerGRUClassifier(
            base_model=base_model,
            num_classes=num_classes,
            input_size=input_size,
            hidden_size=hidden_size,
            dropout=dropout,
            activation=activation,
            fine_tune_base=fine_tune
        )

    elif "cnn" in name:
        model = TransformerCNNClassifier(
            base_model=base_model,
            num_classes=num_classes,
            input_size=input_size,
            dropout=dropout,
            activation=activation,
            fine_tune_base=fine_tune
        )

    else:
        # Default transformer classification head
        from transformers import AutoModelForSequenceClassification
        model = AutoModelForSequenceClassification.from_pretrained(
            config["base_model_name"], num_labels=num_classes
        )

    model.to(device)
    return model


In [15]:
class TransformerLSTMClassifier(nn.Module):
    def __init__(self, base_model, num_classes=3, input_size=768,
                 hidden_size=320, dropout=0.5, activation="relu",
                 fine_tune_base=True):
        super().__init__()

        self.base_model = base_model
        self.num_classes = num_classes
        self.input_size = input_size

        # Optionally freeze transformer layers
        for param in self.base_model.parameters():
            param.requires_grad = fine_tune_base

        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True
        )

        # Dynamic activation lookup
        activations = {
            "relu": nn.ReLU(),
            "tanh": nn.Tanh(),
            "sigmoid": nn.Sigmoid(),
            "gelu": nn.GELU(),
            "leakyrelu": nn.LeakyReLU()
        }
        act = activations.get(activation.lower(), nn.ReLU())

        # Classification head
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 80),
            act,
            nn.Linear(80, 20),
            act,
            nn.Linear(20, num_classes)
        )

    def forward(self, inputs):
        """
        inputs: dict from tokenizer (input_ids, attention_mask)
        returns: raw logits (not softmaxed)
        """
        transformer_out = self.base_model(**inputs)
        token_embeddings = transformer_out.last_hidden_state  # [batch, seq_len, hidden]
        lstm_out, _ = self.lstm(token_embeddings)
        last_hidden = lstm_out[:, -1, :]  # final timestep
        logits = self.classifier(last_hidden)
        return logits

In [16]:
class TransformerBiLSTMClassifier(nn.Module):
    def __init__(self, base_model, num_classes=3, input_size=768,
                 hidden_size=320, dropout=0.5, activation="relu",
                 fine_tune_base=True):
        super().__init__()

        self.base_model = base_model
        self.num_classes = num_classes
        self.input_size = input_size

        # Optionally freeze transformer layers
        for param in self.base_model.parameters():
            param.requires_grad = fine_tune_base

        # BiLSTM layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=True     # <- key change
        )

        # Dynamic activation lookup
        activations = {
            "relu": nn.ReLU(),
            "tanh": nn.Tanh(),
            "sigmoid": nn.Sigmoid(),
            "gelu": nn.GELU(),
            "leakyrelu": nn.LeakyReLU()
        }
        act = activations.get(activation.lower(), nn.ReLU())

        # Classification head (adjust input size since BiLSTM doubles hidden dim)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 2, 160),  # *2 for bidirectional
            act,
            nn.Linear(160, 40),
            act,
            nn.Linear(40, num_classes)
        )

    def forward(self, inputs):
        """
        inputs: dict from tokenizer (input_ids, attention_mask)
        returns: raw logits (not softmaxed)
        """
        transformer_out = self.base_model(**inputs)
        token_embeddings = transformer_out.last_hidden_state  # [batch, seq_len, hidden]
        lstm_out, _ = self.lstm(token_embeddings)

        # Concatenate last timestep from both directions
        last_hidden = torch.cat((lstm_out[:, -1, :self.lstm.hidden_size],
                                 lstm_out[:, 0, self.lstm.hidden_size:]), dim=1)

        logits = self.classifier(last_hidden)
        return logits

In [17]:
class TransformerGRUClassifier(nn.Module):
    def __init__(self, base_model, num_classes=3, input_size=768,
                 hidden_size=320, dropout=0.5, activation="relu",
                 fine_tune_base=True, bidirectional=False):
        super().__init__()

        self.base_model = base_model
        self.num_classes = num_classes
        self.input_size = input_size
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size

        # Optionally freeze transformer layers
        for param in self.base_model.parameters():
            param.requires_grad = fine_tune_base

        # GRU layer (with optional bidirectionality)
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=bidirectional
        )

        # Activation lookup
        activations = {
            "relu": nn.ReLU(),
            "tanh": nn.Tanh(),
            "sigmoid": nn.Sigmoid(),
            "gelu": nn.GELU(),
            "leakyrelu": nn.LeakyReLU()
        }
        act = activations.get(activation.lower(), nn.ReLU())

        # Adjust classifier input size depending on GRU directionality
        fc_input_dim = hidden_size * (2 if bidirectional else 1)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(fc_input_dim, 80),
            act,
            nn.Linear(80, 20),
            act,
            nn.Linear(20, num_classes)
        )

    def forward(self, inputs):
        """
        inputs: dict from tokenizer (input_ids, attention_mask)
        returns: raw logits (not softmaxed)
        """
        transformer_out = self.base_model(**inputs)
        token_embeddings = transformer_out.last_hidden_state  # [batch, seq_len, hidden]
        gru_out, _ = self.gru(token_embeddings)

        if self.bidirectional:
            # Concatenate last forward & backward hidden states
            last_hidden = torch.cat(
                (gru_out[:, -1, :self.hidden_size],
                 gru_out[:, 0, self.hidden_size:]), dim=1
            )
        else:
            # Use last timestep output for unidirectional GRU
            last_hidden = gru_out[:, -1, :]

        logits = self.classifier(last_hidden)
        return logits

# Create Training Loop

In [18]:
def train_and_evaluate(
    model, train_loader, val_loader,
    optimizer, criterion, device,
    model_name, model_dir,
    epochs=3
):
    """
    Train and evaluate a model, logging accuracy/loss/timing and saving state_dict + environment info.
    """
    os.makedirs(model_dir, exist_ok=True)

    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    epoch_times = []

    model.to(device)
    overall_start = time.time()

    for epoch in range(1, epochs + 1):
        epoch_start = time.time()

        # ---------------- TRAIN ----------------
        model.train()
        running_loss, correct, total = 0.0, 0, 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} [Train]", leave=False):
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device)
            }
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_loss = running_loss / total
        train_acc = correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        # ---------------- VALIDATE ----------------
        model.eval()
        val_running_loss, val_correct, val_total = 0.0, 0, 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch}/{epochs} [Val]", leave=False):
                inputs = {
                    "input_ids": batch["input_ids"].to(device),
                    "attention_mask": batch["attention_mask"].to(device)
                }
                labels = batch["labels"].to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_running_loss += loss.item() * labels.size(0)
                preds = outputs.argmax(dim=1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        val_loss = val_running_loss / val_total
        val_acc = val_correct / val_total
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        epoch_time = time.time() - epoch_start
        epoch_times.append(epoch_time)

        print(f"Epoch {epoch:02d}/{epochs} | "
              f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f} | "
              f"Train Acc: {train_acc*100:.2f}%, Val Acc: {val_acc*100:.2f}% | "
              f"Time: {epoch_time/60:.2f} min")

    total_time = time.time() - overall_start
    avg_epoch_time = total_time / epochs

    # ---------------- ENVIRONMENT INFO ----------------
    gpu_info = {}
    if torch.cuda.is_available():
        gpu_props = torch.cuda.get_device_properties(device)
        gpu_info = {
            "gpu_name": gpu_props.name,
            "total_vram_gb": round(gpu_props.total_memory / 1e9, 2),
            "allocated_vram_gb": round(torch.cuda.memory_allocated(device) / 1e9, 2),
            "reserved_vram_gb": round(torch.cuda.memory_reserved(device) / 1e9, 2),
            "cuda_version": torch.version.cuda
        }

    ram = psutil.virtual_memory()
    disk = shutil.disk_usage('/')
    env_info = {
        "python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
        "torch_version": torch.__version__,
        "device": str(device),
        "gpu_info": gpu_info,
        "ram_used_gb": round((ram.total - ram.available) / 1e9, 2),
        "ram_total_gb": round(ram.total / 1e9, 2),
        "disk_used_gb": round(disk.used / 1e9, 2),
        "disk_total_gb": round(disk.total / 1e9, 2)
    }

    # ---------------- LOG TIMING + ENV ----------------
    timing_log = {
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "total_time_sec": round(total_time, 2),
        "total_time_min": round(total_time / 60, 2),
        "avg_epoch_time_sec": round(avg_epoch_time, 2),
        "epoch_times_sec": [round(t, 2) for t in epoch_times],
        "environment": env_info
    }

    log_path = os.path.join(model_dir, f"{model_name}_timing_log.json")
    with open(log_path, "w") as f:
        json.dump(timing_log, f, indent=4)

    print(f"\nTraining complete in {total_time/60:.2f} min "
          f"({avg_epoch_time:.2f} sec/epoch avg)")
    print(f"Timing and environment info saved to: {log_path}")

    # ---------------- SAVE STATE DICTIONARY ONLY ----------------
    model_state_path = os.path.join(model_dir, f"{model_name}_state_dict.pt")
    torch.save(model.state_dict(), model_state_path)
    print(f"Model state_dict saved to: {model_state_path}")

    # ---------------- PLOTS ----------------
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, epochs + 1), train_losses, label="Train Loss", marker='o')
    plt.plot(range(1, epochs + 1), val_losses, label="Val Loss", marker='o')
    plt.title(f"{model_name} Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(model_dir, f"{model_name}_training_loss.png"), bbox_inches="tight")
    plt.close()

    plt.figure(figsize=(8, 5))
    plt.plot(range(1, epochs + 1), train_accs, label="Train Accuracy", marker='o')
    plt.plot(range(1, epochs + 1), val_accs, label="Val Accuracy", marker='o')
    plt.title(f"{model_name} Training Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(model_dir, f"{model_name}_training_accuracy.png"), bbox_inches="tight")
    plt.close()

    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "train_accs": train_accs,
        "val_accs": val_accs,
        "timing_log": timing_log,
        "model_paths": {
            "state_dict": model_state_path
        }
    }


# Model Graph Class

In [19]:
def save_model_graph(model, dataloader, model_name, model_dir, device, show_backbone=True):
    model.eval()
    batch = next(iter(dataloader))
    inputs = {
        "input_ids": batch["input_ids"].to(device),
        "attention_mask": batch["attention_mask"].to(device)
    }

    if not show_backbone:
        class DummyBase(nn.Module):
            def __init__(self, hidden_size=768):
                super().__init__()
                self.config = type('obj', (object,), {"hidden_size": hidden_size})

            def forward(self, **kwargs):
                batch_size = kwargs["input_ids"].shape[0]
                seq_len = kwargs["input_ids"].shape[1]
                hidden_size = self.config.hidden_size
                return type('obj', (object,), {
                    "last_hidden_state": torch.randn(
                        batch_size, seq_len, hidden_size,
                        device=kwargs["input_ids"].device
                    )
                })

        model.base_model = DummyBase()

    class TorchViewWrapper(nn.Module):
        def __init__(self, model):
            super().__init__()
            self.model = model
        def forward(self, inputs):
            return self.model(inputs)

    wrapped_model = TorchViewWrapper(model).to(device)
    wrapped_model.eval()

    graph = draw_graph(
        wrapped_model,
        input_data=(inputs,),
        expand_nested=True,
        depth=4,
        device=device,
        save_graph=True,
        directory=model_dir,
        filename=f"{model_name.replace('/', '_')}_graph"
    )

    print(f"Saved model graph to: {model_dir}/{model_name.replace('/', '_')}_graph.png")
    return graph


# Evaluation Loop

In [20]:
def make_json_serializable(obj):
    """
    Recursively convert numpy data types and other non-serializable
    values (e.g., np.int64, np.float32) into JSON-safe types.
    """
    if isinstance(obj, dict):
        return {str(k): make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(v) for v in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

In [21]:
def evaluate_model(
    model,
    test_loader,
    label_encoder,
    device,
    model_name,
    model_dir,
    num_classes=3,
    train_acc_final=None,
    config=None
):
    """
    Evaluate a trained model on the test set and visualize key metrics.

    Saves:
        - classification_report.json (with class names)
        - roc_auc.png (with label names)
        - predictions.pkl (includes decoded labels)
        - confusion_matrix.png, accuracy_barplot.png (optional)
    """
    os.makedirs(model_dir, exist_ok=True)

    # ---------------- Save CONFIG ----------------
    if config is not None:
        config_path = os.path.join(model_dir, f"{model_name}_config.json")
        with open(config_path, "w") as f:
            json.dump(config, f, indent=4, default=str)
        print(f"Config saved → {config_path}")

    model.to(device)
    model.eval()

    all_preds, all_probs, all_labels = [], [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Evaluating {model_name}", leave=False):
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device)
            }
            labels = batch["labels"].to(device)
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    preds = np.array(all_preds)
    probs = np.array(all_probs)
    labels = np.array(all_labels)

    # ---------------- Decode to class names ----------------
    class_names = label_encoder.classes_       # e.g. ['negative', 'neutral', 'positive']
    y_true = label_encoder.inverse_transform(labels)
    y_pred = label_encoder.inverse_transform(preds)

    # ---------------- Save raw predictions ----------------
    pred_dict = {
        "y_true_idx": labels.tolist(),
        "y_pred_idx": preds.tolist(),
        "y_true": y_true.tolist(),
        "y_pred": y_pred.tolist(),
        "probs": probs.tolist(),
        "class_names": class_names.tolist()
    }
    preds_path = os.path.join(model_dir, f"{model_name}_predictions.pkl")
    with open(preds_path, "wb") as f:
        pickle.dump(pred_dict, f)
    print(f"Predictions saved → {preds_path}")

    # ---------------- Classification report ----------------
    report_path = os.path.join(model_dir, f"{model_name}_classification_report.json")

    report = classification_report(
    y_true, y_pred, target_names=class_names,
    output_dict=True, zero_division=0
    )
    report_serializable = make_json_serializable(report)

    with open(report_path, "w") as f:
        json.dump(report_serializable, f, indent=4)


    # ---------------- ROC-AUC (string labels on plot) ----------------
    # Re-binarize using integer indices for computation
    y_true_bin = label_binarize(labels, classes=np.arange(num_classes))
    fpr, tpr, roc_auc = {}, {}, {}

    for i, label_name in enumerate(class_names):
        # Handle class absence safely
        if y_true_bin[:, i].sum() == 0:
            fpr[i], tpr[i], roc_auc[i] = [0], [0], 0.0
            continue
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], probs[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    macro_auc = roc_auc_score(
        y_true_bin, probs, average="macro", multi_class="ovr"
    )

    plt.figure(figsize=(7, 6))
    for i, label_name in enumerate(class_names):
        plt.plot(fpr[i], tpr[i], lw=2,
                 label=f"{label_name} (AUC={roc_auc[i]:.3f})")
    plt.plot([0, 1], [0, 1], "k--", lw=1)
    plt.title(f"{model_name} ROC Curves (Macro AUC={macro_auc:.3f})")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    plt.grid(True)

    roc_path = os.path.join(model_dir, f"{model_name}_roc_auc.png")
    plt.savefig(roc_path, bbox_inches="tight")
    plt.close()
    print(f"ROC-AUC plot saved → {roc_path}")

    # ---------------- Optional: Accuracy bar plot ----------------
    if train_acc_final is not None:
        plt.figure(figsize=(4, 4))
        plt.bar(["Train", "Test"], [train_acc_final * 100, report["accuracy"] * 100],
                color=["skyblue", "lightcoral"])
        plt.ylabel("Accuracy (%)")
        plt.title(f"{model_name} Train vs Test Accuracy")
        plt.grid(axis="y", linestyle="--", alpha=0.7)
        acc_path = os.path.join(model_dir, f"{model_name}_accuracy_barplot.png")
        plt.savefig(acc_path, bbox_inches="tight")
        plt.close()
        print(f"Accuracy comparison saved → {acc_path}")

    return {
        "report": report,
        "roc_auc": macro_auc,
        "predictions_file": preds_path,
        "report_file": report_path,
        "roc_plot": roc_path
    }


# Usage Function

In [22]:
def run_pipeline(
    *,
    use_cached=True,
    make_splits=True,
    load_model=True,
    make_datasets=True,
    build_opt=True,
    build_loss_fn=True,
    make_loaders=True,
    init_model=True,
    do_train=True,
    save_graph=True,
    do_eval=True,
    **config
):
    """
    Unified Deep Learning Pipeline

    Execution Stages
    ----------------
    1. Setup directories and device
    2. Load tokenizer and base transformer model
    3. Load dataset (with optional caching)
    4. Prepare train/val/test splits
    5. Create tokenized PyTorch DataLoaders
    6. Initialize TransformerLSTMClassifier
    7. Build optimizer and loss function
    8. Train the model (with logging and plots)
    9. Save model computation graph (optional)
    10. Evaluate on test set (classification report, ROC-AUC, etc.)
    """

    print("=== Starting pipeline setup ===")

    # -------------------- STAGE 1: Paths + Device --------------------
    base_dir = config.get("BASE_DIR", "/content/drive/MyDrive/ITRPA_PROJ")
    base_model_name = config.get("base_model_name", "bert-base-uncased")
    model_name = config.get("model_name", "bert-bi-lstm")
    device = torch.device(config.get("device", "cpu"))

    model_dir = os.path.join(base_dir, "outputs_colab_final", model_name)
    os.makedirs(model_dir, exist_ok=True)

    # -------------------- STAGE 2: Load tokenizer + transformer base --------------------
    tokenizer, base_model = load_base_model(base_model_name)
    base_model.to(device)

    # -------------------- STAGE 3–4: Load dataset + prepare splits --------------------
    X, y, label_enc = load_datafile(config, data_filename="reviews.csv", use_cached=use_cached)
    X_train, X_val, X_test, y_train, y_val, y_test, label_enc = prepare_splits(X, y)

    # -------------------- STAGE 5: Create Dataloaders --------------------
    train_loader, train_eval_loader, val_loader, test_loader = create_dataloaders(
        X_train, y_train, X_val, y_val, X_test, y_test,
        tokenizer, config["max_len"], config["batch_size"]
    )

    # -------------------- STAGE 6: Initialize model --------------------
    model = build_model(base_model, config, device)

    # -------------------- STAGE 7: Build optimizer + loss --------------------
    optimizer = build_optimizer(
        model,
        optimizer_name=config.get("optimizer", "adamw"),
        lr_base=config.get("lr_base", 2e-5),
        lr_lstm=config.get("lr_lstm", 1e-4),
        lr_classifier=config.get("lr_classifier", 1e-4),
        weight_decay_base=config.get("weight_decay_base", 0.01),
        weight_decay_others=config.get("weight_decay_others", 0.0),
        fine_tune_base=config.get("fine_tune_base", True)
    )


    loss_fn = build_loss(config["loss_fn"])

    # -------------------- STAGE 8: Train model --------------------
    train_acc_final = None
    if do_train:
        print("\n=== Beginning model training ===")
        train_log = train_and_evaluate(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            optimizer=optimizer,
            criterion=loss_fn,
            device=device,
            model_name=model_name,
            model_dir=model_dir,
            epochs=config["epochs"]
        )
        train_acc_final = train_log["train_accs"][-1] if len(train_log["train_accs"]) > 0 else None
    else:
        print("Skipping training (do_train=False)")

    # -------------------- STAGE 9: Save model graph --------------------
    if save_graph:
        print("\n=== Saving model graph ===")
        save_model_graph(
            model=model,
            dataloader=train_loader,
            model_name=model_name,
            model_dir=model_dir,
            device=device,
            show_backbone=True
        )
    else:
        print("Skipping graph generation (save_graph=False)")

    # -------------------- STAGE 10: Evaluate on test set --------------------
    if do_eval:
        print("\n=== Evaluating model on test set ===")
        results = evaluate_model(
            model=model,
            test_loader=test_loader,
            label_encoder=label_enc,
            device=device,
            model_name=model_name,
            model_dir=model_dir,
            num_classes=config.get("num_classes", 3),
            train_acc_final=train_acc_final,
            config=config
        )
    else:
        print("Skipping evaluation (do_eval=False)")
        results = None

    print("=== Pipeline complete ===")

    # -------------------- Return outputs --------------------
    return {
        "model": model,
        "tokenizer": tokenizer,
        "optimizer": optimizer,
        "loss_fn": loss_fn,
        "label_enc": label_enc,
        "dataloaders": {
            "train": train_loader,
            "val": val_loader,
            "test": test_loader
        },
        "results": results,
        "config": config
    }


# Example Usage

In [23]:
# CONFIG = {
#     # -------- Model & Architecture --------
#     "model_name": "roberta-lstm-gelu-lr-adj",
#     "base_model_name": "roberta-base",
#     "activation": "gelu",
#     "hidden_size": 320,
#     "dropout": 0.5,
#     "fine_tune_base": True,
#     "num_classes": 3,

#     # -------- Tokenization & Data --------
#     "max_len": 160,
#     "batch_size": 128,

#     # -------- Training --------
#     "epochs": 2,
#     "optimizer": "adamw",
#     "loss_fn": "crossentropy",

#     # -------- Learning Rates --------
#     "lr_base": 2e-05,
#     "lr_lstm": 1e-04,
#     "lr_classifier": 1e-03,

#     # -------- Weight Decay --------
#     "weight_decay_base": 0.035,
#     "weight_decay_others": 0.01,

#     # -------- Paths & Device --------
#     "BASE_DIR": "/content/drive/MyDrive/ITRPA_PROJ",
#     "device": str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
# }


# pipeline = run_pipeline(
#     use_cached=True,
#     make_splits=True,
#     load_model=True,
#     make_datasets=True,
#     build_opt=True,
#     build_loss_fn=True,
#     make_loaders=True,
#     init_model=True,
#     do_train=True,
#     save_graph=True,
#     do_eval=True,
#     **CONFIG
# )




# Run Inference

In [24]:
def infer_from_state_dict(
    model_path,
    base_model_name="bert-base-uncased",
    hidden_size=320,
    num_classes=3,
    activation="gelu",
    max_len=128,
    batch_size=16,
    fine_tune_base=True,
    device=None,
    texts=None,
    csv_path=None,
    text_column="review",
    id2label=None,
    save_path=None,
    verbose=True
):
    """
    Perform inference with a saved Transformer+LSTM classifier.
    Supports direct CSV loading and progress bar visualization.

    Returns:
        df_results (pd.DataFrame): DataFrame with predictions and confidences
        timing (dict): timing summary
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load data
    if csv_path:
        df = pd.read_csv(csv_path)
        if text_column not in df.columns:
            raise ValueError(f"Column '{text_column}' not found in {csv_path}.")
        texts = df[text_column].astype(str).tolist()
        if verbose:
            print(f"Loaded {len(texts)} rows from {csv_path}")
    elif texts is None:
        raise ValueError("Provide either 'texts' list or 'csv_path'.")

    # Load tokenizer and transformer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    base_model = AutoModel.from_pretrained(base_model_name)
    input_size = base_model.config.hidden_size  # dynamic input size from base model

    # Build model
    model = TransformerLSTMClassifier(
        base_model=base_model,
        num_classes=num_classes,
        input_size=input_size,
        hidden_size=hidden_size,
        dropout=0.5,
        activation=activation,
        fine_tune_base=fine_tune_base
    )


    # Load weights
    state_dict = torch.load(model_path, map_location="cpu")
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    # Build dataset and dataloader
    dataset = TextDataset(texts, tokenizer=tokenizer, max_len=max_len, inference=True)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    if verbose:
        print(f"Model loaded ({base_model_name}), running on {device}")
        print(f"Starting inference on {len(dataset)} samples...")

    preds, probs = [], []
    start = time.time()

    # Inference loop with progress bar
    with torch.no_grad():
        for batch in tqdm(loader, total=len(loader), desc="Inference", leave=True):
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(batch)
            p = F.softmax(logits, dim=1)
            preds.extend(torch.argmax(p, dim=1).cpu().numpy())
            probs.extend(p.cpu().numpy())

    total = time.time() - start
    timing = {
        "total_time_sec": total,
        "time_per_sample_sec": total / len(dataset),
        "time_per_batch_sec": total / len(loader)
    }

    if verbose:
        print(f"Total time: {total:.3f}s | Per sample: {timing['time_per_sample_sec']:.4f}s")

    # Prepare results
    df_results = pd.DataFrame({
        "text": texts,
        "predicted_class": preds,
        "confidence": [float(np.max(p)) for p in probs]
    })

    if id2label:
        df_results["label"] = [id2label[p] for p in preds]

    if csv_path:
        df_results.insert(0, "source_file", os.path.basename(csv_path))

    if save_path:
        df_results.to_csv(save_path, index=False)
        if verbose:
            print(f"Results saved to: {save_path}")

    return df_results, timing

# Inference Example

In [25]:
# BASE_DIR = "/content/drive/MyDrive/ITRPA_PROJ"
# DATA_PATH = os.path.join(BASE_DIR, "data", "reviews.csv")
# MODEL_PATH = os.path.join(BASE_DIR, "outputs_colab_final", "roberta-lstm-gelu-lr-adj", "roberta-lstm-gelu-lr-adj_state_dict.pt")
# OUTPUT_PATH = os.path.join(BASE_DIR, "outputs_colab_final", "roberta-lstm-gelu-lr-adj","reviews_with_predictions.csv")

# # Label mapping (adjust as per your training)
# id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}

# # Run inference
# df_results, timing = infer_from_state_dict(
#     model_path=MODEL_PATH,
#     base_model_name="roberta-base",
#     csv_path=DATA_PATH,
#     text_column="Review",   # change if your CSV uses 'text', 'comment', etc.
#     id2label=id2label,
#     batch_size=128,
#     save_path=OUTPUT_PATH
# )

# print("Done!")
# print(f"Processed {len(df_results)} rows")
# print(timing)
# df_results.head()

# Disconnect Runtime

In [26]:
# from google.colab import runtime
# runtime.unassign()


# BI LSTM ROBERTA

In [27]:
# CONFIG = {
#     # -------- Model & Architecture --------
#     "model_name": "roberta-bilstm-gelu-lr-adj",   # ← changed from "lstm" to "bilstm"
#     "base_model_name": "roberta-base",
#     "activation": "gelu",
#     "hidden_size": 320,
#     "dropout": 0.5,
#     "fine_tune_base": True,
#     "num_classes": 3,

#     # -------- Tokenization & Data --------
#     "max_len": 160,
#     "batch_size": 128,

#     # -------- Training --------
#     "epochs": 4,
#     "optimizer": "adamw",
#     "loss_fn": "crossentropy",

#     # -------- Learning Rates --------
#     "lr_base": 2e-05,
#     "lr_lstm": 5e-05,
#     "lr_classifier": 1e-03,

#     # -------- Weight Decay --------
#     "weight_decay_base": 0.035,
#     "weight_decay_others": 0.01,

#     # -------- Paths & Device --------
#     "BASE_DIR": "/content/drive/MyDrive/ITRPA_PROJ",
#     "device": str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
# }

# # --- Run the pipeline ---
# pipeline = run_pipeline(
#     use_cached=True,
#     make_splits=True,
#     load_model=True,
#     make_datasets=True,
#     build_opt=True,
#     build_loss_fn=True,
#     make_loaders=True,
#     init_model=True,
#     do_train=True,
#     save_graph=True,
#     do_eval=True,
#     **CONFIG
# )


# ROBERTA GRU

In [29]:
# CONFIG = {
#     # -------- Model & Architecture --------
#     "model_name": "roberta-gru-gelu-lr-adj",   # ← changed from "bilstm" to "gru"
#     "base_model_name": "roberta-base",
#     "activation": "gelu",
#     "hidden_size": 320,
#     "dropout": 0.5,
#     "fine_tune_base": True,
#     "num_classes": 3,

#     # -------- Tokenization & Data --------
#     "max_len": 160,
#     "batch_size": 128,

#     # -------- Training --------
#     "epochs": 5,
#     "optimizer": "adamw",
#     "loss_fn": "crossentropy",

#     # -------- Learning Rates --------
#     "lr_base": 2e-05,
#     "lr_gru": 5e-05,          # ← renamed to match GRU layer
#     "lr_classifier": 1e-03,

#     # -------- Weight Decay --------
#     "weight_decay_base": 0.035,
#     "weight_decay_others": 0.01,

#     # -------- Paths & Device --------
#     "BASE_DIR": "/content/drive/MyDrive/ITRPA_PROJ",
#     "device": str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
# }

# # --- Run the pipeline ---
# pipeline = run_pipeline(
#     use_cached=True,
#     make_splits=True,
#     load_model=True,
#     make_datasets=True,
#     build_opt=True,
#     build_loss_fn=True,
#     make_loaders=True,
#     init_model=True,
#     do_train=True,
#     save_graph=True,
#     do_eval=True,
#     **CONFIG
# )


=== Starting pipeline setup ===


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded base model: roberta-base
Dataset found in memory — using cached variables.
 Train: 70.0%
 Val:   15.0%
 Test:  15.0%
Classes found: [0 1 2]
Dataloaders ready — Train: 74912, Val: 16053, Test: 16053
Optimizer parameter groups:
  Group 0: lr=2.0e-05, weight_decay=0.035, n_params=124645632
  Group 1: lr=1.0e-04, weight_decay=0.01, n_params=1046400
  Group 2: lr=1.0e-03, weight_decay=0.01, n_params=27363

=== Beginning model training ===




Epoch 01/5 | Train Loss: 0.4447, Val Loss: 0.3173 | Train Acc: 81.57%, Val Acc: 88.59% | Time: 15.73 min




Epoch 02/5 | Train Loss: 0.1890, Val Loss: 0.3179 | Train Acc: 93.44%, Val Acc: 89.88% | Time: 15.73 min




Epoch 03/5 | Train Loss: 0.1184, Val Loss: 0.3780 | Train Acc: 96.04%, Val Acc: 90.11% | Time: 15.73 min




Epoch 04/5 | Train Loss: 0.0884, Val Loss: 0.4066 | Train Acc: 97.12%, Val Acc: 90.75% | Time: 15.72 min




Epoch 05/5 | Train Loss: 0.0750, Val Loss: 0.3946 | Train Acc: 97.52%, Val Acc: 90.21% | Time: 15.73 min

Training complete in 78.64 min (943.72 sec/epoch avg)
Timing and environment info saved to: /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_timing_log.json
Model state_dict saved to: /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_state_dict.pt

=== Saving model graph ===
Saved model graph to: /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_graph.png

=== Evaluating model on test set ===
Config saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_config.json




Predictions saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_predictions.pkl
ROC-AUC plot saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_roc_auc.png
Accuracy comparison saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-gru-gelu-lr-adj/roberta-gru-gelu-lr-adj_accuracy_barplot.png
=== Pipeline complete ===


# ROBERTA BI GRU GELU

In [32]:
CONFIG = {
    # -------- Model & Architecture --------
    "model_name": "roberta-bigru-gelu-lr-adj",   # changed from "gru" to "bigru"
    "base_model_name": "roberta-base",
    "activation": "gelu",
    "hidden_size": 320,
    "dropout": 0.5,
    "fine_tune_base": True,
    "num_classes": 3,

    # -------- Tokenization & Data --------
    "max_len": 160,
    "batch_size": 128,

    # -------- Training --------
    "epochs": 4,
    "optimizer": "adamw",
    "loss_fn": "crossentropy",

    # -------- Learning Rates --------
    "lr_base": 2e-05,
    "lr_gru": 4e-05,          # slightly lowered for BiGRU stability (was 5e-05)
    "lr_classifier": 7e-04,   # minor reduction to balance larger BiGRU output

    # -------- Weight Decay --------
    "weight_decay_base": 0.035,
    "weight_decay_others": 0.01,

    # -------- Paths & Device --------
    "BASE_DIR": "/content/drive/MyDrive/ITRPA_PROJ",
    "device": str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
}

# --- Run the pipeline ---
pipeline = run_pipeline(
    use_cached=True,
    make_splits=True,
    load_model=True,
    make_datasets=True,
    build_opt=True,
    build_loss_fn=True,
    make_loaders=True,
    init_model=True,
    do_train=True,
    save_graph=True,
    do_eval=True,
    **CONFIG
)


=== Starting pipeline setup ===


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded base model: roberta-base
Dataset found in memory — using cached variables.
 Train: 70.0%
 Val:   15.0%
 Test:  15.0%
Classes found: [0 1 2]
Dataloaders ready — Train: 74912, Val: 16053, Test: 16053
Optimizer parameter groups:
  Group 0: lr=2.0e-05, weight_decay=0.035, n_params=124645632
  Group 1: lr=1.0e-04, weight_decay=0.01, n_params=2092800
  Group 2: lr=7.0e-04, weight_decay=0.01, n_params=52963

=== Beginning model training ===




Epoch 01/4 | Train Loss: 0.4446, Val Loss: 0.3409 | Train Acc: 81.78%, Val Acc: 86.89% | Time: 15.84 min




Epoch 02/4 | Train Loss: 0.1910, Val Loss: 0.3372 | Train Acc: 93.43%, Val Acc: 89.30% | Time: 15.83 min




Epoch 03/4 | Train Loss: 0.1143, Val Loss: 0.3551 | Train Acc: 96.12%, Val Acc: 91.60% | Time: 15.89 min




Epoch 04/4 | Train Loss: 0.0889, Val Loss: 0.3146 | Train Acc: 97.05%, Val Acc: 92.45% | Time: 15.90 min

Training complete in 63.46 min (951.91 sec/epoch avg)
Timing and environment info saved to: /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_timing_log.json
Model state_dict saved to: /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_state_dict.pt

=== Saving model graph ===
Saved model graph to: /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_graph.png

=== Evaluating model on test set ===
Config saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_config.json




Predictions saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_predictions.pkl
ROC-AUC plot saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_roc_auc.png
Accuracy comparison saved → /content/drive/MyDrive/ITRPA_PROJ/outputs_colab_final/roberta-bigru-gelu-lr-adj/roberta-bigru-gelu-lr-adj_accuracy_barplot.png
=== Pipeline complete ===
