In [1]:
# Install dependencies (run only once)
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
!pip install nlpaug

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-non

In [30]:
import logging
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    RobertaModel,
    RobertaPreTrainedModel,
    AutoConfig
)
from datasets import load_dataset, Dataset, ClassLabel
import evaluate
import numpy as np
from peft import LoraConfig, get_peft_model
from torch import nn
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
import nlpaug.augmenter.word as naw
import torch
import pandas as pd
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
import os

In [31]:
# ---------------------------
# Logging setup
# ---------------------------
logging.basicConfig(level=logging.INFO)  # Set logging level to INFO
logger = logging.getLogger(__name__)     # Create a logger instance for this module

# ---------------------------
# Configuration
# ---------------------------
class Config:

    # Base model configuration
    base_model = "roberta-base"
    output_dir = "results_lora"

    # Training hyperparameters
    early_stopping_patience = 3
    weight_decay_value = 0.01
    mc_dropout_iterations = 10. # Number of MC Dropout iterations for uncertainty estimation
    train_last_k_layers = 2.    # Used to control fine-tuning depth
    max_seq_length = 512
    train_batch_size = 32
    eval_batch_size = 64
    num_train_epochs = 1
    learning_rate = 5e-6

    # LoRA Configuration
    lora_r = 2
    lora_alpha = 4
    lora_dropout = 0.05
    lora_bias = "none"
    lora_target_modules = ["query", "value"]
    lora_task_type = "SEQ_CLS"


In [37]:
# ---------------------------
# Utility Functions
# ---------------------------
def preprocess(tokenizer, dataset, max_length):

    # Tokenize and preprocess the dataset
    def preprocess(examples):
        return tokenizer(examples["text"], truncation=True, max_length=max_length, padding="max_length")

    # Apply preprocessing in batches and remove original 'text' column
    return dataset.map(preprocess, batched=True, remove_columns=["text"])

def compute_metrics(p):
    # Load accuracy metric from Hugging Face evaluate library
    metric = evaluate.load("accuracy")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def mc_dropout_predict(model, dataset, data_collator, device, iterations):
    model.train() # Enable dropout layers during inference for MC Dropout
    loader = DataLoader(dataset, batch_size=64, collate_fn=data_collator)
    all_logits = []

    # Perform multiple stochastic forward passes
    for _ in range(iterations):
        iteration_logits = []
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**inputs)
            iteration_logits.append(outputs.logits.cpu().numpy())
        all_logits.append(np.concatenate(iteration_logits, axis=0))

    # Average logits over all iterations and return final predictions
    mean_logits = np.mean(np.array(all_logits), axis=0)
    return np.argmax(mean_logits, axis=1)

def freeze_model_parameters(model):
    logger.info("Freezing base model parameters")

    # Freeze all model parameters except LoRA and classifier layers
    for name, param in model.named_parameters():
        if "lora" not in name and "classifier" not in name:
            param.requires_grad = False

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inference_model.to(device).eval(). # Set model to eval mode
    all_predictions = []

    if labelled:
        metric = evaluate.load("accuracy") # Load accuracy metric

    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Add predictions and ground truths to compute accuracy
            metric.add_batch(predictions=predictions.cpu().numpy(), references=batch["labels"].cpu().numpy())

    all_predictions = torch.cat(all_predictions, dim=0)
    return (metric.compute(), all_predictions) if labelled else all_predictions

In [38]:
# ---------------------------
# Tokenizer and Dataset Preparation
# ---------------------------

cfg = Config()

logger.info("Loading tokenizer and dataset")
tokenizer = RobertaTokenizer.from_pretrained(cfg.base_model) # Load tokenizer for the base model
tokenizer.model_max_length = cfg.max_seq_length  # Explicitly set max sequence length


In [39]:
# Load AG News dataset from Hugging Face Datasets
dataset = load_dataset("ag_news")
train_dataset, test_dataset = dataset["train"], dataset["test"]

# Tokenize training and test datasets
encoded_train_data = preprocess(tokenizer, train_dataset, cfg.max_seq_length)
encoded_test_data = preprocess(tokenizer, test_dataset, cfg.max_seq_length)

# Rename label column to 'labels' to match model expectations
encoded_train_data = encoded_train_data.rename_column("label", "labels")
encoded_test_data = encoded_test_data.rename_column("label", "labels")

# Get number of unique labels
num_labels = len(set(encoded_train_data["labels"]))

# Attempt to get label names (if available via ClassLabel), otherwise default to known AG News classes
label_names = encoded_train_data.features["labels"].names if isinstance(encoded_train_data.features["labels"], ClassLabel) else ["World", "Sports", "Business", "Sci/Tech"]

# Create dictionaries for label ID ↔ label name mappings
id2label = {i: name for i, name in enumerate(label_names)}
label2id = {name: i for i, name in enumerate(label_names)}

In [40]:
# Load a pre-trained RoBERTa model for sequence classification
# with the number of output labels and label mappings
model = RobertaForSequenceClassification.from_pretrained(cfg.base_model, num_labels=num_labels, id2label=id2label, label2id=label2id)

# Create a LoRA configuration using values from Config
lora_cfg = LoraConfig(
    r=cfg.lora_r,                          # Rank of the LoRA decomposition
    lora_alpha=cfg.lora_alpha,            # Scaling factor
    lora_dropout=cfg.lora_dropout,        # Dropout probability for LoRA layers
    bias=cfg.lora_bias,                   # Whether to use bias ("none", "all", or "lora_only")
    target_modules=cfg.lora_target_modules,  # Layers to which LoRA is applied
    task_type=cfg.lora_task_type          # Task type (e.g., "SEQ_CLS" for sequence classification)
)

# Apply PEFT (Parameter-Efficient Fine-Tuning) with LoRA to the base model
model = get_peft_model(model, lora_cfg)

# Print the number of trainable parameters after LoRA is applied
print(model.print_trainable_parameters())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 667,396 || all params: 125,316,104 || trainable%: 0.5326
None


In [41]:
# Freeze all model parameters except those related to LoRA and classifier
freeze_model_parameters(model)

In [42]:
# ---------------------------
# Training Arguments
# ---------------------------
training_args = TrainingArguments(
    output_dir=f'./trained_models/{cfg.output_dir}',  # Where to save the model
    eval_strategy='steps',                               # Evaluate every `eval_steps`
    save_strategy='steps',                               # Save checkpoint every `save_steps`
    eval_steps=500,                                      # Evaluation interval
    save_steps=4000,                                     # Checkpoint saving interval
    learning_rate=cfg.learning_rate,                  # Learning rate
    per_device_train_batch_size=cfg.train_batch_size, # Batch size for training
    per_device_eval_batch_size=cfg.eval_batch_size,   # Batch size for evaluation
    num_train_epochs=cfg.num_train_epochs,            # Number of training epochs
    weight_decay=cfg.weight_decay_value,              # Weight decay for regularization
    logging_dir='./logs',                                # Logging directory
    logging_steps=100,                                   # Interval for logging
    save_total_limit=3,                                  # Max number of saved checkpoints
    load_best_model_at_end=True,                         # Automatically load the best checkpoint
    metric_for_best_model="accuracy",                    # Metric to choose best model
    greater_is_better=True,                              # Whether a higher metric is better
    lr_scheduler_type="cosine",                          # Learning rate scheduler
    warmup_ratio=0.1,                                    # Warmup steps as a ratio of total steps
    report_to="wandb",                                   # Report training to Weights & Biases
)

# Data collator dynamically pads inputs to the longest in a batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Early stopping callback: stop training if no improvement for N evals
callbacks = [EarlyStoppingCallback(early_stopping_patience=cfg.early_stopping_patience)]

# ---------------------------
# Trainer Initialization
# ---------------------------
logger.info("Initializing Trainer")
trainer = Trainer(
    model=model,                                   # Model to train
    args=training_args,                            # Training arguments
    train_dataset=encoded_train_data,         # Tokenized training set
    eval_dataset=encoded_test_data,           # Tokenized test/validation set
    compute_metrics=compute_metrics,               # Function to compute metrics (e.g., accuracy)
    data_collator=data_collator,                   # Padding and batching handler
    callbacks=callbacks                            # Callbacks (e.g., early stopping)
)

# ---------------------------
# Training Execution
# ---------------------------
logger.info("Starting training")
trainer.train()  # Begin model fine-tuning

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
500,1.3728,1.368935,0.652632
1000,1.3136,1.302323,0.857237
1500,1.0429,0.990065,0.881579
2000,0.5466,0.486958,0.880789
2500,0.4174,0.389235,0.884605
3000,0.3901,0.369729,0.885789
3500,0.3968,0.36503,0.885921


TrainOutput(global_step=3750, training_loss=0.8104265350341797, metrics={'train_runtime': 2021.9287, 'train_samples_per_second': 59.349, 'train_steps_per_second': 1.855, 'total_flos': 3.181992247296e+16, 'train_loss': 0.8104265350341797, 'epoch': 1.0})

In [None]:
# ---------------------------
# Model Evaluation on Test Set
# ---------------------------
logger.info("Evaluating the model")

# Evaluate the model using the evaluation dataset passed during Trainer initialization
results = trainer.evaluate()

# Log the evaluation metrics (e.g., accuracy, loss)
logger.info(f"Evaluation results: {results}")

In [44]:
print(results)

{'eval_loss': 0.3648858368396759, 'eval_accuracy': 0.8859210526315789, 'eval_runtime': 48.7431, 'eval_samples_per_second': 155.919, 'eval_steps_per_second': 2.441, 'epoch': 1.0}


In [45]:
# ---------------------------
# Predicting on Unlabelled Test Data
# ---------------------------

# Load the unlabelled test dataset (assumed to be a Pandas DataFrame with a "text" column)
unlabelled_dataset = pd.read_pickle("/content/test_unlabelled.pkl")

# Preprocess the unlabelled data using the tokenizer
# This applies tokenization, truncation, padding, and removes raw text
test_dataset = preprocess(tokenizer, unlabelled_dataset, cfg.max_seq_length)

# Generate predictions on the unlabelled data
# Since this is unlabelled, `labelled=False` disables metric computation
# Predictions are made using a batched DataLoader and returned as a tensor
preds = evaluate_model(
    model,
    test_dataset,
    labelled=False,
    batch_size=8,
    data_collator=DataCollatorWithPadding(tokenizer)
)

# ---------------------------
# Saving Predictions to CSV
# ---------------------------

# Create a DataFrame with prediction results
# "ID" column corresponds to the index of each example
# "Label" contains predicted class indices
submission_df = pd.DataFrame({
    "ID": range(len(preds)),
    "Label": preds.numpy()
})

# Export the predictions to a CSV file for submission or further use
with open("submission.csv", "w") as f:
    submission_df.to_csv(f, index=False)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

100%|██████████| 1000/1000 [00:57<00:00, 17.30it/s]
