<a href="https://colab.research.google.com/github/Dominickstephens/aLoRa/blob/main/Roberta_Emotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries (PEFT is the modern standard for LoRA)
!pip install transformers datasets accelerate evaluate peft

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import torch
import numpy as np
import evaluate
import time
from sklearn.metrics import f1_score


# Configuration
MODEL_NAME = "roberta-large"
NUM_LABELS = 28
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 3
LEARNING_RATE = 5e-5
FF_LEARNING_RATE = 1e-5
BATCH_SIZE = 16
# https://arxiv.org/pdf/2412.12148
THRESHOLD = 0.5

In [3]:
# Load the dataset
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
max_length = 128

def tokenize(batch):
    encodings = tokenizer(batch['text'], truncation=True, padding='max_length', max_length=max_length)
    encodings['labels'] = batch['labels']
    return encodings

ds_encoded = ds.map(tokenize, batched=True)
ds_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])

    labels = []
    for item in batch:
        multi_hot = torch.zeros(NUM_LABELS, dtype=torch.float)
        if item['labels'] is not None:
            for l in item['labels']:
                if 0 <= l < NUM_LABELS:
                    multi_hot[l] = 1.0
        labels.append(multi_hot)

    labels = torch.stack(labels)
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Data Loaders
train_loader = DataLoader(ds_encoded['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(ds_encoded['validation'], batch_size=BATCH_SIZE, collate_fn=collate_fn)

# Evaluation Metric
f1_metric = evaluate.load("f1", config="multilabel")

README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [4]:
def prepare_model(method: str):

    model = RobertaForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        problem_type="multi_label_classification"
    )

    if method == "LoRA":
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"]
        )

        model = get_peft_model(model, peft_config)

        print("\n LoRA Configuration")
        model.print_trainable_parameters()
        model.to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

    # Add loRa varient here, i like QRLora maybe

    else:
        print("\n Full Fine-Tuning Configuration ")
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in model.parameters())
        print(f"Total Parameters: {total_params / 1e6:.2f}M")
        print(f"Trainable Parameters: {trainable_params / 1e6:.2f}M (100.00%)")

        model.to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=FF_LEARNING_RATE)

    return model, optimizer

In [5]:
def train_model(model, optimizer, method: str, train_loader, device, epochs, save_model):
    results = {}
    start_time = time.time()

    model.train()
    for epoch in range(epochs):
        loop = tqdm(train_loader, leave=True, desc=f"{method} Epoch {epoch+1}")
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            loop.set_postfix(loss=loss.item())

    total_train_time = time.time() - start_time
    results['train_time_sec'] = total_train_time

    cpt_str = method + "_checkpoint.pth"

    if save_model:
      torch.save({
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': loss,
      }, cpt_str)

    return model, results

In [6]:
def evaluate_model(model, val_loader, f1_metric, threshold, device, method: str):
    model.eval()
    all_preds, all_targets = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            pred_probs = torch.sigmoid(logits)
            pred_labels = (pred_probs > threshold).long()

            all_preds.append(pred_labels.cpu().numpy())
            all_targets.append(labels.long().cpu().numpy())

    preds = np.concatenate(all_preds, axis=0)
    targets = np.concatenate(all_targets, axis=0)

    f1_results = {"f1": f1_score(targets, preds, average="macro")}
    exact_match = np.mean([np.all(p == t) for p, t in zip(preds, targets)])

    results = {
        'f1_macro': f1_results['f1'],
        'exact_match_accuracy': exact_match
    }

    print("-" * 50)
    print(f"| {method} Evaluation Results |")
    print("-" * 50)
    print(f"Validation Macro F1 Score: {results['f1_macro']:.4f}")
    print(f"Validation Exact Match Accuracy: {results['exact_match_accuracy']:.4f}")
    print("-" * 50)

    return results

In [7]:
def cleanup(full_model=None, full_optimizer=None):
    if full_model is not None:
        del full_model
    if full_optimizer is not None:
        del full_optimizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [8]:
results_table = []

print("Before FINE-TUNING")
print("="*60)
full_model, full_optimizer = prepare_model("Full Fine-Tuning")

# Evaluate
print("\nEvaluating model before training...")
pretrain_eval = evaluate_model(
    full_model,
    val_loader,
    f1_metric,
    THRESHOLD,
    DEVICE,
    "Full Fine-Tuning (Before Training)"
)

before_results = {"train_time_sec": 0.0, **pretrain_eval}
results_table.append({"Method": "LoRA", **before_results})

cleanup(full_model, full_optimizer)


Before FINE-TUNING


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Full Fine-Tuning Configuration 
Total Parameters: 355.39M
Trainable Parameters: 355.39M (100.00%)

Evaluating model before training...
--------------------------------------------------
| Full Fine-Tuning (Before Training) Evaluation Results |
--------------------------------------------------
Validation Macro F1 Score: 0.0165
Validation Exact Match Accuracy: 0.0000
--------------------------------------------------


In [9]:
full_method = "Full Fine-Tuning"

full_model, full_optimizer = prepare_model(full_method)

train_full_model = True

if (train_full_model):
  # Train
  full_model, full_train_results = train_model(
      full_model,
      full_optimizer,
      full_method,
      train_loader,
      DEVICE,
      EPOCHS,
      save_model=True
  )
else:
  # Load checkpoint
  cpt_string = full_method + "_checkpoint.pth"
  checkpoint = torch.load(cpt_string)
  full_model.load_state_dict(checkpoint['model_state_dict'])
  full_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']
  results = {}
  results['train_time_sec'] = 0



# Evaluate
full_eval_results = evaluate_model(
    full_model,
    val_loader,
    f1_metric,
    THRESHOLD,
    DEVICE,
    full_method
)

full_results = {**full_train_results, **full_eval_results}
results_table.append({"Method": "Full Fine-Tuning", **full_results})

cleanup(full_model, full_optimizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Full Fine-Tuning Configuration 
Total Parameters: 355.39M
Trainable Parameters: 355.39M (100.00%)


Full Fine-Tuning Epoch 1: 100%|██████████| 2714/2714 [11:56<00:00,  3.79it/s, loss=0.067]
Full Fine-Tuning Epoch 2: 100%|██████████| 2714/2714 [11:56<00:00,  3.79it/s, loss=0.0728]
Full Fine-Tuning Epoch 3: 100%|██████████| 2714/2714 [11:56<00:00,  3.79it/s, loss=0.0486]


--------------------------------------------------
| Full Fine-Tuning Evaluation Results |
--------------------------------------------------
Validation Macro F1 Score: 0.4511
Validation Exact Match Accuracy: 0.4751
--------------------------------------------------


In [10]:
lora_method = "LoRA"
train_lora_model = True

print("LoRA FINE-TUNING")
print("="*60)
lora_model, lora_optimizer = prepare_model(lora_method)

if (train_lora_model):

  # Train
  lora_model, lora_train_results = train_model(
      lora_model,
      lora_optimizer,
      lora_method,
      train_loader,
      DEVICE,
      EPOCHS,
      save_model=True
  )
else:
  # Load checkpoint
  cpt_string = lora_method + "_checkpoint.pth"
  checkpoint = torch.load(cpt_string)
  lora_model.load_state_dict(checkpoint['model_state_dict'])
  lora_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']
  results = {}
  results['train_time_sec'] = 0


# Evaluate
lora_eval_results = evaluate_model(
    lora_model,
    val_loader,
    f1_metric,
    THRESHOLD,
    DEVICE,
    lora_method
)

lora_results = {**lora_train_results, **lora_eval_results}
results_table.append({"Method": "LoRA", **lora_results})

cleanup(full_model, full_optimizer)

LoRA FINE-TUNING


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 LoRA Configuration
trainable params: 1,864,732 || all params: 357,253,176 || trainable%: 0.5220


LoRA Epoch 1: 100%|██████████| 2714/2714 [07:55<00:00,  5.71it/s, loss=0.0898]
LoRA Epoch 2: 100%|██████████| 2714/2714 [07:55<00:00,  5.71it/s, loss=0.0831]
LoRA Epoch 3: 100%|██████████| 2714/2714 [07:54<00:00,  5.72it/s, loss=0.0371]


--------------------------------------------------
| LoRA Evaluation Results |
--------------------------------------------------
Validation Macro F1 Score: 0.4452
Validation Exact Match Accuracy: 0.4394
--------------------------------------------------


In [21]:
results_print = {}

for result in results_table:
    method_name = result["Method"]
    model = prepare_model(method_name)[0]
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    train_time = result.get("train_time_sec", 0.0)
    f1_macro = result.get("f1_macro", 0.0)

    results_print[method_name] = [trainable_params/1e6, train_time, f1_macro]

print("\n\nCOMPARISON OF RESULTS")
print("#"*60)

print(f"| {'Method':<20} | {'Trainable Params (M)':<20} | {'Train Time (s)':<15} | {'Macro F1':<10} |")
print("-" * 75)

for key, value in results_print.items():
    print(f"|{key:<20}|", end = " ")
    for x in value:
      print(f"{x:20.4f}", end = " ")
    print("\n")

# print(f"| {method_name:<20} | {trainable_params/1e6:<20.2f}M | {train_time:<15.2f} | {f1_macro:<10.4f} |")

print("#" * 60)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 LoRA Configuration
trainable params: 1,864,732 || all params: 357,253,176 || trainable%: 0.5220


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Full Fine-Tuning Configuration 
Total Parameters: 355.39M
Trainable Parameters: 355.39M (100.00%)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 LoRA Configuration
trainable params: 1,864,732 || all params: 357,253,176 || trainable%: 0.5220


COMPARISON OF RESULTS
############################################################
| Method               | Trainable Params (M) | Train Time (s)  | Macro F1   |
---------------------------------------------------------------------------
|LoRA                |               1.8647            1424.7488               0.4452 

|Full Fine-Tuning    |             355.3884            2149.0254               0.4511 

############################################################
