<a href="https://colab.research.google.com/github/Dominickstephens/aLoRa/blob/main/Roberta_Emotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets accelerate evaluate pynvml
!pip install -U peft
!pip install -U bitsandbytes
!pip show peft transformers

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1
Name: peft
Version: 0.17.1
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: benjamin@huggingface.co
License: Apache
Location: /usr/local/lib/python3.12/dist-packages
Requires

In [15]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, AdaLoraConfig
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import torch
import numpy as np
import evaluate
import time
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from pynvml import *



# Configuration
MODEL_NAME = "roberta-base"
NUM_LABELS = 28
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 3
LEARNING_RATE = 5e-5
FF_LEARNING_RATE = 1e-5
BATCH_SIZE = 16
# https://arxiv.org/pdf/2412.12148
THRESHOLD = 0.5

In [3]:
# Load the dataset
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
max_length = 128

def tokenize(batch):
    encodings = tokenizer(batch['text'], truncation=True, padding='max_length', max_length=max_length)
    encodings['labels'] = batch['labels']
    return encodings

ds_encoded = ds.map(tokenize, batched=True)
ds_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])

    labels = []
    for item in batch:
        multi_hot = torch.zeros(NUM_LABELS, dtype=torch.float)
        if item['labels'] is not None:
            for l in item['labels']:
                if 0 <= l < NUM_LABELS:
                    multi_hot[l] = 1.0
        labels.append(multi_hot)

    labels = torch.stack(labels)
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Data Loaders
train_loader = DataLoader(ds_encoded['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(ds_encoded['validation'], batch_size=BATCH_SIZE, collate_fn=collate_fn)

# Evaluation Metric
f1_metric = evaluate.load("f1", config="multilabel")

README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [4]:
def prepare_model(method: str):
    """
    Prepares a RoBERTa model for fine-tuning using different PEFT strategies:
    - "Full"      : full fine-tuning
    - "LoRA"      : standard LoRA
    - "LoRA+"     : LoRA with Rescaled Stable adaptation
    - "AdaLoRA"   : Adaptive LoRA (dynamic rank allocation)
    - "DoRA"      : Weight-decomposed LoRA (Meta 2024)
    - "QLoRA"     : 4-bit quantized LoRA
    """

    # Load Quantization Config
    quantization_config = None
    if method == "QLoRA":
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

    # Load base RoBERTa
    model = RobertaForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        problem_type="multi_label_classification",
        quantization_config=quantization_config,
        device_map="auto" if method == "QLoRA" else None,
    )

    # Choose PEFT variant
    if method == "LoRA":
        config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"],
        )
        model = get_peft_model(model, config)
        print("\nUsing LoRA")

    elif method == "LoRA+":
        config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"],
            use_rslora=True,  # LoRA+
        )
        model = get_peft_model(model, config)
        print("\n⚙️ Using LoRA+")

    elif method == "AdaLoRA":
        config = AdaLoraConfig(
            task_type=TaskType.SEQ_CLS,
            init_r=8,
            target_r=4,
            tinit=100,
            tfinal=500,
            deltaT=10,
            lora_alpha=32,
            lora_dropout=0.1,
            total_step=len(train_loader) * EPOCHS
        )
        model = get_peft_model(model, config)
        print("\nUsing AdaLoRA")

    elif method == "DoRA":
        config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["query", "value"],
            use_dora=True,  # enables DoRA
        )
        model = get_peft_model(model, config)
        print("\nUsing DoRA")

    elif method == "QLoRA":
        config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["query", "value"],
        )
        model = get_peft_model(model, config)
        print("\nUsing QLoRA (4-bit quantized + LoRA)")

    else:
        print("\nFull Fine-Tuning (no adapters)")
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in model.parameters())
        print(f"Total Parameters: {total_params / 1e6:.2f}M")
        print(f"Trainable Parameters: {trainable_params / 1e6:.2f}M (100%)")
        model.to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=FF_LEARNING_RATE)
        return model, optimizer

    # Shared setup for PEFT variants
    model.print_trainable_parameters()
    model.to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    return model, optimizer


In [5]:
def train_model(model, optimizer, method: str, train_loader, device, epochs, save_model):
    results = {}
    start_time = time.time()

    gpu_utilization_history = []
    max_gpu_mem_allocated = 0.0

    if device.type == 'cuda':
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(device.index if device.index is not None else 0)

    model.train()
    for epoch in range(epochs):
        loop = tqdm(train_loader, leave=True, desc=f"{method} Epoch {epoch+1}")
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            if device.type == 'cuda':
                util = nvmlDeviceGetUtilizationRates(handle)
                gpu_util_percent = util.gpu
                gpu_utilization_history.append(gpu_util_percent)

                allocated_mem_bytes = torch.cuda.memory_allocated(device)

                max_gpu_mem_allocated = max(max_gpu_mem_allocated, allocated_mem_bytes / (1024**3))

                loop.set_postfix(
                    loss=loss.item(),
                    gpu_util_perc=f"{gpu_util_percent}%",
                    gpu_mem_gb=f"{allocated_mem_bytes / (1024**3):.2f}"
                )
            else:
                loop.set_postfix(loss=loss.item())

    total_train_time = time.time() - start_time
    results['train_time_sec'] = total_train_time

    if device.type == 'cuda':
        results['max_gpu_mem_gb'] = max_gpu_mem_allocated
        results['max_gpu_percent'] = max(gpu_utilization_history) if gpu_utilization_history else 0
        results['average_gpu_percent'] = sum(gpu_utilization_history) / len(gpu_utilization_history) if gpu_utilization_history else 0

        nvmlShutdown()


    if save_model:
      cpt_str = method + "_checkpoint.pth"
      torch.save({
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': loss,
      }, cpt_str)

    return model, results

In [6]:
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    hamming_loss,
    roc_auc_score,
    average_precision_score
)
import numpy as np
import torch

def evaluate_model(model, val_loader, f1_metric, threshold, device, method: str):
    model.eval()
    all_preds, all_targets, all_probs = [], [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            pred_probs = torch.sigmoid(logits)
            pred_labels = (pred_probs > threshold).long()

            all_probs.append(pred_probs.cpu().numpy())
            all_preds.append(pred_labels.cpu().numpy())
            all_targets.append(labels.long().cpu().numpy())

    preds = np.concatenate(all_preds, axis=0)
    probs = np.concatenate(all_probs, axis=0)
    targets = np.concatenate(all_targets, axis=0)

    # Multi-label metrics
    results = {
        "f1_macro": f1_score(targets, preds, average="macro"),
        "f1_micro": f1_score(targets, preds, average="micro"),
        "f1_weighted": f1_score(targets, preds, average="weighted"),
        "precision_macro": precision_score(targets, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(targets, preds, average="macro", zero_division=0),
        "hamming_loss": hamming_loss(targets, preds),
        "exact_match_accuracy": np.mean([np.all(p == t) for p, t in zip(preds, targets)]),
    }

    # Probabilistic metrics (optional)
    try:
        results["roc_auc_macro"] = roc_auc_score(targets, probs, average="macro")
        results["pr_auc_macro"] = average_precision_score(targets, probs, average="macro")
    except ValueError:
        results["roc_auc_macro"] = None
        results["pr_auc_macro"] = None

    print("-" * 50)
    print(f"| {method} Evaluation Results |")
    print("-" * 50)
    for k, v in results.items():
        if v is not None:
            print(f"{k.replace('_',' ').title():30}: {v:.4f}")
    print("-" * 50)

    return results


In [7]:
def cleanup(full_model=None, full_optimizer=None):
    if full_model is not None:
        del full_model
    if full_optimizer is not None:
        del full_optimizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [12]:
results_table = []

print("Before FINE-TUNING")
print("="*60)
full_model, full_optimizer = prepare_model("Full Fine-Tuning")

# Evaluate
print("\nEvaluating model before training...")
pretrain_eval = evaluate_model(
    full_model,
    val_loader,
    f1_metric,
    THRESHOLD,
    DEVICE,
    "Full Fine-Tuning (Before Training)"
)

before_results = {"train_time_sec": 0.0, **pretrain_eval}
results_table.append({"Method": "Before Training", **before_results})

cleanup(full_model, full_optimizer)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Before FINE-TUNING

Full Fine-Tuning (no adapters)
Total Parameters: 124.67M
Trainable Parameters: 124.67M (100%)

Evaluating model before training...
--------------------------------------------------
| Full Fine-Tuning (Before Training) Evaluation Results |
--------------------------------------------------
F1 Macro                      : 0.0509
F1 Micro                      : 0.0770
F1 Weighted                   : 0.1676
Precision Macro               : 0.0290
Recall Macro                  : 0.7075
Hamming Loss                  : 0.6950
Exact Match Accuracy          : 0.0000
Roc Auc Macro                 : 0.5010
Pr Auc Macro                  : 0.0444
--------------------------------------------------


In [9]:
full_method = "Full Fine-Tuning"

full_model, full_optimizer = prepare_model(full_method)

train_full_model = True

if (train_full_model):
  # Train
  full_model, full_train_results = train_model(
      full_model,
      full_optimizer,
      full_method,
      train_loader,
      DEVICE,
      EPOCHS,
      save_model=True
  )
else:
  # Load checkpoint
  cpt_string = full_method + "_checkpoint.pth"
  checkpoint = torch.load(cpt_string)
  full_model.load_state_dict(checkpoint['model_state_dict'])
  full_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']
  results = {}
  results['train_time_sec'] = 0



# Evaluate
full_eval_results = evaluate_model(
    full_model,
    val_loader,
    f1_metric,
    THRESHOLD,
    DEVICE,
    full_method
)

full_results = {**full_train_results, **full_eval_results}
results_table.append({"Method": "Full Fine-Tuning", **full_results})

cleanup(full_model, full_optimizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Full Fine-Tuning (no adapters)
Total Parameters: 124.67M
Trainable Parameters: 124.67M (100%)


Full Fine-Tuning Epoch 1:  10%|█         | 276/2714 [00:23<03:27, 11.72it/s, gpu_mem_gb=2.37, gpu_util_perc=92%, loss=0.202]


KeyboardInterrupt: 

In [24]:
del lora_model
del lora_optimizer
torch.cuda.empty_cache()

In [26]:
# ============================================================
# MULTI-RUN LoRA / QLoRA / AdaLoRA / DoRA BENCHMARK
# ============================================================
# import torch

methods_to_run = ["DoRA"]
# results_table = []

for lora_method in methods_to_run:
    print("\n" + "="*70)
    print(f"Starting Fine-Tuning with {lora_method}")
    print("="*70)

    train_lora_model = True

    # ---- Prepare model and optimizer ----
    try:
        lora_model, lora_optimizer = prepare_model(lora_method)
    except Exception as e:
        print(f"Failed to prepare {lora_method}: {e}")
        continue

    if train_lora_model:
        # ---- Train ----
        try:
            lora_model, lora_train_results = train_model(
                lora_model,
                lora_optimizer,
                lora_method,
                train_loader,
                DEVICE,
                EPOCHS,
                save_model=True
            )
        except torch.cuda.OutOfMemoryError:
            print(f"Skipping {lora_method} (Out of memory)")
            torch.cuda.empty_cache()
            continue
        except Exception as e:
            print(f"Training failed for {lora_method}: {e}")
            continue

    else:
        # ---- Load checkpoint ----
        cpt_string = f"{lora_method}_checkpoint.pth"
        checkpoint = torch.load(cpt_string)
        lora_model.load_state_dict(checkpoint["model_state_dict"])
        lora_optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        epoch = checkpoint["epoch"]
        loss = checkpoint["loss"]
        lora_train_results = {"train_time_sec": 0, "final_loss": loss}

    # ---- Evaluate ----
    try:
        lora_eval_results = evaluate_model(
            lora_model,
            val_loader,
            f1_metric,
            THRESHOLD,
            DEVICE,
            lora_method
        )
    except Exception as e:
        print(f"Evaluation failed for {lora_method}: {e}")
        continue

    # ---- Merge and store results ----
    lora_results = {**lora_train_results, **lora_eval_results}
    results_table.append({"Method": lora_method, **lora_results})

    # ---- Cleanup GPU memory ----
    del lora_model
    del lora_optimizer
    torch.cuda.empty_cache()

print("\n" + "="*70)
print("ALL EXPERIMENTS COMPLETE")
print("="*70)

# Convert to DataFrame for nice display (optional)
import pandas as pd
results_df = pd.DataFrame(results_table)
display(results_df)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting Fine-Tuning with DoRA

Using DoRA
trainable params: 925,468 || all params: 125,592,632 || trainable%: 0.7369


DoRA Epoch 1:   2%|▏         | 51/2714 [00:03<03:09, 14.03it/s, gpu_mem_gb=9.11, gpu_util_perc=91%, loss=0.269]


KeyboardInterrupt: 

In [None]:
print("\n\nCOMPARISON OF RESULTS")
print("#" * 100)

# Print table header with new metrics
print(f"| {'Method':<20} | {'Trainable Params (M)':<20} | {'Train Time (s)':<15} | "
      f"{'Macro F1':<10} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} |")
print("-" * 100)

# Print each row from results_table
for result in results_table:
    method_name = result["Method"]
    trainable_params = result.get("trainable_params", 0.0)
    train_time = result.get("train_time_sec", 0.0)
    f1_macro = result.get("f1_macro", 0.0)
    accuracy = result.get("accuracy", 0.0)
    precision = result.get("precision_macro", 0.0)
    recall = result.get("recall_macro", 0.0)

    print(f"| {method_name:<20} | {trainable_params / 1e6:<20.4f} | {train_time:<15.2f} | "
          f"{f1_macro:<10.4f} | {accuracy:<10.4f} | {precision:<10.4f} | {recall:<10.4f} |")

print("#" * 100)





COMPARISON OF RESULTS
############################################################
| Method               | Trainable Params (M) | Train Time (s)  | Macro F1   |
---------------------------------------------------------------------------
| Before Training      | 0.0000               | 0.00            | 0.0578     |
| LoRA                 | 0.0000               | 1780.28         | 0.4030     |
| LoRA+                | 0.0000               | 1777.49         | 0.4094     |
| AdaLoRA              | 0.0000               | 2583.87         | 0.3962     |
| DoRA                 | 0.0000               | 2143.37         | 0.4131     |
############################################################


In [None]:
import matplotlib.pyplot as plt

# Extract plotting data from results_table
methods = [r["Method"] for r in results_table]
params = [r.get("trainable_params", 0.0) / 1e6 for r in results_table]  # in millions
train_times = [r.get("train_time_sec", 0.0) for r in results_table]
f1_macros = [r.get("f1_macro", 0.0) for r in results_table]

plt.figure(figsize=(13, 5))
colors = plt.cm.tab10.colors

#  Plot 1: Training Time vs F1
plt.subplot(1, 2, 1)
for i, method in enumerate(methods):
    plt.scatter(train_times[i], f1_macros[i], s=120, label=method,
                color=colors[i % len(colors)], edgecolors='black', linewidth=1.2)
    plt.text(train_times[i]*1.01, f1_macros[i], method, fontsize=9, va='center')
plt.xlabel('Training Time (seconds)', fontsize=11)
plt.ylabel('Macro F1 Score', fontsize=11)
plt.title('Training Time vs Macro F1', fontsize=13, weight='bold')
plt.grid(True, linestyle='--', alpha=0.6)

# Plot 2: Parameters vs F1
plt.subplot(1, 2, 2)
for i, method in enumerate(methods):
    plt.scatter(params[i], f1_macros[i], s=120, label=method,
                color=colors[i % len(colors)], edgecolors='black', linewidth=1.2)
    plt.text(params[i]*1.01, f1_macros[i], method, fontsize=9, va='center')
plt.xlabel('Trainable Parameters (Millions)', fontsize=11)
plt.ylabel('Macro F1 Score', fontsize=11)
plt.title('Model Size vs Macro F1', fontsize=13, weight='bold')
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()


NameError: name 'results_print' is not defined

In [None]:
DEFAULT_R = 8
DEFAULT_ALPHA = 16
DEFAULT_DROPOUT = 0.1
OPT_EPOCHS = 1

def _run_trial_training(trial_method: str, lr: float, r: int = None, lora_alpha: int = None, lora_dropout: float = None):

    # 1. Load Base Model
    model = RobertaForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        problem_type="multi_label_classification",
    )

    if trial_method == "LoRA":
        config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            target_modules=["query", "value"],
        )
        model = get_peft_model(model, config)
        model.print_trainable_parameters()

    # 2. Setup Optimizer and Device
    model.to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=lr)

    # 3. Train
    trained_model, _ = train_model(
        model,
        optimizer,
        trial_method,
        train_loader,
        DEVICE,
        OPT_EPOCHS,
        save_model=False
    )

    # 4. Evaluate
    eval_results = evaluate_model(
        trained_model,
        val_loader,
        f1_metric,
        THRESHOLD,
        DEVICE,
        trial_method
    )

    # 5. Cleanup
    cleanup(trained_model, optimizer)

    return eval_results["f1_macro"]

# STUDY 1: LoRA Learning Rate Optimization

def objective_lora_lr(trial):

    lr = trial.suggest_float('LEARNING_RATE_LoRA', 1e-6, 1e-4, log=True)

    r = DEFAULT_R
    lora_alpha = DEFAULT_ALPHA
    lora_dropout = DEFAULT_DROPOUT

    # Run the training
    f1_macro = _run_trial_training(
        trial_method='LoRA',
        lr=lr,
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout
    )

    trial.set_user_attr('lora_r', r)
    trial.set_user_attr('lora_alpha', lora_alpha)

    return f1_macro

print("\n" + "="*70)
print("STUDY 1: LoRA Learning Rate Optimization")
print(f"Fixed Parameters: r={DEFAULT_R}, alpha={DEFAULT_ALPHA}, dropout={DEFAULT_DROPOUT}")
print("="*70)

study_lora = optuna.create_study(direction='maximize', study_name="LoRA_LR_Study")
study_lora.optimize(objective_lora_lr, n_trials=25)

print("Done Lora")


# STUDY 2: Full Fine-Tuning Learning Rate Optimization

def objective_fullft_lr(trial):
    lr = trial.suggest_float('FF_LEARNING_RATE_FullFT', 5e-7, 5e-6, log=True)

    f1_macro = _run_trial_training(
        trial_method='Full Fine-Tuning',
        lr=lr
    )

    return f1_macro

print("\n" + "="*70)
print("STUDY 2: Full Fine-Tuning Learning Rate Optimization")
print("="*70)

study_fullft = optuna.create_study(direction='maximize', study_name="FullFT_LR_Study")
study_fullft.optimize(objective_fullft_lr, n_trials=25)

# LoRA Results
print("\n" + "#"*50)
print("BEST LoRA LEARNING RATE RESULT")
print("#"*50)
print(f"Best Macro F1: {study_lora.best_value:.4f}")
print(f"Optimal Learning Rate: {study_lora.best_params['LEARNING_RATE_LoRA']:.2e}")
print(f"Fixed Rank (r): {study_lora.best_trial.user_attrs['lora_r']}")
print(f"Fixed Alpha: {study_lora.best_trial.user_attrs['lora_alpha']}")
print("#"*50)

# --- Full Fine Tuning
print("\n" + "#"*50)
print("BEST FULL FINE-TUNING LEARNING RATE RESULT")
print("#"*50)
print(f"Best Macro F1: {study_fullft.best_value:.4f}")
print(f"Optimal Learning Rate: {study_fullft.best_params['FF_LEARNING_RATE_FullFT']:.2e}")
print("#"*50)