In [1]:
!pip install -q torch transformers datasets evaluate scikit-learn pandas matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn.metrics import accuracy_score
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification,
    Trainer, TrainingArguments, TrainerCallback
)
from datasets import load_dataset
import evaluate
import gc

In [3]:
# Set Seed for reproducability
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

In [4]:
class AdaptiveLoRALinear(nn.Module):
    def __init__(self, base_layer, r=4, alpha=32, dropout=0.05):
        super().__init__()
        self.base = base_layer
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()

        self.lora_A = nn.Parameter(torch.randn(r, base_layer.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(base_layer.out_features, r) * 0.01)

        device = self.lora_A.device
        self.initial_A = self.lora_A.detach().clone().to(device)
        self.initial_B = self.lora_B.detach().clone().to(device)

        self.grad_norm_history = []
        self.weight_change_history = []

        self.lora_A.register_hook(self._capture_grad_hook('A'))
        self.lora_B.register_hook(self._capture_grad_hook('B'))

    def forward(self, x):
        result = self.base(x)
        if self.r > 0:
            lora_out = self.dropout(x) @ self.lora_A.T @ self.lora_B.T
            result += self.scaling * lora_out
        return result

    def _capture_grad_hook(self, which):
        def hook(grad):
            norm = grad.norm().item()
            if which == 'A':
                self.grad_norm_history.append(('A', norm))
            elif which == 'B':
                self.grad_norm_history.append(('B', norm))
        return hook

    def compute_weight_change(self):
        device = self.lora_A.device
        if self.initial_A.device != device:
            self.initial_A = self.initial_A.to(device)
        if self.initial_B.device != device:
            self.initial_B = self.initial_B.to(device)
        delta_A = (self.lora_A - self.initial_A).norm().item()
        delta_B = (self.lora_B - self.initial_B).norm().item()
        total_change = delta_A + delta_B
        self.weight_change_history.append(total_change)
        return total_change

    def average_grad_norm(self):
        norms = [n for (w, n) in self.grad_norm_history]
        return sum(norms) / len(norms) if norms else 0.0


class AdaptiveLoRAMonitor:
    def __init__(self, model, alpha=0.5, beta=0.5, min_r=2, max_r=16):
        self.model = model
        self.alpha = alpha
        self.beta = beta
        self.min_r = min_r
        self.max_r = max_r
        self.modules = [m for m in model.modules() if isinstance(m, AdaptiveLoRALinear)]

    def assign_adaptive_ranks(self, run_id):
      grad_scores = []
      weight_scores = []

      # Step 1: Collect grad norms and weight changes separately
      for module in self.modules:
          grad_scores.append(module.average_grad_norm())
          weight_scores.append(module.compute_weight_change())

      # Step 2: Normalize grad scores
      finite_grad_scores = [g for g in grad_scores if np.isfinite(g)]
      min_grad, max_grad = min(finite_grad_scores), max(finite_grad_scores)
      normalized_grads = [
          (g - min_grad) / (max_grad - min_grad) if np.isfinite(g) and (max_grad > min_grad) else 0.5
          for g in grad_scores
      ]

      # Step 3: Normalize weight scores
      finite_weight_scores = [w for w in weight_scores if np.isfinite(w)]
      min_weight, max_weight = min(finite_weight_scores), max(finite_weight_scores)
      normalized_weights = [
          (w - min_weight) / (max_weight - min_weight) if np.isfinite(w) and (max_weight > min_weight) else 0.5
          for w in weight_scores
      ]

      # Step 4: Compute final combined score
      total_scores = [
          self.alpha * g_norm + self.beta * w_norm
          for g_norm, w_norm in zip(normalized_grads, normalized_weights)
      ]

      valid_total_scores = [s for s in total_scores if np.isfinite(s)]
      if not valid_total_scores:
          raise ValueError("All total scores are NaN or Inf — cannot assign ranks!")

      min_score, max_score = min(valid_total_scores), max(valid_total_scores)
      fallback_rank = (self.min_r + self.max_r) // 2

      print("\n📊 Layer-wise Stats and Rank Assignment:")
      print(f"{'Layer':<30} {'Grad Norm':>12} {'Weight Δ':>12} {'Total Score':>12} {'Norm Score':>12} {'Assigned r':>10}")

      rows = []
      for idx, module in enumerate(self.modules):
          grad = grad_scores[idx]
          weight = weight_scores[idx]
          score = total_scores[idx]

          if not np.isfinite(score):
              norm_score = 0.5  # fallback mid
              new_r = fallback_rank
          else:
              if min_score == max_score:
                  norm_score = 1.0
              else:
                  norm_score = (score - min_score) / (max_score - min_score)
              new_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)

          new_r = max(self.min_r, min(self.max_r, new_r))  # Clamp safely

          # Reinitialize LoRA matrices
          device = module.lora_A.device
          module.lora_A = nn.Parameter(torch.randn(new_r, module.base.in_features, device=device) * 0.01)
          module.lora_B = nn.Parameter(torch.randn(module.base.out_features, new_r, device=device) * 0.01)
          module.initial_A = module.lora_A.detach().clone().to(device)
          module.initial_B = module.lora_B.detach().clone().to(device)
          module.r = new_r
          module.grad_norm_history = []
          module.weight_change_history = []

          print(f"{str(module.base)[:28]:<30} {grad:12.4f} {weight:12.4f} {score:12.4f} {norm_score:12.4f} {new_r:10d}")

          rows.append({
              "Layer": str(module.base),
              "Grad Norm": grad,
              "Weight Change": weight,
              "Score": score,
              "Norm Score": norm_score,
              "Assigned Rank": new_r,
          })

      # Save to CSV
      df_layer_stats = pd.DataFrame(rows)
      os.makedirs("rank_logs", exist_ok=True)
      df_layer_stats.to_csv(f"rank_logs/adaptive_lora_run_{run_id}.csv", index=False)


In [5]:
def inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and ('q_lin' in name or 'v_lin' in name):
            parent = model
            for part in name.split('.')[:-1]:
                parent = getattr(parent, part)
            layer_name = name.split('.')[-1]
            setattr(parent, layer_name, AdaptiveLoRALinear(module, r, alpha, dropout))
    return model

def freeze_model_except_lora(model):
    """
      Freezes all the layers except the LoRA Layers and Classification Layers
    """
    for name, param in model.named_parameters():
        if (
            'lora_A' in name or
            'lora_B' in name or
            'pre_classifier' in name or
            'classifier' in name
        ):
            param.requires_grad = True
        else:
            param.requires_grad = False

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
dataset = load_dataset("glue", "sst2")
dataset = dataset.rename_column("label", "labels")
dataset = dataset.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", max_length=128), batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [12]:
os.makedirs("rank_logs", exist_ok=True)

In [13]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
def log_phase(phase_name, run_id=None):
    ts = time.time()
    with open("gpu_phase_timestamps.log", "a") as f:
        f.write(f"{run_id},{phase_name},{ts}\n")

In [15]:
warmup_steps = [50, 100, 200, 500, 1000, 2000]
alpha = 0.5
beta = 0.5

In [16]:
all_results = []

for run_id in range(len(warmup_steps)):
    print(f"\n\nStarting Run {run_id} | Warmup Steps: {warmup_steps[run_id]}")
    gc.collect()
    torch.cuda.empty_cache()

    set_seed(42+run_id)

    # initialize base model
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    # Initial LoRA rank for Warmup phase
    # alpha must be either 2x or 4x of the rank r
    # if r = 4, then 4x4 = 16, thus alpha=16
    model = inject_adaptive_lora(model,
                                 r=4,
                                 alpha=16,
                                 dropout=0.05)

    # freeze the model
    #   except the LoRA Layer
    #   except the Classification Heads
    freeze_model_except_lora(model)

    # move model to CUDA
    model = model.to("cuda")
    for name, param in model.named_parameters():
        if 'lora_' in name:
            param.data = param.data.to('cuda')

    # Assign Adaptive Ranks
    monitor = AdaptiveLoRAMonitor(model,
                                  alpha=alpha,
                                  beta=beta,
                                  min_r=2,
                                  max_r=16)

    # training arguments for warmup phase
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./warmup_run{run_id}",
            eval_strategy="no",
            max_steps=warmup_steps[run_id],
            per_device_train_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"]
    )

    #------------------Warmup START------------------
    log_phase("warmup_start", run_id)
    warmup_start = time.time()
    trainer.train()
    warmup_end = time.time()
    log_phase("warmup_end", run_id)
    #------------------Warmup END------------------

    #------------------Rank Assignment START------------------
    log_phase("rank_assignment_start", run_id)
    monitor.assign_adaptive_ranks(run_id=run_id)
    log_phase("rank_assignment_end", run_id)
    #------------------Rank Assignment END------------------

    # Training arguments for Fine Tuning
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./finetune_run{run_id}",
            eval_strategy="epoch",
            save_strategy="no",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        compute_metrics=compute_metrics
    )


    #------------------Training Phase START------------------
    log_phase("training_start", run_id)
    start_time = time.time()
    trainer.train()
    end_time = time.time()
    log_phase("training_end", run_id)
    #------------------Training Phase END------------------


    # final_memory = torch.cuda.memory_allocated() / 1e9
    outputs = trainer.predict(dataset["validation"])
    logits = outputs.predictions[1] if isinstance(outputs.predictions, tuple) else outputs.predictions
    preds = np.argmax(logits, axis=-1)
    labels = dataset["validation"]["labels"]
    acc = accuracy_score(labels.cpu().numpy(), preds)

    print(f"\nFinal Eval Accuracy: {acc:.4f}")

    warmup_time = round((warmup_end - warmup_start)/ 60, 2)
    training_time = round((end_time - start_time) / 60, 2)
    total_time = round(warmup_time + training_time,2)

    all_results.append({
        "Run ID": run_id,
        "Warmup Steps": warmup_steps,
        "Final Accuracy": round(acc, 4),
        "Warmup Time (min)": warmup_time,
        "Training Time (min)": training_time,
        "Total Time (min)": total_time
    })



Starting Run 0 | Warmup Steps: 50


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5279



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1500.3137       0.3304       0.0274       0.0360          2
Linear(in_features=768, out_      8503.0968       0.4967       0.4578       0.6026         10
Linear(in_features=768, out_      1139.3659       0.3179       0.0000       0.0000          2
Linear(in_features=768, out_     13057.6460       0.4941       0.5925       0.7799         12
Linear(in_features=768, out_      1666.2338       0.3289       0.0305       0.0401          2
Linear(in_features=768, out_     17633.8021       0.5158       0.7596       1.0000         16
Linear(in_features=768, out_      3095.8223       0.4913       0.2869       0.3777          7
Linear(in_features=768, out_     14773.6968       0.5210       0.6798       0.8949         14
Linear(in_features=768, out_      7046.3539       0.5909       0.5373       0.7073         11
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2532,0.276533,0.897936
2,0.1565,0.333299,0.892202
3,0.1765,0.38441,0.904817



Final Eval Accuracy: 0.9048


Starting Run 1 | Warmup Steps: 100


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5357
100,0.3981



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1580.9419       0.4368       0.0019       0.0000          2
Linear(in_features=768, out_      8169.5573       0.5913       0.3175       0.4886          8
Linear(in_features=768, out_      1520.6292       0.5073       0.0476       0.0707          2
Linear(in_features=768, out_     13228.3707       0.5807       0.4725       0.7285         12
Linear(in_features=768, out_      1871.7202       0.4735       0.0361       0.0529          2
Linear(in_features=768, out_            inf       0.5624       0.3349       0.5154          9
Linear(in_features=768, out_      3115.1043       0.6483       0.1940       0.2973          6
Linear(in_features=768, out_            inf       0.6559       0.3980       0.6131         10
Linear(in_features=768, out_      7940.2142       0.8761       0.5026       0.7751         12
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2412,0.260306,0.896789
2,0.1505,0.312236,0.897936
3,0.1614,0.356719,0.905963



Final Eval Accuracy: 0.9060


Starting Run 2 | Warmup Steps: 200


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5038
100,0.3634
150,0.31
200,0.3412



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1842.6458       0.5956       0.0000       0.0000          2
Linear(in_features=768, out_     10984.5066       0.6766       0.2877       0.4365          8
Linear(in_features=768, out_      2041.5594       0.6613       0.0557       0.0845          3
Linear(in_features=768, out_     18550.2664       0.6819       0.4781       0.7253         12
Linear(in_features=768, out_      2552.2689       0.6370       0.0494       0.0750          3
Linear(in_features=768, out_     22145.8138       0.6970       0.5784       0.8773         14
Linear(in_features=768, out_      3754.9572       0.7381       0.1572       0.2385          5
Linear(in_features=768, out_            inf       0.7554       0.3735       0.5666          9
Linear(in_features=768, out_      9175.0185       0.9825       0.4796       0.7275         12
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2334,0.268794,0.897936
2,0.1531,0.342798,0.908257
3,0.174,0.347857,0.908257



Final Eval Accuracy: 0.9083


Starting Run 3 | Warmup Steps: 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5001
100,0.369
150,0.3183
200,0.3539
250,0.2947
300,0.3116
350,0.3337
400,0.3212
450,0.2995
500,0.3152



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1894.7279       0.9504       0.0884       0.0000          2
Linear(in_features=768, out_      9102.4437       1.0893       0.4498       0.6730         11
Linear(in_features=768, out_      2341.7030       1.0842       0.2117       0.2296          5
Linear(in_features=768, out_     13331.4228       0.8996       0.4433       0.6609         11
Linear(in_features=768, out_      2447.1973       0.9809       0.1321       0.0814          3
Linear(in_features=768, out_     16339.7775       0.8407       0.5000       0.7664         12
Linear(in_features=768, out_      4721.6390       1.2269       0.4090       0.5969         10
Linear(in_features=768, out_            inf       0.9839       0.3654       0.5157          9
Linear(in_features=768, out_      6749.6052       1.4084       0.6255       1.0000         16
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2385,0.302814,0.885321
2,0.1532,0.343884,0.900229
3,0.1508,0.377984,0.90367



Final Eval Accuracy: 0.9037


Starting Run 4 | Warmup Steps: 1000


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.519
100,0.361
150,0.3183
200,0.3581
250,0.2991
300,0.3182
350,0.3427
400,0.3208
450,0.2996
500,0.328



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      2330.0464       1.5983       0.3664       0.4363          8
Linear(in_features=768, out_      6896.8834       1.3490       0.5026       0.6732         11
Linear(in_features=768, out_      1890.0756       1.4385       0.2440       0.2234          5
Linear(in_features=768, out_      9658.3892       1.1848       0.5770       0.8026         13
Linear(in_features=768, out_      1630.4327       1.2498       0.1156       0.0000          2
Linear(in_features=768, out_            inf       1.0940       0.2730       0.2737          5
Linear(in_features=768, out_      4666.8565       1.6394       0.5364       0.7319         12
Linear(in_features=768, out_            inf       1.3152       0.4045       0.5026          9
Linear(in_features=768, out_      5575.9388       1.7200       0.6410       0.9139         14
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2432,0.267273,0.896789
2,0.156,0.332984,0.895642
3,0.1637,0.386,0.896789



Final Eval Accuracy: 0.8968


Starting Run 5 | Warmup Steps: 2000


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5
100,0.3655
150,0.3142
200,0.3584
250,0.3049
300,0.3327
350,0.3422
400,0.3221
450,0.3089
500,0.3336



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      2897.5051       2.1601       0.5086       0.5154          9
Linear(in_features=768, out_      8108.5596       1.8188       0.7177       0.9320         15
Linear(in_features=768, out_      2061.4486       2.0555       0.3870       0.2730          5
Linear(in_features=768, out_      8611.9208       1.6097       0.6405       0.7782         12
Linear(in_features=768, out_      2682.6996       1.9422       0.3718       0.2427          5
Linear(in_features=768, out_            inf       1.3554       0.2500       0.0000          2
Linear(in_features=768, out_      5360.5549       2.2600       0.7518       1.0000         16
Linear(in_features=768, out_            inf       1.6988       0.4398       0.3782          7
Linear(in_features=768, out_      6244.7279       2.1331       0.7492       0.9948         15
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2463,0.289739,0.894495
2,0.1506,0.355203,0.893349
3,0.1435,0.374952,0.901376



Final Eval Accuracy: 0.9014


In [17]:
# --- Save all results to CSV ---
df = pd.DataFrame(all_results)
df.to_csv("adaptive_lora_sst2_benchmark.csv", index=False)

# --- Display nicely formatted results ---
from tabulate import tabulate
print("\nFinal Results Across All Runs:\n")
print(tabulate(df, headers='keys', tablefmt='pretty'))



Final Results Across All Runs:

+---+--------+---------------------------------+----------------+-------------------+---------------------+------------------+
|   | Run ID |          Warmup Steps           | Final Accuracy | Warmup Time (min) | Training Time (min) | Total Time (min) |
+---+--------+---------------------------------+----------------+-------------------+---------------------+------------------+
| 0 |   0    | [50, 100, 200, 500, 1000, 2000] |     0.9048     |       0.04        |        5.82         |       5.86       |
| 1 |   1    | [50, 100, 200, 500, 1000, 2000] |     0.906      |       0.07        |        5.98         |       6.05       |
| 2 |   2    | [50, 100, 200, 500, 1000, 2000] |     0.9083     |       0.12        |        5.95         |       6.07       |
| 3 |   3    | [50, 100, 200, 500, 1000, 2000] |     0.9037     |       0.27        |        5.98         |       6.25       |
| 4 |   4    | [50, 100, 200, 500, 1000, 2000] |     0.8968     |       0.53  