In [1]:
!pip install -q torch transformers datasets evaluate scikit-learn pandas matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn.metrics import accuracy_score
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification,
    Trainer, TrainingArguments, TrainerCallback
)
from datasets import load_dataset
import evaluate
import gc

In [3]:
# Set Seed for reproducability
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [4]:
class AdaptiveLoRALinear(nn.Module):
    def __init__(self, base_layer, r=4, alpha=32, dropout=0.05):
        super().__init__()
        self.base = base_layer
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()

        self.lora_A = nn.Parameter(torch.randn(r, base_layer.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(base_layer.out_features, r) * 0.01)

        device = self.lora_A.device
        self.initial_A = self.lora_A.detach().clone().to(device)
        self.initial_B = self.lora_B.detach().clone().to(device)

        self.grad_norm_history = []
        self.weight_change_history = []

        self.lora_A.register_hook(self._capture_grad_hook('A'))
        self.lora_B.register_hook(self._capture_grad_hook('B'))

    def forward(self, x):
        result = self.base(x)
        if self.r > 0:
            lora_out = self.dropout(x) @ self.lora_A.T @ self.lora_B.T
            result += self.scaling * lora_out
        return result

    def _capture_grad_hook(self, which):
        def hook(grad):
            norm = grad.norm().item()
            if which == 'A':
                self.grad_norm_history.append(('A', norm))
            elif which == 'B':
                self.grad_norm_history.append(('B', norm))
        return hook

    def compute_weight_change(self):
        device = self.lora_A.device
        if self.initial_A.device != device:
            self.initial_A = self.initial_A.to(device)
        if self.initial_B.device != device:
            self.initial_B = self.initial_B.to(device)
        delta_A = (self.lora_A - self.initial_A).norm().item()
        delta_B = (self.lora_B - self.initial_B).norm().item()
        total_change = delta_A + delta_B
        self.weight_change_history.append(total_change)
        return total_change

    def average_grad_norm(self):
        norms = [n for (w, n) in self.grad_norm_history]
        return sum(norms) / len(norms) if norms else 0.0


class AdaptiveLoRAMonitor:
    def __init__(self, model, alpha=0.5, beta=0.5, min_r=2, max_r=16):
        self.model = model
        self.alpha = alpha
        self.beta = beta
        self.min_r = min_r
        self.max_r = max_r
        self.modules = [m for m in model.modules() if isinstance(m, AdaptiveLoRALinear)]

    def assign_adaptive_ranks(self, run_id):
      grad_scores = []
      weight_scores = []

      # Step 1: Collect grad norms and weight changes separately
      for module in self.modules:
          grad_scores.append(module.average_grad_norm())
          weight_scores.append(module.compute_weight_change())

      # Step 2: Normalize grad scores
      finite_grad_scores = [g for g in grad_scores if np.isfinite(g)]
      min_grad, max_grad = min(finite_grad_scores), max(finite_grad_scores)
      normalized_grads = [
          (g - min_grad) / (max_grad - min_grad) if np.isfinite(g) and (max_grad > min_grad) else 0.5
          for g in grad_scores
      ]

      # Step 3: Normalize weight scores
      finite_weight_scores = [w for w in weight_scores if np.isfinite(w)]
      min_weight, max_weight = min(finite_weight_scores), max(finite_weight_scores)
      normalized_weights = [
          (w - min_weight) / (max_weight - min_weight) if np.isfinite(w) and (max_weight > min_weight) else 0.5
          for w in weight_scores
      ]

      # Step 4: Compute final combined score
      total_scores = [
          self.alpha * g_norm + self.beta * w_norm
          for g_norm, w_norm in zip(normalized_grads, normalized_weights)
      ]

      valid_total_scores = [s for s in total_scores if np.isfinite(s)]
      if not valid_total_scores:
          raise ValueError("All total scores are NaN or Inf — cannot assign ranks!")

      min_score, max_score = min(valid_total_scores), max(valid_total_scores)
      fallback_rank = (self.min_r + self.max_r) // 2

      print("\n📊 Layer-wise Stats and Rank Assignment:")
      print(f"{'Layer':<30} {'Grad Norm':>12} {'Weight Δ':>12} {'Total Score':>12} {'Norm Score':>12} {'Assigned r':>10}")

      rows = []
      for idx, module in enumerate(self.modules):
          grad = grad_scores[idx]
          weight = weight_scores[idx]
          score = total_scores[idx]

          if not np.isfinite(score):
              norm_score = 0.5  # fallback mid
              new_r = fallback_rank
          else:
              if min_score == max_score:
                  norm_score = 1.0
              else:
                  norm_score = (score - min_score) / (max_score - min_score)
              new_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)

          new_r = max(self.min_r, min(self.max_r, new_r))  # Clamp safely

          # Reinitialize LoRA matrices
          device = module.lora_A.device
          module.lora_A = nn.Parameter(torch.randn(new_r, module.base.in_features, device=device) * 0.01)
          module.lora_B = nn.Parameter(torch.randn(module.base.out_features, new_r, device=device) * 0.01)
          module.initial_A = module.lora_A.detach().clone().to(device)
          module.initial_B = module.lora_B.detach().clone().to(device)
          module.r = new_r
          module.grad_norm_history = []
          module.weight_change_history = []

          print(f"{str(module.base)[:28]:<30} {grad:12.4f} {weight:12.4f} {score:12.4f} {norm_score:12.4f} {new_r:10d}")

          rows.append({
              "Layer": str(module.base),
              "Grad Norm": grad,
              "Weight Change": weight,
              "Score": score,
              "Norm Score": norm_score,
              "Assigned Rank": new_r,
          })

      # Save to CSV
      df_layer_stats = pd.DataFrame(rows)
      os.makedirs("rank_logs", exist_ok=True)
      df_layer_stats.to_csv(f"rank_logs/adaptive_lora_run_{run_id}.csv", index=False)


In [5]:
def inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and ('q_lin' in name or 'v_lin' in name):
            parent = model
            for part in name.split('.')[:-1]:
                parent = getattr(parent, part)
            layer_name = name.split('.')[-1]
            setattr(parent, layer_name, AdaptiveLoRALinear(module, r, alpha, dropout))
    return model

def freeze_model_except_lora(model):
    """
      Freezes all the layers except the LoRA Layers and Classification Layers
    """
    for name, param in model.named_parameters():
        if (
            'lora_A' in name or
            'lora_B' in name or
            'pre_classifier' in name or
            'classifier' in name
        ):
            param.requires_grad = True
        else:
            param.requires_grad = False

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
dataset = load_dataset("glue", "sst2")
dataset = dataset.rename_column("label", "labels")
dataset = dataset.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", max_length=128), batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [7]:
os.makedirs("rank_logs", exist_ok=True)

In [8]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [9]:
def log_phase(phase_name, run_id=None):
    ts = time.time()
    with open("gpu_phase_timestamps.log", "a") as f:
        f.write(f"{run_id},{phase_name},{ts}\n")

In [10]:
warmup_steps = 500
alpha = 0.5
beta = 0.5

In [11]:
all_results = []

for run_id in range(6):
    print(f"\n\nStarting Run {run_id} | Alpha = {alpha} | Beta = {beta}")
    gc.collect()
    torch.cuda.empty_cache()

    set_seed(42+run_id)

    # initialize base model
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    # Initial LoRA rank for Warmup phase
    # alpha must be either 2x or 4x of the rank r
    # if r = 4, then 4x4 = 16, thus alpha=16
    model = inject_adaptive_lora(model,
                                 r=4,
                                 alpha=16,
                                 dropout=0.05)

    # freeze the model
    #   except the LoRA Layer
    #   except the Classification Heads
    freeze_model_except_lora(model)

    # move model to CUDA
    model = model.to("cuda")
    for name, param in model.named_parameters():
        if 'lora_' in name:
            param.data = param.data.to('cuda')

    # Assign Adaptive Ranks
    monitor = AdaptiveLoRAMonitor(model,
                                  alpha=alpha,
                                  beta=beta,
                                  min_r=2,
                                  max_r=16)

    # training arguments for warmup phase
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./warmup_run{run_id}",
            eval_strategy="no",
            max_steps=warmup_steps,
            per_device_train_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"]
    )

    #------------------Warmup START------------------
    log_phase("warmup_start", run_id)
    warmup_start = time.time()
    trainer.train()
    warmup_end = time.time()
    log_phase("warmup_end", run_id)
    #------------------Warmup END------------------

    #------------------Rank Assignment START------------------
    log_phase("rank_assignment_start", run_id)
    monitor.assign_adaptive_ranks(run_id=run_id)
    log_phase("rank_assignment_end", run_id)
    #------------------Rank Assignment END------------------

    # Training arguments for Fine Tuning
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./finetune_run{run_id}",
            eval_strategy="epoch",
            save_strategy="no",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        compute_metrics=compute_metrics
    )


    #------------------Training Phase START------------------
    log_phase("training_start", run_id)
    start_time = time.time()
    trainer.train()
    end_time = time.time()
    log_phase("training_end", run_id)
    #------------------Training Phase END------------------


    # final_memory = torch.cuda.memory_allocated() / 1e9
    outputs = trainer.predict(dataset["validation"])
    logits = outputs.predictions[1] if isinstance(outputs.predictions, tuple) else outputs.predictions
    preds = np.argmax(logits, axis=-1)
    labels = dataset["validation"]["labels"]
    acc = accuracy_score(labels.cpu().numpy(), preds)

    print(f"\nFinal Eval Accuracy: {acc:.4f}")

    warmup_time = round((warmup_end - warmup_start)/ 60, 2)
    training_time = round((end_time - start_time) / 60, 2)
    total_time = round(warmup_time + training_time,2)

    all_results.append({
        "Run ID": run_id,
        "Warmup Steps": warmup_steps,
        "Final Accuracy": round(acc, 4),
        "Warmup Time (min)": warmup_time,
        "Training Time (min)": training_time,
        "Total Time (min)": total_time
    })



Starting Run 0 | Alpha = 0.5 | Beta = 0.5


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5087
100,0.3613
150,0.3149
200,0.3635
250,0.2922
300,0.3224
350,0.3404
400,0.3223
450,0.3024
500,0.3173



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      2097.5599       1.0462       0.1397       0.1825          4
Linear(in_features=768, out_      8337.5685       0.9748       0.3282       0.4997          8
Linear(in_features=768, out_      2006.9954       1.0933       0.1736       0.2395          5
Linear(in_features=768, out_     12401.7395       0.9236       0.4472       0.7000         11
Linear(in_features=768, out_      2079.2703       0.9106       0.0313       0.0000          2
Linear(in_features=768, out_            inf       0.8747       0.2500       0.3681          7
Linear(in_features=768, out_      4232.2678       1.2049       0.3496       0.5357          9
Linear(in_features=768, out_            inf       0.9665       0.3229       0.4907          8
Linear(in_features=768, out_      7134.8524       1.2844       0.5268       0.8339         13
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2436,0.293192,0.885321
2,0.1371,0.328901,0.902523
3,0.148,0.340942,0.911697



Final Eval Accuracy: 0.9117


Starting Run 1 | Alpha = 0.5 | Beta = 0.5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5193
100,0.3626
150,0.3248
200,0.3536
250,0.2965
300,0.3152
350,0.3365
400,0.3222
450,0.3031
500,0.3166



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1857.4861       1.0115       0.1039       0.0451          2
Linear(in_features=768, out_      7488.8164       1.0017       0.3896       0.5068          9
Linear(in_features=768, out_      1950.6023       1.0508       0.1370       0.0986          3
Linear(in_features=768, out_      9842.5633       0.9128       0.4481       0.6013         10
Linear(in_features=768, out_      1970.9588       0.9645       0.0760       0.0000          2
Linear(in_features=768, out_            inf       0.8670       0.2500       0.2812          5
Linear(in_features=768, out_      4359.9941       1.2289       0.3903       0.5080          9
Linear(in_features=768, out_            inf       0.9939       0.3412       0.4287          8
Linear(in_features=768, out_      6954.5780       1.3370       0.6029       0.8516         13
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2332,0.275964,0.896789
2,0.1482,0.31301,0.901376
3,0.1631,0.339573,0.90711


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Eval Accuracy: 0.9071


Starting Run 2 | Alpha = 0.5 | Beta = 0.5


Step,Training Loss
50,0.498
100,0.3662
150,0.3167
200,0.3544
250,0.295
300,0.3184
350,0.3388
400,0.3135
450,0.3005
500,0.3196



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1539.7719       1.0019       0.1114       0.0417          2
Linear(in_features=768, out_      7547.5324       1.0008       0.4112       0.5720         10
Linear(in_features=768, out_      1667.3364       1.0729       0.1765       0.1568          4
Linear(in_features=768, out_     11277.2925       0.9403       0.5480       0.8139         13
Linear(in_features=768, out_      1917.0634       0.9505       0.0878       0.0000          2
Linear(in_features=768, out_     11528.0763       0.8774       0.5085       0.7442         12
Linear(in_features=768, out_      3648.6241       1.2533       0.4247       0.5958         10
Linear(in_features=768, out_            inf       0.9942       0.3550       0.4726          8
Linear(in_features=768, out_      6519.6592       1.3501       0.6484       0.9916         15
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2443,0.276487,0.888761
2,0.1382,0.361227,0.888761
3,0.1689,0.396453,0.895642



Final Eval Accuracy: 0.8956


Starting Run 3 | Alpha = 0.5 | Beta = 0.5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5001
100,0.369
150,0.3183
200,0.3539
250,0.2947
300,0.3116
350,0.3337
400,0.3212
450,0.2995
500,0.3152



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1894.7279       0.9504       0.0884       0.0000          2
Linear(in_features=768, out_      9102.4437       1.0893       0.4498       0.6730         11
Linear(in_features=768, out_      2341.7030       1.0842       0.2117       0.2296          5
Linear(in_features=768, out_     13331.4228       0.8996       0.4433       0.6609         11
Linear(in_features=768, out_      2447.1973       0.9809       0.1321       0.0814          3
Linear(in_features=768, out_     16339.7775       0.8407       0.5000       0.7664         12
Linear(in_features=768, out_      4721.6390       1.2269       0.4090       0.5969         10
Linear(in_features=768, out_            inf       0.9839       0.3654       0.5157          9
Linear(in_features=768, out_      6749.6052       1.4084       0.6255       1.0000         16
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2385,0.302814,0.885321
2,0.1532,0.343884,0.900229
3,0.1508,0.377984,0.90367


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Eval Accuracy: 0.9037


Starting Run 4 | Alpha = 0.5 | Beta = 0.5


Step,Training Loss
50,0.5205
100,0.3613
150,0.3148
200,0.3512
250,0.2961
300,0.3148
350,0.339
400,0.3229
450,0.2959
500,0.3172



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1712.0901       1.0245       0.1186       0.1412          3
Linear(in_features=768, out_      6658.6375       0.9774       0.3323       0.4841          8
Linear(in_features=768, out_      1585.0455       1.0491       0.1298       0.1592          4
Linear(in_features=768, out_     11590.8493       0.9228       0.5399       0.8173         13
Linear(in_features=768, out_      1747.5166       0.8983       0.0306       0.0000          2
Linear(in_features=768, out_            inf       0.8667       0.2500       0.3521          6
Linear(in_features=768, out_      4390.6975       1.2462       0.4102       0.6092         10
Linear(in_features=768, out_            inf       0.9998       0.3446       0.5040          9
Linear(in_features=768, out_      5612.4631       1.3105       0.5170       0.7805         12
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2443,0.271102,0.892202
2,0.1527,0.321255,0.896789
3,0.1611,0.360513,0.893349



Final Eval Accuracy: 0.8933


Starting Run 5 | Alpha = 0.5 | Beta = 0.5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5008
100,0.3629
150,0.3121
200,0.3579
250,0.3005
300,0.3188
350,0.3366
400,0.3141
450,0.2991
500,0.3195



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      1517.3959       0.9652       0.1141       0.0322          2
Linear(in_features=768, out_      7260.6183       0.9647       0.4222       0.5651          9
Linear(in_features=768, out_      1221.5392       0.9617       0.0955       0.0000          2
Linear(in_features=768, out_     10533.0557       0.8934       0.5423       0.7730         12
Linear(in_features=768, out_      2172.8666       0.9509       0.1382       0.0738          3
Linear(in_features=768, out_            inf       0.8391       0.2500       0.2673          5
Linear(in_features=768, out_      4273.4144       1.2757       0.5040       0.7067         11
Linear(in_features=768, out_            inf       1.0035       0.3780       0.4888          8
Linear(in_features=768, out_      5839.9196       1.3805       0.6698       0.9935         15
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.247,0.277821,0.893349
2,0.1583,0.349981,0.896789
3,0.1738,0.344531,0.90367



Final Eval Accuracy: 0.9037


In [12]:
# --- Save all results to CSV ---
df = pd.DataFrame(all_results)
df.to_csv("adaptive_lora_sst2_benchmark.csv", index=False)

# --- Display nicely formatted results ---
from tabulate import tabulate
print("\nFinal Results Across All Runs:\n")
print(tabulate(df, headers='keys', tablefmt='pretty'))



Final Results Across All Runs:

+---+--------+--------------+----------------+-------------------+---------------------+------------------+
|   | Run ID | Warmup Steps | Final Accuracy | Warmup Time (min) | Training Time (min) | Total Time (min) |
+---+--------+--------------+----------------+-------------------+---------------------+------------------+
| 0 |  0.0   |    500.0     |     0.9117     |       0.27        |        5.63         |       5.9        |
| 1 |  1.0   |    500.0     |     0.9071     |       0.25        |        5.69         |       5.94       |
| 2 |  2.0   |    500.0     |     0.8956     |       0.25        |        5.65         |       5.9        |
| 3 |  3.0   |    500.0     |     0.9037     |       0.26        |        5.78         |       6.04       |
| 4 |  4.0   |    500.0     |     0.8933     |       0.26        |        5.81         |       6.07       |
| 5 |  5.0   |    500.0     |     0.9037     |       0.26        |        5.78         |       6.04    