In [1]:
!pip install -q torch transformers datasets evaluate scikit-learn pandas matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn.metrics import accuracy_score
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification,
    Trainer, TrainingArguments, TrainerCallback
)
from datasets import load_dataset
import evaluate
import gc

In [3]:
# Set Seed for reproducability
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

In [4]:
class AdaptiveLoRALinear(nn.Module):
    def __init__(self, base_layer, r=4, alpha=32, dropout=0.05):
        super().__init__()
        self.base = base_layer
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()

        self.lora_A = nn.Parameter(torch.randn(r, base_layer.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(base_layer.out_features, r) * 0.01)

        device = self.lora_A.device
        self.initial_A = self.lora_A.detach().clone().to(device)
        self.initial_B = self.lora_B.detach().clone().to(device)

        self.grad_norm_history = []
        self.weight_change_history = []

        self.lora_A.register_hook(self._capture_grad_hook('A'))
        self.lora_B.register_hook(self._capture_grad_hook('B'))

    def forward(self, x):
        result = self.base(x)
        if self.r > 0:
            lora_out = self.dropout(x) @ self.lora_A.T @ self.lora_B.T
            result += self.scaling * lora_out
        return result

    def _capture_grad_hook(self, which):
        def hook(grad):
            norm = grad.norm().item()
            if which == 'A':
                self.grad_norm_history.append(('A', norm))
            elif which == 'B':
                self.grad_norm_history.append(('B', norm))
        return hook

    def compute_weight_change(self):
        device = self.lora_A.device
        if self.initial_A.device != device:
            self.initial_A = self.initial_A.to(device)
        if self.initial_B.device != device:
            self.initial_B = self.initial_B.to(device)
        delta_A = (self.lora_A - self.initial_A).norm().item()
        delta_B = (self.lora_B - self.initial_B).norm().item()
        total_change = delta_A + delta_B
        self.weight_change_history.append(total_change)
        return total_change

    def average_grad_norm(self):
        norms = [n for (w, n) in self.grad_norm_history]
        return sum(norms) / len(norms) if norms else 0.0


class AdaptiveLoRAMonitor:
    def __init__(self, model, alpha=0.5, beta=0.5, min_r=2, max_r=16):
        self.model = model
        self.alpha = alpha
        self.beta = beta
        self.min_r = min_r
        self.max_r = max_r
        self.modules = [m for m in model.modules() if isinstance(m, AdaptiveLoRALinear)]

    # def assign_adaptive_ranks(self, run_id):
    #     stats = []
    #     for module in self.modules:
    #         grad_score = module.average_grad_norm()
    #         weight_score = module.compute_weight_change()
    #         total_score = self.alpha * grad_score + self.beta * weight_score
    #         stats.append((module, grad_score, weight_score, total_score))

    #     scores = [s[-1] for s in stats]
    #     valid_scores = [s for s in scores if np.isfinite(s)]

    #     if not valid_scores:
    #         raise ValueError("All scores are NaN or Inf — cannot assign ranks!")

    #     min_score, max_score = min(valid_scores), max(valid_scores)

    #     fallback_rank = (self.min_r+self.max_r) // 2

    #     print("\n📊 Layer-wise Stats and Rank Assignment:")
    #     print(f"{'Layer':<30} {'Grad Norm':>12} {'Weight Δ':>12} {'Score':>12} {'Norm Score':>12} {'Assigned r':>10}")
    #     rows = []

    #     for (module, grad, weight, score) in stats:
    #         if not np.isfinite(score):
    #             # Exploded layer => directly assign mid rank
    #             new_r = fallback_rank
    #             norm_score = 0.5  # Just for logging/printing
    #         else:
    #             if min_score == max_score:
    #                 norm_score = 1.0
    #             else:
    #                 norm_score = (score - min_score) / (max_score - min_score)
    #             new_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)

    #         new_r = max(self.min_r, min(self.max_r, new_r))  # Clamp safely

    #         # Reinitialize LoRA matrices
    #         device = module.lora_A.device
    #         module.lora_A = nn.Parameter(torch.randn(new_r, module.base.in_features, device=device) * 0.01)
    #         module.lora_B = nn.Parameter(torch.randn(module.base.out_features, new_r, device=device) * 0.01)
    #         module.initial_A = module.lora_A.detach().clone().to(device)
    #         module.initial_B = module.lora_B.detach().clone().to(device)
    #         module.r = new_r
    #         module.grad_norm_history = []
    #         module.weight_change_history = []

    #         print(f"{str(module.base)[:28]:<30} {grad:12.4f} {weight:12.4f} {score:12.4f} {norm_score:12.4f} {new_r:10d}")

    #         rows.append({
    #             "Layer": str(module.base),
    #             "Grad Norm": grad,
    #             "Weight Change": weight,
    #             "Score": score,
    #             "Norm Score": norm_score,
    #             "Assigned Rank": new_r,
    #         })

    #     # Save to CSV
    #     df_layer_stats = pd.DataFrame(rows)
    #     os.makedirs("rank_logs", exist_ok=True)
    #     df_layer_stats.to_csv(f"rank_logs/adaptive_lora_run_{run_id}.csv", index=False)

    def assign_adaptive_ranks(self, run_id):
      grad_scores = []
      weight_scores = []

      # Step 1: Collect grad norms and weight changes separately
      for module in self.modules:
          grad_scores.append(module.average_grad_norm())
          weight_scores.append(module.compute_weight_change())

      # Step 2: Normalize grad scores
      finite_grad_scores = [g for g in grad_scores if np.isfinite(g)]
      min_grad, max_grad = min(finite_grad_scores), max(finite_grad_scores)
      normalized_grads = [
          (g - min_grad) / (max_grad - min_grad) if np.isfinite(g) and (max_grad > min_grad) else 0.5
          for g in grad_scores
      ]

      # Step 3: Normalize weight scores
      finite_weight_scores = [w for w in weight_scores if np.isfinite(w)]
      min_weight, max_weight = min(finite_weight_scores), max(finite_weight_scores)
      normalized_weights = [
          (w - min_weight) / (max_weight - min_weight) if np.isfinite(w) and (max_weight > min_weight) else 0.5
          for w in weight_scores
      ]

      # Step 4: Compute final combined score
      total_scores = [
          self.alpha * g_norm + self.beta * w_norm
          for g_norm, w_norm in zip(normalized_grads, normalized_weights)
      ]

      valid_total_scores = [s for s in total_scores if np.isfinite(s)]
      if not valid_total_scores:
          raise ValueError("All total scores are NaN or Inf — cannot assign ranks!")

      min_score, max_score = min(valid_total_scores), max(valid_total_scores)
      fallback_rank = (self.min_r + self.max_r) // 2

      print("\n📊 Layer-wise Stats and Rank Assignment:")
      print(f"{'Layer':<30} {'Grad Norm':>12} {'Weight Δ':>12} {'Total Score':>12} {'Norm Score':>12} {'Assigned r':>10}")

      rows = []
      for idx, module in enumerate(self.modules):
          grad = grad_scores[idx]
          weight = weight_scores[idx]
          score = total_scores[idx]

          if not np.isfinite(score):
              norm_score = 0.5  # fallback mid
              new_r = fallback_rank
          else:
              if min_score == max_score:
                  norm_score = 1.0
              else:
                  norm_score = (score - min_score) / (max_score - min_score)
              new_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)

          new_r = max(self.min_r, min(self.max_r, new_r))  # Clamp safely

          # Reinitialize LoRA matrices
          device = module.lora_A.device
          module.lora_A = nn.Parameter(torch.randn(new_r, module.base.in_features, device=device) * 0.01)
          module.lora_B = nn.Parameter(torch.randn(module.base.out_features, new_r, device=device) * 0.01)
          module.initial_A = module.lora_A.detach().clone().to(device)
          module.initial_B = module.lora_B.detach().clone().to(device)
          module.r = new_r
          module.grad_norm_history = []
          module.weight_change_history = []

          print(f"{str(module.base)[:28]:<30} {grad:12.4f} {weight:12.4f} {score:12.4f} {norm_score:12.4f} {new_r:10d}")

          rows.append({
              "Layer": str(module.base),
              "Grad Norm": grad,
              "Weight Change": weight,
              "Score": score,
              "Norm Score": norm_score,
              "Assigned Rank": new_r,
          })

      # Save to CSV
      df_layer_stats = pd.DataFrame(rows)
      os.makedirs("rank_logs", exist_ok=True)
      df_layer_stats.to_csv(f"rank_logs/adaptive_lora_run_{run_id}.csv", index=False)


In [5]:
def inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and ('q_lin' in name or 'v_lin' in name):
            parent = model
            for part in name.split('.')[:-1]:
                parent = getattr(parent, part)
            layer_name = name.split('.')[-1]
            setattr(parent, layer_name, AdaptiveLoRALinear(module, r, alpha, dropout))
    return model

def freeze_model_except_lora(model):
    """
      Freezes all the layers except the LoRA Layers and Classification Layers
    """
    for name, param in model.named_parameters():
        if (
            'lora_A' in name or
            'lora_B' in name or
            'pre_classifier' in name or
            'classifier' in name
        ):
            param.requires_grad = True
        else:
            param.requires_grad = False

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
dataset = load_dataset("glue", "sst2")
dataset = dataset.rename_column("label", "labels")
dataset = dataset.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", max_length=128), batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [7]:
os.makedirs("rank_logs", exist_ok=True)

In [8]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [9]:
def log_phase(phase_name, run_id=None):
    ts = time.time()
    with open("gpu_phase_timestamps.log", "a") as f:
        f.write(f"{run_id},{phase_name},{ts}\n")

In [10]:
warmup_steps = 500
# alpha = 0.2
# beta = 0.8

alpha_beta_combinations = [[0.5,0.5],[0.8,0.2],[0.2,0.8]]

In [12]:
all_results = []

for run_id in range(len(alpha_beta_combinations)):
    print(f"\n\nStarting Run {run_id} | Alpha = {alpha_beta_combinations[run_id][0]} | Beta = {alpha_beta_combinations[run_id][1]}")
    gc.collect()
    torch.cuda.empty_cache()

    set_seed(42)

    # initialize base model
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    # Initial LoRA rank for Warmup phase
    # alpha must be either 2x or 4x of the rank r
    # if r = 4, then 4x4 = 16, thus alpha=16
    model = inject_adaptive_lora(model,
                                 r=4,
                                 alpha=16,
                                 dropout=0.05)

    # freeze the model
    #   except the LoRA Layer
    #   except the Classification Heads
    freeze_model_except_lora(model)

    # move model to CUDA
    model = model.to("cuda")
    for name, param in model.named_parameters():
        if 'lora_' in name:
            param.data = param.data.to('cuda')

    alpha = alpha_beta_combinations[run_id][0]
    beta = alpha_beta_combinations[run_id][1]

    # Assign Adaptive Ranks
    monitor = AdaptiveLoRAMonitor(model,
                                  alpha=alpha,
                                  beta=beta,
                                  min_r=2,
                                  max_r=16)

    # training arguments for warmup phase
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./warmup_run{run_id}",
            eval_strategy="no",
            max_steps=warmup_steps,
            per_device_train_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"]
    )

    #------------------Warmup START------------------
    log_phase("warmup_start", run_id)
    warmup_start = time.time()
    trainer.train()
    warmup_end = time.time()
    log_phase("warmup_end", run_id)
    #------------------Warmup END------------------

    #------------------Rank Assignment START------------------
    log_phase("rank_assignment_start", run_id)
    monitor.assign_adaptive_ranks(run_id=run_id)
    log_phase("rank_assignment_end", run_id)
    #------------------Rank Assignment END------------------

    # Training arguments for Fine Tuning
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./finetune_run{run_id}",
            eval_strategy="epoch",
            save_strategy="no",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        compute_metrics=compute_metrics
    )


    #------------------Training Phase START------------------
    log_phase("training_start", run_id)
    start_time = time.time()
    trainer.train()
    end_time = time.time()
    log_phase("training_end", run_id)
    #------------------Training Phase END------------------


    # final_memory = torch.cuda.memory_allocated() / 1e9
    outputs = trainer.predict(dataset["validation"])
    logits = outputs.predictions[1] if isinstance(outputs.predictions, tuple) else outputs.predictions
    preds = np.argmax(logits, axis=-1)
    labels = dataset["validation"]["labels"]
    acc = accuracy_score(labels.cpu().numpy(), preds)

    print(f"\nFinal Eval Accuracy: {acc:.4f}")

    warmup_time = round((warmup_end - warmup_start)/ 60, 2)
    training_time = round((end_time - start_time) / 60, 2)
    total_time = round(warmup_time + training_time,2)

    all_results.append({
        "Run ID": run_id,
        "Warmup Steps": warmup_steps,
        "Final Accuracy": round(acc, 4),
        "Warmup Time (min)": warmup_time,
        "Training Time (min)": training_time,
        "Total Time (min)": total_time
    })



Starting Run 0 | Alpha = 0.5 | Beta = 0.5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5115
100,0.3833
150,0.3192
200,0.347
250,0.2875
300,0.3207
350,0.3503
400,0.3049
450,0.3054
500,0.3129



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      2918.2235       0.9975       0.1407       0.0000          2
Linear(in_features=768, out_     14155.8683       1.1114       0.5599       0.7582         12
Linear(in_features=768, out_      2952.1606       1.0224       0.1614       0.0374          2
Linear(in_features=768, out_     15408.7932       0.9390       0.4607       0.5787         10
Linear(in_features=768, out_      3908.4742       0.9972       0.1695       0.0520          2
Linear(in_features=768, out_            inf       0.8191       0.2500       0.1977          4
Linear(in_features=768, out_      5853.9705       1.1953       0.3828       0.4378          8
Linear(in_features=768, out_            inf       0.9555       0.3576       0.3922          7
Linear(in_features=768, out_     10537.4976       1.3036       0.6055       0.8406         13
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2376,0.265201,0.905963
2,0.1621,0.305572,0.897936
3,0.139,0.350892,0.905963



Final Eval Accuracy: 0.9060


Starting Run 1 | Alpha = 0.8 | Beta = 0.2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5115
100,0.3833
150,0.3192
200,0.347
250,0.2875
300,0.3207
350,0.3503
400,0.3049
450,0.3054
500,0.3129



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      2918.2235       0.9975       0.0563       0.0000          2
Linear(in_features=768, out_     14155.8683       1.1114       0.6192       0.6856         11
Linear(in_features=768, out_      2952.1606       1.0224       0.0657       0.0115          2
Linear(in_features=768, out_     15408.7932       0.9390       0.6236       0.6909         11
Linear(in_features=768, out_      3908.4742       0.9972       0.1026       0.0564          2
Linear(in_features=768, out_            inf       0.8191       0.4000       0.4186          7
Linear(in_features=768, out_      5853.9705       1.1953       0.2564       0.2437          5
Linear(in_features=768, out_            inf       0.9555       0.4430       0.4710          8
Linear(in_features=768, out_     10537.4976       1.3036       0.5102       0.5528          9
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2369,0.291266,0.891055
2,0.1721,0.317872,0.902523
3,0.1472,0.335981,0.901376



Final Eval Accuracy: 0.9014


Starting Run 2 | Alpha = 0.2 | Beta = 0.8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5115
100,0.3833
150,0.3192
200,0.347
250,0.2875
300,0.3207
350,0.3503
400,0.3049
450,0.3054
500,0.3129



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ  Total Score   Norm Score Assigned r
Linear(in_features=768, out_      2918.2235       0.9975       0.2251       0.1668          4
Linear(in_features=768, out_     14155.8683       1.1114       0.5006       0.5341          9
Linear(in_features=768, out_      2952.1606       1.0224       0.2570       0.2094          4
Linear(in_features=768, out_     15408.7932       0.9390       0.2977       0.2636          5
Linear(in_features=768, out_      3908.4742       0.9972       0.2363       0.1818          4
Linear(in_features=768, out_            inf       0.8191       0.1000       0.0000          2
Linear(in_features=768, out_      5853.9705       1.1953       0.5092       0.5455          9
Linear(in_features=768, out_            inf       0.9555       0.2721       0.2294          5
Linear(in_features=768, out_     10537.4976       1.3036       0.7008       0.8010         13
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2427,0.287984,0.892202
2,0.1562,0.330662,0.892202
3,0.1714,0.386296,0.904817



Final Eval Accuracy: 0.9048


In [13]:
# --- Save all results to CSV ---
df = pd.DataFrame(all_results)
df.to_csv("adaptive_lora_sst2_benchmark.csv", index=False)

# --- Display nicely formatted results ---
from tabulate import tabulate
print("\nFinal Results Across All Runs:\n")
print(tabulate(df, headers='keys', tablefmt='pretty'))



Final Results Across All Runs:

+---+--------+--------------+----------------+-------------------+---------------------+------------------+
|   | Run ID | Warmup Steps | Final Accuracy | Warmup Time (min) | Training Time (min) | Total Time (min) |
+---+--------+--------------+----------------+-------------------+---------------------+------------------+
| 0 |  0.0   |    500.0     |     0.906      |       0.38        |        8.58         |       8.96       |
| 1 |  1.0   |    500.0     |     0.9014     |       0.37        |        8.58         |       8.95       |
| 2 |  2.0   |    500.0     |     0.9048     |       0.37        |        8.59         |       8.96       |
+---+--------+--------------+----------------+-------------------+---------------------+------------------+
