In [None]:
!pip install -q torch transformers datasets evaluate scikit-learn pandas matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m111.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import random
from sklearn.metrics import accuracy_score
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification,
    Trainer, TrainingArguments, TrainerCallback
)
from datasets import load_dataset
import evaluate
import gc

In [None]:
# Set Seed for reproducability
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

In [None]:
class AdaptiveLoRALinear(nn.Module):
    def __init__(self, base_layer, r=4, alpha=32, dropout=0.05):
        super().__init__()
        self.base = base_layer
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()

        self.lora_A = nn.Parameter(torch.randn(r, base_layer.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(base_layer.out_features, r) * 0.01)

        device = self.lora_A.device
        self.initial_A = self.lora_A.detach().clone().to(device)
        self.initial_B = self.lora_B.detach().clone().to(device)

        self.grad_norm_history = []
        self.weight_change_history = []

        self.lora_A.register_hook(self._capture_grad_hook('A'))
        self.lora_B.register_hook(self._capture_grad_hook('B'))

    def forward(self, x):
        result = self.base(x)
        if self.r > 0:
            lora_out = self.dropout(x) @ self.lora_A.T @ self.lora_B.T
            result += self.scaling * lora_out
        return result

    def _capture_grad_hook(self, which):
        def hook(grad):
            norm = grad.norm().item()
            if which == 'A':
                self.grad_norm_history.append(('A', norm))
            elif which == 'B':
                self.grad_norm_history.append(('B', norm))
        return hook

    def compute_weight_change(self):
        device = self.lora_A.device
        if self.initial_A.device != device:
            self.initial_A = self.initial_A.to(device)
        if self.initial_B.device != device:
            self.initial_B = self.initial_B.to(device)
        delta_A = (self.lora_A - self.initial_A).norm().item()
        delta_B = (self.lora_B - self.initial_B).norm().item()
        total_change = delta_A + delta_B
        self.weight_change_history.append(total_change)
        return total_change

    def average_grad_norm(self):
        norms = [n for (w, n) in self.grad_norm_history]
        return sum(norms) / len(norms) if norms else 0.0


class AdaptiveLoRAMonitor:
    def __init__(self, model, alpha=0.5, beta=0.5, min_r=2, max_r=16):
        self.model = model
        self.alpha = alpha
        self.beta = beta
        self.min_r = min_r
        self.max_r = max_r
        self.modules = [m for m in model.modules() if isinstance(m, AdaptiveLoRALinear)]

    def assign_adaptive_ranks(self, run_id):
        stats = []
        for module in self.modules:
            grad_score = module.average_grad_norm()
            weight_score = module.compute_weight_change()
            total_score = self.alpha * grad_score + self.beta * weight_score
            stats.append((module, grad_score, weight_score, total_score))

        scores = [s[-1] for s in stats]
        valid_scores = [s for s in scores if np.isfinite(s)]


        # Clean NaN/inf from scores and set fallback normalization
        # scores = [s if np.isfinite(s) else 0.0 for s in scores]

        # scores_clean = [s if np.isfinite(s) else None for s in scores]
        # valid_scores = [s for s in scores_clean if s is not None]

        if not valid_scores:
            raise ValueError("All scores are NaN or Inf — cannot assign ranks!")

        min_score, max_score = min(valid_scores), max(valid_scores)

        fallback_rank = (self.min_r+self.max_r) // 2

        print("\n📊 Layer-wise Stats and Rank Assignment:")
        print(f"{'Layer':<30} {'Grad Norm':>12} {'Weight Δ':>12} {'Score':>12} {'Norm Score':>12} {'Assigned r':>10}")
        rows = []

        for (module, grad, weight, score) in stats:
            if not np.isfinite(score):
                # Exploded layer => directly assign mid rank
                new_r = fallback_rank
                norm_score = 0.5  # Just for logging/printing
            else:
                if min_score == max_score:
                    norm_score = 1.0
                else:
                    norm_score = (score - min_score) / (max_score - min_score)
                new_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)

            new_r = max(self.min_r, min(self.max_r, new_r))  # Clamp safely

            # Reinitialize LoRA matrices
            device = module.lora_A.device
            module.lora_A = nn.Parameter(torch.randn(new_r, module.base.in_features, device=device) * 0.01)
            module.lora_B = nn.Parameter(torch.randn(module.base.out_features, new_r, device=device) * 0.01)
            module.initial_A = module.lora_A.detach().clone().to(device)
            module.initial_B = module.lora_B.detach().clone().to(device)
            module.r = new_r
            module.grad_norm_history = []
            module.weight_change_history = []

            print(f"{str(module.base)[:28]:<30} {grad:12.4f} {weight:12.4f} {score:12.4f} {norm_score:12.4f} {new_r:10d}")

            rows.append({
                "Layer": str(module.base),
                "Grad Norm": grad,
                "Weight Change": weight,
                "Score": score,
                "Norm Score": norm_score,
                "Assigned Rank": new_r,
            })

        # Save to CSV
        df_layer_stats = pd.DataFrame(rows)
        os.makedirs("rank_logs", exist_ok=True)
        df_layer_stats.to_csv(f"rank_logs/adaptive_lora_run_{run_id}.csv", index=False)

        # normalized_scores = []
        # fallback_rank = int((self.min_r + self.max_r) / 2)
        # fallback_norm_score = (fallback_rank - self.min_r) / (self.max_r - self.min_r)

        # for s in scores_clean:
        #     if s is None:
        #         normalized_scores.append(fallback_norm_score)
        #     else:
        #         if min_score == max_score:
        #             norm_s = 1.0  # Edge case: all scores same
        #         else:
        #             norm_s = (s - min_score) / (max_score - min_score)
        #         normalized_scores.append(norm_s)

        # print("\nLayer-wise Stats and Rank Assignment:")
        # print(f"{'Layer':<30} {'Grad Norm':>12} {'Weight Δ':>12} {'Score':>12} {'Norm Score':>12} {'Assigned r':>10}")
        # rows = []
        # for (module, g, w, score), norm_score in zip(stats, normalized_scores):
        #     new_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)
        #     new_r = max(self.min_r, min(self.max_r, new_r))

        #     device = module.lora_A.device
        #     module.lora_A = nn.Parameter(torch.randn(new_r, module.base.in_features, device=device) * 0.01)
        #     module.lora_B = nn.Parameter(torch.randn(module.base.out_features, new_r, device=device) * 0.01)
        #     module.initial_A = module.lora_A.detach().clone().to(device)
        #     module.initial_B = module.lora_B.detach().clone().to(device)
        #     module.r = new_r
        #     module.grad_norm_history = []
        #     module.weight_change_history = []

        #     print(f"{str(module.base)[:28]:<30} {g:12.4f} {w:12.4f} {score:12.4f} {norm_score:12.4f} {new_r:10d}")

        #     rows.append({
        #         "Layer": str(module.base),
        #         "Grad Norm": g,
        #         "Weight Change": w,
        #         "Score": score,
        #         "Norm Score": norm_score,
        #         "Assigned Rank": new_r,
        #     })

        # df_layer_stats = pd.DataFrame(rows)
        # os.makedirs("rank_logs", exist_ok=True)
        # df_layer_stats.to_csv(f"rank_logs/adaptive_lora_run_{run_id}.csv", index=False)


In [None]:
def inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and ('q_lin' in name or 'v_lin' in name):
            parent = model
            for part in name.split('.')[:-1]:
                parent = getattr(parent, part)
            layer_name = name.split('.')[-1]
            setattr(parent, layer_name, AdaptiveLoRALinear(module, r, alpha, dropout))
    return model

def freeze_model_except_lora(model):
    """
      Freezes all the layers except the LoRA Layers and Classification Layers
    """
    for name, param in model.named_parameters():
        if (
            'lora_A' in name or
            'lora_B' in name or
            'pre_classifier' in name or
            'classifier' in name
        ):
            param.requires_grad = True
        else:
            param.requires_grad = False

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
dataset = load_dataset("glue", "sst2")
dataset = dataset.rename_column("label", "labels")
dataset = dataset.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", max_length=128), batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [None]:
os.makedirs("rank_logs", exist_ok=True)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def log_phase(phase_name, run_id=None):
    ts = time.time()
    with open("gpu_phase_timestamps.log", "a") as f:
        f.write(f"{run_id},{phase_name},{ts}\n")

In [None]:
warmup_steps = 500
alpha = 0.2
beta = 0.8

In [None]:
all_results = []

for run_id in range(6):
    print(f"\n\nStarting Run {run_id} | Warmup Steps = {warmup_steps}")
    gc.collect()
    torch.cuda.empty_cache()

    set_seed(42+run_id)

    # initialize base model
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    # Initial LoRA rank for Warmup phase
    # alpha must be either 2x or 4x of the rank r
    # if r = 4, then 4x4 = 16, thus alpha=16
    model = inject_adaptive_lora(model,
                                 r=4,
                                 alpha=16,
                                 dropout=0.05)

    # freeze the model
    #   except the LoRA Layer
    #   except the Classification Heads
    freeze_model_except_lora(model)

    # move model to CUDA
    model = model.to("cuda")
    for name, param in model.named_parameters():
        if 'lora_' in name:
            param.data = param.data.to('cuda')

    # Assign Adaptive Ranks
    monitor = AdaptiveLoRAMonitor(model,
                                  alpha=alpha,
                                  beta=beta,
                                  min_r=2,
                                  max_r=16)

    # training arguments for warmup phase
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./warmup_run{run_id}",
            eval_strategy="no",
            max_steps=warmup_steps,
            per_device_train_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"]
    )

    #------------------Warmup START------------------
    log_phase("warmup_start", run_id)
    warmup_start = time.time()
    trainer.train()
    warmup_end = time.time()
    log_phase("warmup_end", run_id)
    #------------------Warmup END------------------

    #------------------Rank Assignment START------------------
    log_phase("rank_assignment_start", run_id)
    monitor.assign_adaptive_ranks(run_id=run_id)
    log_phase("rank_assignment_end", run_id)
    #------------------Rank Assignment END------------------

    # Training arguments for Fine Tuning
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./finetune_run{run_id}",
            eval_strategy="epoch",
            save_strategy="no",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            learning_rate=5e-4,
            weight_decay=0.01,
            logging_steps=50,
            report_to="none",
            fp16=True
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        compute_metrics=compute_metrics
    )


    #------------------Training Phase START------------------
    log_phase("training_start", run_id)
    start_time = time.time()
    trainer.train()
    end_time = time.time()
    log_phase("training_end", run_id)
    #------------------Training Phase END------------------


    # final_memory = torch.cuda.memory_allocated() / 1e9
    outputs = trainer.predict(dataset["validation"])
    logits = outputs.predictions[1] if isinstance(outputs.predictions, tuple) else outputs.predictions
    preds = np.argmax(logits, axis=-1)
    labels = dataset["validation"]["labels"]
    acc = accuracy_score(labels.cpu().numpy(), preds)

    print(f"\nFinal Eval Accuracy: {acc:.4f}")

    warmup_time = round((warmup_end - warmup_start)/ 60, 2)
    training_time = round((end_time - start_time) / 60, 2)
    total_time = round(warmup_time + training_time,2)

    all_results.append({
        "Run ID": run_id,
        "Warmup Steps": warmup_steps,
        "Final Accuracy": round(acc, 4),
        "Warmup Time (min)": warmup_time,
        "Training Time (min)": training_time,
        "Total Time (min)": total_time
    })



Starting Run 0 | Warmup Steps = 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5087
100,0.3613
150,0.3149
200,0.3635
250,0.2922
300,0.3224
350,0.3404
400,0.3223
450,0.3024
500,0.3173



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ        Score   Norm Score Assigned r
Linear(in_features=768, out_      2097.5599       1.0462    1678.2571       0.0071          2
Linear(in_features=768, out_      8337.5685       0.9748    6670.2497       0.4974          8
Linear(in_features=768, out_      2006.9954       1.0933    1605.8150       0.0000          2
Linear(in_features=768, out_     12401.7395       0.9236    9921.5763       0.8168         13
Linear(in_features=768, out_      2079.2703       0.9106    1663.5984       0.0057          2
Linear(in_features=768, out_            inf       0.8747          inf       0.5000          9
Linear(in_features=768, out_      4232.2678       1.2049    3386.0552       0.1749          4
Linear(in_features=768, out_            inf       0.9665          inf       0.5000          9
Linear(in_features=768, out_      7134.8524       1.2844    5708.1388       0.4029          7
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2339,0.260288,0.897936
2,0.1489,0.331716,0.894495
3,0.1588,0.359134,0.909404



Final Eval Accuracy: 0.9094


Starting Run 1 | Warmup Steps = 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5193
100,0.3626
150,0.3248
200,0.3536
250,0.2965
300,0.3152
350,0.3365
400,0.3222
450,0.3031
500,0.3166



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ        Score   Norm Score Assigned r
Linear(in_features=768, out_      1857.4861       1.0115    1486.1912       0.0000          2
Linear(in_features=768, out_      7488.8164       1.0017    5991.2535       0.5855         10
Linear(in_features=768, out_      1950.6023       1.0508    1560.6920       0.0097          2
Linear(in_features=768, out_      9842.5633       0.9128    7874.2332       0.8302         13
Linear(in_features=768, out_      1970.9588       0.9645    1576.9600       0.0118          2
Linear(in_features=768, out_            inf       0.8670          inf       0.5000          9
Linear(in_features=768, out_      4359.9941       1.2289    3488.2410       0.2602          5
Linear(in_features=768, out_            inf       0.9939          inf       0.5000          9
Linear(in_features=768, out_      6954.5780       1.3370    5563.9298       0.5300          9
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2486,0.254042,0.899083
2,0.1625,0.317888,0.895642
3,0.1642,0.332196,0.905963



Final Eval Accuracy: 0.9060


Starting Run 2 | Warmup Steps = 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.498
100,0.3662
150,0.3167
200,0.3544
250,0.295
300,0.3184
350,0.3388
400,0.3135
450,0.3005
500,0.3196



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ        Score   Norm Score Assigned r
Linear(in_features=768, out_      1539.7719       1.0019    1232.0179       0.0000          2
Linear(in_features=768, out_      7547.5324       1.0008    6038.2261       0.6015         10
Linear(in_features=768, out_      1667.3364       1.0729    1334.0837       0.0128          2
Linear(in_features=768, out_     11277.2925       0.9403    9022.0221       0.9749         15
Linear(in_features=768, out_      1917.0634       0.9505    1533.8409       0.0378          2
Linear(in_features=768, out_     11528.0763       0.8774    9222.6365       1.0000         16
Linear(in_features=768, out_      3648.6241       1.2533    2919.1499       0.2111          4
Linear(in_features=768, out_            inf       0.9942          inf       0.5000          9
Linear(in_features=768, out_      6519.6592       1.3501    5215.9974       0.4986          8
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2471,0.284899,0.893349
2,0.1281,0.340022,0.904817
3,0.1525,0.363111,0.90367



Final Eval Accuracy: 0.9037


Starting Run 3 | Warmup Steps = 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5001
100,0.369
150,0.3183
200,0.3539
250,0.2947
300,0.3116
350,0.3337
400,0.3212
450,0.2995
500,0.3152



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ        Score   Norm Score Assigned r
Linear(in_features=768, out_      1894.7279       0.9504    1515.9724       0.0000          2
Linear(in_features=768, out_      9102.4437       1.0893    7282.1728       0.4990          8
Linear(in_features=768, out_      2341.7030       1.0842    1873.5793       0.0309          2
Linear(in_features=768, out_     13331.4228       0.8996   10665.3182       0.7917         13
Linear(in_features=768, out_      2447.1973       0.9809    1957.9540       0.0382          2
Linear(in_features=768, out_     16339.7775       0.8407   13071.9901       1.0000         16
Linear(in_features=768, out_      4721.6390       1.2269    3777.5565       0.1957          4
Linear(in_features=768, out_            inf       0.9839          inf       0.5000          9
Linear(in_features=768, out_      6749.6052       1.4084    5399.9658       0.3361          6
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2343,0.282096,0.897936
2,0.1576,0.327229,0.901376
3,0.1553,0.362788,0.905963



Final Eval Accuracy: 0.9060


Starting Run 4 | Warmup Steps = 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5205
100,0.3613
150,0.3148
200,0.3512
250,0.2961
300,0.3148
350,0.339
400,0.3229
450,0.2959
500,0.3172



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ        Score   Norm Score Assigned r
Linear(in_features=768, out_      1712.0901       1.0245    1369.8770       0.0127          2
Linear(in_features=768, out_      6658.6375       0.9774    5327.1055       0.5071          9
Linear(in_features=768, out_      1585.0455       1.0491    1268.2462       0.0000          2
Linear(in_features=768, out_     11590.8493       0.9228    9272.8640       1.0000         16
Linear(in_features=768, out_      1747.5166       0.8983    1398.1930       0.0162          2
Linear(in_features=768, out_            inf       0.8667          inf       0.5000          9
Linear(in_features=768, out_      4390.6975       1.2462    3512.8073       0.2804          5
Linear(in_features=768, out_            inf       0.9998          inf       0.5000          9
Linear(in_features=768, out_      5612.4631       1.3105    4490.2326       0.4025          7
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2477,0.278599,0.892202
2,0.1581,0.31003,0.905963
3,0.1655,0.349101,0.904817



Final Eval Accuracy: 0.9048


Starting Run 5 | Warmup Steps = 500


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5008
100,0.3629
150,0.3121
200,0.3579
250,0.3005
300,0.3188
350,0.3366
400,0.3141
450,0.2991
500,0.3195



📊 Layer-wise Stats and Rank Assignment:
Layer                             Grad Norm     Weight Δ        Score   Norm Score Assigned r
Linear(in_features=768, out_      1517.3959       0.9652    1214.1098       0.0318          2
Linear(in_features=768, out_      7260.6183       0.9647    5808.6876       0.6486         11
Linear(in_features=768, out_      1221.5392       0.9617     977.4237       0.0000          2
Linear(in_features=768, out_     10533.0557       0.8934    8426.6233       1.0000         16
Linear(in_features=768, out_      2172.8666       0.9509    1738.4835       0.1022          3
Linear(in_features=768, out_            inf       0.8391          inf       0.5000          9
Linear(in_features=768, out_      4273.4144       1.2757    3418.9866       0.3278          6
Linear(in_features=768, out_            inf       1.0035          inf       0.5000          9
Linear(in_features=768, out_      5839.9196       1.3805    4672.2118       0.4960          8
Linear(in_features=

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2328,0.27289,0.892202
2,0.1532,0.334888,0.90367
3,0.1508,0.343284,0.911697



Final Eval Accuracy: 0.9117


In [None]:
# --- Save all results to CSV ---
df = pd.DataFrame(all_results)
df.to_csv("adaptive_lora_sst2_benchmark.csv", index=False)

# --- Display nicely formatted results ---
from tabulate import tabulate
print("\nFinal Results Across All Runs:\n")
print(tabulate(df, headers='keys', tablefmt='pretty'))



Final Results Across All Runs:

+---+--------+--------------+----------------+-------------------+---------------------+------------------+
|   | Run ID | Warmup Steps | Final Accuracy | Warmup Time (min) | Training Time (min) | Total Time (min) |
+---+--------+--------------+----------------+-------------------+---------------------+------------------+
| 0 |  0.0   |    500.0     |     0.9094     |       0.31        |        6.87         |       7.18       |
| 1 |  1.0   |    500.0     |     0.906      |        0.3        |        6.61         |       6.91       |
| 2 |  2.0   |    500.0     |     0.9037     |        0.3        |         6.9         |       7.2        |
| 3 |  3.0   |    500.0     |     0.906      |        0.3        |         6.9         |       7.2        |
| 4 |  4.0   |    500.0     |     0.9048     |       0.31        |        6.96         |       7.27       |
| 5 |  5.0   |    500.0     |     0.9117     |       0.31        |        6.94         |       7.25    