In [1]:
!pip install -q datasets transformers evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m307.2/491.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import pandas as pd
import time
import numpy as np
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

In [3]:
class AdaptiveLoRALinear(nn.Module):
    def __init__(self, base_layer, r=4, alpha=32, dropout=0.05):
        super().__init__()
        self.base = base_layer
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()

        self.lora_A = nn.Parameter(torch.randn(r, base_layer.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(base_layer.out_features, r) * 0.01)

        device = self.lora_A.device
        self.initial_A = self.lora_A.detach().clone().to(device)
        self.initial_B = self.lora_B.detach().clone().to(device)

        self.grad_norm_history = []
        self.weight_change_history = []

        self.lora_A.register_hook(self._capture_grad_hook('A'))
        self.lora_B.register_hook(self._capture_grad_hook('B'))

    def forward(self, x):
        result = self.base(x)
        if self.r > 0:
            lora_out = self.dropout(x) @ self.lora_A.T @ self.lora_B.T
            result += self.scaling * lora_out
        return result

    def _capture_grad_hook(self, which):
        def hook(grad):
            norm = grad.norm().item()
            if which == 'A':
                self.grad_norm_history.append(('A', norm))
            elif which == 'B':
                self.grad_norm_history.append(('B', norm))
        return hook

    def compute_weight_change(self):
        delta_A = (self.lora_A - self.initial_A).norm().item()
        delta_B = (self.lora_B - self.initial_B).norm().item()
        return delta_A + delta_B

    def average_grad_norm(self):
        norms = [n for (w, n) in self.grad_norm_history]
        return sum(norms) / len(norms) if norms else 0.0

In [4]:
def inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and ('q_lin' in name or 'v_lin' in name):
            parent = model
            for part in name.split('.')[:-1]:
                parent = getattr(parent, part)
            layer_name = name.split('.')[-1]
            original_layer = getattr(parent, layer_name)
            lora_layer = AdaptiveLoRALinear(original_layer, r=r, alpha=alpha, dropout=dropout)
            setattr(parent, layer_name, lora_layer)
    return model

In [5]:
def freeze_model_except_lora(model):
    for name, param in model.named_parameters():
        param.requires_grad = 'lora_' in name

In [6]:
class AdaptiveLoRAMonitor:
    def __init__(self, model, alpha=0.5, beta=0.5, min_r=2, max_r=16):
        self.model = model
        self.alpha = alpha
        self.beta = beta
        self.min_r = min_r
        self.max_r = max_r
        self.modules = [m for m in self.model.modules() if isinstance(m, AdaptiveLoRALinear)]

    def gather_statistics(self):
        stats = []
        for module in self.modules:
            score = 0.0
            if self.alpha > 0.0:
                score += self.alpha * module.average_grad_norm()
            if self.beta > 0.0:
                score += self.beta * module.compute_weight_change()
            stats.append((module, score))
        return stats

    def assign_adaptive_ranks(self):
        stats = self.gather_statistics()
        scores = [score for (module, score) in stats]
        min_score, max_score = min(scores), max(scores)

        if min_score == max_score:
            normalized_scores = [1.0 for _ in scores]
        else:
            normalized_scores = [(s - min_score) / (max_score - min_score) for s in scores]

        for (module, norm_score) in zip(self.modules, normalized_scores):
            if not np.isfinite(norm_score):
                norm_score = 1.0

            adaptive_r = int(self.min_r + (self.max_r - self.min_r) * norm_score)
            adaptive_r = max(self.min_r, min(self.max_r, adaptive_r))
            print(f"Assigning new rank r={adaptive_r} to a layer")

            device = module.lora_A.device
            new_lora_A = torch.randn(adaptive_r, module.base.in_features, device=device) * 0.01
            new_lora_B = torch.randn(module.base.out_features, adaptive_r, device=device) * 0.01

            module.lora_A = nn.Parameter(new_lora_A)
            module.lora_B = nn.Parameter(new_lora_B)

            module.initial_A = new_lora_A.detach().clone()
            module.initial_B = new_lora_B.detach().clone()

            module.grad_norm_history = []
            module.weight_change_history = []

In [7]:
dataset = load_dataset('glue', 'mrpc')  # Change here if needed
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

dataset = dataset.rename_column('label', 'labels')
dataset = dataset.map(lambda e: tokenizer(e['sentence1'], e['sentence2'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [8]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [9]:
experiments_list = [
    {"type": "full_finetune"},
    {"type": "static_lora"},
    {"type": "adaptive_lora", "alpha": 1.0, "beta": 0.0},
    {"type": "adaptive_lora", "alpha": 0.0, "beta": 1.0},
    {"type": "adaptive_lora", "alpha": 0.8, "beta": 0.2},
    {"type": "adaptive_lora", "alpha": 0.2, "beta": 0.8},
    {"type": "adaptive_lora", "alpha": 0.5, "beta": 0.5},
]

In [11]:
experiment_results = []

for idx, exp in enumerate(experiments_list, start=1):
    print(f"\n🔵 Starting Experiment {idx}: {exp}")

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

    if exp["type"] == "static_lora":
        model = inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05)
        freeze_model_except_lora(model)

    if exp["type"] == "adaptive_lora":
        model = inject_adaptive_lora(model, r=4, alpha=32, dropout=0.05)
        freeze_model_except_lora(model)

    model = model.to('cuda')

    # Patch to fix initial weights
    for module in model.modules():
        if isinstance(module, AdaptiveLoRALinear):
            module.initial_A = module.initial_A.to(module.lora_A.device)
            module.initial_B = module.initial_B.to(module.lora_B.device)

    monitor = None
    if exp["type"] == "adaptive_lora":
        monitor = AdaptiveLoRAMonitor(model, alpha=exp["alpha"], beta=exp["beta"], min_r=2, max_r=16)

    torch.cuda.empty_cache()
    initial_memory_gb = torch.cuda.memory_allocated() / 1e9
    start_time = time.time()

    # Warmup Phase
    if exp["type"] == "adaptive_lora":
        warmup_args = TrainingArguments(
            output_dir=f'./warmup_{idx}',
            eval_strategy="no",
            learning_rate=5e-4,
            per_device_train_batch_size=16,
            num_train_epochs=1,
            max_steps=500,
            weight_decay=0.01,
            fp16=True,
            report_to="none",
        )
        trainer = Trainer(
            model=model,
            args=warmup_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            compute_metrics=compute_metrics,
        )
        print(f"\n🟡 Warmup Phase for Experiment {idx}")
        trainer.train()
        print(f"✅ Warmup Completed")

        print(f"\n🟠 Assigning Adaptive Ranks for Experiment {idx}")
        monitor.assign_adaptive_ranks()
        print(f"✅ Adaptive Ranks Assigned")

    # Full Finetuning Phase
    training_args = TrainingArguments(
        output_dir=f'./final_{idx}',
        eval_strategy="steps",
        eval_steps=100,
        logging_steps=50,
        save_steps=500,
        learning_rate=5e-4,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        compute_metrics=compute_metrics,
    )

    print(f"\n🟢 Full Fine-tuning Phase for Experiment {idx}")
    trainer.train()

    end_time = time.time()
    final_memory_gb = torch.cuda.memory_allocated() / 1e9

    metrics = trainer.evaluate()

    experiment_results.append({
        "Experiment": idx,
        "Type": exp["type"],
        "Alpha": exp.get("alpha", None),
        "Beta": exp.get("beta", None),
        "Eval Loss": metrics["eval_loss"],
        "Eval Accuracy": metrics["eval_accuracy"],
        "Training Time (min)": round((end_time - start_time)/60, 2),
        "Initial GPU Memory (GB)": round(initial_memory_gb, 2),
        "Final GPU Memory (GB)": round(final_memory_gb, 2),
    })


🔵 Starting Experiment 1: {'type': 'full_finetune'}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🟢 Full Fine-tuning Phase for Experiment 1


Step,Training Loss,Validation Loss,Accuracy
100,0.6401,0.645948,0.683824
200,0.638,0.625072,0.683824
300,0.6217,0.624564,0.683824
400,0.6263,0.624296,0.683824
500,0.6365,0.624379,0.683824
600,0.6357,0.624178,0.683824


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔵 Starting Experiment 2: {'type': 'static_lora'}

🟢 Full Fine-tuning Phase for Experiment 2


Step,Training Loss,Validation Loss,Accuracy
100,0.587,0.539699,0.708333
200,0.5394,0.467119,0.784314
300,0.4504,0.428335,0.811275
400,0.4221,0.447015,0.811275
500,0.385,0.386036,0.833333
600,0.3742,0.396079,0.838235


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔵 Starting Experiment 3: {'type': 'adaptive_lora', 'alpha': 1.0, 'beta': 0.0}

🟡 Warmup Phase for Experiment 3


Step,Training Loss
500,0.4907


✅ Warmup Completed

🟠 Assigning Adaptive Ranks for Experiment 3
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
✅ Adaptive Ranks Assigned

🟢 Full Fine-tuning Phase for Experiment 3


Step,Training Loss,Validation Loss,Accuracy
100,0.5955,0.549399,0.705882
200,0.526,0.47454,0.784314
300,0.4497,0.41161,0.801471
400,0.408,0.4211,0.803922
500,0.3733,0.388387,0.813725
600,0.3755,0.388108,0.830882


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔵 Starting Experiment 4: {'type': 'adaptive_lora', 'alpha': 0.0, 'beta': 1.0}

🟡 Warmup Phase for Experiment 4


Step,Training Loss
500,0.4907


✅ Warmup Completed

🟠 Assigning Adaptive Ranks for Experiment 4
Assigning new rank r=5 to a layer
Assigning new rank r=7 to a layer
Assigning new rank r=8 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=9 to a layer
Assigning new rank r=6 to a layer
Assigning new rank r=7 to a layer
Assigning new rank r=3 to a layer
Assigning new rank r=8 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=3 to a layer
Assigning new rank r=13 to a layer
✅ Adaptive Ranks Assigned

🟢 Full Fine-tuning Phase for Experiment 4


Step,Training Loss,Validation Loss,Accuracy
100,0.5647,0.487445,0.762255
200,0.4985,0.420879,0.816176
300,0.4332,0.404608,0.833333
400,0.39,0.407357,0.823529
500,0.35,0.351656,0.838235
600,0.3253,0.383385,0.838235


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔵 Starting Experiment 5: {'type': 'adaptive_lora', 'alpha': 0.8, 'beta': 0.2}

🟡 Warmup Phase for Experiment 5


Step,Training Loss
500,0.4907


✅ Warmup Completed

🟠 Assigning Adaptive Ranks for Experiment 5
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
✅ Adaptive Ranks Assigned

🟢 Full Fine-tuning Phase for Experiment 5


Step,Training Loss,Validation Loss,Accuracy
100,0.5955,0.549399,0.705882
200,0.526,0.47454,0.784314
300,0.4497,0.41161,0.801471
400,0.408,0.4211,0.803922
500,0.3733,0.388387,0.813725
600,0.3755,0.388108,0.830882


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔵 Starting Experiment 6: {'type': 'adaptive_lora', 'alpha': 0.2, 'beta': 0.8}

🟡 Warmup Phase for Experiment 6


Step,Training Loss
500,0.4907


✅ Warmup Completed

🟠 Assigning Adaptive Ranks for Experiment 6
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
✅ Adaptive Ranks Assigned

🟢 Full Fine-tuning Phase for Experiment 6


Step,Training Loss,Validation Loss,Accuracy
100,0.5955,0.549399,0.705882
200,0.526,0.47454,0.784314
300,0.4497,0.41161,0.801471
400,0.408,0.4211,0.803922
500,0.3733,0.388387,0.813725
600,0.3755,0.388108,0.830882


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔵 Starting Experiment 7: {'type': 'adaptive_lora', 'alpha': 0.5, 'beta': 0.5}

🟡 Warmup Phase for Experiment 7


Step,Training Loss
500,0.4907


✅ Warmup Completed

🟠 Assigning Adaptive Ranks for Experiment 7
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=16 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
Assigning new rank r=2 to a layer
✅ Adaptive Ranks Assigned

🟢 Full Fine-tuning Phase for Experiment 7


Step,Training Loss,Validation Loss,Accuracy
100,0.5955,0.549399,0.705882
200,0.526,0.47454,0.784314
300,0.4497,0.41161,0.801471
400,0.408,0.4211,0.803922
500,0.3733,0.388387,0.813725
600,0.3755,0.388108,0.830882


In [12]:
df_results = pd.DataFrame(experiment_results)
print("\nAll Experiments Completed!")
print("\nFinal Adaptive LoRA Experiments Summary:")
display(df_results)


All Experiments Completed!

Final Adaptive LoRA Experiments Summary:


Unnamed: 0,Experiment,Type,Alpha,Beta,Eval Loss,Eval Accuracy,Training Time (min),Initial GPU Memory (GB),Final GPU Memory (GB)
0,1,full_finetune,,,0.623979,0.683824,0.87,0.27,0.83
1,2,static_lora,,,0.38889,0.848039,0.54,1.1,1.1
2,3,adaptive_lora,1.0,0.0,0.382421,0.830882,0.87,1.38,0.3
3,4,adaptive_lora,0.0,1.0,0.375332,0.838235,0.89,0.57,0.3
4,5,adaptive_lora,0.8,0.2,0.382421,0.830882,0.9,0.58,0.3
5,6,adaptive_lora,0.2,0.8,0.382421,0.830882,0.89,0.58,0.3
6,7,adaptive_lora,0.5,0.5,0.382421,0.830882,0.89,0.57,0.3
