In [1]:
import os
import math
import inspect
import logging
import typing as ty
from functools import partial
from pathlib import Path
from collections import OrderedDict

import torch
import torch.optim
import torch.nn as nn
import numpy as np
import webbrowser
import graphviz
import minlora
from minlora import (
    LoRAParametrization,
    add_lora,
    merge_lora,
    remove_lora
)
from minlora.utils import get_params_by_name, name_is_lora
from minlora.model import add_lora_by_name, apply_lora
from torch.optim import AdamW
from spconv.pytorch.conv import SubMConv3d
graphviz.set_jupyter_format('svg')
from lora_pytorch import LoRA
assert torch.cuda.is_available()
from torchview import draw_graph
from torchviz import make_dot
from graphviz import Digraph

from pointcept.engines.defaults import (
    default_argument_parser,
    default_config_parser,
    default_setup,
)
from pointcept.engines.test import TESTERS
from pointcept.engines.launch import launch
from pointcept.engines.test import TesterBase, SemSegTester

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

repo_root = Path("../..")


def count_trainable_parameters(model):
    return dict(
        trainable=sum(p.numel() for p in model.parameters() if p.requires_grad),
        frozen=sum(p.numel() for p in model.parameters() if not p.requires_grad)
    )

def named_trainable_parameters(model):
    return dict(
        trainable=[n for n, p in model.named_parameters() if p.requires_grad],
        frozen=[n for n, p in model.named_parameters() if not p.requires_grad]
    )


def filter_named_params(model, name_filter: ty.Callable) -> tuple[str, nn.Parameter]:
    """
    generator which returns (parameter_name, weight tensor)
    for all tensors whose names match the filter function
    """
    for n, p in model.named_parameters():
        if name_filter(n):
            yield n, p

get_named_lora_params = partial(filter_named_params, name_filter=name_is_lora)
get_named_non_lora_params = partial(filter_named_params, name_filter=(lambda x: not name_is_lora(x)))

def count_lora_parameters(model):
    """use minlora directly"""
    return sum(p.numel() for p in minlora.get_lora_params(model))
    
def count_lora_params_manual(model):
    """just looking at weight tensor names manually as a cross check"""
    return sum(p.numel() for n, p in get_named_lora_params(model))

def check_lora_trainable(model):
    for param in minlora.get_lora_params(model):
        assert param.requires_grad

def freeze_non_lora_params(model):
    print("freezing non-lora parameters")
    frozen_param_names = set()
    for name, param in get_named_non_lora_params(model):
        param.requires_grad = False
        frozen_param_names.add(name)
    yield
    print("unfreezing")

def unfreeze_all_params(model):
    for name, param in model.named_parameters():
        param.requires_grad = True

def create_spoofed_input(batch_size=2, num_points=1000, n_classes=5, num_features=6, device='cpu'):
    return {
        'coord': torch.rand(num_points * batch_size, num_features, device=device),
        'feat': torch.rand(num_points * batch_size, num_features, device=device),
        'grid_coord': torch.randint(0, 100, (num_points * batch_size, 3), device=device),
        'batch': torch.arange(batch_size, device=device).repeat_interleave(num_points),
        'offset': torch.tensor([num_points * i for i in range(1, batch_size + 1)], device=device),
        'condition': ['ScanNet'] * batch_size,
        'grid_size': torch.tensor([0.01], device=device),
        'segment': torch.randint(low=0, high=n_classes-1, size=(num_points * batch_size,), device=device)
    }


def patch_cfg(cfg: dict, repo_root: Path = repo_root) -> dict:
    cfg = cfg.copy()
    cfg["my_data_root"] = repo_root / cfg["my_data_root"]
    cfg["weight"] = repo_root / cfg["weight"]
    cfg["batch_size_test_per_gpu"] = 1
    return cfg


repo_root = Path("../..")
cfg_file = Path("../../test/custom-ppt-config.py"); assert cfg_file.exists
device = "cuda"

args = default_argument_parser().parse_args(args=["--config-file", f"{cfg_file}"])
cfg = default_config_parser(args.config_file, args.options); cfg = patch_cfg(cfg)

tester = TESTERS.build(dict(type=cfg.test.type, cfg=cfg))
model = tester.model
model.to(device)
print("loaded")

[2024-09-02 17:11:05,584 INFO test.py line 41 36491] => Loading config ...
[2024-09-02 17:11:05,585 INFO test.py line 48 36491] => Building model ...
[2024-09-02 17:11:08,260 INFO test.py line 61 36491] Num params: 97447088
[2024-09-02 17:11:08,462 INFO test.py line 68 36491] Loading weight at: ../../models/PointTransformerV3/scannet-semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth
[2024-09-02 17:11:08,961 INFO test.py line 84 36491] => Loaded weight '../../models/PointTransformerV3/scannet-semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth' (epoch 94)
[2024-09-02 17:11:08,965 INFO test.py line 53 36491] => Building test dataset & dataloader ...
[2024-09-02 17:11:08,967 INFO scannet.py line 72 36491] Totally 0 x 1 samples in val set.


loaded


# Visualise netron

In [3]:
#torch.save(model, "model.pth")

Now install netron and open this file:

```bash
snap install netron
snap run netron
```

# LoRA

### lora-pytorch implementation

In [6]:
# lora_model = LoRA.from_module(model, rank=50)
# print("bare model: ", count_trainable_parameters(model))
# print("lora:", count_trainable_parameters(lora_model))
# torch.save(model, "model_lora.pth")

bare model:  110759388
lora: 13312300


### minlora implementation

In [2]:
# optimizer
def configure_optimizers_lora(
    model,
    weight_decay: float = 0.05,
    learning_rate: float = 0.005,
    betas: tuple[float, float] = (0.9, 0.999),
    device_type: str = "cuda"
):
    # we apply weight decay to all lora params
    optim_groups = [
        # note: .get_lora_params() returns a generator
        # we need to wrap it in a list so we can consume it twice
        {"params": list(minlora.get_lora_params(model)) , "weight_decay": weight_decay},
        # you can also add biases for fine-tuning,
        # but I want to make sure lora alone works
        # {"params": minlora.get_bias_params(model), "weight_decay": 0.0}, # bias params don't get weight decay
    ]

    def parameter_count(optim_groups):
        n = sum(p.numel() for group in optim_groups for p in group["params"])
        if n < 1e6:
            return f"{n/1e3:.1f}k"
        else:
            return f"{n/1e6:.1f}M"

    logger.info(f"optimizing {parameter_count(optim_groups)} parameters")

    # new PyTorch nightly has a new 'fused' option for AdamW that is much faster
    use_fused = (device_type == "cuda") and ("fused" in inspect.signature(torch.optim.AdamW).parameters)
    logger.info(f"using fused AdamW: {use_fused}")
    extra_args = dict(fused=True) if use_fused else dict()
    optimizer = torch.optim.AdamW(
        optim_groups,
        lr=learning_rate,
        betas=betas,
        **extra_args
    )

    return optimizer

lora_hparams = dict(
    lora_dropout_p = 0.0,
    rank=10,
    lora_alpha = 64
)

lora_config = {
    torch.nn.Embedding: {
        "weight": partial(LoRAParametrization.from_embedding, **lora_hparams),
    },
    torch.nn.Linear: {
        "weight": partial(LoRAParametrization.from_linear, **lora_hparams),
    },
    SubMConv3d: {
        "weight": partial(LoRAParametrization.from_sparseconv3d, **lora_hparams),
    }
}

print("before LoRA:", count_trainable_parameters(model))
named_trainable_initial = named_trainable_parameters(model)


freeze_non_lora_params(model)

minlora.add_lora(model, lora_config=lora_config)
print("after LoRA:", count_trainable_parameters(model))

assert count_lora_parameters(model) == count_lora_params_manual(model)

minlora.remove_lora(model)
print("after removing lora:", count_trainable_parameters(model))

#unfreeze_all_params(model)
#print("After unfreezing:", count_trainable_parameters(model))
# if use_lora:
#     optimizer = configure_optimizers_lora(model, weight_decay, learning_rate, (beta1, beta2), device_type)
# else:
#     optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
# if init_from == 'resume':
#     optimizer.load_state_dict(checkpoint['optimizer'])


before LoRA: {'trainable': 97447088, 'frozen': 1}
after LoRA: {'trainable': 100761978, 'frozen': 1}


NameError: name 'named_lora_params' is not defined

In [None]:
named_trainable_initial["frozen"]

In [11]:
for item in minlora.get_lora_params(model):
    print(type(item))
    print(item.requires_grad)
    break

<class 'torch.nn.parameter.Parameter'>
True


In [21]:
for item in get_params_by_name(model, name_filter=name_is_lora):
    print(type(item))
    print(item.requires_grad)
    break

<class 'torch.nn.parameter.Parameter'>
True


In [27]:
for n, p in named_lora_params(model):
    print(n)
    print(p, "\n")
    break

backbone.embedding.stem.conv.parametrizations.weight.0.lora_A
Parameter containing:
tensor([[ 0.0015, -0.0013, -0.0098,  ..., -0.0012, -0.0173,  0.0280],
        [-0.0241,  0.0332,  0.0262,  ...,  0.0046, -0.0024, -0.0134],
        [-0.0098, -0.0252,  0.0011,  ...,  0.0062, -0.0120,  0.0355],
        ...,
        [ 0.0060,  0.0095,  0.0280,  ...,  0.0277, -0.0291, -0.0006],
        [-0.0261,  0.0200,  0.0330,  ..., -0.0213, -0.0310, -0.0165],
        [-0.0139, -0.0235, -0.0101,  ...,  0.0103,  0.0028, -0.0225]],
       device='cuda:0', requires_grad=True) 



In [19]:
list(get_params_by_name(model, name_filter=name_is_lora))[:5]

[Parameter containing:
 tensor([[ 0.0015, -0.0013, -0.0098,  ..., -0.0012, -0.0173,  0.0280],
         [-0.0241,  0.0332,  0.0262,  ...,  0.0046, -0.0024, -0.0134],
         [-0.0098, -0.0252,  0.0011,  ...,  0.0062, -0.0120,  0.0355],
         ...,
         [ 0.0060,  0.0095,  0.0280,  ...,  0.0277, -0.0291, -0.0006],
         [-0.0261,  0.0200,  0.0330,  ..., -0.0213, -0.0310, -0.0165],
         [-0.0139, -0.0235, -0.0101,  ...,  0.0103,  0.0028, -0.0225]],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        

In [20]:
def get_params_by_name(model, print_shapes=False, name_filter=None):
    for n, p in model.named_parameters():
        if name_filter is None or name_filter(n):
            if print_shapes:
                print(n, p.shape)
            yield p


In [13]:
check_lora_trainable(model)

In [4]:
X = create_spoofed_input(device="cuda", batch_size=16)

In [6]:
weight_decay = 0.05
learning_rate = 0.005
beta1, beta2 = 0.9, 0.999#0.95
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

optimizer = configure_optimizers_lora(
    model,
    weight_decay,
    learning_rate,
    (beta1, beta2),
    device_type
)

INFO:__main__:optimizing 3.3M parameters
INFO:__main__:using fused AdamW: True


test backward pass

In [7]:
import torch
from minlora import LoRAParametrization


def inspect_lora_gradients(model, x, num_steps=5):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    def check_grads():
        a_no_grad, b_no_grad = [], []
        a_with_grad, b_with_grad = 0, 0
        total_a, total_b = 0, 0
        trainable_params_with_grad = 0
        frozen_params = 0
        total_params = 0

        for name, param in model.named_parameters():
            total_params += param.numel()
            if not param.requires_grad:
                frozen_params += param.numel()
            elif param.grad is not None and torch.any(param.grad != 0):
                trainable_params_with_grad += param.numel()

            if 'lora_A' in name:
                total_a += 1
                if param.grad is None or torch.all(param.grad == 0):
                    a_no_grad.append(name)
                else:
                    a_with_grad += 1
            elif 'lora_B' in name:
                total_b += 1
                if param.grad is None or torch.all(param.grad == 0):
                    b_no_grad.append(name)
                else:
                    b_with_grad += 1

        return (a_with_grad, b_with_grad, total_a, total_b, a_no_grad, b_no_grad, 
                trainable_params_with_grad, frozen_params, total_params)

    # Initial forward and backward pass
    y = model(x)
    loss = y["loss"].sum()
    loss.backward()
    
    results = check_grads()
    (
        a_grad,
        b_grad,
        total_a,
        total_b,
        a_no_grad,
        b_no_grad,
        trainable_grad,
        frozen,
        total
    ) = results

    print("*** First Pass ***")
    print(f"Initial gradients: A: {a_grad}/{total_a}, B: {b_grad}/{total_b}")
    print(f"Trainable parameters with gradients: {trainable_grad:,}")
    print(f"Frozen parameters: {frozen:,}")
    print(f"Total parameters: {total:,}")
    if a_no_grad:
        print(f"Total A matrices without gradients: {len(a_no_grad)}")
    if b_no_grad:
        print(f"Total B matrices without gradients: {len(b_no_grad)}")

    # Perform several optimization steps
    for i in range(num_steps):
        optimizer.step()
        optimizer.zero_grad()
        
        y = model(x)
        loss = y["loss"].sum()
        loss.backward()
        
        results = check_grads()
        a_grad, b_grad, total_a, total_b, a_no_grad, b_no_grad, trainable_grad, frozen, total = results

        print(f"\nGradients after step {i+1}:")
        print(f"A: {a_grad}/{total_a}, B: {b_grad}/{total_b}")
        print(f"Trainable parameters with gradients: {trainable_grad:,}")
        print(f"Frozen parameters: {frozen:,}")
        print(f"Total parameters: {total:,}")
        if a_no_grad:
            print(f"A matrices without gradients: {a_no_grad}")
        if b_no_grad:
            print(f"B matrices without gradients: {b_no_grad}")
            
X = create_spoofed_input(device="cuda", batch_size=16)
inspect_lora_gradients(model, X)

*** First Pass ***
Initial gradients: A: 0/195, B: 194/195
Trainable parameters with gradients: 808,320
Frozen parameters: 97,447,089
Total parameters: 100,761,979
Total A matrices without gradients: 195
Total B matrices without gradients: 1

Gradients after step 1:
A: 194/195, B: 194/195
Trainable parameters with gradients: 3,312,300
Frozen parameters: 97,447,089
Total parameters: 100,761,979
A matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_A']
B matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_B']

Gradients after step 2:
A: 194/195, B: 194/195
Trainable parameters with gradients: 3,312,300
Frozen parameters: 97,447,089
Total parameters: 100,761,979
A matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_A']
B matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_B']

Gradients after step 3:
A: 194/195, B: 194/195
Trainable parameters with gradients: 3,312,300
Frozen parameters: 97

In [8]:
def showlora(model):
    for name, module in model.named_modules():
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)):
            print(f"Module {name}:")
            if hasattr(module, 'parametrizations'):
                for param_name, param in module.parametrizations.items():
                    print(f"  - {param_name} LoRA parameters:")
                    for lora_name, lora_param in param.named_parameters():
                        print(f"    - {lora_name}: device = {lora_param.device}")
            elif isinstance(module, nn.MultiheadAttention):
                if hasattr(module.out_proj, 'parametrizations'):
                    for param_name, param in module.out_proj.parametrizations.items():
                        print(f"  - out_proj.{param_name} LoRA parameters:")
                        for lora_name, lora_param in param.named_parameters():
                            print(f"    - {lora_name}: device = {lora_param.device}")

showlora(model)

Module backbone.enc.enc0.block0.cpe.1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.attn.qkv:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.attn.proj:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.mlp.0.fc1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.mlp.0.fc2:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block1.cpe.1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module 

In [9]:
torch.save(model, "model_minlora.pth")

RuntimeError: Serialization of parametrized modules is only supported through state_dict(). See:
https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training