In [1]:
import os
import math
import inspect
import logging
from pathlib import Path
from collections import OrderedDict

import torch
import torch.optim
import torch.nn as nn
import numpy as np
import webbrowser
import graphviz
graphviz.set_jupyter_format('svg')
from lora_pytorch import LoRA
assert torch.cuda.is_available()
from torchview import draw_graph
from torchviz import make_dot
from graphviz import Digraph

from pointcept.engines.defaults import (
    default_argument_parser,
    default_config_parser,
    default_setup,
)
from pointcept.engines.test import TESTERS
from pointcept.engines.launch import launch
from pointcept.engines.test import TesterBase, SemSegTester

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

repo_root = Path("../..")


def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def create_spoofed_input(batch_size=2, num_points=1000, n_classes=5, num_features=6, device='cpu'):
    return {
        'coord': torch.rand(num_points * batch_size, num_features, device=device),
        'feat': torch.rand(num_points * batch_size, num_features, device=device),
        'grid_coord': torch.randint(0, 100, (num_points * batch_size, 3), device=device),
        'batch': torch.arange(batch_size, device=device).repeat_interleave(num_points),
        'offset': torch.tensor([num_points * i for i in range(1, batch_size + 1)], device=device),
        'condition': ['ScanNet'] * batch_size,
        'grid_size': torch.tensor([0.01], device=device),
        'segment': torch.randint(low=0, high=n_classes-1, size=(num_points * batch_size,), device=device)
    }


def patch_cfg(cfg: dict, repo_root: Path = repo_root) -> dict:
    cfg = cfg.copy()
    cfg["my_data_root"] = repo_root / cfg["my_data_root"]
    cfg["weight"] = repo_root / cfg["weight"]
    cfg["batch_size_test_per_gpu"] = 1
    return cfg


repo_root = Path("../..")
cfg_file = Path("../../test/custom-ppt-config.py"); assert cfg_file.exists
device = "cuda"

args = default_argument_parser().parse_args(args=["--config-file", f"{cfg_file}"])
cfg = default_config_parser(args.config_file, args.options); cfg = patch_cfg(cfg)

tester = TESTERS.build(dict(type=cfg.test.type, cfg=cfg))
model = tester.model
model.to(device)
print("loaded")

[2024-09-02 15:38:58,385 INFO test.py line 41 32766] => Loading config ...
[2024-09-02 15:38:58,386 INFO test.py line 48 32766] => Building model ...
[2024-09-02 15:39:00,996 INFO test.py line 61 32766] Num params: 97447088
[2024-09-02 15:39:01,197 INFO test.py line 68 32766] Loading weight at: ../../models/PointTransformerV3/scannet-semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth
[2024-09-02 15:39:01,853 INFO test.py line 84 32766] => Loaded weight '../../models/PointTransformerV3/scannet-semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth' (epoch 94)
[2024-09-02 15:39:01,857 INFO test.py line 53 32766] => Building test dataset & dataloader ...
[2024-09-02 15:39:01,859 INFO scannet.py line 72 32766] Totally 0 x 1 samples in val set.


DITCHING CLASS EMBEDDING
loaded


# Visualise netron

In [3]:
#torch.save(model, "model.pth")

Now install netron and open this file:

```bash
snap install netron
snap run netron
```

# LoRA

### lora-pytorch implementation

In [6]:
# lora_model = LoRA.from_module(model, rank=50)
# print("bare model: ", count_trainable_parameters(model))
# print("lora:", count_trainable_parameters(lora_model))
# torch.save(model, "model_lora.pth")

bare model:  110759388
lora: 13312300


### minlora implementation

#### for PPT+PTvt

In [2]:
from functools import partial

import minlora
from minlora import (
    LoRAParametrization,
    add_lora,
    merge_lora,
    remove_lora
)
from minlora.model import add_lora_by_name, apply_lora
from torch.optim import AdamW

from spconv.pytorch.conv import SubMConv3d

In [3]:
# optimizer
def configure_optimizers_lora(
    model,
    weight_decay: float = 0.05,
    learning_rate: float = 0.005,
    betas: tuple[float, float] = (0.9, 0.999),
    device_type: str = "cuda"
):
    # we apply weight decay to all lora params
    optim_groups = [
        # note: .get_lora_params() returns a generator
        # we need to wrap it in a list so we can consume it twice
        {"params": list(minlora.get_lora_params(model)) , "weight_decay": weight_decay},
        # you can also add biases for fine-tuning,
        # but I want to make sure lora alone works
        # {"params": minlora.get_bias_params(model), "weight_decay": 0.0}, # bias params don't get weight decay
    ]

    def parameter_count(optim_groups):
        n = sum(p.numel() for group in optim_groups for p in group["params"])
        if n < 1e6:
            return f"{n/1e3:.1f}k"
        else:
            return f"{n/1e6:.1f}M"

    logger.info(f"optimizing {parameter_count(optim_groups)} parameters")

    # new PyTorch nightly has a new 'fused' option for AdamW that is much faster
    use_fused = (device_type == "cuda") and ("fused" in inspect.signature(torch.optim.AdamW).parameters)
    logger.info(f"using fused AdamW: {use_fused}")
    extra_args = dict(fused=True) if use_fused else dict()
    optimizer = torch.optim.AdamW(
        optim_groups,
        lr=learning_rate,
        betas=betas,
        **extra_args
    )

    return optimizer

lora_hparams = dict(
    lora_dropout_p = 0.0,
    rank=10,
    lora_alpha = 64
)

lora_config = {
    torch.nn.Embedding: {
        "weight": partial(LoRAParametrization.from_embedding, **lora_hparams),
    },
    torch.nn.Linear: {
        "weight": partial(LoRAParametrization.from_linear, **lora_hparams),
    },
    SubMConv3d: {
        "weight": partial(LoRAParametrization.from_sparseconv3d, **lora_hparams),
    }
}

print("before LoRA:", count_trainable_parameters(model))

def freeze_non_lora_params(model):
    for name, param in model.named_parameters():
        if 'lora' not in name:
            param.requires_grad = False

def unfreeze_all_params(model):
    for name, param in model.named_parameters():
        param.requires_grad = True

freeze_non_lora_params(model)

minlora.add_lora(model, lora_config=lora_config)
print("after LoRA:", count_trainable_parameters(model))
# if use_lora:
#     optimizer = configure_optimizers_lora(model, weight_decay, learning_rate, (beta1, beta2), device_type)
# else:
#     optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
# if init_from == 'resume':
#     optimizer.load_state_dict(checkpoint['optimizer'])


before LoRA: 97447088
after LoRA: 3314890


In [4]:
X = create_spoofed_input(device="cuda", batch_size=16)

In [6]:
weight_decay = 0.05
learning_rate = 0.005
beta1, beta2 = 0.9, 0.999#0.95
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

optimizer = configure_optimizers_lora(
    model,
    weight_decay,
    learning_rate,
    (beta1, beta2),
    device_type
)

INFO:__main__:optimizing 3.3M parameters
INFO:__main__:using fused AdamW: True


test backward pass

In [7]:
import torch
from minlora import LoRAParametrization


def inspect_lora_gradients(model, x, num_steps=5):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    def check_grads():
        a_no_grad, b_no_grad = [], []
        a_with_grad, b_with_grad = 0, 0
        total_a, total_b = 0, 0
        trainable_params_with_grad = 0
        frozen_params = 0
        total_params = 0

        for name, param in model.named_parameters():
            total_params += param.numel()
            if not param.requires_grad:
                frozen_params += param.numel()
            elif param.grad is not None and torch.any(param.grad != 0):
                trainable_params_with_grad += param.numel()

            if 'lora_A' in name:
                total_a += 1
                if param.grad is None or torch.all(param.grad == 0):
                    a_no_grad.append(name)
                else:
                    a_with_grad += 1
            elif 'lora_B' in name:
                total_b += 1
                if param.grad is None or torch.all(param.grad == 0):
                    b_no_grad.append(name)
                else:
                    b_with_grad += 1

        return (a_with_grad, b_with_grad, total_a, total_b, a_no_grad, b_no_grad, 
                trainable_params_with_grad, frozen_params, total_params)

    # Initial forward and backward pass
    y = model(x)
    loss = y["loss"].sum()
    loss.backward()
    
    results = check_grads()
    (
        a_grad,
        b_grad,
        total_a,
        total_b,
        a_no_grad,
        b_no_grad,
        trainable_grad,
        frozen,
        total
    ) = results

    print("*** First Pass ***")
    print(f"Initial gradients: A: {a_grad}/{total_a}, B: {b_grad}/{total_b}")
    print(f"Trainable parameters with gradients: {trainable_grad:,}")
    print(f"Frozen parameters: {frozen:,}")
    print(f"Total parameters: {total:,}")
    if a_no_grad:
        print(f"Total A matrices without gradients: {len(a_no_grad)}")
    if b_no_grad:
        print(f"Total B matrices without gradients: {len(b_no_grad)}")

    # Perform several optimization steps
    for i in range(num_steps):
        optimizer.step()
        optimizer.zero_grad()
        
        y = model(x)
        loss = y["loss"].sum()
        loss.backward()
        
        results = check_grads()
        a_grad, b_grad, total_a, total_b, a_no_grad, b_no_grad, trainable_grad, frozen, total = results

        print(f"\nGradients after step {i+1}:")
        print(f"A: {a_grad}/{total_a}, B: {b_grad}/{total_b}")
        print(f"Trainable parameters with gradients: {trainable_grad:,}")
        print(f"Frozen parameters: {frozen:,}")
        print(f"Total parameters: {total:,}")
        if a_no_grad:
            print(f"A matrices without gradients: {a_no_grad}")
        if b_no_grad:
            print(f"B matrices without gradients: {b_no_grad}")
            
X = create_spoofed_input(device="cuda", batch_size=16)
inspect_lora_gradients(model, X)

*** First Pass ***
Initial gradients: A: 0/195, B: 194/195
Trainable parameters with gradients: 808,320
Frozen parameters: 97,447,089
Total parameters: 100,761,979
Total A matrices without gradients: 195
Total B matrices without gradients: 1

Gradients after step 1:
A: 194/195, B: 194/195
Trainable parameters with gradients: 3,312,300
Frozen parameters: 97,447,089
Total parameters: 100,761,979
A matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_A']
B matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_B']

Gradients after step 2:
A: 194/195, B: 194/195
Trainable parameters with gradients: 3,312,300
Frozen parameters: 97,447,089
Total parameters: 100,761,979
A matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_A']
B matrices without gradients: ['embedding_table.parametrizations.weight.0.lora_B']

Gradients after step 3:
A: 194/195, B: 194/195
Trainable parameters with gradients: 3,312,300
Frozen parameters: 97

In [10]:
def inspect_lora_params(model):
    lora_param_count = 0
    non_lora_param_count = 0
    lora_param_sizes = {}
    
    for name, param in model.named_parameters():
        param_size = param.numel()
        if 'lora_A' in name or 'lora_B' in name:
            lora_param_count += param_size
            lora_param_sizes[name] = param_size
        else:
            non_lora_param_count += param_size
    
    logger.debug("LoRA parameters:")
    for name, size in lora_param_sizes.items():
        logger.debug(f"  {name}: {size:,} elements")
    
    logger.info(f"\nTotal LoRA parameters: {lora_param_count:,}")
    logger.info(f"Total non-LoRA parameters: {non_lora_param_count:,}")
    logger.info(f"Total parameters: {lora_param_count + non_lora_param_count:,}")
    
    lora_percentage = (lora_param_count / (lora_param_count + non_lora_param_count)) * 100
    logger.info(f"Percentage of LoRA parameters: {lora_percentage:.2f}%")
    
    return lora_param_count, non_lora_param_count

# Usage
lora_params, non_lora_params = inspect_lora_params(model)

INFO:__main__:
Total LoRA parameters: 3,314,890
INFO:__main__:Total non-LoRA parameters: 97,447,089
INFO:__main__:Total parameters: 100,761,979
INFO:__main__:Percentage of LoRA parameters: 3.29%


In [7]:
loss = model(X)

loss["loss"].backward()
optimizer.step()
#optimizer.zero_grad(set_to_none=True)

torch.Size([1, 256])=
feat.shape=torch.Size([16000, 512])
self.class_embedding.shape=torch.Size([13, 512])
sim.shape=torch.Size([16000, 13])


In [17]:
from minlora import LoRAParametrization

def test_lora_gradients(model, X, num_steps=5):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    def check_gradients():
        for name, param in model.named_parameters():
            if isinstance(param, LoRAParametrization):
                print(f"{name}:")
                print("  A grad:", param.lora_A.grad)
                print("  B grad:", param.lora_B.grad)

    # Initial forward and backward pass
    y = model(X)
    loss = y["loss"].sum()
    loss.backward()
    
    print("Initial gradients:")
    check_gradients()

    # Perform several optimization steps
    for i in range(num_steps):
        optimizer.step()
        optimizer.zero_grad()
        
        y = model(x)
        loss = y["loss"].sum()
        loss.backward()
        
        print(f"\nGradients after step {i+1}:")
        check_gradients()

test_lora_gradients(model, X)

torch.Size([1, 256])=


OutOfMemoryError: CUDA out of memory. Tried to allocate 46.00 MiB. GPU 0 has a total capacity of 23.55 GiB of which 62.38 MiB is free. Including non-PyTorch memory, this process has 22.06 GiB memory in use. Of the allocated memory 21.17 GiB is allocated by PyTorch, and 435.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [11]:
# def inspect_lora_params_and_gradients(model, X):
#     # Forward and backward pass
#     loss = model(X)
#     loss["loss"].backward()
    
#     lora_param_count = 0
#     lora_param_with_grad_count = 0
#     non_lora_param_count = 0
#     non_lora_param_with_grad_count = 0
    
#     for name, param in model.named_parameters():
#         param_size = param.numel()
#         is_lora = 'lora_A' in name or 'lora_B' in name
        
#         if is_lora:
#             lora_param_count += param_size
#             if param.grad is not None and param.grad.abs().sum().item() > 0:
#                 lora_param_with_grad_count += param_size
#         else:
#             non_lora_param_count += param_size
#             if param.grad is not None and param.grad.abs().sum().item() > 0:
#                 non_lora_param_with_grad_count += param_size
        
#         if is_lora:
#             grad_status = "with grad" if param.grad is not None and param.grad.abs().sum().item() > 0 else "without grad"
#             print(f"LoRA parameter: {name} ({param_size:,} elements) - {grad_status}")
    
#     print(f"\nTotal LoRA parameters: {lora_param_count:,}")
#     print(f"LoRA parameters with gradients: {lora_param_with_grad_count:,}")
#     print(f"Total non-LoRA parameters: {non_lora_param_count:,}")
#     print(f"Non-LoRA parameters with gradients: {non_lora_param_with_grad_count:,}")
#     print(f"Total parameters: {lora_param_count + non_lora_param_count:,}")
    
#     lora_percentage = (lora_param_count / (lora_param_count + non_lora_param_count)) * 100
#     print(f"Percentage of LoRA parameters: {lora_percentage:.2f}%")
    
#     # Optional: zero out gradients
#     model.zero_grad()
    
#     return lora_param_count, lora_param_with_grad_count, non_lora_param_count, non_lora_param_with_grad_count

# # Usage
# lora_params, lora_grads, non_lora_params, non_lora_grads = inspect_lora_params_and_gradients(model, X)

torch.Size([1, 256])=
feat.shape=torch.Size([16000, 512])
self.class_embedding.shape=torch.Size([13, 512])
sim.shape=torch.Size([16000, 13])
LoRA parameter: backbone.embedding.stem.conv.parametrizations.weight.0.lora_A (7,500 elements) - without grad
LoRA parameter: backbone.embedding.stem.conv.parametrizations.weight.0.lora_B (480 elements) - with grad
LoRA parameter: backbone.enc.enc0.block0.cpe.0.parametrizations.weight.0.lora_A (12,960 elements) - without grad
LoRA parameter: backbone.enc.enc0.block0.cpe.0.parametrizations.weight.0.lora_B (480 elements) - with grad
LoRA parameter: backbone.enc.enc0.block0.cpe.1.parametrizations.weight.0.lora_A (480 elements) - without grad
LoRA parameter: backbone.enc.enc0.block0.cpe.1.parametrizations.weight.0.lora_B (480 elements) - with grad
LoRA parameter: backbone.enc.enc0.block0.attn.qkv.parametrizations.weight.0.lora_A (480 elements) - without grad
LoRA parameter: backbone.enc.enc0.block0.attn.qkv.parametrizations.weight.0.lora_B (1,440 elem

In [8]:
def showlora(model):
    for name, module in model.named_modules():
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)):
            print(f"Module {name}:")
            if hasattr(module, 'parametrizations'):
                for param_name, param in module.parametrizations.items():
                    print(f"  - {param_name} LoRA parameters:")
                    for lora_name, lora_param in param.named_parameters():
                        print(f"    - {lora_name}: device = {lora_param.device}")
            elif isinstance(module, nn.MultiheadAttention):
                if hasattr(module.out_proj, 'parametrizations'):
                    for param_name, param in module.out_proj.parametrizations.items():
                        print(f"  - out_proj.{param_name} LoRA parameters:")
                        for lora_name, lora_param in param.named_parameters():
                            print(f"    - {lora_name}: device = {lora_param.device}")

showlora(model)

Module backbone.enc.enc0.block0.cpe.1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.attn.qkv:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.attn.proj:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.mlp.0.fc1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.mlp.0.fc2:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block1.cpe.1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module 

In [9]:
torch.save(model, "model_minlora.pth")

RuntimeError: Serialization of parametrized modules is only supported through state_dict(). See:
https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training

### custom implementation (claude, unchecked but it runs lol)

In [3]:
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super().__init__()
        self.lora_A = nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank))
        self.scale = 0.01
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        return (x @ self.lora_A.T @ self.lora_B.T) * self.scale

class AdaptiveLoRAWrapper(nn.Module):
    def __init__(self, base_layer, rank=4):
        super().__init__()
        self.base_layer = base_layer
        if hasattr(base_layer, 'weight'):
            weight = base_layer.weight
            in_features, out_features = weight.shape[1], weight.shape[0]
        elif hasattr(base_layer, 'in_features') and hasattr(base_layer, 'out_features'):
            in_features, out_features = base_layer.in_features, base_layer.out_features
        else:
            raise ValueError(f"Unable to determine in_features and out_features for {type(base_layer)}")
        self.lora = LoRALayer(in_features, out_features, rank)

    def forward(self, x):
        return self.base_layer(x) + self.lora(x)

def get_in_out_features(layer):
    if hasattr(layer, 'in_features') and hasattr(layer, 'out_features'):
        return layer.in_features, layer.out_features
    elif hasattr(layer, 'weight'):
        return layer.weight.shape[1], layer.weight.shape[0]
    else:
        raise ValueError(f"Unable to determine in_features and out_features for {type(layer)}")

class LoRAQKV(nn.Module):
    def __init__(self, qkv_layer, rank=4):
        super().__init__()
        self.qkv_layer = qkv_layer
        in_features, out_features = get_in_out_features(qkv_layer)
        self.lora = LoRALayer(in_features, out_features, rank)

    def forward(self, x):
        return self.qkv_layer(x) + self.lora(x)

def apply_lora_to_ptv3(model, rank=4):
    for name, module in model.named_modules():
        if isinstance(module, SerializedAttention):
            module.qkv = LoRAQKV(module.qkv, rank)
            module.proj = AdaptiveLoRAWrapper(module.proj, rank)
        elif isinstance(module, MLP):
            module.fc1 = AdaptiveLoRAWrapper(module.fc1, rank)
            module.fc2 = AdaptiveLoRAWrapper(module.fc2, rank)

def apply_lora_to_ppt(model, rank=4):
    # Apply LoRA to PT-v3 backbone
    apply_lora_to_ptv3(model.backbone, rank)
    
    # Apply LoRA to the projection head
    model.proj_head = AdaptiveLoRAWrapper(model.proj_head, rank)

    def freeze_non_lora_params(model):
        for name, param in model.named_parameters():
            if 'lora' not in name:
                param.requires_grad = False

    freeze_non_lora_params(model)
    return model

# Usage:
# ppt_model = PointPromptTraining(...)
# ppt_model_with_lora = apply_lora_to_ppt(ppt_model)

In [8]:
ppt_model_with_lora = apply_lora_to_ppt(model) 

In [7]:
from pointcept.models.point_transformer_v3 import SerializedAttention, MLP

In [10]:
count_trainable_parameters(ppt_model_with_lora)

453888

In [5]:
count_trainable_parameters(model)

97979580