In [1]:
import os
import math
import inspect
from pathlib import Path
from collections import OrderedDict

import torch
import torch.optim
import torch.nn as nn
import numpy as np
import webbrowser
import graphviz
graphviz.set_jupyter_format('svg')
from lora_pytorch import LoRA
assert torch.cuda.is_available()
from torchview import draw_graph
from torchviz import make_dot
from graphviz import Digraph

from pointcept.engines.defaults import (
    default_argument_parser,
    default_config_parser,
    default_setup,
)
from pointcept.engines.test import TESTERS
from pointcept.engines.launch import launch
from pointcept.engines.test import TesterBase, SemSegTester

repo_root = Path("../..")


def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def create_spoofed_input(batch_size=2, num_points=1000, n_classes=5, num_features=6, device='cpu'):
    return {
        'coord': torch.rand(num_points * batch_size, num_features, device=device),
        'feat': torch.rand(num_points * batch_size, num_features, device=device),
        'grid_coord': torch.randint(0, 100, (num_points * batch_size, 3), device=device),
        'batch': torch.arange(batch_size, device=device).repeat_interleave(num_points),
        'offset': torch.tensor([num_points * i for i in range(1, batch_size + 1)], device=device),
        'condition': ['ScanNet'] * batch_size,
        'grid_size': torch.tensor([0.01], device=device),
        'segment': torch.randint(low=0, high=n_classes-1, size=(num_points * batch_size,), device=device)
    }


def patch_cfg(cfg: dict, repo_root: Path = repo_root) -> dict:
    cfg = cfg.copy()
    cfg["my_data_root"] = repo_root / cfg["my_data_root"]
    cfg["weight"] = repo_root / cfg["weight"]
    cfg["batch_size_test_per_gpu"] = 1
    return cfg


repo_root = Path("../..")
cfg_file = Path("../../test/custom-ppt-config.py"); assert cfg_file.exists
device = "cuda"

args = default_argument_parser().parse_args(args=["--config-file", f"{cfg_file}"])
cfg = default_config_parser(args.config_file, args.options); cfg = patch_cfg(cfg)

tester = TESTERS.build(dict(type=cfg.test.type, cfg=cfg))
model = tester.model
model.to(device)
print("loaded")

[2024-08-27 17:32:46,017 INFO test.py line 41 33468] => Loading config ...
[2024-08-27 17:32:46,018 INFO test.py line 48 33468] => Building model ...


proj_head shape says Linear(in_features=64, out_features=512, bias=True)


[2024-08-27 17:32:49,630 INFO test.py line 61 33468] Num params: 97447088
[2024-08-27 17:32:49,862 INFO test.py line 68 33468] Loading weight at: ../../models/PointTransformerV3/scannet-semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth
[2024-08-27 17:32:51,405 INFO test.py line 84 33468] => Loaded weight '../../models/PointTransformerV3/scannet-semseg-pt-v3m1-1-ppt-extreme/model/model_best.pth' (epoch 94)
[2024-08-27 17:32:51,409 INFO test.py line 53 33468] => Building test dataset & dataloader ...
[2024-08-27 17:32:51,411 INFO scannet.py line 72 33468] Totally 0 x 1 samples in val set.


DITCHING CLASS EMBEDDING
loaded


In [6]:
for m in model.named_modules():
    print(len(m))
    print(m[0])
    break

2


IndexError: tuple index out of range

# Visualise netron

In [3]:
torch.save(model, "model.pth")

Now install netron and open this file:

```bash
snap install netron
snap run netron
```

# LoRA

### lora-pytorch implementation

In [6]:
# lora_model = LoRA.from_module(model, rank=50)
# print("bare model: ", count_trainable_parameters(model))
# print("lora:", count_trainable_parameters(lora_model))
# torch.save(model, "model_lora.pth")

bare model:  110759388
lora: 13312300


### minlora implementation

In [26]:
import torch
import torch.nn as nn
from minlora import add_lora, LoRAParametrization

# Example custom architecture
class CustomBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.attention = nn.MultiheadAttention(dim, 8)
        self.ffn = nn.Sequential(
            nn.Linear(dim, 4*dim),
            nn.GELU(),
            nn.Linear(4*dim, dim)
        )
        self.conv = nn.Conv2d(dim, dim, 3, padding=1)
    
    def forward(self, x):
        x = self.attention(x, x, x)[0] + x
        x = self.ffn(x) + x
        x = x.permute(0, 2, 1).unsqueeze(-1)  # reshape for 2D conv
        x = self.conv(x).squeeze(-1).permute(0, 2, 1)
        return x

class CustomModel(nn.Module):
    def __init__(self, dim, num_blocks):
        super().__init__()
        self.blocks = nn.ModuleList([CustomBlock(dim) for _ in range(num_blocks)])
        self.final_layer = nn.Linear(dim, dim)
    
    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return self.final_layer(x)

# Custom LoRA configuration
custom_lora_config = {
    nn.Linear: {
        "weight": LoRAParametrization.from_linear
    },
    # nn.MultiheadAttention: {
    #     "out_proj.weight": LoRAParametrization.from_linear
    # },
    nn.Conv2d: {
        "weight": LoRAParametrization.from_conv2d
    }
}

# Create and apply LoRA
model = CustomModel(dim=256, num_blocks=3)
add_lora(model, lora_config=custom_lora_config)

# Verify LoRA application
for name, module in model.named_modules():
    if isinstance(module, (nn.Linear, nn.Conv2d)) or (isinstance(module, nn.MultiheadAttention) and 'weight' in name):
        print(f"LoRA applied to {name}: {hasattr(module, 'parametrizations')}")

# # Training loop (pseudo-code)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# for epoch in range(num_epochs):
#     for batch in dataloader:
#         optimizer.zero_grad()
#         loss = criterion(model(batch), targets)
#         loss.backward()
#         optimizer.step()

LoRA applied to blocks.0.attention.out_proj: False
LoRA applied to blocks.0.ffn.0: True
LoRA applied to blocks.0.ffn.2: True
LoRA applied to blocks.0.conv: True
LoRA applied to blocks.1.attention.out_proj: False
LoRA applied to blocks.1.ffn.0: True
LoRA applied to blocks.1.ffn.2: True
LoRA applied to blocks.1.conv: True
LoRA applied to blocks.2.attention.out_proj: False
LoRA applied to blocks.2.ffn.0: True
LoRA applied to blocks.2.ffn.2: True
LoRA applied to blocks.2.conv: True
LoRA applied to final_layer: True


#### for PPT+PTvt

In [2]:
from functools import partial

import minlora
from minlora import (
    LoRAParametrization,
    add_lora,
    merge_lora,
    remove_lora
)
from minlora.model import add_lora_by_name, apply_lora


from spconv.pytorch.conv import SubMConv3d

In [3]:
# optimizer
def configure_optimizers_lora(self, weight_decay, learning_rate, betas, device_type):
    # we apply weight decay to all lora params
    optim_groups = [
        # note: .get_lora_params() returns a generator
        # we need to wrap it in a list so we can consume it twice
        {"params": list(minlora.get_lora_params(self)) , "weight_decay": weight_decay},
        # you can also add biases for fine-tuning,
        # but I want to make sure lora alone works
        # {"params": minlora.get_bias_params(self), "weight_decay": 0.0}, # bias params don't get weight decay
    ]

    def parameter_count(optim_groups):
        n = sum(p.numel() for group in optim_groups for p in group["params"])
        if n < 1e6:
            return f"{n/1e3:.1f}k"
        else:
            return f"{n/1e6:.1f}M"

    print(f"optimizing {parameter_count(optim_groups)} parameters")

    # new PyTorch nightly has a new 'fused' option for AdamW that is much faster
    use_fused = (device_type == "cuda") and ("fused" in inspect.signature(torch.optim.AdamW).parameters)
    print(f"using fused AdamW: {use_fused}")
    extra_args = dict(fused=True) if use_fused else dict()
    optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)

    return optimizer

lora_hparams = dict(
    lora_dropout_p = 0.0,
    rank=10,
    lora_alpha = 64
)

lora_config = {
    torch.nn.Embedding: {
        "weight": partial(LoRAParametrization.from_embedding, **lora_hparams),
    },
    torch.nn.Linear: {
        "weight": partial(LoRAParametrization.from_linear, **lora_hparams),
    },
    SubMConv3d: {
        "weight": partial(LoRAParametrization.from_sparseconv3d, **lora_hparams),
    }
}

print("before LoRA:", count_trainable_parameters(model))

def freeze_non_lora_params(model):
    for name, param in model.named_parameters():
        if True:#'lora' not in name:
            param.requires_grad = False

freeze_non_lora_params(model)

minlora.add_lora(model, lora_config=lora_config)
print("after LoRA:", count_trainable_parameters(model))
# if use_lora:
#     optimizer = configure_optimizers_lora(model, weight_decay, learning_rate, (beta1, beta2), device_type)
# else:
#     optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
# if init_from == 'resume':
#     optimizer.load_state_dict(checkpoint['optimizer'])


before LoRA: 97447088
after LoRA: 3314890


In [4]:
X = create_spoofed_input(device="cuda", batch_size=16)

In [5]:
model(X)

torch.Size([1, 256])=
feat.shape=torch.Size([16000, 512])
self.class_embedding.shape=torch.Size([13, 512])
sim.shape=torch.Size([16000, 13])


{'loss': tensor(4.7081, device='cuda:0', grad_fn=<AddBackward0>)}

In [6]:
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
learning_rate = 6e-4 # max learning rate
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast

optimizer = configure_optimizers_lora(model, weight_decay, learning_rate, (beta1, beta2), device_type)

optimizing 3.3M parameters
using fused AdamW: True


test backward pass

In [7]:
loss = model(X)

loss["loss"].backward()
optimizer.step()
#optimizer.zero_grad(set_to_none=True)

torch.Size([1, 256])=
feat.shape=torch.Size([16000, 512])
self.class_embedding.shape=torch.Size([13, 512])
sim.shape=torch.Size([16000, 13])


In [8]:
def showlora(model):
    for name, module in model.named_modules():
        if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)):
            print(f"Module {name}:")
            if hasattr(module, 'parametrizations'):
                for param_name, param in module.parametrizations.items():
                    print(f"  - {param_name} LoRA parameters:")
                    for lora_name, lora_param in param.named_parameters():
                        print(f"    - {lora_name}: device = {lora_param.device}")
            elif isinstance(module, nn.MultiheadAttention):
                if hasattr(module.out_proj, 'parametrizations'):
                    for param_name, param in module.out_proj.parametrizations.items():
                        print(f"  - out_proj.{param_name} LoRA parameters:")
                        for lora_name, lora_param in param.named_parameters():
                            print(f"    - {lora_name}: device = {lora_param.device}")

showlora(model)

Module backbone.enc.enc0.block0.cpe.1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.attn.qkv:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.attn.proj:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.mlp.0.fc1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block0.mlp.0.fc2:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module backbone.enc.enc0.block1.cpe.1:
  - weight LoRA parameters:
    - original: device = cuda:0
    - 0.lora_A: device = cuda:0
    - 0.lora_B: device = cuda:0
Module 

In [9]:
torch.save(model, "model_minlora.pth")

RuntimeError: Serialization of parametrized modules is only supported through state_dict(). See:
https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training

### custom implementation (claude, unchecked but it runs lol)

In [3]:
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super().__init__()
        self.lora_A = nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank))
        self.scale = 0.01
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        return (x @ self.lora_A.T @ self.lora_B.T) * self.scale

class AdaptiveLoRAWrapper(nn.Module):
    def __init__(self, base_layer, rank=4):
        super().__init__()
        self.base_layer = base_layer
        if hasattr(base_layer, 'weight'):
            weight = base_layer.weight
            in_features, out_features = weight.shape[1], weight.shape[0]
        elif hasattr(base_layer, 'in_features') and hasattr(base_layer, 'out_features'):
            in_features, out_features = base_layer.in_features, base_layer.out_features
        else:
            raise ValueError(f"Unable to determine in_features and out_features for {type(base_layer)}")
        self.lora = LoRALayer(in_features, out_features, rank)

    def forward(self, x):
        return self.base_layer(x) + self.lora(x)

def get_in_out_features(layer):
    if hasattr(layer, 'in_features') and hasattr(layer, 'out_features'):
        return layer.in_features, layer.out_features
    elif hasattr(layer, 'weight'):
        return layer.weight.shape[1], layer.weight.shape[0]
    else:
        raise ValueError(f"Unable to determine in_features and out_features for {type(layer)}")

class LoRAQKV(nn.Module):
    def __init__(self, qkv_layer, rank=4):
        super().__init__()
        self.qkv_layer = qkv_layer
        in_features, out_features = get_in_out_features(qkv_layer)
        self.lora = LoRALayer(in_features, out_features, rank)

    def forward(self, x):
        return self.qkv_layer(x) + self.lora(x)

def apply_lora_to_ptv3(model, rank=4):
    for name, module in model.named_modules():
        if isinstance(module, SerializedAttention):
            module.qkv = LoRAQKV(module.qkv, rank)
            module.proj = AdaptiveLoRAWrapper(module.proj, rank)
        elif isinstance(module, MLP):
            module.fc1 = AdaptiveLoRAWrapper(module.fc1, rank)
            module.fc2 = AdaptiveLoRAWrapper(module.fc2, rank)

def apply_lora_to_ppt(model, rank=4):
    # Apply LoRA to PT-v3 backbone
    apply_lora_to_ptv3(model.backbone, rank)
    
    # Apply LoRA to the projection head
    model.proj_head = AdaptiveLoRAWrapper(model.proj_head, rank)

    def freeze_non_lora_params(model):
        for name, param in model.named_parameters():
            if 'lora' not in name:
                param.requires_grad = False

    freeze_non_lora_params(model)
    return model

# Usage:
# ppt_model = PointPromptTraining(...)
# ppt_model_with_lora = apply_lora_to_ppt(ppt_model)

In [8]:
ppt_model_with_lora = apply_lora_to_ppt(model) 

In [7]:
from pointcept.models.point_transformer_v3 import SerializedAttention, MLP

In [10]:
count_trainable_parameters(ppt_model_with_lora)

453888

In [5]:
count_trainable_parameters(model)

97979580