# Dinov3

In [1]:
import torch 
import torch.nn.functional as F
from models.backbones import Dinov3ConvNext, Dinov3ViT, convnext_sizes, vit_sizes

convnext_ckps = {
    'tiny': './checkpoints/dinov3/convnext/dinov3_convnext_tiny_pretrain_lvd1689m-21b726bb.pth', 
    'small': './checkpoints/dinov3/convnext/dinov3_convnext_small_pretrain_lvd1689m-296db49d.pth', 
    'base': './checkpoints/dinov3/convnext/dinov3_convnext_base_pretrain_lvd1689m-801f2ba9.pth', 
    'large': './checkpoints/dinov3/convnext/dinov3_convnext_large_pretrain_lvd1689m-61fa432d.pth'}

vit_ckps = {
    'small': './checkpoints/dinov3/vit/dinov3_vits16_pretrain_lvd1689m-08c60483.pth', 
    'base': './checkpoints/dinov3/vit/dinov3_vit7b16_pretrain_lvd1689m-a955f4ea.pth', 
    'large': None}

model_size = 'small'

model = Dinov3ConvNext(
    depths=convnext_sizes[model_size]["depths"],
    dims=convnext_sizes[model_size]["dims"],    
    weights=convnext_ckps[model_size],
    )
inputs = torch.randn(3, 3, 640, 640)
feature_list = model.forward_features([inputs], [None])
output = model(inputs)
print(output.shape)
feature_list['x_norm_patchtokens'].shape

model = Dinov3ViT(
    patch_size=vit_sizes[model_size]["patch_size"],
    embed_dim=vit_sizes[model_size]["embed_dim"],
    depth=vit_sizes[model_size]["depth"],
    num_heads=vit_sizes[model_size]["num_heads"],
    ffn_ratio=vit_sizes[model_size]["ffn_ratio"],
    weights=vit_ckps[model_size],
    )
inputs = torch.randn(3, 3, 640, 640)
feature_list = model.forward_features_list([inputs], [None])

torch.Size([3, 768])


In [19]:
feature_list[1][0][0].shape

torch.Size([3, 1601, 384])

# Meta Space

In [None]:
import torch
import torch 
import torch.nn.functional as F
from models.backbones import Dinov3ViT, vit_sizes
from models.modules import MetaSpace

vit_ckps = {
    'small': './checkpoints/dinov3/vit/dinov3_vits16_pretrain_lvd1689m-08c60483.pth', 
    'base': './checkpoints/dinov3/vit/dinov3_vit7b16_pretrain_lvd1689m-a955f4ea.pth', 
    'large': None}

model_size = 'small'

# ===== Usage Example =====
model = Dinov3ViT(
    patch_size=vit_sizes[model_size]["patch_size"],
    embed_dim=vit_sizes[model_size]["embed_dim"],
    depth=vit_sizes[model_size]["depth"],
    num_heads=vit_sizes[model_size]["num_heads"],
    ffn_ratio=vit_sizes[model_size]["ffn_ratio"],
    weights=vit_ckps[model_size],
    )
print("=" * 60)
print("MetaSpace with Keypoint Features - Example")
print("=" * 60)

# Hyperparameters
batch_size = 4
num_kpts = 17  # e.g., human pose keypoints
original_size = (256, 256)

# Multi-scale feature maps (e.g., from backbone)
feature_maps = [
    torch.randn(batch_size, 256, 64, 64),   # Level 0: 1/4 scale
    torch.randn(batch_size, 512, 32, 32),   # Level 1: 1/8 scale
    torch.randn(batch_size, 1024, 16, 16),  # Level 2: 1/16 scale
]

# Keypoints in original image coordinates
keypoints = torch.rand(batch_size, num_kpts, 2) * 256  # Random [0, 256]

# Valid mask (e.g., some keypoints are occluded)
valid_mask = torch.rand(batch_size, num_kpts) > 0.2

# Initialize MetaSpace
meta_space = MetaSpace(
    original_size=original_size,
    feature_dims=[256, 512, 1024],
    num_kpts=num_kpts,
    num_heads=8,
    momentum=0.9
)

print("\n1. Initial forward pass (training mode):")
meta_space.train()
fused_features = meta_space(feature_maps, keypoints, valid_mask)

for i, feats in enumerate(fused_features):
    print(f"   Level {i}: {feats.shape}")

print("\n2. Update meta spaces:")
meta_space.update_meta_spaces()
print("   Meta spaces updated with accumulated features")

print("\n3. Check meta space statistics:")
for i, meta in enumerate(meta_space.meta_spaces):
    print(f"   Level {i} meta space: {meta.shape}")
    print(f"      Mean: {meta.mean().item():.4f}, Std: {meta.std().item():.4f}")

print("\n4. Inference mode:")
meta_space.eval()
with torch.no_grad():
    fused_features_eval = meta_space(feature_maps, keypoints)
print("   Features fused without accumulation")

print("\n" + "=" * 60)
print("주요 기능:")
print("- Multi-scale keypoint feature extraction")
print("- Gaussian pooling for robust local features")
print("- EMA-based meta feature learning")
print("- Gated attention fusion")
print("- Valid mask support for occluded keypoints")
print("=" * 60)

MetaSpace with Keypoint Features - Example

1. Initial forward pass (training mode):
   Level 0: torch.Size([4, 17, 256])
   Level 1: torch.Size([4, 17, 512])
   Level 2: torch.Size([4, 17, 1024])

2. Update meta spaces:
   Meta spaces updated with accumulated features

3. Check meta space statistics:
   Level 0 meta space: torch.Size([17, 256])
      Mean: 0.0005, Std: 0.0774
   Level 1 meta space: torch.Size([17, 512])
      Mean: -0.0006, Std: 0.0565
   Level 2 meta space: torch.Size([17, 1024])
      Mean: -0.0000, Std: 0.0411

4. Inference mode:
   Features fused without accumulation

주요 기능:
- Multi-scale keypoint feature extraction
- Gaussian pooling for robust local features
- EMA-based meta feature learning
- Gated attention fusion
- Valid mask support for occluded keypoints


# FSKD

In [None]:
class FSKD(nn.Module):
    def __init__(
            self, 
            in_channels: int, 
            out_channels: int
        ):
        super(FSKD, self).__init__()
        self.backbone = torch.hub.load("models/backbone/dinov3", 
                                       'dinov3_convnext_small', 
                                       source='local', 
                                       weights='./checkpoints/dinov3_convnext_small_pretrain_lvd1689m-296db49d.pth')
        
        self.neck = nn.Sequential()
        self.head = nn.Sequential()

    def forward_features(
            self, 
            x: torch.Tensor, 
            masks: Optional[torch.Tensor] = None
        ) -> List[Dict[str, torch.Tensor]]:
        scaled_features = self.backbone.forward_features_list([x], [masks])[1:]
        pose_feature = self.neck(scaled_features)
        result = self.head(pose_feature)
        return result
            
    def forward(self, x: torch.Tensor) -> List[Dict[str, torch.Tensor]]:
        result = self.forward_features(x)
        return result

# Autocast

In [None]:
import torch.amp as amp 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
amp.autocast(device, enabled=False)

<torch.amp.autocast_mode.autocast at 0x76d12b671e10>

# DeepPose

In [None]:
import timm
import timm.models.resnet as resnet

In [51]:
from timm.models.resnet import Bottleneck
from timm.models.resnet import BasicBlock
from timm.models.helpers import load_pretrained, resolve_pretrained_cfg

resnet_args = {
    'resnet18': dict(block=BasicBlock, layers=(2, 2, 2, 2)),
    'resnet34': dict(block=BasicBlock, layers=(3, 4, 6, 3)),
    'resnet50': dict(block=Bottleneck, layers=(3, 4, 6, 3)),
    'resnet50s': dict(block=Bottleneck, layers=(3, 4, 6, 3), stem_width=64, stem_type='deep'),
    'resnet50t': dict(block=Bottleneck, layers=(3, 4, 6, 3), stem_width=32, stem_type='deep_tiered', avg_down=True),
    'resnet101': dict(block=Bottleneck, layers=(3, 4, 23, 3))
}

resnet_ckps = {
    'resnet18': './checkpoints/resnet/resnet18-5c106cde.pth',
    'resnet34': './checkpoints/resnet/resnet34-333f7ec4.pth',
    'resnet50': './checkpoints/resnet/resnet50-0676ba61.pth',
    'resnet50s': './checkpoints/resnet/resnet50s-3cf99910.pth',
    'resnet50t': './checkpoints/resnet/resnet50t-1f8793b8.pth',
    'resnet101': './checkpoints/resnet/resnet101-5d3b4d8f.pth'
}

class DeepPose(timm.models.ResNet):
    def __init__(self, num_joints=8, backbone='resnet50', checkpoint=None, download=False, **kwargs):
        model_args = resnet_args.get(backbone)
        super(DeepPose, self).__init__(**dict(model_args, **kwargs))
        features = False

        # resolve and update model pretrained config and model kwargs
        pretrained_cfg = resolve_pretrained_cfg(
            backbone,
            pretrained_cfg=None,
            pretrained_cfg_overlay=None
        )
        pretrained_cfg = pretrained_cfg.to_dict()
        
        # For classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats
        num_classes_pretrained = 0 if features else getattr(self, 'num_classes', kwargs.get('num_classes', 1000))
        if download:
            checkpoint = load_pretrained(
                timm.models.ResNet,
                pretrained_cfg=pretrained_cfg,
                num_classes=num_classes_pretrained,
                in_chans=kwargs.get('in_chans', 3),
                filter_fn=None,
                strict=None,
                cache_dir=None,
                )
        print(checkpoint)
        if checkpoint:
            self.load_state_dict(torch.load(checkpoint))

        self.njoints = num_joints
        self.fc = nn.Linear(self.fc.in_features, self.njoints*2)

In [52]:
if __name__=='__main__':
    import torch 

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = DeepPose(8, 'resnet18', download=True)
    model = model.to(device)

    random_noise = torch.randn((32, 3, 224, 224)).to(device)
    result = model(random_noise)

    print(f'INPUT SIZE : {random_noise.shape}')
    print(f'OUTPUT SIZE : {result.shape}')

TypeError: Module.load_state_dict() missing 1 required positional argument: 'state_dict'

# FCMAE

In [None]:
import os
import sys 
sys.path.append(os.path.join(os.path.dirname(__file__), "thirdparty", "ConvNeXt"))
from thirdparty.ConvNeXt import FCMAE

ImportError: cannot import name 'MinkowskiConvolution' from 'MinkowskiEngine' (unknown location)