In [2]:
from models.dinov2.dinov2 import DINOv2
import cv2
import torch
import numpy as np
import argparse
import sys, os
from torch import nn

DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [3]:
model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}

In [4]:
encoder='vits'

intermediate_layer_idx = {
'vits': [2, 5, 8, 11],
'vitb': [2, 5, 8, 11], 
'vitl': [4, 11, 17, 23], 
'vitg': [9, 19, 29, 39]
}

dino_backbone = DINOv2(model_name=encoder).to(DEVICE)

In [5]:
# image = np.random.rand(518, 518, 3)
image = np.random.rand(490, 644, 3)

image = image.astype('float32')/255
images_arr = np.expand_dims(image, axis=0)
print(images_arr.shape)
input_tensor = torch.Tensor(np.transpose(images_arr, [0, 3, 2, 1])).to(DEVICE)



(1, 490, 644, 3)


In [7]:
result = dino_backbone.forward_features(input_tensor)
dino_features = result['x_norm_patchtokens'].permute(0, 2, 1).cpu().detach().numpy()
print("x_norm_clstoken: ", result['x_norm_clstoken'].cpu().detach().numpy().shape)
print("x_norm_regtokens: ", result['x_norm_regtokens'].cpu().detach().numpy().shape)
print("x_norm_patchtokens: ", result['x_norm_patchtokens'].cpu().detach().numpy().shape)
print("x_prenorm: ", result['x_prenorm'].cpu().detach().numpy().shape)

ret = dino_backbone(input_tensor)
print("ret: ", ret)
print("dino_features: ", dino_features.shape)


x_norm_clstoken:  (1, 384)
x_norm_regtokens:  (1, 0, 384)
x_norm_patchtokens:  (1, 1610, 384)
x_prenorm:  (1, 1611, 384)
ret:  {'x_norm_clstoken': tensor([[ 0.4053, -2.4967, -0.3243, -1.8044, -0.6511,  0.0798,  2.2217, -0.6900,
         -0.4979, -0.0444, -0.8815, -0.4863, -0.2996,  0.1270,  1.2281, -1.9720,
         -0.8986,  0.7999,  0.1510, -1.3546, -1.0754, -1.2510, -0.8343, -0.8719,
         -0.7795, -0.1154,  0.0317,  0.1778,  1.1575,  0.1968,  1.1554,  0.5081,
         -0.3818, -0.0678, -1.6300,  0.9168,  0.9229,  0.0439, -0.5499,  1.2073,
          0.7164, -0.4153,  1.6352, -0.6176,  0.9814, -0.1579,  0.3655,  0.0869,
         -1.3129,  0.2041, -0.8349, -0.1806,  0.9245, -0.0254, -0.7807,  0.0168,
         -0.6424, -1.5370,  1.5536,  0.1421,  0.4377,  0.3102, -0.2822,  0.4147,
         -2.7648,  0.1212,  0.2114,  1.9111,  1.5413,  0.0321, -1.6190,  0.8200,
         -0.1481,  0.8727,  0.5020, -0.4007,  1.3255,  0.4614,  0.7335, -0.0741,
          0.7543, -0.9212,  0.9231,  0.6398

In [1]:
# CVAE backbone

In [8]:
from backbone import build_backbone, Backbone, Joiner
from position_encoding import build_position_encoding

In [9]:

args = argparse.Namespace(hidden_dim=384, position_embedding='sine')

print(args.hidden_dim)  # Output: 384

384


In [10]:
position_embedding = build_position_encoding(args)
position_embedding

PositionEmbeddingSine()

In [15]:
x_np = np.random.rand(8, 512, 15, 20)
x_np = np.random.rand(8, 384, 1369)

x_tensor = torch.Tensor(x_np).to(DEVICE)
x_tensor.shape



torch.Size([8, 384, 1369])

In [16]:
pe = position_embedding(x_tensor)
pe.shape


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [22]:
net = Joiner(dino_backbone, position_embedding)

In [None]:
train_backbone = False
return_interm_layers = False
dilation = False
backbone = Backbone('resnet18', train_backbone, return_interm_layers, dilation).to(DEVICE) 
backbone