In [19]:
import argparse
import yaml
from easydict import EasyDict
from diffmot import DiffMOT

with open('configs/dancetrack.yaml') as f:
    config = yaml.safe_load(f)

class Args:
    def __init__(self):
        self.config = 'configs/dancetrack.yaml'
        self.dataset = 'dancetrack'
        self.skip_connection = False
        self.network = 'New_ReUNet'
        self.filters = [16, 32, 64, 128, 256]

    def items(self):
        return vars(self).items()
    
args = Args()

for k, v in args.items():
    config[k] = v

config["exp_name"] = args.config.split("/")[-1].split(".")[0]
config["dataset"] = args.dataset
config = EasyDict(config)

In [20]:
from models.autoencoder import D2MP
from models.condition_embedding import History_motion_embedding

encoder = History_motion_embedding()
model = D2MP(config, encoder=encoder)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("> Model built!")
print('Network Version: ', config.network)
print(f'Network Params: {params}')

> Model built!
Network Version:  New_ReUNet
Network Params: 18650191


In [4]:
import torch

batch = {
    'condition' : torch.rand(16, 5, 8),
    'delta_bbox' : torch.rand(16, 4),
}

In [21]:
loss = model(batch)

In [22]:
loss

tensor(1.7654, grad_fn=<MeanBackward0>)

In [18]:
model.diffusion.net.filters

[16, 64, 128, 256]

In [16]:
model.diffusion.net

New_ReUNet(
  (shared_ctx_mlp): MLP(
    (layer_norm): LayerNorm((259,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (dense_layer): Sequential(
      (0): Linear(in_features=259, out_features=518, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=518, out_features=259, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.1, inplace=False)
      (6): Linear(in_features=259, out_features=256, bias=True)
    )
  )
  (up_blocks): ModuleList(
    (0): LSTMAoA(
      (out_dropout): Dropout(p=0.1, inplace=False)
      (norm): LayerNorm((260,), eps=1e-05, elementwise_affine=True)
      (att_lstm): LSTM(260, 256, num_layers=2)
      (multi_head): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (aoa_layer): Sequential(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): GLU(dim=-1)
      )
      (r