In [1]:
from main_model import MainModel
import torch
from config import T, C, N_MFCC, H, W, NUM_CLASSES

T = 2
H = 128
W = 128

model = MainModel(
    T=T, C=C, H=H, W=W,
    N_MFCC=N_MFCC,
    num_classes=NUM_CLASSES,
)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [3]:
# Count the number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

13575247

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=10, gamma=0.1)

In [16]:
audio = torch.randn(1, 4*T, 13)
visual = torch.randn(1, T, C, H, W)
targets = torch.randint(-1, NUM_CLASSES, (1, T, 25))
boxes = torch.rand(1, T, 25, 4)

In [17]:
audio_features = model.forward_audio_encoder(audio.to(device))
visual_features = model.forward_visual_encoder(visual.to(device))
fused_features = model.forward_fusion(visual_features, audio_features)
head_output = model.forward_head(fused_features)

## Contrastive Loss

In [None]:
import torch.nn as nn

class ContrastiveLoss(nn.Module):
    def __init__(self, fmap_size, in_channels=128, out_dim=128, tau=1):
        super(ContrastiveLoss, self).__init__()
        self.temperature = tau
        self.in_channels = in_channels
        self.fmap_size = fmap_size
        self.in_channels = in_channels
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=1,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False
        )
        self.fc = nn.Linear(fmap_size, out_dim)
        self.tau = tau

    def forward(self, audio_features, visual_features, active_frames):
        """
        Args:
            audio_features: (B, T, C)
            visual_features: (B, T, C, hi, wi)
            active_frames: (B, T)
        """
        batch_size, T, ... = audio_features.size()
        V = V[active_frames]
        V = visual_features.view(batch_size*T, -1)
        
        visual_features = self.conv(visual_features)
        visual_features = self.fc(visual_features.view(visual_features.size(0), -1))
        
        A = audio_features.view(audio_features.size(0), -1)
        V = visual_features.view(visual_features.size(0), -1)
        A = A[active_frames]
        A = A / torch.norm(A, dim=1, keepdim=True)
        V = V / torch.norm(V, dim=1, keepdim=True)
        s = torch.matmul(
            A.view(A.size(0), -1).T,
            V.view(V.size(0), -1)
        )
        exp_s = torch.exp(s / self.tau)
        exp_diag = exp_s.diagonal(offset=0, dim1=-2, dim2=-1)
        sum_exp = exp_s.sum(dim=1, keepdim=True) - exp_diag
        # Compute the contrastive loss
        return -torch.log(exp_diag / sum_exp).mean()


In [None]:

loss = ContrastiveLoss()

# Test epoch

In [1]:
from dataset import DummyDataset, DummyDataLoader
from train import Trainer
import torch
from main_model import MainModel

In [2]:
T, C, H, W = 2, 3, 128, 128
N_MFCC = 13
NUM_CLASSES = 2

train_dataset = DummyDataset(
    N_MFCC=N_MFCC,
    C=C,
    H=H,
    W=W,
    T=T,
)
train_loader = DummyDataLoader(
    train_dataset,
    batch_size=1,
    shuffle=True,
    num_workers=0,
)

In [3]:
model = MainModel(
    T=T,
    C=C,
    H=H,
    W=W,
    N_MFCC=N_MFCC,
    num_classes=NUM_CLASSES,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=10, gamma=0.1
)

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    device=device,
)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
