In [1]:
import torch
import torch.nn as nn
from deciwatch import PositionEmbeddingSine_1D

In [118]:
class LSTM(nn.Module):
    def __init__(self, num_keypoints, keypoints_dim, num_layers, dropout, bidirectional):
        super(LSTM, self).__init__()
        
        self.num_layers = num_layers
        self.num_keypoints = num_keypoints
        self.keypoints_dim = keypoints_dim
        self.D = 2 if bidirectional else 1
        
        self.lstm = nn.LSTM(input_size=num_keypoints * keypoints_dim, hidden_size=num_keypoints * keypoints_dim, num_layers=num_layers, batch_first=True, dropout=dropout, bidirectional=bidirectional)
        self.linear = nn.Linear(self.D * keypoints_dim * num_keypoints, keypoints_dim * num_keypoints)
        
        self.relu = nn.ReLU()
        
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x):
        out, _ = self.lstm(x)
            
        out = self.linear(out)
        out = self.relu(out)
        
        return out
        

In [2]:
class Transformer(nn.Module):
    def __init__(self, d_model: int, nhead: int, num_layers: int, dim_feedforward: int, dropout: float, keypoints_numel: int, batch_size: int, window_size: int, device: torch.device):
        super(Transformer, self).__init__()
        
        self.device = device
        
        self.embedder = nn.Linear(keypoints_numel, d_model)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True), num_layers, nn.LayerNorm(d_model))
        self.mapper = nn.Linear(d_model, keypoints_numel)
        
        self.positional_encoding = PositionEmbeddingSine_1D(d_model//2, device)(batch_size, window_size).permute(1, 0, 2)
        self.dropout = nn.Dropout(dropout)
        
        self.x_mask = nn.Transformer.generate_square_subsequent_mask(window_size, device=device) # NOTE: Mask er måske ikke nødvendig
        
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x: torch.Tensor):
        x_embed = self.dropout(self.embedder(x) + self.positional_encoding)
        
        preds = self.transformer_encoder(x_embed, self.x_mask)
        preds = self.mapper(preds)
        
        return preds

In [3]:
batch_size = 8
num_keypoints = 16
keypoints_dim = 2
keypoints_numel = num_keypoints * keypoints_dim
window_size = 5
video_sequence = torch.rand(batch_size, window_size, num_keypoints * keypoints_dim)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
d_model = 512
nhead = 8
num_layers = 6
dim_feedforward = 2048
dropout = 0.1

transformer = Transformer(d_model, nhead, num_layers, dim_feedforward, dropout, keypoints_numel, batch_size, window_size, device)
pred = transformer(video_sequence)
print(pred.shape)

torch.Size([8, 5, 32])


In [None]:
a = torch.rand(batch_size, window_size, num_keypoints * keypoints_dim)
print(a.shape)
b = torch.rand(window_size, num_keypoints * keypoints_dim)
print(torch.concat((b, a), dim=1).shape)

torch.Size([8, 5, 32])


RuntimeError: Tensors must have same number of dimensions: got 2 and 3

In [119]:
num_layers = 2
dropout = 0
bidirectional = False
lstm = LSTM(num_keypoints, keypoints_dim, num_layers, dropout, bidirectional)

#pred_old = lstm.forward_old(video_sequence)
#print(pred_old.shape)

pred_new = lstm(video_sequence)
print(pred_new.shape)

torch.Size([1, 5, 32])
torch.Size([1, 5, 32])


In [209]:
a = torch.ones((video_sequence.shape[0], 1, video_sequence.shape[2]))
print(video_sequence.shape)
torch.cat((video_sequence, a), dim=1).shape

torch.Size([8, 5, 32])


torch.Size([8, 6, 32])