In [2]:
import torch

In [3]:
class TransformerModel(torch.nn.Module):
    def __init__(self, output_size, num_layers, d_model, num_heads, dff, dropout_rate):
        super().__init__()

        # Define the transformer encoder
        self.encoder_layer = torch.nn.TransformerEncoderLayer(d_model, num_heads, dff, dropout_rate, batch_first=True)
        self.transformer_encoder = torch.nn.TransformerEncoder(self.encoder_layer, num_layers)

        # Define the output layer
        self.final_layer = torch.nn.Linear(d_model, output_size)

    def forward(self, x):
        # Apply the transformer encoder
        x = self.transformer_encoder(x)

        # Apply the final layer
        x = self.final_layer(x)

        return x


In [4]:
# Define the input and output sizes
input_size = 8
output_size = 6

# Define the transformer model
num_layers = 2
d_model = 128
num_heads = 4
dff = 64
dropout_rate = 0.1
model = TransformerModel(output_size, num_layers, d_model, num_heads, dff, dropout_rate)

# Generate some example input data
batch_size = 1
seq_length = input_size
input_data = torch.randn(batch_size, seq_length, d_model)

# print the parameters of the model
total_params = sum(p.numel() for p in model.parameters())
print(total_params)

# Pass the input data through the model to get the output
print(input_data.shape)
output_data = model(input_data)
# expect 16, 10, 6
print(output_data.shape)

250182
torch.Size([1, 8, 128])
torch.Size([1, 8, 6])


In [10]:
from torchinfo import summary
seq_length = 16000
summary(model, input_size=(batch_size, seq_length, d_model))

Layer (type:depth-idx)                        Output Shape              Param #
TransformerModel                              [1, 16000, 6]             83,136
├─TransformerEncoder: 1-1                     [1, 16000, 128]           --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1, 16000, 128]           83,136
│    │    └─TransformerEncoderLayer: 3-2      [1, 16000, 128]           83,136
├─Linear: 1-2                                 [1, 16000, 6]             774
Total params: 250,182
Trainable params: 250,182
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 8.19
Forward/backward pass size (MB): 0.77
Params size (MB): 0.00
Estimated Total Size (MB): 8.96

In [19]:
import torch.nn as nn
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(10, 32, 512)
out = transformer_encoder(src)

In [20]:
out.shape

torch.Size([10, 32, 512])

In [24]:
transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
src = torch.rand((10, 32, 512))
tgt = torch.rand((20, 32, 512))
out = transformer_model(src, tgt)
print(out.shape)


torch.Size([20, 32, 512])
tensor([[[0.8831, 0.7192, 0.7406,  ..., 0.8938, 0.6347, 0.5211],
         [0.2122, 0.4283, 0.1966,  ..., 0.2230, 0.0964, 0.0918],
         [0.7867, 0.8161, 0.5503,  ..., 0.8816, 0.2052, 0.5661],
         ...,
         [0.4690, 0.1957, 0.8169,  ..., 0.3704, 0.0157, 0.3309],
         [0.3612, 0.8310, 0.9841,  ..., 0.4858, 0.4641, 0.8028],
         [0.0825, 0.4360, 0.0565,  ..., 0.1524, 0.3171, 0.8398]],

        [[0.4719, 0.3796, 0.5296,  ..., 0.8090, 0.1761, 0.2643],
         [0.0157, 0.2695, 0.6866,  ..., 0.6252, 0.8721, 0.7033],
         [0.2062, 0.2626, 0.8744,  ..., 0.4858, 0.9292, 0.9802],
         ...,
         [0.1701, 0.1103, 0.1372,  ..., 0.7573, 0.0362, 0.4542],
         [0.7662, 0.3510, 0.6453,  ..., 0.9619, 0.7399, 0.8724],
         [0.3191, 0.2178, 0.6184,  ..., 0.6606, 0.6076, 0.3059]],

        [[0.7307, 0.7800, 0.7666,  ..., 0.7604, 0.5445, 0.5923],
         [0.7473, 0.8057, 0.5010,  ..., 0.7684, 0.9257, 0.0859],
         [0.9730, 0.5786, 0.2559