In [3]:
%pip install torch

import torch
import torch.nn as nn
import torch.optim as optim
import math

# PositionalEncoding

We define a PositionalEncoding class that adds positional information to the input embeddings.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# We create a SimpleTransformer class that includes:

- An embedding layer
- Positional encoding
- A transformer encoder
- A final linear layer

In [None]:
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.linear(output)
        return output

# Hyperparameters

We set up hyperparameters for the model.

In [None]:
vocab_size = 1000
d_model = 512
nhead = 8
num_encoder_layers = 3
batch_size = 32
seq_length = 20

We instantiate the model, define a loss function (CrossEntropyLoss), and set up an optimizer (Adam).

In [None]:
model = SimpleTransformer(vocab_size, d_model, nhead, num_encoder_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

We generate some dummy data for training.

In [None]:
src = torch.randint(0, vocab_size, (seq_length, batch_size))
tgt = torch.randint(0, vocab_size, (seq_length, batch_size))

# Training iteration

We perform one training iteration (for demonstration purposes).

In [None]:
# Training loop (just one iteration for demonstration)
model.train()
optimizer.zero_grad()
output = model(src)
loss = criterion(output.view(-1, vocab_size), tgt.view(-1))
loss.backward()
optimizer.step()

print(f"Loss: {loss.item()}")

# Inference

Finally, we do a simple inference with random input data.

In [None]:
model.eval()
with torch.no_grad():
    test_input = torch.randint(0, vocab_size, (seq_length, 1))
    test_output = model(test_input)
    predicted = test_output.argmax(2)
    print("Input shape:", test_input.shape)
    print("Output shape:", test_output.shape)
    print("Predicted shape:", predicted.shape)