<a href="https://colab.research.google.com/github/ChandrashekharGhanokar/attention_is_all_you_need/blob/main/attention_is_all_you_need.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Positional Encoding


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim

# Adds positional information to the input embeddings
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len):
        super(PositionalEncoding, self).__init__()
        self.pe = torch.zeros(max_len, embed_size)  # Create a matrix of size (max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1)  # Positions 0, 1, 2, ...
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_size))
        self.pe[:, 0::2] = torch.sin(position * div_term)  # Apply sin to even positions
        self.pe[:, 1::2] = torch.cos(position * div_term)  # Apply cos to odd positions
        self.pe = self.pe.unsqueeze(0)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)  # Add positional encoding to input
        return x

# Encoder Layer


In [19]:
# Single transformer encoder layer with attention and feed-forward network
class EncoderLayer(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(EncoderLayer, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, heads, dropout=dropout)  # Multi-head attention
        self.norm1 = nn.LayerNorm(embed_size)  # Layer normalization
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),  # FFN layer 1
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)  # FFN layer 2
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output, _ = self.attention(x, x, x, attn_mask=mask)  # Attention output
        x = self.dropout(self.norm1(attn_output + x))  # Add & Norm 1
        forward_output = self.feed_forward(x)  # Feed-forward network
        x = self.dropout(self.norm2(forward_output + x))  # Add & Norm 2
        return x

# Transformer Model


In [20]:
# Full transformer model with embedding, encoder layers, and final output
class Transformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_len):
        super(Transformer, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_size)  # Embedding layer
        self.position_embedding = PositionalEncoding(embed_size, max_len)  # Positional encoding

        # Stack multiple encoder layers
        self.layers = nn.ModuleList(
            [EncoderLayer(embed_size, heads, forward_expansion, dropout) for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)  # Final output layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        out = self.dropout(self.word_embedding(x))  # Apply word embedding
        out = self.position_embedding(out)  # Add positional encoding
        for layer in self.layers:
            out = layer(out, mask)  # Pass through each encoder layer
        out = self.fc_out(out)  # Generate final output
        return out

# Hyperparameters


In [21]:
# Model parameters
VOCAB_SIZE = 50
EMBED_SIZE = 512
NUM_LAYERS = 3
HEADS = 8
FORWARD_EXPANSION = 4
DROPOUT = 0.1
MAX_LEN = 100
LR = 0.001

# Training Loop


In [22]:
# Create the transformer model
model = Transformer(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_layers=NUM_LAYERS,
    heads=HEADS,
    forward_expansion=FORWARD_EXPANSION,
    dropout=DROPOUT,
    max_len=MAX_LEN
)

# Sample input and target tensors (tokenized sentence)
input_tensor = torch.randint(0, VOCAB_SIZE, (1, 5))  # Batch size = 1, seq_len = 5
target_tensor = torch.randint(0, VOCAB_SIZE, (1, 5))

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Loss function
optimizer = optim.Adam(model.parameters(), lr=LR)  # Adam optimizer

# Training loop
epochs = 500
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    output = model(input_tensor, mask=None)  # Forward pass
    output = output.view(-1, VOCAB_SIZE)  # Reshape output for loss calculation
    target = target_tensor.view(-1)  # Reshape target

    loss = criterion(output, target)  # Calculate loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')  # Print loss every 100 epochs

Epoch 0, Loss: 4.071639060974121
Epoch 100, Loss: 2.9158100005588494e-05
Epoch 200, Loss: 2.281638080603443e-05
Epoch 300, Loss: 2.0360714188427664e-05
Epoch 400, Loss: 2.2196469217306003e-05


# Evaluation


In [23]:
# Evaluate the model on the input
model.eval()
with torch.no_grad():
    pred = model(input_tensor, mask=None).argmax(dim=2)  # Get predicted tokens
    predicted_sentence = " ".join([str(idx.item()) for idx in pred[0]])  # Convert tokens to sentence
    print(f"Predicted Sentence: {predicted_sentence}")

Predicted Sentence: 29 42 7 6 26
