In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7c5029fb2730>

In [4]:
# Part 1: Understanding the Problem
# ---------------------------------
# Let's create a simple long-term dependency task:
# Predict the last token of a sequence given two tokens that should match
# Example: [1, 2, 3, 4, 1] should predict 1 because the first token is 1

def generate_matching_data(seq_length=10, num_samples=1000, vocab_size=5):
    """Generate data where the target matches the first token"""
    X = torch.randint(1, vocab_size, (num_samples, seq_length))
    # Target is the same as the first token
    y = X[:, 0]
    return X, y

# Generate training data
seq_length = 20  # Longer sequences to demonstrate long-term dependency
X_train, y_train = generate_matching_data(seq_length, 1000)
X_test, y_test = generate_matching_data(seq_length, 200)

print("=" * 50)
print("DATA GENERATION RESULTS:")
print(f"Input shape: {X_train.shape}, Target shape: {y_train.shape}")
print(f"Sample input sequence: {X_train[0]}")
print(f"Sample target (should match first token): {y_train[0]}")
print(f"First token in sequence: {X_train[0][0]}")
print("This is a long-term dependency task: model must remember the first token")
print("=" * 50)

DATA GENERATION RESULTS:
Input shape: torch.Size([1000, 20]), Target shape: torch.Size([1000])
Sample input sequence: tensor([2, 4, 3, 3, 1, 1, 3, 2, 3, 4, 1, 1, 3, 2, 2, 4, 1, 3, 3, 1])
Sample target (should match first token): 2
First token in sequence: 2
This is a long-term dependency task: model must remember the first token


In [5]:
# Part 2: Self-Attention - The Key to Solving Long-Term Dependencies
# -----------------------------------------------------------------

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (self.head_dim * heads == embed_size), "Embedding size needs to be divisible by heads"

        # Linear projections for Q, K, V
        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        # x shape: (batch_size, seq_length, embed_size)
        batch_size = x.shape[0]
        seq_length = x.shape[1]

        # Perform linear projections
        queries = self.query(x)  # (batch_size, seq_length, embed_size)
        keys = self.key(x)       # (batch_size, seq_length, embed_size)
        values = self.value(x)   # (batch_size, seq_length, embed_size)

        # Reshape for multi-head attention
        queries = queries.reshape(batch_size, seq_length, self.heads, self.head_dim)
        keys = keys.reshape(batch_size, seq_length, self.heads, self.head_dim)
        values = values.reshape(batch_size, seq_length, self.heads, self.head_dim)

        # Transpose to compute attention across seq_length
        # New shape: (batch_size, heads, seq_length, head_dim)
        queries = queries.permute(0, 2, 1, 3)
        keys = keys.permute(0, 2, 1, 3)
        values = values.permute(0, 2, 1, 3)

        # Compute attention scores
        # (batch_size, heads, seq_length, seq_length)
        energy = torch.matmul(queries, keys.permute(0, 1, 3, 2))

        # Scale attention scores
        scaling = self.head_dim ** 0.5
        attention = energy / scaling

        # Apply softmax to get attention weights
        attention = torch.softmax(attention, dim=-1)

        # Multiply with values
        # (batch_size, heads, seq_length, head_dim)
        out = torch.matmul(attention, values)

        # Reshape and concatenate heads
        # (batch_size, seq_length, embed_size)
        out = out.permute(0, 2, 1, 3).reshape(batch_size, seq_length, self.embed_size)

        # Final linear layer
        out = self.fc_out(out)
        return out, attention


In [6]:
# Demonstrate Self-Attention with a simple example
def demonstrate_self_attention():
    print("=" * 50)
    print("SELF-ATTENTION DEMONSTRATION:")

    # Create a small example batch
    batch_size = 1
    seq_length = 5
    embed_size = 8
    heads = 2

    # Create a random embedding tensor
    x = torch.randn(batch_size, seq_length, embed_size)
    print(f"Input shape: {x.shape}")

    # Create self-attention module
    attention = SelfAttention(embed_size, heads)

    # Forward pass
    output, attention_weights = attention(x)

    print(f"Output shape: {output.shape}")
    print(f"Attention weights shape: {attention_weights.shape}")

    # Print attention weights for one head
    print("Attention weights for head 0:")
    print(attention_weights[0, 0])
    print("This shows how each position attends to all other positions")

    # Check if first token (position 0) has high attention score with last token
    print(f"Attention from last position to first: {attention_weights[0, 0, -1, 0].item():.4f}")
    print("Higher values mean stronger attention between positions")
    print("=" * 50)

# Run the demonstration
demonstrate_self_attention()

SELF-ATTENTION DEMONSTRATION:
Input shape: torch.Size([1, 5, 8])
Output shape: torch.Size([1, 5, 8])
Attention weights shape: torch.Size([1, 2, 5, 5])
Attention weights for head 0:
tensor([[0.1460, 0.1890, 0.3634, 0.1926, 0.1089],
        [0.0897, 0.2782, 0.3777, 0.0855, 0.1690],
        [0.1175, 0.3344, 0.2613, 0.1146, 0.1722],
        [0.0894, 0.1968, 0.5028, 0.1089, 0.1022],
        [0.1708, 0.2192, 0.2385, 0.1758, 0.1957]], grad_fn=<SelectBackward0>)
This shows how each position attends to all other positions
Attention from last position to first: 0.1708
Higher values mean stronger attention between positions


In [7]:
# Part 3: Visualizing Self-Attention
# ----------------------------------

def visualize_attention(model, input_seq):
    """Visualize the attention weights for a single sequence"""
    # Convert to tensor and add batch dimension
    input_tensor = input_seq.unsqueeze(0)  # (1, seq_length)

    # Get embeddings
    embeddings = model.embedding(input_tensor)  # (1, seq_length, embed_size)

    # Pass through self-attention
    _, attention = model.attention(embeddings)  # attention: (1, heads, seq_length, seq_length)

    # Average attention across heads
    avg_attention = attention.mean(dim=1).squeeze(0).detach().cpu().numpy()

    # Plotting
    plt.figure(figsize=(10, 8))
    plt.imshow(avg_attention, cmap='viridis')
    plt.colorbar()
    plt.title("Self-Attention Weights")
    plt.xlabel("Key Positions")
    plt.ylabel("Query Positions")
    plt.show()

    return avg_attention

In [8]:
# Part 4: The Transformer Encoder
# ------------------------------
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention block
        attention_out, attention_weights = self.attention(x)

        # Add & norm (residual connection)
        x = self.norm1(attention_out + x)
        x = self.dropout(x)

        # Feed forward block
        forward_out = self.feed_forward(x)

        # Add & norm (residual connection)
        x = self.norm2(forward_out + x)
        x = self.dropout(x)

        return x, attention_weights

# Demonstrate Transformer Block functioning
def demonstrate_transformer_block():
    print("=" * 50)
    print("TRANSFORMER BLOCK DEMONSTRATION:")

    # Create a sample input
    batch_size = 2
    seq_length = 5
    embed_size = 8
    heads = 2

    # Create random embedding tensor
    x = torch.randn(batch_size, seq_length, embed_size)
    print(f"Input shape: {x.shape}")

    # Create transformer block
    block = TransformerBlock(
        embed_size=embed_size,
        heads=heads,
        dropout=0.1,
        forward_expansion=2
    )

    # Forward pass
    out, attention = block(x)

    print(f"Output shape: {out.shape}")
    print(f"Attention weights shape: {attention.shape}")

    # Demonstrate residual connections by checking if output is similar to input
    input_norm = torch.norm(x).item()
    output_norm = torch.norm(out).item()
    diff_norm = torch.norm(x - out).item()

    print("\nResidual connection demonstration:")
    print(f"Input norm: {input_norm:.4f}")
    print(f"Output norm: {output_norm:.4f}")
    print(f"Difference norm: {diff_norm:.4f}")
    print("Residual connections allow information to flow directly through the network")
    print("This helps prevent the vanishing gradient problem in deep networks")

    # Demonstrate layer normalization
    sample_vector = x[0, 0]
    normalized = block.norm1(sample_vector.unsqueeze(0)).squeeze(0)

    print("\nLayer normalization demonstration:")
    print(f"Original vector mean: {sample_vector.mean().item():.4f}, std: {sample_vector.std().item():.4f}")
    print(f"Normalized vector mean: {normalized.mean().item():.4f}, std: {normalized.std().item():.4f}")
    print("Layer normalization stabilizes training by standardizing activations")
    print("=" * 50)

# Run the demonstration
demonstrate_transformer_block()

TRANSFORMER BLOCK DEMONSTRATION:
Input shape: torch.Size([2, 5, 8])
Output shape: torch.Size([2, 5, 8])
Attention weights shape: torch.Size([2, 2, 5, 5])

Residual connection demonstration:
Input norm: 10.6661
Output norm: 9.1173
Difference norm: 6.1627
Residual connections allow information to flow directly through the network
This helps prevent the vanishing gradient problem in deep networks

Layer normalization demonstration:
Original vector mean: 0.2208, std: 1.8771
Normalized vector mean: 0.0000, std: 1.0690
Layer normalization stabilizes training by standardizing activations
