In [3]:
import numpy as np
import math

class TransformerExplained:
    """
    A simplified implementation of the Transformer architecture
    with detailed mathematical explanations.
    """
    
    def __init__(self, d_model=512, n_heads=8, d_ff=2048, vocab_size=10000):
        """
        Initialize transformer parameters.
        
        Args:
            d_model: Dimension of the model (embedding size)
            n_heads: Number of attention heads
            d_ff: Dimension of feed-forward network
            vocab_size: Size of vocabulary
        """
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads  # Dimension per head
        self.d_ff = d_ff
        self.vocab_size = vocab_size
        
        # Initialize random weights (in practice, these would be learned)
        self.embedding = np.random.randn(vocab_size, d_model) * 0.1
        
    def positional_encoding(self, seq_len, d_model):
        """
        Create positional encodings using sine and cosine functions.
        
        Mathematical formula:
        PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
        PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
        
        Where:
        - pos is the position in the sequence
        - i is the dimension index
        """
        PE = np.zeros((seq_len, d_model))
        
        for pos in range(seq_len):
            for i in range(0, d_model, 2):
                # Calculate the division term
                div_term = 10000 ** (i / d_model)
                
                # Apply sine to even indices
                PE[pos, i] = math.sin(pos / div_term)
                
                # Apply cosine to odd indices
                if i + 1 < d_model:
                    PE[pos, i + 1] = math.cos(pos / div_term)
        
        return PE
    
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        """
        Compute scaled dot-product attention.
        
        Mathematical formula:
        Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
        
        Where:
        - Q: Query matrix (seq_len, d_k)
        - K: Key matrix (seq_len, d_k)
        - V: Value matrix (seq_len, d_k)
        - d_k: Dimension of each head
        
        Steps:
        1. Compute QK^T (matrix multiplication)
        2. Scale by 1/sqrt(d_k) to prevent gradient vanishing
        3. Apply mask (optional, for decoder self-attention)
        4. Apply softmax to get attention weights
        5. Multiply by V to get weighted values
        """
        # Step 1: Compute QK^T
        # Shape: (seq_len, seq_len)
        scores = np.dot(Q, K.T)
        
        # Step 2: Scale by sqrt(d_k)
        # This prevents the dot products from growing too large
        scores = scores / math.sqrt(self.d_k)
        
        # Step 3: Apply mask if provided (e.g., for causal attention)
        if mask is not None:
            scores = scores + mask * -1e9
        
        # Step 4: Apply softmax to get attention weights
        # Each row sums to 1
        attention_weights = self.softmax(scores)
        
        # Step 5: Apply attention weights to values
        # Shape: (seq_len, d_k)
        output = np.dot(attention_weights, V)
        
        return output, attention_weights
    
    def multi_head_attention(self, query, key, value, mask=None):
        """
        Implement multi-head attention mechanism.
        
        Mathematical process:
        1. Project Q, K, V into h different subspaces using linear transformations
        2. Apply scaled dot-product attention in parallel for each head
        3. Concatenate the outputs
        4. Apply final linear transformation
        
        Formula:
        MultiHead(Q, K, V) = Concat(head_1, ..., head_h)W^O
        where head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)
        """
        seq_len = query.shape[0]
        
        # Initialize weight matrices (normally learned)
        W_q = np.random.randn(self.d_model, self.d_model) * 0.1
        W_k = np.random.randn(self.d_model, self.d_model) * 0.1
        W_v = np.random.randn(self.d_model, self.d_model) * 0.1
        W_o = np.random.randn(self.d_model, self.d_model) * 0.1
        
        # Project to get Q, K, V for all heads at once
        Q = np.dot(query, W_q)  # (seq_len, d_model)
        K = np.dot(key, W_k)    # (seq_len, d_model)
        V = np.dot(value, W_v)  # (seq_len, d_model)
        
        # Reshape to separate heads
        Q = Q.reshape(seq_len, self.n_heads, self.d_k)
        K = K.reshape(seq_len, self.n_heads, self.d_k)
        V = V.reshape(seq_len, self.n_heads, self.d_k)
        
        # Apply attention for each head
        all_heads = []
        attention_weights_all = []
        
        for i in range(self.n_heads):
            # Extract each head
            Q_head = Q[:, i, :]  # (seq_len, d_k)
            K_head = K[:, i, :]  # (seq_len, d_k)
            V_head = V[:, i, :]  # (seq_len, d_k)
            
            # Apply scaled dot-product attention
            head_output, attn_weights = self.scaled_dot_product_attention(
                Q_head, K_head, V_head, mask
            )
            all_heads.append(head_output)
            attention_weights_all.append(attn_weights)
        
        # Concatenate all heads
        # Shape: (seq_len, d_model)
        concat_output = np.concatenate(all_heads, axis=1)
        
        # Final linear transformation
        output = np.dot(concat_output, W_o)
        
        return output, attention_weights_all
    
    def feed_forward_network(self, x):
        """
        Implement position-wise feed-forward network.
        
        Mathematical formula:
        FFN(x) = max(0, xW_1 + b_1)W_2 + b_2
        
        This is a simple 2-layer neural network with ReLU activation.
        Applied to each position separately and identically.
        """
        # Initialize weights and biases (normally learned)
        W1 = np.random.randn(self.d_model, self.d_ff) * 0.1
        b1 = np.zeros(self.d_ff)
        W2 = np.random.randn(self.d_ff, self.d_model) * 0.1
        b2 = np.zeros(self.d_model)
        
        # First linear transformation
        hidden = np.dot(x, W1) + b1
        
        # ReLU activation
        hidden = np.maximum(0, hidden)
        
        # Second linear transformation
        output = np.dot(hidden, W2) + b2
        
        return output
    
    def layer_norm(self, x, epsilon=1e-6):
        """
        Apply layer normalization.
        
        Mathematical formula:
        LayerNorm(x) = γ * (x - μ) / σ + β
        
        Where:
        - μ: mean of x
        - σ: standard deviation of x
        - γ: learned scale parameter (set to 1 here)
        - β: learned shift parameter (set to 0 here)
        """
        # Calculate mean and variance
        mean = np.mean(x, axis=-1, keepdims=True)
        variance = np.var(x, axis=-1, keepdims=True)
        
        # Normalize
        x_norm = (x - mean) / np.sqrt(variance + epsilon)
        
        # In practice, γ and β would be learned parameters
        gamma = np.ones(x.shape[-1])
        beta = np.zeros(x.shape[-1])
        
        return gamma * x_norm + beta
    
    def encoder_layer(self, x, mask=None):
        """
        Implement a single encoder layer.
        
        Components:
        1. Multi-head self-attention
        2. Add & Norm
        3. Feed-forward network
        4. Add & Norm
        """
        # Self-attention
        attn_output, _ = self.multi_head_attention(x, x, x, mask)
        
        # Add & Norm (residual connection + layer normalization)
        x = self.layer_norm(x + attn_output)
        
        # Feed-forward network
        ff_output = self.feed_forward_network(x)
        
        # Add & Norm
        x = self.layer_norm(x + ff_output)
        
        return x
    
    def decoder_layer(self, x, encoder_output, src_mask=None, tgt_mask=None):
        """
        Implement a single decoder layer.
        
        Components:
        1. Masked multi-head self-attention
        2. Add & Norm
        3. Multi-head cross-attention (attending to encoder output)
        4. Add & Norm
        5. Feed-forward network
        6. Add & Norm
        """
        # Masked self-attention
        self_attn_output, _ = self.multi_head_attention(x, x, x, tgt_mask)
        x = self.layer_norm(x + self_attn_output)
        
        # Cross-attention to encoder output
        cross_attn_output, _ = self.multi_head_attention(
            x, encoder_output, encoder_output, src_mask
        )
        x = self.layer_norm(x + cross_attn_output)
        
        # Feed-forward network
        ff_output = self.feed_forward_network(x)
        x = self.layer_norm(x + ff_output)
        
        return x
    
    def create_causal_mask(self, seq_len):
        """
        Create a causal mask for decoder self-attention.
        This prevents the decoder from attending to future positions.
        
        Returns upper triangular matrix with -inf values.
        """
        mask = np.triu(np.ones((seq_len, seq_len)), k=1)
        return mask
    
    def softmax(self, x):
        """Compute softmax values for each row of x."""
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
    
    def demonstrate_attention(self):
        """
        Demonstrate attention mechanism with a simple example.
        """
        print("=== TRANSFORMER ARCHITECTURE DEMONSTRATION ===\n")
        
        # Example: Simple sequence of 4 tokens
        seq_len = 4
        
        print("1. INPUT EMBEDDING AND POSITIONAL ENCODING")
        print("-" * 50)
        
        # Simulate token IDs (must be within vocab_size range)
        token_ids = np.array([101, 253, 37, 102])  # Example token IDs
        print(f"Token IDs: {token_ids}")
        
        # Get embeddings
        embeddings = self.embedding[token_ids]  # (seq_len, d_model)
        print(f"Embedding shape: {embeddings.shape}")
        
        # Add positional encoding
        pos_encoding = self.positional_encoding(seq_len, self.d_model)
        input_embeddings = embeddings + pos_encoding
        print(f"After adding positional encoding: {input_embeddings.shape}")
        
        print("\n2. SELF-ATTENTION MECHANISM")
        print("-" * 50)
        
        # Simplified attention with small dimensions for visualization
        d_k_demo = 4
        Q = np.random.randn(seq_len, d_k_demo)
        K = np.random.randn(seq_len, d_k_demo)
        V = np.random.randn(seq_len, d_k_demo)
        
        print(f"Query (Q) shape: {Q.shape}")
        print(f"Key (K) shape: {K.shape}")
        print(f"Value (V) shape: {V.shape}")
        
        # Compute attention scores
        scores = np.dot(Q, K.T) / math.sqrt(d_k_demo)
        print(f"\nAttention scores (QK^T/√d_k):")
        print(scores)
        
        # Apply softmax
        attention_weights = self.softmax(scores)
        print(f"\nAttention weights (after softmax):")
        print(attention_weights)
        print(f"Row sums (should be 1): {attention_weights.sum(axis=1)}")
        
        # Apply attention to values
        output = np.dot(attention_weights, V)
        print(f"\nAttention output shape: {output.shape}")
        
        print("\n3. CAUSAL MASKING (for Decoder)")
        print("-" * 50)
        
        # Create causal mask
        causal_mask = self.create_causal_mask(seq_len)
        print("Causal mask (0 = allowed, 1 = masked):")
        print(causal_mask)
        
        # Apply mask to scores
        masked_scores = scores + causal_mask * -1e9
        masked_attention = self.softmax(masked_scores)
        print("\nMasked attention weights:")
        print(masked_attention)
        print("Note: Future positions have ~0 attention weight")
        
        print("\n4. MULTI-HEAD ATTENTION")
        print("-" * 50)
        print(f"Number of heads: {self.n_heads}")
        print(f"Dimension per head: {self.d_k}")
        print(f"Total dimension: {self.d_model}")
        
        # Run multi-head attention
        mha_output, attn_weights = self.multi_head_attention(
            input_embeddings[:seq_len], 
            input_embeddings[:seq_len], 
            input_embeddings[:seq_len]
        )
        print(f"Multi-head attention output shape: {mha_output.shape}")
        
        print("\n5. FEED-FORWARD NETWORK")
        print("-" * 50)
        
        # Example FFN computation
        x_example = np.random.randn(1, self.d_model)
        ff_output = self.feed_forward_network(x_example)
        print(f"FFN input shape: {x_example.shape}")
        print(f"FFN output shape: {ff_output.shape}")
        print(f"Hidden layer dimension: {self.d_ff}")
        
        print("\n6. COMPLETE ENCODER LAYER")
        print("-" * 50)
        
        encoder_output = self.encoder_layer(input_embeddings[:seq_len])
        print(f"Encoder layer output shape: {encoder_output.shape}")
        
        print("\n=== KEY MATHEMATICAL CONCEPTS ===")
        print("-" * 50)
        print("1. Attention allows the model to focus on different parts of the input")
        print("2. Scaling by √d_k prevents gradient vanishing in softmax")
        print("3. Multi-head attention learns different types of relationships")
        print("4. Residual connections help with gradient flow")
        print("5. Layer normalization stabilizes training")
        print("6. FFN adds non-linearity and increases model capacity")


# Run the demonstration
if __name__ == "__main__":
    # Create transformer with smaller dimensions for clarity
    transformer = TransformerExplained(
        d_model=64,    # Smaller for demonstration
        n_heads=4,
        d_ff=256,
        vocab_size=1000
    )
    
    # Run the demonstration
    transformer.demonstrate_attention()
    
    print("\n\n=== MATHEMATICAL SUMMARY ===")
    print("-" * 50)
    print("Core Transformer Equations:")
    print()
    print("1. Scaled Dot-Product Attention:")
    print("   Attention(Q,K,V) = softmax(QK^T/√d_k)V")
    print()
    print("2. Multi-Head Attention:")
    print("   MultiHead(Q,K,V) = Concat(head_1,...,head_h)W^O")
    print("   where head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)")
    print()
    print("3. Position-wise FFN:")
    print("   FFN(x) = max(0, xW_1 + b_1)W_2 + b_2")
    print()
    print("4. Positional Encoding:")
    print("   PE(pos,2i) = sin(pos/10000^(2i/d_model))")
    print("   PE(pos,2i+1) = cos(pos/10000^(2i/d_model))")
    print()
    print("5. Layer Normalization:")
    print("   LayerNorm(x) = γ * (x-μ)/σ + β")

=== TRANSFORMER ARCHITECTURE DEMONSTRATION ===

1. INPUT EMBEDDING AND POSITIONAL ENCODING
--------------------------------------------------
Token IDs: [101 253  37 102]
Embedding shape: (4, 64)
After adding positional encoding: (4, 64)

2. SELF-ATTENTION MECHANISM
--------------------------------------------------
Query (Q) shape: (4, 4)
Key (K) shape: (4, 4)
Value (V) shape: (4, 4)

Attention scores (QK^T/√d_k):
[[ 0.22537311 -0.51454192  0.63765767  0.37202592]
 [ 0.41856289  1.63407252 -0.05978984  0.25628838]
 [-0.4239514   0.27390864 -0.64142233 -2.16428676]
 [-0.61971087  0.6148871  -1.88436126 -1.83555439]]

Attention weights (after softmax):
[[0.24123296 0.11510538 0.36432549 0.27933616]
 [0.17117357 0.57719935 0.10609398 0.1455331 ]
 [0.25066046 0.50368915 0.20166912 0.04398127]
 [0.19937022 0.68523473 0.05628979 0.05910526]]
Row sums (should be 1): [1. 1. 1. 1.]

Attention output shape: (4, 4)

3. CAUSAL MASKING (for Decoder)
------------------------------------------------