### Implementation of Multi-Head Attention

In [1]:
import numpy as np

# --- 1. Define Hyperparameters ---
S = 4          # Sequence length (number of tokens)
d_model = 6    # Dimensionality of the model/embeddings
d_k = d_model  # Dimensionality of Keys and Queries (for single head)
d_v = d_model  # Dimensionality of Values (for single head)

print(f"--- Hyperparameters ---")
print(f"Sequence Length (S): {S}")
print(f"Model/Embedding Dimension (d_model): {d_model}")
print(f"Key/Query Dimension (d_k): {d_k}")
print(f"Value Dimension (d_v): {d_v}\n")


# --- 2. Create Sample Input Data (Token Embeddings) ---
# In a real model, this would be the output of an embedding layer.
# For our purpose, we'll create a random matrix.
# We use a fixed seed for reproducibility.
np.random.seed(42)
X = np.random.randn(S, d_model)

print(f"--- Input Matrix X (Shape: {X.shape}) ---")
print("This matrix represents a sequence of 4 tokens, each with a 6-dimensional embedding.")
print(X)

--- Hyperparameters ---
Sequence Length (S): 4
Model/Embedding Dimension (d_model): 6
Key/Query Dimension (d_k): 6
Value Dimension (d_v): 6

--- Input Matrix X (Shape: (4, 6)) ---
This matrix represents a sequence of 4 tokens, each with a 6-dimensional embedding.
[[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696]
 [ 1.57921282  0.76743473 -0.46947439  0.54256004 -0.46341769 -0.46572975]
 [ 0.24196227 -1.91328024 -1.72491783 -0.56228753 -1.01283112  0.31424733]
 [-0.90802408 -1.4123037   1.46564877 -0.2257763   0.0675282  -1.42474819]]


In [2]:
# We continue from Step 1, using the variables S, d_model, d_k, d_v, and X.

# --- Part A: Linear Projections ---
# In a real model, these weight matrices are learned during training.
# We'll initialize them randomly for this demonstration.
np.random.seed(0) # Use a different seed for weights
W_Q = np.random.randn(d_model, d_k)
W_K = np.random.randn(d_model, d_k)
W_V = np.random.randn(d_model, d_v)

# Project X into Q, K, and V spaces
Q = X @ W_Q
K = X @ W_K
V = X @ W_V

print("--- Part A: Query, Key, and Value Matrices ---")
print(f"Shape of Q: {Q.shape}")
print(f"Shape of K: {K.shape}")
print(f"Shape of V: {V.shape}\n")


# --- Part B: Calculating Attention Scores ---
scores = Q @ K.T
# The @ operator in NumPy is used for matrix multiplication.
# K.T is the transpose of the Key matrix.

print("--- Part B: Raw Attention Scores (Q @ K.T) ---")
print(f"Shape of scores: {scores.shape}")
print("This [S, S] matrix shows the raw compatibility between each query (row) and each key (column).")
print(scores)
print("\n")


# --- Part C: Scaling ---
# The scaling factor prevents the dot products from becoming too large.
scaled_scores = scores / np.sqrt(d_k)

print("--- Part C: Scaled Attention Scores ---")
print(f"Shape of scaled_scores: {scaled_scores.shape}")
print(f"Scaling factor (sqrt(d_k)): {np.sqrt(d_k):.4f}")
print("Scores after dividing by the scaling factor.")
print(scaled_scores)
print("\n")


# --- Part D: Softmax Normalization ---
# We define a helper function for numerical stability, though it's less critical here.
def softmax(x, axis=-1):
    # Subtracting the max for numerical stability (prevents overflow)
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / e_x.sum(axis=axis, keepdims=True)

attention_weights = softmax(scaled_scores)

print("--- Part D: Attention Weights (after Softmax) ---")
print(f"Shape of attention_weights: {attention_weights.shape}")
print("Each row now represents a probability distribution and sums to 1.")
print("This matrix A tells us 'where to look'.")
print(attention_weights)
print(f"Sum of the first row of weights: {attention_weights[0].sum():.4f}") # Should be 1.0
print("\n")


# --- Part E: Weighted Sum of Values ---
# This is the final step where information is aggregated.
O = attention_weights @ V

print("--- Part E: Final Output Matrix O (A @ V) ---")
print(f"Shape of O: {O.shape}")
print("This is the final contextualized output. Each row is a weighted sum of all value vectors.")
print(O)

--- Part A: Query, Key, and Value Matrices ---
Shape of Q: (4, 6)
Shape of K: (4, 6)
Shape of V: (4, 6)

--- Part B: Raw Attention Scores (Q @ K.T) ---
Shape of scores: (4, 4)
This [S, S] matrix shows the raw compatibility between each query (row) and each key (column).
[[  2.51889084  -7.29616433  13.79556179  12.64020041]
 [  4.97930562   1.55709123  28.89221706  11.53667062]
 [ 14.57857978  14.59243409 -59.0346232  -17.30294836]
 [  0.92217982  -6.4552404   -4.47717683   5.1656522 ]]


--- Part C: Scaled Attention Scores ---
Shape of scaled_scores: (4, 4)
Scaling factor (sqrt(d_k)): 2.4495
Scores after dividing by the scaling factor.
[[  1.02833288  -2.97864661   5.63201452   5.16034021]
 [  2.03279301   0.63567983  11.79519822   4.70982606]
 [  5.95168027   5.95733627 -24.100784    -7.06389909]
 [  0.37647834  -2.63534086  -1.82779979   2.10886868]]


--- Part D: Attention Weights (after Softmax) ---
Shape of attention_weights: (4, 4)
Each row now represents a probability distribut

In [3]:
# --- Setup for Multi-Head Attention ---
# We use the same S and d_model from Step 1.
h = 3          # Number of attention heads
# d_k and d_v must be divisible by h
# We ensure d_model is divisible by h. Let's redefine it for this example.
d_model = 12
h = 3
d_k = d_v = d_model // h # d_k = d_v = 4

print(f"--- Multi-Head Attention Hyperparameters ---")
print(f"Number of Heads (h): {h}")
print(f"Model Dimension (d_model): {d_model}")
print(f"Key/Query/Value Dimension per Head (d_k, d_v): {d_k}\n")

# Recreate input X with the new d_model
np.random.seed(42)
X = np.random.randn(S, d_model)
print(f"--- New Input Matrix X (Shape: {X.shape}) ---")
print(X)
print("\n")

# --- Part A & B: Parallel Linear Projections ---
# We now need h sets of weight matrices.
# In a real implementation, this would be one large weight matrix
# that is then reshaped, but we'll create them separately for clarity.

W_Q_heads = []
W_K_heads = []
W_V_heads = []
heads_output = []

np.random.seed(1) # Seed for head weights
for i in range(h):
    W_Q_heads.append(np.random.randn(d_model, d_k))
    W_K_heads.append(np.random.randn(d_model, d_k))
    W_V_heads.append(np.random.randn(d_model, d_v))

print("--- Part A & B: Created projection matrices for each of the 3 heads ---\n")

# --- Part C: Parallel Attention Calculation ---
print("--- Part C: Calculating attention for each head in parallel ---\n")
for i in range(h):
    print(f"--- Processing Head {i+1} ---")
    # Project X for the current head
    Q_i = X @ W_Q_heads[i]
    K_i = X @ W_K_heads[i]
    V_i = X @ W_V_heads[i]
    print(f"Shape of Q_{i+1}, K_{i+1}, V_{i+1}: {Q_i.shape}, {K_i.shape}, {V_i.shape}")

    # Calculate attention scores for the current head
    scores_i = Q_i @ K_i.T
    scaled_scores_i = scores_i / np.sqrt(d_k)
    attention_weights_i = softmax(scaled_scores_i)

    # Calculate the output of the current head
    head_i_output = attention_weights_i @ V_i
    print(f"Shape of Head {i+1} output: {head_i_output.shape}\n")
    heads_output.append(head_i_output)

# --- Part D: Concatenation ---
# Concatenate the outputs of all heads along the last dimension
concatenated_heads = np.concatenate(heads_output, axis=-1)

print("--- Part D: Concatenation ---")
print("Shape of outputs from Head 1, 2, 3:", [h.shape for h in heads_output])
print(f"Shape of concatenated matrix: {concatenated_heads.shape}")
print("This matrix contains the combined knowledge from all heads.\n")

# --- Part E: Final Linear Projection ---
# Create the final output projection matrix W_O
# It maps the concatenated dimension back to d_model
np.random.seed(2)
W_O = np.random.randn(h * d_v, d_model)

# Compute the final multi-head attention output
multi_head_output = concatenated_heads @ W_O

print("--- Part E: Final Multi-Head Attention Output ---")
print(f"Shape of W_O: {W_O.shape}")
print(f"Shape of final output: {multi_head_output.shape}")
print("This is the final, context-aware output matrix, ready for the next layer.")
print(multi_head_output)

--- Multi-Head Attention Hyperparameters ---
Number of Heads (h): 3
Model Dimension (d_model): 12
Key/Query/Value Dimension per Head (d_k, d_v): 4

--- New Input Matrix X (Shape: (4, 12)) ---
[[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
   1.57921282  0.76743473 -0.46947439  0.54256004 -0.46341769 -0.46572975]
 [ 0.24196227 -1.91328024 -1.72491783 -0.56228753 -1.01283112  0.31424733
  -0.90802408 -1.4123037   1.46564877 -0.2257763   0.0675282  -1.42474819]
 [-0.54438272  0.11092259 -1.15099358  0.37569802 -0.60063869 -0.29169375
  -0.60170661  1.85227818 -0.01349722 -1.05771093  0.82254491 -1.22084365]
 [ 0.2088636  -1.95967012 -1.32818605  0.19686124  0.73846658  0.17136828
  -0.11564828 -0.3011037  -1.47852199 -0.71984421 -0.46063877  1.05712223]]


--- Part A & B: Created projection matrices for each of the 3 heads ---

--- Part C: Calculating attention for each head in parallel ---

--- Processing Head 1 ---
Shape of Q_1, K_1, V_1: (4, 4), (4, 4), (4, 4

### Implementation of Transformer Architecture

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ScaledDotProductAttention(nn.Module):
    """
    Computes the Scaled Dot-Product Attention as described in "Attention is All You Need".
    This module does not have any learnable parameters itself, but it's the core
    computation block for the Multi-Head Attention layer.
    """

    def __init__(self, dropout_rate=0.1):
        """
        Initializes the ScaledDotProductAttention module.

        Args:
            dropout_rate (float): The dropout probability to be applied to the attention
                                  weights. Default is 0.1.
        """
        # ----------------- Line-by-Line Explanation -----------------
        # 1. super(ScaledDotProductAttention, self).__init__()
        #    This is a standard and necessary line in the constructor of any PyTorch module.
        #    It calls the constructor of the parent class, nn.Module, which sets up
        #    the necessary internal state for the module (like tracking parameters and sub-modules).
        super(ScaledDotProductAttention, self).__init__()

        # 2. self.dropout = nn.Dropout(dropout_rate)
        #    Here, we initialize a Dropout layer. Dropout is a regularization technique
        #    to prevent overfitting. During training, it randomly sets a fraction of
        #    its input elements to zero with probability `dropout_rate`. This is applied
        #    to the attention weights before they are multiplied by the Value matrix.
        #    `nn.Dropout` is a module, so we instantiate it here.
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, query, key, value, mask=None):
        """
        Performs the forward pass of the attention mechanism.

        Args:
            query (torch.Tensor): The query tensor. Shape: [batch_size, n_heads, seq_len_q, d_k]
            key (torch.Tensor): The key tensor. Shape: [batch_size, n_heads, seq_len_k, d_k]
            value (torch.Tensor): The value tensor. Shape: [batch_size, n_heads, seq_len_v, d_v]
                                  Note: seq_len_k and seq_len_v must be the same.
            mask (torch.Tensor, optional): A mask tensor to prevent attention to certain positions.
                                           Shape should be broadcastable to the scores tensor's shape.
                                           Example shape: [batch_size, 1, 1, seq_len_k] for padding mask.
                                           Example shape: [batch_size, 1, seq_len_q, seq_len_k] for look-ahead mask.

        Returns:
            torch.Tensor: The output of the attention mechanism. Shape: [batch_size, n_heads, seq_len_q, d_v]
            torch.Tensor: The attention weights. Shape: [batch_size, n_heads, seq_len_q, seq_len_k]
        """
        # ----------------- Line-by-Line Explanation -----------------
        # 1. d_k = query.size(-1)
        #    We retrieve the dimensionality of the key/query vectors, d_k.
        #    `query.size()` returns a tuple of the tensor's dimensions.
        #    Using -1 as an index retrieves the size of the *last* dimension, which is d_k.
        #    This is robust and works regardless of the number of preceding dimensions (batch, heads).
        d_k = query.size(-1)

        # 2. scores = torch.matmul(query, key.transpose(-2, -1))
        #    This computes the dot product compatibility scores: Q * K^T.
        #    `torch.matmul` performs matrix multiplication.
        #    `key.transpose(-2, -1)` is crucial. It transposes the last two dimensions of the key tensor.
        #    For a key tensor of shape [batch, n_heads, seq_len_k, d_k], this transpose
        #    swaps the `seq_len_k` and `d_k` dimensions, resulting in a shape of
        #    [batch, n_heads, d_k, seq_len_k].
        #    Now, the matrix multiplication between `query` ([..., seq_len_q, d_k]) and the transposed `key`
        #    ([..., d_k, seq_len_k]) is mathematically valid and produces a `scores` tensor of shape
        #    [batch_size, n_heads, seq_len_q, seq_len_k].
        scores = torch.matmul(query, key.transpose(-2, -1))

        # 3. scores = scores / math.sqrt(d_k)
        #    Here, we apply the scaling factor as per the formula. We divide all the scores
        #    by the square root of d_k to stabilize the gradients.
        scores = scores / math.sqrt(d_k)

        # 4. if mask is not None:
        #    We check if a mask was provided. The mask is used to hide certain positions
        #    from the attention mechanism (e.g., padding tokens or future tokens in a decoder).
        if mask is not None:
            # 5. scores = scores.masked_fill(mask == 0, -1e9)
            #    `masked_fill` is a PyTorch operation that fills elements of a tensor with a
            #    specified value where a condition (the mask) is True.
            #    Our convention is that the mask has a value of 1 for positions we want to
            #    keep and 0 for positions we want to mask out.
            #    `mask == 0` creates a boolean tensor that is `True` for all positions to be masked.
            #    We fill these positions in the `scores` tensor with a very large negative number
            #    (-1e9, which is -1,000,000,000). This is a practical stand-in for negative infinity.
            #    When softmax is applied next, e^(-1e9) will be effectively zero.
            scores = scores.masked_fill(mask == 0, -1e9)

        # 6. p_attn = F.softmax(scores, dim=-1)
        #    We apply the softmax function to the scores to get the attention weights.
        #    `F.softmax` is the functional version of the softmax layer.
        #    `dim=-1` is critical. It specifies that the softmax operation should be
        #    applied along the last dimension of the `scores` tensor (the `seq_len_k` dimension).
        #    This normalizes the weights for each query independently, ensuring that for a given
        #    query, the sum of its attention weights across all keys is 1.
        p_attn = F.softmax(scores, dim=-1)

        # 7. p_attn = self.dropout(p_attn)
        #    We apply the dropout layer we initialized earlier to the attention weights.
        #    This helps in regularizing the model. During evaluation (`model.eval()`),
        #    dropout is automatically disabled.
        p_attn = self.dropout(p_attn)

        # 8. return torch.matmul(p_attn, value), p_attn
        #    Finally, we compute the weighted sum of the values by multiplying the attention
        #    weights `p_attn` ([..., seq_len_q, seq_len_k]) with the `value` tensor
        #    ([..., seq_len_v, d_v], where seq_len_k == seq_len_v).
        #    The result is a tensor of shape [batch_size, n_heads, seq_len_q, d_v].
        #    We also return the attention weights `p_attn`. This is very useful for
        #    debugging and for visualizing what the model is "paying attention to".
        output = torch.matmul(p_attn, value)

        return output, p_attn

In [5]:
# --- Configuration for our test run ---
batch_size = 2   # How many sequences we process at once
n_heads = 8      # Number of attention heads (we'll see this in the next step, for now it's just a dimension)
seq_len = 10     # Length of our input sequence
d_k = 64         # Dimension of Key/Query vectors (d_model / n_heads)
d_v = 64         # Dimension of Value vectors (d_model / n_heads)

# --- Create Dummy Tensors ---
# In a real Transformer, these would be the outputs of linear projection layers.
# torch.randn creates a tensor with random numbers from a standard normal distribution.
query = torch.randn(batch_size, n_heads, seq_len, d_k)
key = torch.randn(batch_size, n_heads, seq_len, d_k)
value = torch.randn(batch_size, n_heads, seq_len, d_v)

print("--- Input Tensor Shapes ---")
print(f"Query (Q) shape: {query.shape}")
print(f"Key (K)   shape: {key.shape}")
print(f"Value (V) shape: {value.shape}")
print("-" * 30)


# --- Demonstrate with a Padding Mask ---
# Let's pretend in our batch of 2 sequences, the first sequence has length 10
# and the second sequence has length 7 (and 3 padding tokens).
# The mask should be 1 for real tokens and 0 for padding tokens.
# Shape: [batch_size, 1, 1, seq_len] so it can be broadcast across heads and query length.
mask = torch.ones(batch_size, 1, 1, seq_len)
mask[1, :, :, 7:] = 0  # Set the last 3 tokens of the second sequence in the batch to 0 (masked)

print("--- Mask Tensor ---")
print(f"Mask shape: {mask.shape}")
print(f"Mask content for the second sequence:\n{mask[1]}")
print("-" * 30)


# --- Instantiate and run the attention module ---
attention_module = ScaledDotProductAttention(dropout_rate=0.1)
output, attention_weights = attention_module(query, key, value, mask=mask)


# --- Examine the Output ---
print("--- Output Tensor Shapes ---")
print(f"Output shape: {output.shape}")
print(f"Attention Weights shape: {attention_weights.shape}")
print("-" * 30)

# Let's verify the mask worked. The attention weights for the second sequence's
# queries (rows) should be zero for the last 3 keys (columns).
print("--- Verifying the Mask ---")
print("Attention weights for the first query of the second sequence (sum should be 1):")
print(attention_weights[1, 0, 0, :])
# The sum of weights for any given query should be 1.0
print(f"Sum of these weights: {torch.sum(attention_weights[1, 0, 0, :])}")

--- Input Tensor Shapes ---
Query (Q) shape: torch.Size([2, 8, 10, 64])
Key (K)   shape: torch.Size([2, 8, 10, 64])
Value (V) shape: torch.Size([2, 8, 10, 64])
------------------------------
--- Mask Tensor ---
Mask shape: torch.Size([2, 1, 1, 10])
Mask content for the second sequence:
tensor([[[1., 1., 1., 1., 1., 1., 1., 0., 0., 0.]]])
------------------------------
--- Output Tensor Shapes ---
Output shape: torch.Size([2, 8, 10, 64])
Attention Weights shape: torch.Size([2, 8, 10, 10])
------------------------------
--- Verifying the Mask ---
Attention weights for the first query of the second sequence (sum should be 1):
tensor([0.0892, 0.2347, 0.1509, 0.2802, 0.1258, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000])
Sum of these weights: 0.8808539509773254


In [8]:
import torch
import torch.nn as nn
import math
# We assume the ScaledDotProductAttention class from Step 1 is available
# from step1_scaled_dot_product_attention import ScaledDotProductAttention

class MultiHeadAttention(nn.Module):
    """
    Implements the Multi-Head Attention mechanism as described in "Attention is All You Need".
    This module contains the learnable linear projections for Q, K, V, and the final output.
    """
    def __init__(self, d_model, n_heads, dropout_rate=0.1):
        """
        Initializes the MultiHeadAttention module.

        Args:
            d_model (int): The dimensionality of the input and output vectors.
            n_heads (int): The number of attention heads.
            dropout_rate (float): The dropout probability.
        """
        # ----------------- Line-by-Line Explanation -----------------
        # 1. Standard nn.Module constructor call.
        super(MultiHeadAttention, self).__init__()

        # 2. assert d_model % n_heads == 0
        #    This is a critical sanity check. The model's architecture requires that
        #    the embedding dimension `d_model` can be evenly split among the attention heads.
        #    If not, the logic for reshaping and splitting dimensions will fail.
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        # 3. Store hyperparameters.
        self.d_model = d_model
        self.n_heads = n_heads
        # 4. self.d_k = d_model // n_heads
        #    Calculate the dimension of the key/query/value vectors for each head.
        #    For example, if d_model=512 and n_heads=8, then each head works with
        #    smaller 64-dimensional vectors.
        self.d_k = d_model // n_heads

        # 5. self.W_q = nn.Linear(d_model, d_model)
        #    Initialize the four learnable linear layers. `nn.Linear(in_features, out_features)`
        #    creates a fully connected layer that performs a matrix multiplication (W*x + b).
        #    - W_q: Projects the input query into a representation for all heads.
        #    - W_k: Projects the input key into a representation for all heads.
        #    - W_v: Projects the input value into a representation for all heads.
        #    - W_o: The final output projection layer that combines the heads' outputs.
        #    We use `d_model` as both input and output size for the first three layers
        #    because we will handle the splitting into heads via reshaping.
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        # 6. self.attention = ScaledDotProductAttention(dropout_rate)
        #    Instantiate the attention mechanism module we built in Step 1. This module
        #    will perform the core computation for all heads in parallel.
        self.attention = ScaledDotProductAttention(dropout_rate)

    def forward(self, query, key, value, mask=None):
        """
        Performs the forward pass of the Multi-Head Attention mechanism.

        Args:
            query (torch.Tensor): The query tensor. Shape: [batch_size, seq_len_q, d_model]
            key (torch.Tensor): The key tensor. Shape: [batch_size, seq_len_k, d_model]
            value (torch.Tensor): The value tensor. Shape: [batch_size, seq_len_v, d_model]
            mask (torch.Tensor, optional): A mask tensor.

        Returns:
            torch.Tensor: The output of the MHA mechanism. Shape: [batch_size, seq_len_q, d_model]
            torch.Tensor: The attention weights. Shape: [batch_size, n_heads, seq_len_q, seq_len_k]
        """
        # ----------------- Line-by-Line Explanation -----------------
        # 1. batch_size = query.size(0)
        #    Get the batch size from the first dimension of the query tensor. This will be
        #    needed for reshaping the tensors later.
        batch_size = query.size(0)

        # 2. q_proj = self.W_q(query)
        #    Perform the initial linear projections for Q, K, and V.
        #    Input query shape:  [batch_size, seq_len_q, d_model]
        #    Output q_proj shape: [batch_size, seq_len_q, d_model]
        q_proj = self.W_q(query)
        k_proj = self.W_k(key)
        v_proj = self.W_v(value)

        # 3. q_split = q_proj.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        #    *** This is the crucial "split into separated heads" step. ***
        #    a) .view(batch_size, -1, self.n_heads, self.d_k): Reshapes the tensor.
        #       - `batch_size` remains the same.
        #       - `-1` tells PyTorch to automatically infer the sequence length (seq_len_q).
        #       - The `d_model` dimension is split into `n_heads` and `d_k`.
        #       - Shape becomes: [batch_size, seq_len_q, n_heads, d_k]
        #    b) .transpose(1, 2): Swaps the second and third dimensions.
        #       - This brings the `n_heads` dimension forward.
        #       - Final shape: [batch_size, n_heads, seq_len_q, d_k]
        #    Now the tensor is ready for our ScaledDotProductAttention module. Each of the `n_heads`
        #    "slices" along the second dimension is an independent query matrix for one head.
        q_split = q_proj.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k_split = k_proj.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v_split = v_proj.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)

        # 4. if mask is not None: ...
        #    The mask needs to be properly shaped to be broadcastable to the attention scores.
        #    The scores will have shape [batch_size, n_heads, seq_len_q, seq_len_k].
        #    A typical padding mask might have shape [batch_size, 1, 1, seq_len_k].
        #    `unsqueeze(1)` adds a new dimension for the `n_heads`, allowing the same
        #    mask to be applied to all heads.
        if mask is not None:
            mask = mask.unsqueeze(1)

        # 5. context, attention_weights = self.attention(q_split, k_split, v_split, mask=mask)
        #    Apply Scaled Dot-Product Attention.
        #    - `context` (the output) will have shape: [batch_size, n_heads, seq_len_q, d_k]
        #    - `attention_weights` will have shape: [batch_size, n_heads, seq_len_q, seq_len_k]
        context, attention_weights = self.attention(q_split, k_split, v_split, mask=mask)

        # 6. context_concat = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        #    *** This is the "concatenate the heads" step. ***
        #    a) .transpose(1, 2): Reverses the earlier transpose, swapping `n_heads` and `seq_len_q`.
        #       - Shape becomes: [batch_size, seq_len_q, n_heads, d_k]
        #    b) .contiguous(): This is an important memory layout operation. A `transpose`
        #       operation can make a tensor non-contiguous in memory. `view` requires a
        #       contiguous tensor, so we call `.contiguous()` to create a contiguous copy.
        #    c) .view(batch_size, -1, self.d_model): Reshapes the tensor back.
        #       - It merges the `n_heads` and `d_k` dimensions back into the single `d_model` dimension.
        #       - This effectively concatenates the outputs of all heads.
        #       - Final shape: [batch_size, seq_len_q, d_model]
        context_concat = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # 7. output = self.W_o(context_concat)
        #    Apply the final linear projection layer to mix the information from all heads.
        #    The shape remains [batch_size, seq_len_q, d_model].
        output = self.W_o(context_concat)

        # 8. Return the final output and the attention weights for analysis.
        return output, attention_weights

In [10]:
# --- Configuration for our test run ---
batch_size = 2
seq_len = 5
d_model = 512
n_heads = 8

# --- Create Dummy Tensors for a self-attention scenario ---
# In self-attention, Q, K, and V are all the same. This tensor represents
# the output from the previous layer (e.g., the input embedding layer).
x = torch.randn(batch_size, seq_len, d_model)

# The mask will be for padding. Let's say the first sequence has length 5,
# and the second has length 3 (2 padding tokens).
# Shape: [batch_size, 1, seq_len]
mask = torch.ones(batch_size, 1, seq_len)
mask[1, :, 3:] = 0 # Mask out the last 2 tokens of the second sequence

# --- Instantiate and run the MHA module ---
mha_module = MultiHeadAttention(d_model=d_model, n_heads=n_heads)
output, attention_weights = mha_module(query=x, key=x, value=x, mask=mask)


# --- Examine the Information Flow via Tensor Shapes ---
print("--- MHA Information Flow ---")
print(f"1. Initial Input (X) shape: \t\t{x.shape}")

# Inside the forward pass...
q_proj = mha_module.W_q(x)
print(f"2. After Linear Projection (e.g., Q_proj) shape: {q_proj.shape}")

q_split = q_proj.view(batch_size, -1, n_heads, d_model // n_heads).transpose(1, 2)
print(f"3. After Splitting into Heads (e.g., Q_split) shape: {q_split.shape}")

context, _ = mha_module.attention(q_split, q_split, q_split, mask.unsqueeze(1))
print(f"4. After Scaled Dot-Product Attention (context) shape: {context.shape}")

context_concat = context.transpose(1, 2).contiguous().view(batch_size, -1, d_model)
print(f"5. After Concatenating Heads (context_concat) shape: {context_concat.shape}")

final_output = mha_module.W_o(context_concat)
print(f"6. After Final Linear Projection (Output) shape: {final_output.shape}")
print("-" * 30)

print("--- Final Output Shapes ---")
print(f"Final MHA Output shape: {output.shape}")
print(f"Attention Weights shape: {attention_weights.shape}")

# Let's check the mask again on the final attention weights.
# The weights for the second sequence should be zero for the 4th and 5th key positions.
print("\n--- Verifying Mask on Final Attention Weights ---")
print("Attention weights for the 1st query of the 2nd sequence in the batch (head 0):")
print(attention_weights[1, 0, 0, :])
print(f"Sum of these weights: {torch.sum(attention_weights[1, 0, 0, :])}")

--- MHA Information Flow ---
1. Initial Input (X) shape: 		torch.Size([2, 5, 512])
2. After Linear Projection (e.g., Q_proj) shape: torch.Size([2, 5, 512])
3. After Splitting into Heads (e.g., Q_split) shape: torch.Size([2, 8, 5, 64])
4. After Scaled Dot-Product Attention (context) shape: torch.Size([2, 8, 5, 64])
5. After Concatenating Heads (context_concat) shape: torch.Size([2, 5, 512])
6. After Final Linear Projection (Output) shape: torch.Size([2, 5, 512])
------------------------------
--- Final Output Shapes ---
Final MHA Output shape: torch.Size([2, 5, 512])
Attention Weights shape: torch.Size([2, 8, 5, 5])

--- Verifying Mask on Final Attention Weights ---
Attention weights for the 1st query of the 2nd sequence in the batch (head 0):
tensor([0.5158, 0.3034, 0.2919, 0.0000, 0.0000], grad_fn=<SliceBackward0>)
Sum of these weights: 1.1111112833023071


In [11]:
import torch
import torch.nn as nn

class PositionwiseFeedForward(nn.Module):
    """
    Implements the Position-wise Feed-Forward Network (FFN) sub-layer of the Transformer.
    This consists of two linear transformations with a ReLU activation in between.
    """
    def __init__(self, d_model, d_ff, dropout_rate=0.1):
        """
        Initializes the PositionwiseFeedForward module.

        Args:
            d_model (int): The dimensionality of the input and output vectors.
            d_ff (int): The dimensionality of the inner-layer (the "feed-forward" dimension).
            dropout_rate (float): The dropout probability.
        """
        # ----------------- Line-by-Line Explanation -----------------
        # 1. Standard nn.Module constructor call.
        super(PositionwiseFeedForward, self).__init__()

        # 2. self.linear1 = nn.Linear(d_model, d_ff)
        #    This is the first linear layer. It takes an input of size `d_model`
        #    and projects it to the larger, inner dimension `d_ff`.
        self.linear1 = nn.Linear(d_model, d_ff)

        # 3. self.linear2 = nn.Linear(d_ff, d_model)
        #    This is the second linear layer. It takes the `d_ff`-dimensional output
        #    from the first layer and projects it back down to the original `d_model` size.
        self.linear2 = nn.Linear(d_ff, d_model)

        # 4. self.relu = nn.ReLU()
        #    We instantiate the ReLU activation function. This will be applied element-wise
        #    to the output of the first linear layer. It introduces non-linearity by
        #    clamping all negative values to zero: f(x) = max(0, x).
        self.relu = nn.ReLU()

        # 5. self.dropout = nn.Dropout(dropout_rate)
        #    Initialize a Dropout layer. It is typically applied after the activation function
        #    to regularize the network and prevent overfitting.
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        """
        Performs the forward pass of the FFN.

        Args:
            x (torch.Tensor): The input tensor from the previous sub-layer.
                              Shape: [batch_size, seq_len, d_model]

        Returns:
            torch.Tensor: The output tensor. Shape: [batch_size, seq_len, d_model]
        """
        # ----------------- Line-by-Line Explanation -----------------
        # The input `x` has shape [batch_size, seq_len, d_model].
        # Because we use `nn.Linear`, PyTorch automatically applies the same linear
        # transformation to every vector along the `seq_len` dimension, which is exactly
        # what "position-wise" means.

        # 1. x = self.linear1(x)
        #    The input `x` is passed through the first linear layer.
        #    Shape changes from [batch_size, seq_len, d_model] to [batch_size, seq_len, d_ff].
        x = self.linear1(x)

        # 2. x = self.relu(x)
        #    The ReLU activation is applied element-wise. The shape remains the same.
        #    Shape: [batch_size, seq_len, d_ff].
        x = self.relu(x)

        # 3. x = self.dropout(x)
        #    Dropout is applied to the activations. Shape remains the same.
        #    Shape: [batch_size, seq_len, d_ff].
        x = self.dropout(x)

        # 4. x = self.linear2(x)
        #    The result is passed through the second linear layer, projecting it back.
        #    Shape changes from [batch_size, seq_len, d_ff] to [batch_size, seq_len, d_model].
        x = self.linear2(x)

        # 5. return x
        #    Return the final processed tensor.
        return x

In [12]:
# --- Configuration for our test run ---
batch_size = 2
seq_len = 5
d_model = 512
d_ff = 2048  # Standard practice: d_ff is 4 * d_model

# --- Create a Dummy Input Tensor ---
# This simulates the output from the previous sub-layer.
# Shape: [batch_size, seq_len, d_model]
input_tensor = torch.randn(batch_size, seq_len, d_model)

# --- Instantiate and run the FFN module ---
ffn_module = PositionwiseFeedForward(d_model=d_model, d_ff=d_ff)
output_tensor = ffn_module(input_tensor)


# --- Examine the Information Flow via Tensor Shapes ---
print("--- FFN Information Flow ---")
print(f"1. Initial Input shape: \t\t{input_tensor.shape}")

# Let's trace the shapes inside the forward pass manually to see the flow clearly
intermediate_tensor = ffn_module.linear1(input_tensor)
print(f"2. After 1st Linear Layer (Expansion): \t{intermediate_tensor.shape}")

intermediate_tensor = ffn_module.relu(intermediate_tensor)
print(f"3. After ReLU Activation: \t\t{intermediate_tensor.shape}")

# Dropout is applied here but doesn't change the shape.

final_tensor = ffn_module.linear2(intermediate_tensor)
print(f"4. After 2nd Linear Layer (Projection): {final_tensor.shape}")
print("-" * 30)

print("--- Final Output Shape ---")
print(f"Final FFN Output shape: {output_tensor.shape}")

--- FFN Information Flow ---
1. Initial Input shape: 		torch.Size([2, 5, 512])
2. After 1st Linear Layer (Expansion): 	torch.Size([2, 5, 2048])
3. After ReLU Activation: 		torch.Size([2, 5, 2048])
4. After 2nd Linear Layer (Projection): torch.Size([2, 5, 512])
------------------------------
--- Final Output Shape ---
Final FFN Output shape: torch.Size([2, 5, 512])


In [13]:
import torch
import torch.nn as nn
# We assume the classes from previous steps are available
# from step2_multi_head_attention import MultiHeadAttention
# from step3_positionwise_feed_forward import PositionwiseFeedForward

class EncoderLayer(nn.Module):
    """
    Implements a single Encoder Layer of the Transformer.
    It consists of a Multi-Head Self-Attention sub-layer and a Position-wise
    Feed-Forward Network sub-layer. Each sub-layer is followed by a
    residual connection and layer normalization.
    """
    def __init__(self, d_model, n_heads, d_ff, dropout_rate=0.1):
        """
        Initializes the EncoderLayer.

        Args:
            d_model (int): The dimensionality of the input and output vectors.
            n_heads (int): The number of attention heads.
            d_ff (int): The dimensionality of the inner-layer of the FFN.
            dropout_rate (float): The dropout probability.
        """
        # ----------------- Line-by-Line Explanation -----------------
        # 1. Standard nn.Module constructor call.
        super(EncoderLayer, self).__init__()

        # 2. self.self_attention = MultiHeadAttention(...)
        #    Instantiate the Multi-Head Attention module we built in Step 2. This will
        #    be our first sub-layer.
        self.self_attention = MultiHeadAttention(d_model, n_heads, dropout_rate)

        # 3. self.feed_forward = PositionwiseFeedForward(...)
        #    Instantiate the Position-wise Feed-Forward Network module from Step 3. This
        #    will be our second sub-layer.
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_rate)

        # 4. self.norm1 = nn.LayerNorm(d_model)
        #    Initialize the two Layer Normalization modules. `nn.LayerNorm` takes the size
        #    of the dimension to normalize over, which is `d_model`. PyTorch handles the
        #    calculation of mean/std and the learnable gamma/beta parameters internally.
        #    - `norm1` is for the output of the attention sub-layer.
        #    - `norm2` is for the output of the FFN sub-layer.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # 5. self.dropout = nn.Dropout(dropout_rate)
        #    We use dropout on the output of each sub-layer *before* the residual connection.
        #    This is a common implementation choice following the original paper's design.
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        """
        Performs the forward pass of the EncoderLayer.

        Args:
            x (torch.Tensor): The input tensor from the previous layer.
                              Shape: [batch_size, seq_len, d_model]
            mask (torch.Tensor): The mask for the self-attention mechanism.

        Returns:
            torch.Tensor: The output tensor. Shape: [batch_size, seq_len, d_model]
        """
        # ----------------- Line-by-Line Explanation -----------------
        # ----- Sub-layer 1: Multi-Head Self-Attention -----

        # 1. residual = x
        #    Store the original input `x` for the first residual connection.
        residual = x

        # 2. x, _ = self.self_attention(query=x, key=x, value=x, mask=mask)
        #    Pass the input `x` through the self-attention layer. For self-attention,
        #    the query, key, and value are all the same tensor. We also pass the mask.
        #    The MHA module returns the output and the attention weights. We only need
        #    the output for the forward pass, so we can ignore the weights with `_`.
        #    Output `x` shape: [batch_size, seq_len, d_model]
        x, _ = self.self_attention(query=x, key=x, value=x, mask=mask)

        # 3. x = self.dropout(x)
        #    Apply dropout to the output of the attention layer.
        x = self.dropout(x)

        # 4. x = self.norm1(x + residual)
        #    This is the "Add & Norm" step.
        #    a) `x + residual`: The residual connection is performed. The output of the
        #       sub-layer is added to its original input.
        #    b) `self.norm1(...)`: The result is passed through the first layer normalization module.
        attention_output = x

        # ----- Sub-layer 2: Position-wise Feed-Forward Network -----

        # 5. residual = x
        #    Store the output from the first sub-layer (`attention_output`) for the
        #    second residual connection.
        residual = attention_output

        # 6. x = self.feed_forward(x)
        #    Pass the tensor through the FFN.
        #    Output `x` shape: [batch_size, seq_len, d_model]
        x = self.feed_forward(attention_output)

        # 7. x = self.dropout(x)
        #    Apply dropout to the output of the FFN.
        x = self.dropout(x)

        # 8. x = self.norm2(x + residual)
        #    Perform the second "Add & Norm" step.
        #    a) `x + residual`: Add the FFN output to its input.
        #    b) `self.norm2(...)`: Normalize the result.
        ffn_output = x

        # 9. return x
        #    Return the final output of the encoder layer.
        return ffn_output