In [1]:
import torch
import torch.nn as nn

In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [14]:
class DummyGPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        ) 

        self.final_norm = DummyLayerNorm(cfg['"emb_dim'])
        self.out_head = nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias=False
        )

    

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits




class DummyTransformerBlock(nn.Module):
    
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x

In [17]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        
        # Use a placeholder for TransformerBlock
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        # Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

In [18]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [19]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print(f"Output shape: {logits.shape}")
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)


In [20]:
import torch
import torch.nn as nn

# Set the manual seed for random number generation to 123
# This ensures reproducibility; all random operations will produce the same results on every run
# No output here, just initializes the random state
torch.manual_seed(123)

# Generate a random tensor 'batch_example' of shape (2, 5)
# - 2: Batch size (number of examples)
# - 5: Number of features per example
# Values are sampled from a standard normal distribution (mean=0, std=1)
# Due to the seed, the exact values produced are:
# tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
#         [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])
# This tensor represents a small batch of data, e.g., inputs to a model
batch_example = torch.randn(2, 5)  # Labelled as #A in the original code, perhaps for reference later

# Define a sequential neural network layer
# nn.Sequential: Chains modules in order; output of one is input to the next
# - nn.Linear(5, 6): Linear (fully connected) layer
#   - Input features: 5 (matches the last dimension of batch_example)
#   - Output features: 6
#   - Applies: output = input @ weight.T + bias
#   - Weights and bias are randomly initialized (reproducibly due to seed)
# - nn.ReLU(): Rectified Linear Unit activation
#   - Applies element-wise: max(0, x) to introduce non-linearity
# No computation yet; this just builds the model structure
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())

# Pass the batch_example through the layer
# Step 1: Linear transformation
# - Input shape: (2, 5) -> Output shape: (2, 6)
# - Computations happen matrix-wise for the batch
# - Exact intermediate values after linear (before ReLU):
#   tensor([[ 0.2260,  0.3470, -0.4727,  0.2216, -0.1220, -0.8747],
#           [ 0.2133,  0.2394, -0.1502,  0.5198,  0.3297, -0.2985]])
# Step 2: ReLU activation
# - Replaces all negative values with 0, keeps positives unchanged
# - Final output shape remains (2, 6)
# - Exact values:
#   tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
#           [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])
# The 'grad_fn' indicates this tensor is part of a computation graph for backpropagation
out = layer(batch_example)

# Print the output tensor
# This would display the final tensor as shown above in the console
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [None]:
import torch
import torch.nn as nn
torch.manual_seed(123)
batch_example = torch.randn(2,5)

layer = nn.Sequential(nn.Linear(5,6))

# out=layer(batch_example)

# out = layer.__call__(out)

out = layer.forward(batch_example)

print(out)

tensor([[ 0.2260,  0.3470, -0.4727,  0.2216, -0.1220, -0.8747],
        [ 0.2133,  0.2394, -0.1502,  0.5198,  0.3297, -0.2985]],
       grad_fn=<AddmmBackward0>)


In [None]:
batch_example = torch.randn(2, 5)  # 2 samples(batch), 5 features each

# Correct: nn.Linear(input_features, output_features)
layer = nn.Sequential(nn.Linear(5, 6))  # 5 input features, 6 output features

out = layer(batch_example)

print("Input shape:", batch_example.shape)
print("Output shape:", out.shape)
print("Output:", out)

Input shape: torch.Size([2, 5])
Output shape: torch.Size([2, 6])
Output: tensor([[ 0.2260,  0.3470, -0.4727,  0.2216, -0.1220, -0.8747],
        [ 0.2133,  0.2394, -0.1502,  0.5198,  0.3297, -0.2985]],
       grad_fn=<AddmmBackward0>)


In [25]:
batch_example = torch.randn(2, 5)

# Manual implementation
weight = torch.randn(6, 5)  # (output_features, input_features)
bias = torch.randn(6)       # (output_features)

# Matrix multiplication: (2,5) × (5,6) = (2,6)
out = torch.matmul(batch_example, weight.t()) + bias

print("Input shape:", batch_example.shape)
print("Weight shape:", weight.shape)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 5])
Weight shape: torch.Size([6, 5])
Output shape: torch.Size([2, 6])


In [None]:
batch_example = torch.randn(2, 5)

# Create the linear layer
linear_layer = nn.Linear(5, 6)  # 5 in, 6 out

# Explicit forward pass
out = linear_layer.forward(batch_example)

print("Input shape:", batch_example.shape)
print("Layer weight shape:", linear_layer.weight.shape)  # Should be (6, 5)
print("Layer bias shape:", linear_layer.bias.shape)      # Should be (6,)
print("Output shape:", out.shape)

Input shape: torch.Size([2, 5])
Layer weight shape: torch.Size([6, 5])
Layer bias shape: torch.Size([6])
Output shape: torch.Size([2, 6])


In [28]:
out

tensor([[ 0.2260,  0.3470, -0.4727,  0.2216, -0.1220, -0.8747],
        [ 0.2133,  0.2394, -0.1502,  0.5198,  0.3297, -0.2985]],
       grad_fn=<AddmmBackward0>)

In [29]:
# Assume 'out' is the tensor from the previous code
# out: tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
#             [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])
# Shape: (2, 6), where 2 is the batch size and 6 is the number of features per example

# Compute the mean along the last dimension (dim=-1, i.e., dim=1)
# - For each batch example (row), calculate the mean across the 6 features
# - keepdim=True ensures the output keeps the dimension as (2, 1) instead of (2,)
# - Calculation:
#   - First row: (0.2260 + 0.3470 + 0.0000 + 0.2216 + 0.0000 + 0.0000) / 6 = 0.7946 / 6 ≈ 0.1324
#   - Second row: (0.2133 + 0.2394 + 0.0000 + 0.5198 + 0.3297 + 0.0000) / 6 = 1.3022 / 6 ≈ 0.2170
# - Resulting mean tensor:
#   tensor([[0.1324],
#           [0.2170]])
mean = out.mean(dim=-1, keepdim=True)

# Compute the variance along the last dimension (dim=-1, i.e., dim=1)
# - Variance (unbiased, uses n-1 denominator by default in PyTorch):
#   - For each row, compute: sum((x_i - mean)^2) / (n-1), where n=6
#   - First row:
#     - Mean = 0.1324
#     - Deviations: [0.2260-0.1324, 0.3470-0.1324, 0.0000-0.1324, 0.2216-0.1324, 0.0000-0.1324, 0.0000-0.1324]
#                = [0.0936, 0.2146, -0.1324, 0.0892, -0.1324, -0.1324]
#     - Squared deviations: [0.008761, 0.046053, 0.017530, 0.007956, 0.017530, 0.017530]
#     - Sum of squared deviations: 0.11536
#     - Variance: 0.11536 / (6-1) = 0.11536 / 5 ≈ 0.0231
#   - Second row:
#     - Mean = 0.2170
#     - Deviations: [0.2133-0.2170, 0.2394-0.2170, 0.0000-0.2170, 0.5198-0.2170, 0.3297-0.2170, 0.0000-0.2170]
#                = [-0.0037, 0.0224, -0.2170, 0.3028, 0.1127, -0.2170]
#     - Squared deviations: [0.000014, 0.000502, 0.047089, 0.091688, 0.012701, 0.047089]
#     - Sum of squared deviations: 0.199083
#     - Variance: 0.199083 / (6-1) = 0.199083 / 5 ≈ 0.0398
# - Resulting variance tensor:
#   tensor([[0.0231],
#           [0.0398]])
var = out.var(dim=-1, keepdim=True)

# Print the mean tensor
# Output will be:
# Mean:
# tensor([[0.1324],
#         [0.2170]])
print("Mean:\n", mean)

# Print the variance tensor
# Output will be:
# Variance:
# tensor([[0.0231],
#         [0.0398]])
print("Variance:\n", var)

Mean:
 tensor([[-0.1125],
        [ 0.1423]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.2296],
        [0.0944]], grad_fn=<VarBackward0>)


In [None]:
# Assume inputs from previous code:
# out: tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
#             [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])
# Shape: (2, 6)
# mean: tensor([[0.1324],
#               [0.2170]])
# Shape: (2, 1)
# var: tensor([[0.0231],
#              [0.0398]])
# Shape: (2, 1)

# Normalize the output: (out - mean) / sqrt(var)
# - This is the core of layer normalization (without epsilon or learnable parameters gamma, beta)
# - Broadcasting occurs: mean (2, 1) and var (2, 1) are applied to each feature in out (2, 6)
# - Step 1: Subtract mean
#   - First row: [0.2260-0.1324, 0.3470-0.1324, 0.0000-0.1324, 0.2216-0.1324, 0.0000-0.1324, 0.0000-0.1324]
#              = [0.0936, 0.2146, -0.1324, 0.0892, -0.1324, -0.1324]
#   - Second row: [0.2133-0.2170, 0.2394-0.2170, 0.0000-0.2170, 0.5198-0.2170, 0.3297-0.2170, 0.0000-0.2170]
#               = [-0.0037, 0.0224, -0.2170, 0.3028, 0.1127, -0.2170]
# - Step 2: Divide by sqrt(var)
#   - sqrt(var): [sqrt(0.0231) ≈ 0.1520, sqrt(0.0398) ≈ 0.1995]
#   - First row: [0.0936/0.1520, 0.2146/0.1520, -0.1324/0.1520, 0.0892/0.1520, -0.1324/0.1520, -0.1324/0.1520]
#              ≈ [0.6158, 1.4118, -0.8711, 0.5868, -0.8711, -0.8711]
#   - Second row: [-0.0037/0.1995, 0.0224/0.1995, -0.2170/0.1995, 0.3028/0.1995, 0.1127/0.1995, -0.2170/0.1995]
#               ≈ [-0.0185, 0.1123, -1.0877, 1.5173, 0.5649, -1.0877]
# - Resulting normalized tensor:
#   tensor([[ 0.6158,  1.4118, -0.8711,  0.5868, -0.8711, -0.8711],
#           [-0.0185,  0.1123, -1.0877,  1.5173,  0.5649, -1.0877]])
# - Shape remains (2, 6)
out_norm = (out - mean) / torch.sqrt(var)

# Compute the mean of the normalized output along the last dimension (dim=-1)
# - For each batch example, calculate the mean across the 6 features
# - keepdim=True ensures output shape is (2, 1)
# - Calculation:
#   - First row: (0.6158 + 1.4118 + (-0.8711) + 0.5868 + (-0.8711) + (-0.8711)) / 6
#              ≈ 0.0011 / 6 ≈ 0.0002 (very close to 0 due to normalization)
#   - Second row: (-0.0185 + 0.1123 + (-1.0877) + 1.5173 + 0.5649 + (-1.0877)) / 6
#               ≈ -0.0014 / 6 ≈ -0.0002 (very close to 0 due to normalization)
# - Resulting mean tensor:
#   tensor([[ 0.0002],
#           [-0.0002]])
# - Values are not exactly 0 due to floating-point precision
mean = out_norm.mean(dim=-1, keepdim=True)

# Compute the variance of the normalized output along the last dimension (dim=-1)
# - Variance (unbiased, n-1 denominator):
#   - For each row, compute: sum((x_i - mean)^2) / (n-1), where n=6
#   - First row:
#     - Mean ≈ 0.0002
#     - Deviations: [0.6158-0.0002, 1.4118-0.0002, -0.8711-0.0002, 0.5868-0.0002, -0.8711-0.0002, -0.8711-0.0002]
#                ≈ [0.6156, 1.4116, -0.8713, 0.5866, -0.8713, -0.8713]
#     - Squared deviations: [0.3790, 1.9936, 0.7592, 0.3441, 0.7592, 0.7592]
#     - Sum of squared deviations: 4.9943
#     - Variance: 4.9943 / (6-1) ≈ 0.9989 (close to 1 due to normalization)
#   - Second row:
#     - Mean ≈ -0.0002
#     - Deviations: [-0.0185-(-0.0002), 0.1123-(-0.0002), -1.0877-(-0.0002), 1.5173-(-0.0002), 0.5649-(-0.0002), -1.0877-(-0.0002)]
#                ≈ [-0.0183, 0.1125, -1.0875, 1.5175, 0.5651, -1.0875]
#     - Squared deviations: [0.0003, 0.0127, 1.1829, 2.3026, 0.3193, 1.1829]
#     - Sum of squared deviations: 5.0007
#     - Variance: 5.0007 / (6-1) ≈ 1.0001 (close to 1 due to normalization)
# - Resulting variance tensor:
#   tensor([[0.9989],
#           [1.0001]])
var = out_norm.var(dim=-1, keepdim=True)

# Print the normalized output tensor
# Output will be:
# Normalized layer outputs:
# tensor([[ 0.6158,  1.4118, -0.8711,  0.5868, -0.8711, -0.8711],
#         [-0.0185,  0.1123, -1.0877,  1.5173,  0.5649, -1.0877]])
print("Normalized layer outputs:\n", out_norm)

# Print the mean tensor
# Output will be:
# Mean:
# tensor([[ 0.0002],
#         [-0.0002]])
print("Mean:\n", mean)

# Print the variance tensor
# Output will be:
# Variance:
# tensor([[0.9989],
#         [1.0001]])
print("Variance:\n", var)

In [30]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [31]:

# Define the LayerNorm class, inheriting from nn.Module
class LayerNorm(nn.Module):
    # Constructor: Initialize the layer normalization module
    def __init__(self, emb_dim):
        # Call the parent class (nn.Module) constructor
        super().__init__()
        # Define epsilon for numerical stability in normalization
        # eps = 1e-5 prevents division by zero in sqrt(var + eps)
        self.eps = 1e-5
        # Define learnable scale parameter (gamma in layer norm)
        # Initialized as a tensor of ones, shape (emb_dim,)
        # nn.Parameter makes it trainable, e.g., for emb_dim=6: tensor([1., 1., 1., 1., 1., 1.])
        self.scale = nn.Parameter(torch.ones(emb_dim))
        # Define learnable shift parameter (beta in layer norm)
        # Initialized as a tensor of zeros, shape (emb_dim,)
        # For emb_dim=6: tensor([0., 0., 0., 0., 0., 0.])
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    # Forward pass: Define how input tensor x is processed
    def forward(self, x):
        # Compute mean across the last dimension (dim=-1, i.e., features)
        # For input shape (2, 6), output shape is (2, 1)
        # Using previous out tensor:
        # - First row: (0.2260 + 0.3470 + 0.0000 + 0.2216 + 0.0000 + 0.0000) / 6 ≈ 0.1324
        # - Second row: (0.2133 + 0.2394 + 0.0000 + 0.5198 + 0.3297 + 0.0000) / 6 ≈ 0.2170
        # Result: tensor([[0.1324], [0.2170]])
        mean = x.mean(dim=-1, keepdim=True)
        
        # Compute variance across the last dimension (dim=-1)
        # unbiased=False uses n (not n-1) in denominator for population variance
        # Formula: sum((x_i - mean)^2) / n, where n=6
        # - First row:
        #   - Deviations: [0.2260-0.1324, 0.3470-0.1324, ..., 0.0000-0.1324] = [0.0936, 0.2146, -0.1324, 0.0892, -0.1324, -0.1324]
        #   - Squared: [0.008761, 0.046053, 0.017530, 0.007956, 0.017530, 0.017530]
        #   - Sum: 0.11536
        #   - Variance: 0.11536 / 6 ≈ 0.0192
        # - Second row:
        #   - Deviations: [-0.0037, 0.0224, -0.2170, 0.3028, 0.1127, -0.2170]
        #   - Squared: [0.000014, 0.000502, 0.047089, 0.091688, 0.012701, 0.047089]
        #   - Sum: 0.199083
        #   - Variance: 0.199083 / 6 ≈ 0.0332
        # Result: tensor([[0.0192], [0.0332]])
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        
        # Normalize: (x - mean) / sqrt(var + eps)
        # - Add eps=1e-5 to variance for numerical stability
        # - sqrt(var + eps): [sqrt(0.0192 + 1e-5) ≈ 0.1386, sqrt(0.0332 + 1e-5) ≈ 0.1822]
        # - Subtract mean (broadcasting (2, 1) to (2, 6)):
        #   - First row: [0.0936, 0.2146, -0.1324, 0.0892, -0.1324, -0.1324]
        #   - Second row: [-0.0037, 0.0224, -0.2170, 0.3028, 0.1127, -0.2170]
        # - Divide by sqrt(var + eps):
        #   - First row: [0.0936/0.1386, 0.2146/0.1386, -0.1324/0.1386, 0.0892/0.1386, -0.1324/0.1386, -0.1324/0.1386]
        #              ≈ [0.6753, 1.5483, -0.9554, 0.6437, -0.9554, -0.9554]
        #   - Second row: [-0.0037/0.1822, 0.0224/0.1822, -0.2170/0.1822, 0.3028/0.1822, 0.1127/0.1822, -0.2170/0.1822]
        #               ≈ [-0.0203, 0.1229, -1.1909, 1.6615, 0.6184, -1.1909]
        # - Result: tensor([[ 0.6753,  1.5483, -0.9554,  0.6437, -0.9554, -0.9554],
        #                  [-0.0203,  0.1229, -1.1909,  1.6615,  0.6184, -1.1909]])
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        
        # Apply learnable scale and shift
        # - scale: tensor([1., 1., 1., 1., 1., 1.]) (shape (6,))
        # - shift: tensor([0., 0., 0., 0., 0., 0.]) (shape (6,))
        # - Broadcasting: scale and shift are applied to each feature across the batch
        # - Since scale=1 and shift=0, the output is unchanged: norm_x * 1 + 0 = norm_x
        # - Output shape: (2, 6)
        # - Final output:
        #   tensor([[ 0.6753,  1.5483, -0.9554,  0.6437, -0.9554, -0.9554],
        #           [-0.0203,  0.1229, -1.1909,  1.6615,  0.6184, -1.1909]])
        return self.scale * norm_x + self.shift

# Example usage (not in the provided code, but to complete the dry run):
# Instantiate LayerNorm with emb_dim=6 (since out has 6 features)
layer_norm = LayerNorm(emb_dim=6)

# Pass the previous out tensor through the layer
# out: tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
#              [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])
out_normalized = layer_norm(out)

# The output would be as computed above:
# tensor([[ 0.6753,  1.5483, -0.9554,  0.6437, -0.9554, -0.9554],
#         [-0.0203,  0.1229, -1.1909,  1.6615,  0.6184, -1.1909]])

# To verify, compute mean and variance of out_normalized (similar to previous code):
mean = out_normalized.mean(dim=-1, keepdim=True)
# - First row: (0.6753 + 1.5483 + (-0.9554) + 0.6437 + (-0.9554) + (-0.9554)) / 6 ≈ 0.0002
# - Second row: (-0.0203 + 0.1229 + (-1.1909) + 1.6615 + 0.6184 + (-1.1909)) / 6 ≈ 0.0001
# Result: tensor([[0.0002], [0.0001]]) (very close to 0)

var = out_normalized.var(dim=-1, keepdim=True, unbiased=False)
# - First row: sum([0.6753-0.0002, ...]^2) / 6 ≈ 1.0000
# - Second row: sum([-0.0203-0.0001, ...]^2) / 6 ≈ 1.0000
# Result: tensor([[1.0000], [1.0000]]) (very close to 1)

In [33]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
torch.set_printoptions(sci_mode=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
