<a href="https://colab.research.google.com/github/Darshanbreddy/LLM/blob/main/Layer_normilization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn


# Step 1: Create a batch of data
# Let's say batch_size=2, features=4

In [3]:
x = torch.tensor([[1.0, 2.0, 3.0, 4.0],
                  [5.0, 6.0, 7.0, 8.0]])

print("Original Input:")
print(x)

Original Input:
tensor([[1., 2., 3., 4.],
        [5., 6., 7., 8.]])


# Step 2: Print mean and variance across features for each sample



In [4]:
mean = x.mean(dim=1, keepdim=True)
var = x.var(dim=1, unbiased=False, keepdim=True)

print("\nMean before LayerNorm (per sample):")
print(mean)

print("\nVariance before LayerNorm (per sample):")
print(var)


Mean before LayerNorm (per sample):
tensor([[2.5000],
        [6.5000]])

Variance before LayerNorm (per sample):
tensor([[1.2500],
        [1.2500]])


# Step 3: Apply Layer Normalization
# Normalize over last dimension (features)

In [5]:
layer_norm = nn.LayerNorm(normalized_shape=x.shape[1])  # normalized_shape = num of features
x_norm = layer_norm(x)

print("\nAfter Layer Normalization:")
print(x_norm)


After Layer Normalization:
tensor([[-1.3416, -0.4472,  0.4472,  1.3416],
        [-1.3416, -0.4472,  0.4472,  1.3416]],
       grad_fn=<NativeLayerNormBackward0>)


In [6]:
mean_after = x_norm.mean(dim=1, keepdim=True)
var_after = x_norm.var(dim=1, unbiased=False, keepdim=True)

In [7]:
print("\nMean after LayerNorm (per sample):")
print(mean_after)

print("\nVariance after LayerNorm (per sample):")
print(var_after)


Mean after LayerNorm (per sample):
tensor([[0.],
        [0.]], grad_fn=<MeanBackward1>)

Variance after LayerNorm (per sample):
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [8]:
x = torch.randn(3, 5)  # 3 samples, 5 features each

print("Original Input:")
print(x)

Original Input:
tensor([[-0.9036,  0.2232, -1.6372, -0.0054,  0.4290],
        [-1.0337, -0.9208,  1.4874,  0.5683, -0.7738],
        [ 1.6702, -0.2791,  0.5996, -0.8356,  0.6950]])


In [9]:
def layer_norm(x, eps=1e-5):
    """
    Apply Layer Normalization across features (last dimension) for each sample.

    Args:
        x (Tensor): input tensor of shape (batch_size, num_features)
        eps (float): small value to avoid division by zero

    Returns:
        Tensor: normalized tensor of same shape
    """
    mean = x.mean(dim=1, keepdim=True)              # Per-sample mean
    var = x.var(dim=1, keepdim=True, unbiased=False)  # Per-sample variance
    x_norm = (x - mean) / torch.sqrt(var + eps)     # Normalize
    return x_norm

In [10]:
# Step 3: Mean and Variance before LayerNorm
mean_before = x.mean(dim=1, keepdim=True)
var_before = x.var(dim=1, keepdim=True, unbiased=False)

print("\nMean before LayerNorm (per sample):")
print(mean_before)

print("\nVariance before LayerNorm (per sample):")
print(var_before)


Mean before LayerNorm (per sample):
tensor([[-0.3788],
        [-0.1345],
        [ 0.3700]])

Variance before LayerNorm (per sample):
tensor([[0.6027],
        [0.9920],
        [0.7447]])


In [11]:
x_norm = layer_norm(x)

print("\nAfter Layer Normalization:")
print(x_norm)


After Layer Normalization:
tensor([[-0.6760,  0.7755, -1.6210,  0.4810,  1.0405],
        [-0.9028, -0.7894,  1.6284,  0.7057, -0.6418],
        [ 1.5066, -0.7522,  0.2660, -1.3970,  0.3766]])


In [12]:
mean_after = x_norm.mean(dim=1, keepdim=True)
var_after = x_norm.var(dim=1, keepdim=True, unbiased=False)

print("\nMean after LayerNorm (per sample):")
print(mean_after)

print("\nVariance after LayerNorm (per sample):")
print(var_after)



Mean after LayerNorm (per sample):
tensor([[-1.1921e-08],
        [-5.9605e-08],
        [ 2.3842e-08]])

Variance after LayerNorm (per sample):
tensor([[1.0000],
        [1.0000],
        [1.0000]])


eps=1e-5: Prevents division by zero by adding a small constant during normalization.

keepdim=True: Keeps the tensor dimensions unchanged to ensure correct broadcasting.

unbiased=False: Computes population variance (divides by N) instead of sample variance (N-1), aligning with standard LayerNorm behavior.