In [3]:
import torch
import torch.nn as nn

# We Define an input of 2 batch with 5 columns which be send through a linear layer in ANN which will output a 5x6 with Relu activation 

In [4]:
torch.manual_seed(123)
inputs = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6), nn.ReLU())
output= layer(inputs)

In [6]:
output.shape

torch.Size([2, 6])

In [7]:
mean = output.mean(dim=1,keepdim=True)
var = output.var(dim=1,keepdim=True)

In [8]:
mean

tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)

In [9]:
var

tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)

# This is the formula for Layer Norm 

In [10]:
output_norm = (output - mean)/ torch.sqrt(var)

In [11]:
output_norm

tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)

In [None]:
output_norm.var(dim=1, keepdim=True)

tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)

In [17]:
output_norm.mean(dim=1, keepdim=True)


tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)

In [19]:
# Mean of the output norm was no exact to zero because of some error in calculation in python
# so we set the printoptions so now we get the mean exact to zero
torch.set_printoptions(sci_mode=False)

In [22]:
output_norm.mean(dim=1, keepdim=True)
#As you can see we have success fully acheived a  layer norm functionality 
# with a standard deviation of 1 and a mean of 0

tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)

# This Layer Norm we achieved will help to resolve two issues while trainin:
1. It will help to converge the training much more faster 
2. Instability in learning meaning can help in reduce the problem of vanishing as well as exploding gradients problem

# Lets code a Layer Norm class

In [36]:
class LayerNorm(nn.Module):
    def __init__(self,embedding_dim):
        super().__init__()
        
        self.eps = 1e-5
        self.shift =  nn.Parameter(torch.zeros( embedding_dim))
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        
    def forward(self, y):
        mean = y.mean(dim=1, keepdim=True)
        var = y.var(dim=1, keepdim=True, unbiased= False)
        
        out_norm = (y - mean)/ torch.sqrt(var) + self.eps
        
        return self.scale * out_norm + self.shift 
        

# We defined two more arguments shift , scale and eps(epsilon) 
Eps is a constant param which is used in a situation when the variance is 0 which leads to wrong normalization
where as scale and shift is used at the time of training as they both have similar dimensions as input they both adjust at the time of training if doing so will lead to improve te models performance. 

In [37]:
inputs


tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

In [38]:
embed_dim = inputs.shape[-1]
# context_length = 
lay_n = LayerNorm(embed_dim)

In [40]:
output_norm = lay_n(inputs)

In [41]:
output_norm.mean(dim=1,keepdim=True)

tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)