here, we will learn about layer normalization

Layer Normalization is a technique used in machine learning and artificial intelligence to normalize the inputs of a neural network layer. It ensures that the inputs have a consistent distribution and reduces the internal covariate shift problem that can occur during training.

the various importance of layer normalization are
*    Stabilizes training by reducing internal covariate shift.
*    Improves gradient flow, preventing vanishing or exploding gradients.
*    Better suited for sequential data compared to batch normalization.
*    Eliminates dependency on batch size, enabling small batch training.
*    Enhances convergence speed and model performance.
*    Normalizes across each layer's activations rather than across the batch.
*    Helps models generalize better by reducing overfitting.





In [2]:
import torch
from torch import nn
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])

B,S,E=inputs.size()
inputs=inputs.reshape(S,B,E)
inputs.size()

#B=BATCH_SIZE,S=seq length, E=embedding


torch.Size([2, 1, 3])

In [3]:
inputs

tensor([[[0.2000, 0.1000, 0.3000]],

        [[0.5000, 0.1000, 0.1000]]])

In [18]:
parameters_shape=inputs.shape[-2:]
gamma=nn.Parameter(torch.ones(parameters_shape))
beta=nn.Parameter(torch.zeros(parameters_shape))

#gama and beta are learnable parameters


In [8]:
gamma,gamma.shape, beta,beta.shape

(Parameter containing:
 tensor([[1., 1., 1.]], requires_grad=True),
 torch.Size([1, 3]),
 Parameter containing:
 tensor([[0., 0., 0.]], requires_grad=True),
 torch.Size([1, 3]))

In [9]:
dims=[-(i+1) for i in range(len(parameters_shape))]
dims
#so why dims??The dims list is used to identify the last two dimensions of the tensor, which will likely be used later for applying transformations like scaling (via gamma) or shifting (via beta).

[-1, -2]

In [10]:
#Now lets calculate the mean
mean=inputs.mean(dim=dims,keepdim=True)
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [12]:
#now lets calculate variance
variance=(inputs-mean).pow(2).mean(dim=dims,keepdim=True)
epsilon = 1e-5
SD=(variance+epsilon).sqrt()
SD


tensor([[[0.0817]],

        [[0.1886]]])

In [15]:
y=(inputs-mean)/SD

In [17]:
output=gamma*y+beta
output

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

In [35]:
#now lets do all thing by using class

class LayerNormalization(nn.Module):
  def __init__(self,parameters_shape,eps=1e-5):
    super().__init__()
    self.parameters_shape=parameters_shape
    self.eps=eps
    self.gamma=nn.Parameter(torch.ones(self.parameters_shape))
    self.beta=nn.Parameter(torch.zeros(self.parameters_shape))

  def forward(self,input):
    dims=[-(i+1) for i in range(len(self.parameters_shape))]

    mean=inputs.mean(dim=dims,keepdim=True)
    print(f"Mean \n ({mean.size()}): \n {mean}")

    variance=(inputs-mean).pow(2).mean(dim=dims,keepdim=True)
    SD=(variance+self.eps).sqrt()
    print(f"Standard Deviation \n ({SD.size()}): \n {SD}")

    y=(inputs-mean)/SD
    print(f"y \n ({y.size()}) = \n {y}")

    output=self.gamma*y+self.beta
    print(f"out \n ({output.size()}) = \n {output}")

    return output


In [36]:

batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

In [37]:
print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.6015,  0.0672, -2.1790,  0.5983, -0.0319, -0.2776,  0.1698,
          -0.4997],
         [ 0.4115, -0.5325,  1.5310, -0.3325,  1.3894, -1.4370, -1.5425,
           1.4672],
         [ 1.5566,  0.1266, -0.8267,  1.4627, -0.8166,  0.3438,  0.7982,
           0.8133]],

        [[ 0.6343, -0.8681, -1.1520, -1.0864,  0.2574,  1.1658, -0.5718,
          -1.0948],
         [ 0.1874, -1.5801,  0.8071, -0.2802, -0.6323,  0.7004,  0.7074,
          -1.2898],
         [ 0.4410,  0.0846,  0.5472, -0.2635, -1.0006,  0.8925, -1.1859,
          -1.7671]],

        [[ 0.6052,  0.4204,  1.1780,  0.3145,  0.8102,  0.9300,  1.9683,
          -0.1992],
         [ 1.6946,  0.2956, -0.0725,  1.0973, -0.6916, -0.4401,  2.1340,
           0.1986],
         [-2.0556,  0.8018,  0.8110, -1.1832, -0.1048, -0.2836, -0.0252,
          -0.1234]],

        [[ 0.1273,  0.2326,  0.4985, -0.1163, -0.7773, -0.5526, -0.5036,
          -1.1406],
         [ 0.0606, -0.3598, 

In [38]:
layer_norm=LayerNormalization(inputs.shape[-1:])

In [39]:
output=layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[-0.0689],
         [ 0.1193],
         [ 0.4322]],

        [[-0.3394],
         [-0.1725],
         [-0.2815]],

        [[ 0.7534],
         [ 0.5270],
         [-0.2704]],

        [[-0.2790],
         [-0.2411],
         [-0.4966]],

        [[ 0.3767],
         [-0.4461],
         [-0.1413]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[0.9996],
         [1.1896],
         [0.8552]],

        [[0.8435],
         [0.8716],
         [0.8848]],

        [[0.6045],
         [0.9495],
         [0.8978]],

        [[0.5213],
         [0.8804],
         [0.7326]],

        [[0.8588],
         [0.6193],
         [0.8933]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.6710,  0.1362, -2.1109,  0.6674,  0.0371, -0.2087,  0.2389,
          -0.4310],
         [ 0.2456, -0.5479,  1.1867, -0.3798,  1.0676, -1.3083, -1.3970,
           1.1331],
         [ 1.3147, -0.3574, -1.4721,  1.2050, -1.4603, -0.1035,  0.4279,
           0.4456]],



In [40]:
output[0].mean(),output[0].std()

(tensor(1.9868e-08, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))