In [1]:
import torch

In [2]:
import torch.nn as nn

## Applying Layer Normalization on a sample batch

In [3]:
torch.manual_seed(123)
batch=torch.randn(2,5)
layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())
output=layer(batch)

In [4]:
output

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [34]:
mean=output.mean(dim=-1,keepdim=True)
var=output.var(dim=-1,keepdim=True)

print(mean,var)

tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>) tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [35]:
norm_out=(output-mean)/torch.sqrt(var)

In [36]:
norm_out

tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)

In [37]:
nmean=norm_out.mean(dim=-1,keepdim=True)
nvar=norm_out.var(dim=-1,keepdim=True)

print(nmean,nvar)

tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>) tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## making LayerNorm Class

In [23]:
class LayerNorm(nn.Module):

    def __init__(self,embedding_dim):
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(embedding_dim))
        self.shift=nn.Parameter(torch.zeros(embedding_dim))

    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True,unbiased=False)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)

        return self.scale*norm_x + self.shift


In [24]:
ln = LayerNorm(embedding_dim=5)
out_ln = ln(batch)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[-2.9802e-08],
        [ 0.0000e+00]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
