## 4.2 In Book

In [19]:
import torch
import torch.nn as nn

In [6]:
outputs = torch.randn(3, 6)
outputs

tensor([[-0.9529, -1.4324, -0.3990,  0.8220,  1.1236, -0.2656],
        [ 0.3344, -0.0415,  0.6060, -1.7030, -0.1129, -1.3708],
        [ 0.2600,  1.0295,  0.9568, -1.7038, -0.0626, -1.1690]])

In [10]:
# we want to normalize each row (embed vector) independently

mean = outputs.mean(dim =-1, keepdim=True)
mean

tensor([[-0.1840],
        [-0.3813],
        [-0.1148]])

In [11]:
sd = outputs.std(dim = -1, keepdim=True)
sd

tensor([[0.9924],
        [0.9382],
        [1.1169]])

In [None]:
# Finding z scores
normalized_outputs = (outputs - mean)/sd
normalized_outputs

tensor([[-0.7747, -1.2579, -0.2166,  1.0137,  1.3177, -0.0822],
        [ 0.7629,  0.3622,  1.0524, -1.4089,  0.2861, -1.0548],
        [ 0.3356,  1.0246,  0.9595, -1.4227,  0.0468, -0.9438]])

In [17]:
normalized_outputs.mean(dim=-1, keepdim=True)

tensor([[     0.0000],
        [    -0.0000],
        [     0.0000]])

In [14]:
normalized_outputs.std(dim=-1, keepdim=True)

tensor([[1.],
        [1.],
        [1.]])

In [None]:
# Turns off scientific notation
torch.set_printoptions(sci_mode=False)

In [18]:
#Actual layer norm class
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim)) # make scale and shift trainable
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # Start as 1 and 0, so they may learn to undo the z score step ?

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps) # eps helps avoid dividing by super small numbers (unstable)
        return self.scale * norm_x + self.shift 