# LayerNorm的原理
LayerNorm对每个样本的所有特征进行归一化处理：
- 计算所有特征的均值$\mu$和方差$\sigma^2$
- 归一化：$y=\cfrac{y-\mu}{\sqrt{\sigma^2 + \epsilon}}$
- 缩放和平移：$y=\gamma*y + \beta$（$\gamma$和$\beta$都是可以学习的参数）

# pre-norm和post-norm

## BatchNorm的原理

In [17]:
import torch
import torch.nn as nn
import numpy as np

## 1. pytorch LayerNorm

In [15]:
layer_norm = nn.LayerNorm(normalized_shape = 8)
input_tensor = torch.randn(3, 2, 8)
print(f"{input_tensor=}")

output_tensor = layer_norm(input_tensor)
print(f"{output_tensor=}")

input_tensor=tensor([[[ 0.2377,  2.6934, -0.8230,  0.0745,  1.0357, -0.6654,  0.0215,
           0.4228],
         [-0.8215, -0.6647,  1.2379, -1.4070,  0.3720,  0.9331, -0.0539,
           0.5375]],

        [[-0.2841,  0.6137,  0.9781, -0.0870, -0.0379,  0.2336,  0.3014,
          -0.0584],
         [-1.2883, -0.6146, -0.6878,  0.6659,  0.9041, -0.8015,  0.5935,
           0.2950]],

        [[ 0.1814,  0.4158,  0.8810,  0.7697, -0.8246, -0.5746, -0.2295,
           1.5199],
         [ 0.0037, -1.3735, -0.0878, -0.8832, -0.6735, -0.2526,  1.2561,
           0.5447]]])
output_tensor=tensor([[[-0.1324,  2.2409, -1.1574, -0.2901,  0.6388, -1.0051, -0.3413,
           0.0466],
         [-0.9738, -0.7916,  1.4188, -1.6539,  0.4128,  1.0647, -0.0820,
           0.6050]],

        [[-1.2546,  1.0370,  1.9673, -0.7516, -0.6263,  0.0667,  0.2400,
          -0.6785],
         [-1.5214, -0.6466, -0.7416,  1.0163,  1.3256, -0.8893,  0.9223,
           0.5347]],

        [[-0.1163,  0.2006,  0.82

## 2. 基于numpy和pytorch实现LayerNorm

In [59]:
def layer_norm_np(x):
    """
    batch, seq_len, d_model
    """
    m = np.mean(x, axis=-1, keepdims=True)
    sigma = np.std(x, axis=2, keepdims=True)
    n = (a - m)/sigma
    return n
    
layer_norm_np(input_tensor.numpy())

array([[[-1.3235222e-01,  2.2409232e+00, -1.1574111e+00, -2.9011825e-01,
          6.3881660e-01, -1.0051035e+00, -3.4130612e-01,  4.6551220e-02],
        [ 2.5678658e-01,  3.1096728e+00, -9.7542435e-01,  6.7137904e-02,
          1.1837994e+00, -7.9233730e-01,  5.6056031e-03,  4.7184429e-01]],

       [[ 7.7306367e-02,  6.3458858e+00, -2.6302016e+00, -3.3940420e-01,
          2.1142097e+00, -2.2279088e+00, -4.7460771e-01,  5.4984754e-01],
        [ 4.6021900e-01,  3.6491699e+00, -9.1714418e-01,  2.4823001e-01,
          1.4964322e+00, -7.1248984e-01,  1.7944930e-01,  7.0061010e-01]],

       [[-4.0149562e-02,  3.2803783e+00, -1.4743431e+00, -2.6088524e-01,
          1.0388181e+00, -1.2612447e+00, -3.3250386e-01,  2.1016012e-01],
        [ 5.4358447e-01,  3.7147055e+00, -8.2607758e-01,  3.3278081e-01,
          1.5740042e+00, -6.2256747e-01,  2.6438469e-01,  7.8263152e-01]]],
      dtype=float32)

#### 有偏方差和无偏方差

- unbiased = True（默认）无偏方差：使用自由度n-1进行归一化$var = \cfrac{\sum(x - mean)^2}{n-1}$
- unbiased = False有偏方差：使用元素数量n进行归一化$var = \cfrac{\sum(x - mean)^2}{n}$

In [60]:
def layer_norm_torch(x):
    """
    batch, seq_len, d_model
    """
    m = input_tensor.mean(dim=-1, keepdim=True)
    std = input_tensor.std(dim=-1, keepdim=True, unbiased = False)
    n = (input_tensor - m) / std
    return n

In [61]:
layer_norm_torch(input_tensor)

tensor([[[-0.1324,  2.2409, -1.1574, -0.2901,  0.6388, -1.0051, -0.3413,
           0.0466],
         [-0.9738, -0.7916,  1.4188, -1.6539,  0.4128,  1.0647, -0.0820,
           0.6051]],

        [[-1.2547,  1.0370,  1.9674, -0.7517, -0.6263,  0.0667,  0.2400,
          -0.6785],
         [-1.5214, -0.6466, -0.7416,  1.0163,  1.3256, -0.8893,  0.9223,
           0.5347]],

        [[-0.1163,  0.2006,  0.8298,  0.6792, -1.4766, -1.1385, -0.6719,
           1.6937],
         [ 0.2414, -1.5370,  0.1233, -0.9039, -0.6330, -0.0896,  1.8587,
           0.9401]]])