# The difference between LayerNorm and RMSNorm

LayerNorm: $ y = \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} * \gamma + \beta $

RMSNorm: $ y = \frac{x}{\sqrt{\frac{1}{n}\sum_{i=1}^{d} x_i^2 + \epsilon}} * \gamma $

* mean/var 都是reduce 的过程;
    * mean(dim=a) => 返回的 shape 是(b,c)
    * mean(dim=a, keepdim=True) =>(1, b, c)：方便 broadcast
    * mean(dim=(a,b))=> 返回的 shape 是(c)
    * mean(dim=(a,b), keepdim=True) =>(1, 1, c)：方便 broadcast

In [1]:
import torch
import torch.nn as nn

# llama源码
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):  # 提示私有，非强制
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)  # 平方根的倒数
    
    def forward(self, x):
        output = self._norm(x.float()).type_as(x)  # 转换回与输入 x 相同的数据类型
        return output * self.weight

In [3]:
# 一句话写完（没有包含可学习的参数）
def my_rms(x):
    return x / torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-5)

In [4]:
# 测试
batch_size, seq_len, hidden_size = 2, 3, 4
x = torch.randn(batch_size, seq_len, hidden_size)
rms = RMSNorm(dim=4)
rms(x), my_rms(x)

(tensor([[[ 0.3321,  1.1807, -1.3575, -0.8080],
          [ 0.6407, -1.0380,  1.1444, -1.0965],
          [-0.1252, -0.3735, -1.0646,  1.6466]],
 
         [[-1.4560, -0.2368,  0.1816, -1.3383],
          [-0.8618,  0.7659, -1.5711, -0.4498],
          [-1.2662, -0.4932, -0.9945,  1.0791]]], grad_fn=<MulBackward0>),
 tensor([[[ 0.3321,  1.1807, -1.3575, -0.8080],
          [ 0.6407, -1.0380,  1.1444, -1.0965],
          [-0.1252, -0.3735, -1.0646,  1.6466]],
 
         [[-1.4560, -0.2368,  0.1816, -1.3383],
          [-0.8618,  0.7659, -1.5711, -0.4498],
          [-1.2662, -0.4932, -0.9945,  1.0791]]]))