In [1]:
import torch
from transformers import LlamaForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [101]:
'''
Llama-3.2-1B Model Structure
'''

model_name = "meta-llama/Llama-3.2-1B"

model_hf = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(model_hf)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [5]:
torch.manual_seed(42)

# 随机生成输入数据
batch_size = 1
sequence_length = 4  # 你可以根据需要调整这个长度

# 生成随机的token ID
random_input = torch.randint(0, tokenizer.vocab_size, (batch_size, sequence_length))

# 打印随机生成的输入
print("Random Input Tensor:")
print(random_input)

Random Input Tensor:
tensor([[61542, 70067, 86876,  6414]])


In [4]:
'''
RMSNorm
input: (batch, sequence_length, embedding_size)
algorithm:
    fomula: RMS(x) = sqrt(mean(x_i^2) + epsilon)
    output: norm_x = (x / RMS(x)) * gamma
'''
gamma = torch.tensor([2, 2, 2])
eps = 1e-6
x = torch.tensor([[[1, 2, 3], [1, 2, 3]]], dtype=torch.float32)
print(x.shape)
RMS = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
print(RMS.shape)
out = x / RMS * gamma
print(out)

torch.Size([1, 2, 3])
torch.Size([1, 2, 1])
tensor([[[0.9258, 1.8516, 2.7775],
         [0.9258, 1.8516, 2.7775]]])


In [78]:
'''
Rope position embedding
'''
hidden_state = torch.tensor([[[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]]], dtype=torch.float32)
b, t, c = hidden_state.shape
position = torch.arange(0, t, dtype=torch.float32).unsqueeze(-1)
ids = torch.arange(0, c // 2, dtype=torch.float)
theta = torch.pow(10000, -2 * ids / c)

embeddings = position * theta
sin_embeddings = torch.sin(embeddings)
cos_embeddings = torch.cos(embeddings)

sin_embeddings = sin_embeddings.repeat(1, 2)
cos_embeddings = cos_embeddings.repeat(1, 2)

In [86]:
def half_rotate(x):
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

In [95]:
cos = cos_embeddings.unsqueeze(0)
sin = sin_embeddings.unsqueeze(0)
print(cos)
print(sin)

tensor([[[ 1.0000,  1.0000,  1.0000,  1.0000],
         [ 0.5403,  0.9999,  0.5403,  0.9999],
         [-0.4161,  0.9998, -0.4161,  0.9998]]])
tensor([[[0.0000, 0.0000, 0.0000, 0.0000],
         [0.8415, 0.0100, 0.8415, 0.0100],
         [0.9093, 0.0200, 0.9093, 0.0200]]])


In [98]:
out = cos * hidden_state + sin * half_rotate(hidden_state)

tensor([[[ 1.,  2.,  3.,  4.],
         [ 4.,  5.,  6.,  7.],
         [ 7.,  8.,  9., 10.]]])

In [100]:
6 * 0.5403 + 4 * 0.8415

6.6078