In [1]:
import torch
import torch.nn as nn
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Using bert-base-chinese as tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
save_path = '/egr/research-slim/liangqi1/LLM/transformer-study/data/encoded_texts.pt'  # 替换为你的文件路径
encoded_texts = torch.load(save_path)

vocab_size = tokenizer.vocab_size  # 21128（BERT 预训练词汇表大小）
embedding_dim = 256  # 768 维度（和 BERT 一致）

# 定义嵌入层
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
sample = encoded_texts.input_ids[0]
embedded_tokens = embedding_layer(sample)

  encoded_texts = torch.load(save_path)


#### Positional Encoding

✅ **Positional Encoding** is a **(seq_len, embedding_dim)** shaped matrix used for encoding token positions.  
✅ Its calculation is based on **sin/cos functions**, ensuring the model can learn both **short-term and long-term dependencies**.  
✅ The **embedding output of the Transformer** is added with **Positional Encoding**, allowing the model to perceive **token order information**.  
✅ The **code example computes a (512, 256) Positional Encoding** and adds it to the embedding result.  
✅ **Positional Encoding visualization** helps observe the pattern of **sin/cos values changing with token positions**.

---

✅ Positional Encoding 是一个 (seq_len, embedding_dim) 形状的矩阵，用于为 token 位置编码。<br>
✅ 它的计算基于 sin/cos 函数，确保模型能学习短期和长期依赖关系。<br>
✅ Transformer 的 embedding 结果会加上 Positional Encoding，使模型感知到 token 的顺序信息。<br>
✅ 代码示例计算了 (512, 256) 的 Positional Encoding 并将其加到 embedding 结果上。<br>
✅ 可视化 Positional Encoding 变化，观察随 token 位置变化的 sin/cos 规律。<br>



Final Output = Embedded Tokens (512, 256) + Positional Encoding (512, 256)


In [11]:
# 创建 shape 为 (seq_len, embedding_dim) 的空矩阵
seq_len = embedded_tokens.shape[0]
embedding_dim = embedded_tokens.shape[1]

pe = torch.zeros(seq_len, embedding_dim)

# 生成位置索引 (pos) → shape: (seq_len, 1)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
print(f'shape of the positions: {position.shape}')

# 计算每个维度的分母 10000^(-2i/d)
# Exponentiation is faster than power arithmetic (exp is more efficient than pow).
# Floating point errors can be avoided (preventing overflow if 10000^{-x} is too small).
div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-np.log(10000.0) / embedding_dim))

# 偶数索引使用 sin，奇数索引使用 cos
pe[:, 0::2] = torch.sin(position * div_term)  # 偶数维度
pe[:, 1::2] = torch.cos(position * div_term)  # 奇数维度

print("Positional Encoding Shape:", pe.shape)  # (512, 256)


final_embedding = embedded_tokens + pe
print("Final Embedding Shape:", final_embedding.shape)  # (512, 256)


shape of the positions: torch.Size([512, 1])
Positional Encoding Shape: torch.Size([512, 256])
Final Embedding Shape: torch.Size([512, 256])
