In [2]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Using bert-base-chinese as tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [5]:
# 加载保存的 pt 文件
save_path = '/egr/research-slim/liangqi1/LLM/transformer-study/data/encoded_texts.pt'  # 替换为你的文件路径
encoded_texts = torch.load(save_path)

  encoded_texts = torch.load(save_path)


In [7]:
vocab_size = tokenizer.vocab_size  # 21128（BERT 预训练词汇表大小）
embedding_dim = 256  # 768 维度（和 BERT 一致）

# 定义嵌入层
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
print("Embedding layer:", embedding_layer)


Embedding layer: Embedding(21128, 256)


In [8]:
# 查看 pt 文件中的内容（通常是一个字典，包含 'input_ids', 'attention_mask' 等）
print(encoded_texts.keys())
sample = encoded_texts.input_ids[0]
print(sample.shape) #here is displaying the number of token in one sentence 

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([512])


#### Here is display after the embedding layer what the data shape will be change to (512,256)

* 512 is number of token in the one sample
* 256 is number of dim of the features for one token

In [17]:
embedded_tokens = embedding_layer(sample)
print(embedded_tokens.shape) 

torch.Size([512, 256])


### Dispaly the input data and attention mask

In [9]:
encoded_texts.attention_mask[4]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
encoded_texts.input_ids[0]

tensor([ 101, 2769,  812, 4495,  772, 4638, 7608, 1501, 3867, 3796, 1177, 8024,
        1072, 3300, 1377,  809, 2571, 6862, 3867, 7370, 3796, 3773, 4638, 4294,
        4157,  511,  710,  691, 7608, 1501, 3867, 3796, 1177, 4685, 1068, 1079,
        2159, 8038,  671, 5663, 5445, 6241, 8024, 5283, 3717, 1469, 5283, 6134,
        7481, 3833, 2595, 1177,  679, 6629, 3796, 8024, 6821, 3221, 1728,  711,
        2124,  812, 4638, 6134, 7481, 1469, 1079, 6956, 3221, 1772, 1258, 4638,
        8024, 2523, 7410, 2501, 2768, 2486, 2595, 5946, 5606, 8024, 1315,  886,
        2501, 2768,  771,  679, 4937, 2137, 8024,  833, 4746, 7313, 3867, 1927,
         511,  710,  691, 7608, 1501, 3867, 3796, 1177, 6848, 2885, 8038,  122,
         119, 4685, 2159, 2595, 8038, 4685, 2159, 2595, 3221, 2900,  697, 4905,
        2772, 5442,  697, 4905,  809,  677, 4289, 6574, 3921, 1394, 3198, 8024,
         679,  772, 4495, 4685, 3166, 1146, 4895, 4385, 6496, 4638, 5543, 1213,
        8024, 4685, 2159, 2595, 1962, 80