In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np

# 使用 Hugging Face 的 LLaMA 2 模型和 tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # 或者 "meta-llama/Llama-2-13b-hf" 等
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 输入文本
text = "Hello, how are you?"

# 将文本编码为 token ID 序列
input_ids = tokenizer.encode(text, return_tensors="pt")

# 解码 token ID 序列回文本
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print(f"Decoded text from token IDs: {decoded_text}")

# 获取嵌入表示
with torch.no_grad():
    embeddings = model.get_input_embeddings()(input_ids)  # 获取输入 token 的嵌入

# 计算嵌入的平均向量，并找到词汇表中最接近的 token
average_embedding = embeddings.mean(dim=1).squeeze().numpy()
vocab_embeddings = model.get_input_embeddings().weight.detach().numpy()

# 计算平均嵌入和词汇表中每个 token 的余弦相似度
cosine_similarities = np.dot(vocab_embeddings, average_embedding) / (
    np.linalg.norm(vocab_embeddings, axis=1) * np.linalg.norm(average_embedding)
)
closest_token_id = np.argmax(cosine_similarities)
closest_token = tokenizer.decode([closest_token_id])


print(f"Closest token by embedding: {closest_token}")


Downloading shards: 100%|██████████| 2/2 [04:13<00:00, 126.88s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.73s/it]


Decoded text from token IDs: Hello, how are you?
Closest token by embedding: ,


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np

# 初始化 tokenizer 和模型（确保模型名称和权重已下载，可能需要根据实际模型版本进行调整）
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # 或者其他 Llama 模型名称
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 输入文本
text = "Hello, how are you?"

# 将文本转化为 tokens (tokenize)
input_ids = tokenizer.encode(text, return_tensors="pt")

# 查看 token IDs
print("Token IDs:",input_ids)

# 解码 tokens 回到原文本 (decode)
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("Decoded Text:", decoded_text)

# 获取模型生成的 embedding
with torch.no_grad():
    embeddings = model.get_input_embeddings()(input_ids)  # 获取输入 token 的嵌入

print("Embedding shape:", embeddings.shape)  # 输出 embedding 的形状

# 反映射示例: 通过 embedding 找到最接近的 token (仅作示例)
# (假设你有一个特定的 embedding 需要找到对应的 token, 可以使用 embedding 与 vocab 的余弦相似性来找到近似的 token)
vocab_size = tokenizer.vocab_size
embedding_to_token_map = {}
for i in -(vocab_size):
    token_text = tokenizer.decode([i])
    token_embedding = model.get_input_embeddings()(torch.tensor([i]))
    embedding_to_token_map[token_text] = token_embedding

# 可视化结果
print("Original Text:", text)
print("Decoded Text:", decoded_text)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.27s/it]


Token IDs: tensor([[128000,   9906,     11,   1268,    527,    499,     30]])
Decoded Text: Hello, how are you?
Embedding shape: torch.Size([1, 7, 3072])
Original Text: Hello, how are you?
Decoded Text: Hello, how are you?
