In [2]:
# 安装 transformers 库 (如果尚未安装)
# !pip install transformers

In [1]:
from transformers import AutoTokenizer

In [None]:
# 选择一个预训练模型名称
model_name = "bert-base-uncased" # 你可以尝试其他模型，例如 "gpt2", "roberta-base"

# 加载对应的 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Tokenizer for {model_name}: {tokenizer}")

In [None]:
# 基本的 Tokenization
text = "Hello, how are you today?"

# 使用 tokenizer 对文本进行分词
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

In [None]:
# 编码 (Encoding) 文本为模型输入
encoded_input = tokenizer(text, return_tensors="pt") # return_tensors='pt' 返回 PyTorch tensors
print("Encoded Input:", encoded_input)
# 你会注意到 encoded_input 包含 input_ids (tokens 的数字表示) 和 attention_mask (指示哪些 tokens 应该被关注)。

In [None]:
# 解码 (Decoding) Token IDs 回文本
# 获取 input_ids
input_ids = encoded_input["input_ids"][0]

In [None]:
# 解码 token IDs
decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True) # skip_special_tokens=True 忽略特殊 token
print("Decoded Text:", decoded_text)

In [None]:
# 处理多个句子
sentences = [
    "This is the first sentence.",
    "Here is another one."
]

encoded_batch = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
print("Encoded Batch:", encoded_batch)
# 这里我们使用了 padding=True 来将序列填充到相同的长度，truncation=True 来截断超过模型最大长度的序列。


In [None]:
# 查看词汇表和特殊 tokens
print("Vocabulary Size:", tokenizer.vocab_size)
print("Special Tokens Map:", tokenizer.special_tokens_map)
print("Padding Token:", tokenizer.pad_token, tokenizer.pad_token_id)
print("Separator Token:", tokenizer.sep_token, tokenizer.sep_token_id)
print("Classification Token:", tokenizer.cls_token, tokenizer.cls_token_id)
print("Unknown Token:", tokenizer.unk_token, tokenizer.unk_token_id)