# MateConv mini tokenizer训练

- Step 1.导入必要的库

In [1]:
import random
from tqdm import tqdm
from transformers import AutoTokenizer
import json
from datasets import load_dataset
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
import os

  from .autonotebook import tqdm as notebook_tqdm


- Step 2.读取 tokenizer_train.jsonl 文件

In [2]:
def read_texts_from_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            yield data['text']

# 测试读取数据，你可以更换成你建立的目录
data_path = '/root/autodl-tmp/MateGenConv/dataset/Data/tokenizer_train.jsonl'
texts = read_texts_from_jsonl(data_path)

- Step 3.初始化分词器

In [3]:
# 初始化tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

# 定义特殊token
special_tokens = ["<unk>", "<s>", "</s>"]

# 设置训练器并添加特殊token
trainer = trainers.BpeTrainer(
    vocab_size=6400,
    special_tokens=special_tokens,  # 确保这三个token被包含
    show_progress=True,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)

- Step 4.训练分词器

In [4]:
# 读取文本数据
texts = read_texts_from_jsonl(data_path)

In [5]:
# 训练tokenizer
tokenizer.train_from_iterator(texts, trainer=trainer)






- Step 5.保存分词器

在训练完毕之后，还需要设置解码器 (`tokenizer.decoder = decoders.ByteLevel()`) ，这是为了在生成文本时正确地将分词器产生的 token 序列还原回原始文本。

In [7]:
# 设置解码器
tokenizer.decoder = decoders.ByteLevel()

# 保存tokenizer
tokenizer_dir = "/root/autodl-tmp/MateGenConv/model/mateconv_tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_dir, "tokenizer.json"))
tokenizer.model.save(tokenizer_dir)

# 手动创建配置文件
config = {
    "add_bos_token": False,
    "add_eos_token": False,
    "add_prefix_space": True,
    "added_tokens_decoder": {
        "0": {
            "content": "<unk>",
            "lstrip": False,
            "normalized": False,
            "rstrip": False,
            "single_word": False,
            "special": True
            },
        "1": {
            "content": "<s>",
            "lstrip": False,
            "normalized": False,
            "rstrip": False,
            "single_word": False,
            "special": True
            },
        "2": {
            "content": "</s>",
            "lstrip": False,
            "normalized": False,
            "rstrip": False,
            "single_word": False,
            "special": True
            }
    },
    "bos_token": "<s>",
    "clean_up_tokenization_spaces": False,
    "eos_token": "</s>",
    "legacy": True,
    "model_max_length": 1000000000000000019884624838656,
    "pad_token": None,
    "sp_model_kwargs": {},
    "spaces_between_special_tokens": False,
    "tokenizer_class": "PreTrainedTokenizerFast",
    "unk_token": "<unk>",
    "use_default_system_prompt": False,
    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}"
}

# 保存配置文件
with open(os.path.join(tokenizer_dir, "tokenizer_config.json"), "w", encoding="utf-8") as config_file:
    json.dump(config, config_file, ensure_ascii=False, indent=4)

print("Tokenizer 保存成功！")

Tokenizer 保存成功！


- Step 6.评估分词器

In [8]:
from transformers import AutoTokenizer

# 加载预训练的tokenizer
tokenizer = AutoTokenizer.from_pretrained("./model/mateconv_tokenizer")

# 测试一段对话
messages = [
    {"role": "system", "content": "你是一个优秀的聊天机器人，总是给我正确的回应！"},
    {"role": "user", "content": '是椭圆形的'},
    {"role": "assistant", "content": '456'},
    {"role": "user", "content": '456'},
    {"role": "assistant", "content": '789'}
]

# 使用模板进行文本处理
new_prompt = tokenizer.apply_chat_template(messages, tokenize=True)
print(new_prompt)

[608, 1589, 4835, 269, 4833, 954, 4725, 270, 1170, 345, 4584, 5204, 1273, 648, 2207, 1, 320, 275, 201, 345, 1390, 258, 3852, 1081, 269, 2, 201, 1, 1078, 538, 501, 201, 22, 23, 24, 2, 201, 1, 320, 275, 201, 22, 23, 24, 2, 201, 1, 1078, 538, 501, 201, 25, 26, 27, 2, 201]


整体评估：先用 apply_chat_template 渲染对话并分词，统计总 token 数、字符/字节数，计算 CPT/BPT，并用 decode==rendered 检查可逆性，同时查看特殊 token 与尾部 token。

逐轮增量：逐步截取前 i 条消息，计算每次新增后的 token 总数与增量，用于观察哪一轮消息（或模板收尾）导致 token 激增。

In [9]:
# —— A) 整体开销&可逆性 ——
rendered = tokenizer.apply_chat_template(messages, tokenize=False)  # 渲染后的纯文本
ids = tokenizer.apply_chat_template(messages, tokenize=True)

total_tokens = len(ids)
total_chars = len(rendered)
total_bytes = len(rendered.encode("utf-8"))
cpt = total_chars / total_tokens
bpt = total_bytes / total_tokens

print("== Overall ==")
print("tokens:", total_tokens)
print("chars :", total_chars)
print("bytes :", total_bytes)
print("CPT (chars/token):", round(cpt, 4))
print("BPT (bytes/token):", round(bpt, 4))

decoded = tokenizer.decode(ids)
print("reversible:", decoded == rendered)
print("tail 10 token ids:", ids[-10:])

# 看看特殊token（若未配置某些，会显示 None）
print("\n== Special token ids ==")
print("bos_token_id:", tokenizer.bos_token_id)
print("eos_token_id:", tokenizer.eos_token_id)
print("unk_token_id:", tokenizer.unk_token_id)
print("pad_token_id:", tokenizer.pad_token_id)

# —— B) 逐轮增量 token 开销 ——
print("\n== Incremental tokens per turn ==")
prefix_ids_prev = []
for i in range(1, len(messages)+1):
    prefix_rendered = tokenizer.apply_chat_template(messages[:i], tokenize=False)
    prefix_ids = tokenizer.apply_chat_template(messages[:i], tokenize=True)
    inc = len(prefix_ids) - len(prefix_ids_prev)
    print(f"after #{i} ({messages[i-1]['role']}): total={len(prefix_ids)}  +{inc}")
    prefix_ids_prev = prefix_ids

== Overall ==
tokens: 56
chars : 99
bytes : 155
CPT (chars/token): 1.7679
BPT (bytes/token): 2.7679
reversible: True
tail 10 token ids: [1, 1078, 538, 501, 201, 25, 26, 27, 2, 201]

== Special token ids ==
bos_token_id: 1
eos_token_id: 2
unk_token_id: 0
pad_token_id: None

== Incremental tokens per turn ==
after #1 (system): total=15  +15
after #2 (user): total=32  +17
after #3 (assistant): total=37  +5
after #4 (user): total=51  +14
after #5 (assistant): total=56  +5
