## Tokenizer基本使用

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
sen = "弱小的我也有大梦想！"

## Step1 加载与保存

In [2]:
# 从huggingface加载，输入模型名称，即可加载对应的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [2]:
# tokenizer保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

NameError: name 'tokenizer' is not defined

In [3]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Step2 句子分词器

In [9]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '！']

## Step3 查看词典

In [10]:
tokenizer.vocab

{'##蘇': 19036,
 '##貶': 19581,
 '##銭': 20131,
 '##unch': 11294,
 '》': 518,
 '0l': 10973,
 '##hm': 13130,
 '##膠': 18665,
 '355': 11753,
 '##獵': 17421,
 '##锢': 20292,
 '堿': 1845,
 '邨': 6931,
 '##蜜': 19114,
 '池': 3737,
 '靜': 7477,
 '##犁': 17355,
 '385': 11959,
 '##いた': 10526,
 'angela': 12822,
 'secret': 11634,
 '##咘': 14535,
 '319': 10865,
 '##71': 9097,
 '##62': 9290,
 '醉': 7004,
 '##这': 19878,
 '詡': 6272,
 '##40': 8660,
 '疇': 4539,
 'coffee': 9706,
 '##邱': 19994,
 '莢': 5808,
 'day2': 11144,
 '緣': 5225,
 '##340': 12149,
 '搪': 3020,
 '菀': 5820,
 'って': 9127,
 '饕': 7642,
 'super': 8988,
 '訴': 6260,
 '##墩': 14932,
 '##奔': 15001,
 '萵': 5859,
 '矩': 4762,
 '瀏': 4104,
 '##徘': 15591,
 'sdk': 10302,
 '260': 9044,
 '◇': 471,
 '锁': 7219,
 '426': 12408,
 '##×': 9569,
 '##泳': 16864,
 '涇': 3866,
 '##ⅱ': 13520,
 '淫': 3915,
 'ocean': 12546,
 '##てお': 12878,
 '士': 1894,
 'most': 11344,
 '絹': 5191,
 '嚟': 1708,
 '##佐': 13915,
 '##耽': 18517,
 '##薇': 19005,
 '##餮': 20689,
 '摆': 3030,
 '##吡': 14471,
 '##嘶': 147

In [11]:
# 词表的大小
tokenizer.vocab_size

21128

## Step4 索引转换

In [12]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013]

In [13]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '！']

In [14]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想 ！'

### 更便捷的方式

In [18]:
# 将字符串转换为id序列，又称为编码
# add_special_tokens 是否添加特殊token
ids = tokenizer.encode(sen)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013, 102]

In [17]:
ids = tokenizer.encode(sen, add_special_tokens=False)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013]

In [20]:
# 将id序列转换为字符串，又称为解码
# skip_special_tokens 是否跳过特殊token
str_sen = tokenizer.decode(ids, skip_special_tokens=True)
str_sen

'弱 小 的 我 也 有 大 梦 想 ！'

In [19]:
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想 ！ [SEP]'

## Step5 填充与截断

In [22]:
# 填充
ids = tokenizer.encode(sen,add_special_tokens=True, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013, 102, 0, 0, 0]

In [24]:
# 截断 truncation=True
ids = tokenizer.encode(sen, add_special_tokens=False, max_length=5, truncation=True)
ids

[2483, 2207, 4638, 2769, 738]

## Step6 其他输入部分

In [25]:
ids = tokenizer.encode(sen,add_special_tokens=True, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013, 102, 0, 0, 0]

In [26]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids) # 标记是第几个句子
ids, attention_mask, token_type_ids

([101,
  2483,
  2207,
  4638,
  2769,
  738,
  3300,
  1920,
  3457,
  2682,
  8013,
  102,
  0,
  0,
  0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Step7 快速调用方式

In [27]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15, add_special_tokens=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [28]:
inputs = tokenizer(sen, padding="max_length", max_length=15, add_special_tokens=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 8013, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

## Step8 处理batch数据

In [4]:
sens = ["弱小的我也有大梦想",
        "有梦想谁也了不起",
        "追逐梦想的心，比梦想本身更可贵"]

res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 738, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [8]:
sen = "弱小的我也有大梦想！"

In [9]:
%%time
for i in range(1000):
    tokenizer(sen)

CPU times: total: 62.5 ms
Wall time: 73.8 ms


In [10]:
%%time
res = tokenizer([sen] * 1000)

CPU times: total: 109 ms
Wall time: 14 ms


In [11]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Fast/Slow Tokenizer
- FastTokenizer
    - 基于Rust实现，速度快
    - offset_mapping, word_ids
- SlowTokenizer
    - 基于python实现，速度慢

In [53]:
sen = "弱小的我也有大 Dreaming！"

In [14]:
fast_tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
fast_tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [15]:
slow_tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: total: 641 ms
Wall time: 639 ms


In [17]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: total: 1.44 s
Wall time: 1.44 s


In [18]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: total: 531 ms
Wall time: 215 ms


In [19]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: total: 1.52 s
Wall time: 1.56 s


In [23]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
# slow_tokenizer不能配置return_offsets_mapping=True
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 8013, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (8, 13), (13, 16), (16, 17), (0, 0)]}

In [24]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

## 特殊Tokenizer的加载

In [55]:
from transformers import AutoTokenizer

In [56]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
tokenizer

ChatGLMTokenizer(name_or_path='THUDM/chatglm3-6b', vocab_size=64798, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	
}

In [51]:
tokenizer.save_pretrained("chatglm_tokenizer")

('chatglm_tokenizer\\tokenizer_config.json',
 'chatglm_tokenizer\\special_tokens_map.json',
 'chatglm_tokenizer\\tokenizer.model',
 'chatglm_tokenizer\\added_tokens.json')

In [52]:
tokenizer = AutoTokenizer.from_pretrained("chatglm_tokenizer", trust_remote_code=True)

AttributeError: can't set attribute

In [57]:
tokenizer.decode(tokenizer.encode(sen))

'[gMASK]sop 弱小的我也有大 Dreaming！'