# Tokenizer 基本使用

In [1]:
from transformers import AutoTokenizer

In [2]:
sen = "弱小的我也有大梦想!"

## Step1 加载与保存

In [3]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer



BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

In [None]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

## Step2 句子分词

In [4]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

## Step3 查看词典

In [5]:
tokenizer.vocab

{'##壤': 14949,
 '￥': 8101,
 'well': 12010,
 '##巩': 15400,
 '##資': 19593,
 '##ο': 13392,
 '▲topfeb': 10663,
 '葵': 5878,
 'jonathan': 11559,
 '##笃': 18062,
 '##ᄆ': 13459,
 '##mar': 11800,
 '##頑': 20579,
 '##頗': 20582,
 '漣': 4032,
 '##般': 18720,
 '嗬': 1637,
 '傅': 987,
 'ш': 256,
 '##ound': 11477,
 '##鼻': 21022,
 'rosie': 10487,
 '##ニ': 13689,
 '##帶': 15437,
 '##瓜': 17535,
 '擁': 3075,
 '砗': 4779,
 '旅': 3180,
 '衡': 6130,
 '∞': 381,
 'ə': 198,
 '幡': 2393,
 '##news': 10040,
 '##de': 8510,
 '##⑥': 13561,
 '##浦': 16912,
 '##ura': 10238,
 '##欣': 16672,
 '痤': 4583,
 'eco': 12791,
 '窑': 4965,
 '185': 9560,
 '##ist': 9527,
 '砥': 4783,
 '潔': 4049,
 '蟹': 6101,
 '1989': 8528,
 '##曦': 16343,
 '##邨': 19988,
 '##悶': 15710,
 '虐': 5990,
 '##士': 14951,
 'chi': 12205,
 '弭': 2481,
 '##ま': 9774,
 'vista': 9847,
 'soc': 11405,
 '##卡': 14362,
 '##潇': 17102,
 '##羌': 18457,
 '暉': 3261,
 '卜': 1301,
 '##chat': 10720,
 '辨': 6795,
 '坍': 1774,
 '##哉': 14564,
 '泻': 3811,
 '舌': 5649,
 '##勖': 14298,
 '##紹': 18228,
 '##nex

In [6]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [7]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [8]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [9]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

###  更便捷的实现方式

In [10]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [11]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'

## Step5 填充与截断

In [12]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [13]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

## Step6 其他输入部分

In [None]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

In [14]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 102], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0])

## Step7 快速调用方式

In [15]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [16]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

## Step8 处理batch数据

In [17]:
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心，比梦想本身，更可贵"]
res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [18]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: total: 62.5 ms
Wall time: 75.4 ms


In [19]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

CPU times: total: 15.6 ms
Wall time: 15.4 ms


In [None]:
tokenizer

# Fast / Slow Tokenizer

In [None]:
sen = "弱小的我也有大Dreaming!"

In [None]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

In [None]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

In [None]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

In [None]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

In [None]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

In [None]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

In [None]:
inputs.word_ids()

In [None]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

# 特殊Tokenizer的加载

In [None]:
from transformers import AutoTokenizer

In [None]:
# 新版本的transformers（>4.34），加载 THUDM/chatglm 会报错，因此这里替换为了天宫的模型
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

In [None]:
tokenizer.save_pretrained("skywork_tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)

In [None]:
tokenizer.decode(tokenizer.encode(sen))