# Tokenizer 基本使用

In [2]:
from transformers import AutoTokenizer

In [3]:
sen = "弱小的我也有大梦想!"

## Step1 加载与保存

In [4]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [5]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer\\tokenizer_config.json',
 './roberta_tokenizer\\special_tokens_map.json',
 './roberta_tokenizer\\vocab.txt',
 './roberta_tokenizer\\added_tokens.json',
 './roberta_tokenizer\\tokenizer.json')

In [6]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## Step2 句子分词

In [None]:
tokens = tokenizer.tokenize(sen)
tokens

## Step3 查看词典

In [7]:
tokenizer.vocab

{'椅': 3488,
 '285': 10938,
 '##阳': 20402,
 '└': 435,
 '##!': 13317,
 '睽': 4727,
 '##甾': 17572,
 '莺': 5817,
 'http': 8184,
 '滂': 3992,
 '潭': 4059,
 '##back': 10102,
 '##itor': 12449,
 '##酮': 20050,
 '##bu': 11381,
 '[unused69]': 69,
 '厢': 1334,
 '主': 712,
 '碓': 4812,
 '##腋': 18630,
 '##渎': 16989,
 '##诋': 19460,
 '体': 860,
 '##坠': 14842,
 '[unused44]': 44,
 '葆': 5863,
 '藿': 5977,
 '##泯': 16861,
 '俾': 940,
 '##评': 19454,
 '##缺': 18432,
 '諱': 6325,
 '酵': 6997,
 'ㄅ': 647,
 'lonzo': 12688,
 '剖': 1189,
 '讶': 6385,
 '浚': 3852,
 'い': 536,
 '028': 11842,
 'q2': 9898,
 '##睜': 17772,
 '霉': 7450,
 '##铠': 20257,
 '##潮': 17117,
 'br': 8575,
 'ol': 8972,
 '嘌': 1650,
 '##蝌': 19126,
 'lp': 11195,
 '迪': 6832,
 '##师': 15417,
 '脾': 5569,
 'chrome': 8812,
 '準': 3976,
 '##闻': 20376,
 '##ho': 9570,
 '毅': 3675,
 '錶': 7100,
 '1932': 9737,
 '##icon': 12829,
 'tw': 8351,
 '报': 2845,
 '##祭': 17931,
 'ah': 10785,
 'overdope': 9964,
 '##驭': 20774,
 '##娛': 15081,
 '##ニア': 12650,
 '##鸟': 20938,
 '##倬': 14019,
 '##批': 

In [8]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [None]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

In [None]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

In [None]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

###  更便捷的实现方式

In [None]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

In [None]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

## Step5 填充与截断

In [None]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

In [None]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

## Step6 其他输入部分

In [None]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

In [None]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

## Step7 快速调用方式

In [None]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

In [None]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

## Step8 处理batch数据

In [None]:
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心，比梦想本身，更可贵"]
res = tokenizer(sens)
res

In [None]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

In [None]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

In [None]:
tokenizer

# Fast / Slow Tokenizer

In [None]:
sen = "弱小的我也有大Dreaming!"

In [None]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

In [None]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

In [None]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

In [None]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

In [None]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

In [None]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

In [None]:
inputs.word_ids()

In [None]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

# 特殊Tokenizer的加载

In [None]:
from transformers import AutoTokenizer

In [None]:
# 新版本的transformers（>4.34），加载 THUDM/chatglm 会报错，因此这里替换为了天宫的模型
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

In [None]:
tokenizer.save_pretrained("skywork_tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)

In [None]:
tokenizer.decode(tokenizer.encode(sen))