In [None]:
import torch
import numpy as np
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path = 'bert-base-chinese',
    cache_dir = None,
    force_download = False,
)

sents = [
    '选择珠江花园的原因就是方便。', 
    '笔记本的键盘确实爽。', 
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷，真有点郁闷。', 
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

tokenizer

In [None]:
# 编码两个句子
out = tokenizer.encode(
    text = sents[0], 
    text_pair = sents[1], 

    # 当句子长度大于max_length时，截断
    truncation = True, 

    # 一律pad到max_length长度
    padding = 'max_length', 
    add_special_tokens = True, 
    max_length = 30, 
    # 可取tf, pt, np, 默认返回list
    return_tensors = 'pt',
)

print(out)
tokenizer.decode(out.numpy().tolist()[0]), len(out.numpy().tolist()[0])

In [None]:
# 增强的编码函数
out = tokenizer.encode_plus(
    text = sents[0], 
    text_pair = sents[1], 

    # 当句子长度大于max_length时，截断
    truncation = True, 

    # 一律pad到max_length长度
    padding = 'max_length', 
    add_special_tokens = True, 
    max_length = 30, 

    # 可取tf, pt, np, 默认返回list
    return_tensors = None,

    # 返回token_type_ids
    return_token_type_ids = True,

    # 返回attention_mask
    return_attention_mask = True, 

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask = True, 

    # 返回offset_mapping 标识每个词的起止位置，这个参数只能BertTokenizerFast使用
    # return_offset_mapping = True, 

    # 返回length 标识长度
    return_length = True,
)

print(out)
tokenizer.decode(out['input_ids'])

In [None]:
# 批量编码句子
out = tokenizer.batch_encode_plus(
    # batch_text_or_text_pairs = [sents[0], sents[1]], 
    # 成对编码
    batch_text_or_text_pairs = [(sents[0], sents[1]), (sents[2], sents[3])], 

    # 当句子长度大于max_length时，截断
    truncation = True, 

    # 一律pad到max_length长度
    padding = 'max_length', 
    add_special_tokens = True, 
    max_length = 30, 

    # 可取tf, pt, np, 默认返回list
    return_tensors = None,

    # 返回token_type_ids
    return_token_type_ids = True,

    # 返回attention_mask
    return_attention_mask = True, 

    # 返回special_tokens_mask 特殊符号标识
    return_special_tokens_mask = True, 

    # 返回offset_mapping 标识每个词的起止位置，这个参数只能BertTokenizerFast使用
    # return_offset_mapping = True, 

    # 返回length 标识长度
    return_length = True,
)

print(out)
tokenizer.decode(out['input_ids'][0]), tokenizer.decode(out['input_ids'][1])

In [None]:
# 获取字典
dic = tokenizer.get_vocab()

type(dic), len(dic), '月光' in dic

In [None]:
# 添加新词
tokenizer.add_tokens(new_tokens=['月光', '希望'])

# 添加新符号
tokenizer.add_special_tokens({'eos_token': '[EOS]'})

dic = tokenizer.get_vocab()

type(dic), len(dic), '月光' in dic, dic['月光'], dic['[EOS]']

In [None]:
# 编码新添加的词
out = tokenizer.encode(
    text = '月光的新希望[EOS]', 
    text_pair = None, 

    # 当句子长度大于max_length时，截断
    truncation = True, 

    # 一律pad到max_length长度
    padding = 'max_length', 
    add_special_tokens = True, 
    max_length = 8, 
    # 可取tf, pt, np, 默认返回list
    return_tensors = 'pt',
)

print(out)
tokenizer.decode(out.numpy().tolist()[0]), len(out.numpy().tolist()[0])