In [1]:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File         :Bertology.ipynb
@Description  :
@Time         :2022/04/29 06:10:09
@Author       :Hedwig
@Version      :1.0
'''
from transformers import *
import torch

# bert使用

In [None]:
# Transformer库中，每个预训练的模型都会被分成3个子文件，这三个子文件分别在不同的代码文件中加载运行
# 词表文件：输入单词转具体数字
# 配置文件：放置模型超参数，实例化模型时使用
# 模型权重文件：模型的权值

In [None]:
# transformers文件夹下
# 以configuration开头的是bertology的配置文件
# 以modeling开头的是bertology的模型代码文件
# 以tokenization开头的是词表代码文件
# 以bert为例，
# (1)根据唯一标识符找到词表和配置文件
# 配置文件是configuration_bert文件，打开能看到模型下载链接放在BERT_PRETRAINED_CONFIG_ARCHIVE_MAP字典里
# tokenization_bert.py文件里有对应的PRETRAINED_VOCAB_FILES_MAP存储词汇表下载链接
# (2)根据唯一标识符找到模型文件
# 模型文件的下载链接命名统一为{下载渠道}/{模型id}/{文件名} 
# 模型文件的下载在transformer目录下的file_utils.py文件里，有多个下载渠道，通过函数hf_bucket_url实现
# 将参数列表use_cdn指定为True实现cdn链接，向mirror传入tuna或bfsu实现两种清华大学镜像
# 使用时传入from_pretrained函数就可以，它是hf_bucket_url的上层函数
# (3)加载预训练模型
# 加载预训练模型需要模型代码文件、配置代码文件和词表代码文件，通过三个类实现
# 配置文件类：configuration class
# 模型类：model class
# 词表工具类：tokenizer class
# 除了from_pretrained()方法外，还有save_pretraining()方法将模型代码文件、配置代码文件和词表代码文件保存本地 

In [3]:
# 自动加载
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')# 加载词表
model = BertModel.from_pretrained('bert-base-uncased')# 加载模型和配置文件
# 执行以后默认放在用户的.cache/torch/transformers路径，通过cache_dir参数修改这个路径

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /home/mist/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /home/mist/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/mist/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c3446

In [8]:
#tokenizer.save_pretrained(save_directory='./bert-base-uncased')
model.save_pretrained(save_directory='./bert-base-uncased')


Configuration saved in ./bert-base-uncased/config.json
Model weights saved in ./bert-base-uncased/pytorch_model.bin


In [9]:
# 手动加载
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')# 加载词表
model = BertModel.from_pretrained('./bert-base-uncased')# 加载模型和配置文件，集成一步
# 方法几乎完全一样，但是这时候参数是个路径

Didn't find file ./bert-base-uncased/added_tokens.json. We won't load it.
loading file ./bert-base-uncased/vocab.txt
loading file None
loading file ./bert-base-uncased/special_tokens_map.json
loading file ./bert-base-uncased/tokenizer_config.json
loading configuration file ./bert-base-uncased/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading wei

In [None]:
# 查看Transformers中可以使用的模型在__init__.py文件
# 代码输出Transformers库的宏定义
print(BERT_PRETRAINED_MODEL_ARCHIVE_LIST)# 这些都是bert系列，对于其他bart等方法类似

# 词表工具使用

In [None]:
# Tokenizer词表工具是用Rust编写的，提供了多个不同组件
# Normalizer：输入字符规范化转换，如文本大小写转换、Unicode规范化
# PreTokenizer：输入数据的预处理。如基于字节、空格、字符等分割
# Model：生成和使用子词模型如WordLevel、BPE、WordPiece
# Post-Processor：文本二次处理，如在Bert中用BertProcessor为文本添加特殊标识
# Decoder：输入向量转字符串
# Trainer：为每个模型提供培训能力
# Tokenizer中，主要通过PreTrainedTokenizer类实现对外接口的使用 

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')# 加载词表
for tokenstr in tokenizer.SPECIAL_TOKENS_ATTRIBUTES:
    strto = "tokenizer."+tokenstr
    print(tokenstr,eval(strto))
# unk_token:未知标识
# sep_token:句子结束标识
# pad_token:填充标识
# cls_token: 开始标识
# mask_token:遮挡词标识
# additional_special_tokens：扩充自定义
print("mask_token",tokenizer.mask_token,tokenizer.mask_token_id)# 查看id

Didn't find file bert-base-uncased/added_tokens.json. We won't load it.
loading file bert-base-uncased/vocab.txt
loading file None
loading file bert-base-uncased/special_tokens_map.json
loading file bert-base-uncased/tokenizer_config.json
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


bos_token None
eos_token None
unk_token [UNK]
sep_token [SEP]
pad_token [PAD]
cls_token [CLS]
mask_token [MASK]
additional_special_tokens []
mask_token [MASK] 103


In [6]:
# tokenizer的tokenize方法可以分词，而一般用encode方法直接加特殊词、分词、转换成词向量一步到位
one_toind = tokenizer.encode("Who is Li Jinhong ? ")
two_toind = tokenizer.encode("Li Jinhong is a programmer")
all_toidx = one_toind+two_toind[1:]
print(all_toidx)
print(tokenizer.convert_ids_to_tokens(one_toind))
print(tokenizer.decode(all_toidx)) #解码一步到位

[101, 2040, 2003, 5622, 9743, 19991, 1029, 102, 5622, 9743, 19991, 2003, 1037, 20273, 102]
['[CLS]', 'who', 'is', 'li', 'jin', '##hong', '?', '[SEP]']
[CLS] who is li jinhong? [SEP] li jinhong is a programmer [SEP]


In [7]:
# encode参数列表
# text：第一个句子
# text_pair：第二个句子
# add_special_tokens：是否添加特殊词 
# max_length：最大长度，超过这个长度会截断，截断保留标识符
# stride：----
# truncation_strategy：截断策略 
# padding：是否填充长度不足句子，是则选'max_length'
# return_tensors：返回的张量类型，None、tf、pt
# 截断策略有四个取值，'longest_first'输入两个句子时，从较长难过的句子处理，截断使得小于max_lenth
# 'only_first''only_second'只截断一个 不截断
padded_sequence = tokenizer.encode(
    "Li Jinhong is a programmer",
    add_special_tokens=False
)
print(tokenizer.decode(padded_sequence))

li jinhong is a programmer


In [11]:
# encode_plus，编码的同时返回掩码标识和被截断词信息
padded_plus_sequence_id = tokenizer.encode_plus(
    "Li Jinhong is a programmer",
    max_length=10,
    padding='max_length'
)
print(padded_plus_sequence_id)

{'input_ids': [101, 5622, 9743, 19991, 2003, 1037, 20273, 102, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}


In [14]:
# batch_encode_plus:批处理
tokens = tokenizer.batch_encode_plus(
    ["This is a sample","This is another longer sample text"]
)
print(tokens)

{'input_ids': [[101, 2023, 2003, 1037, 7099, 102], [101, 2023, 2003, 2178, 2936, 7099, 3793, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [17]:
# 向PerTrainedTokenizer添加特殊词
# 添加普通词add_tokens()，添加特殊词add_special_tokens()
print(tokenizer.additional_special_tokens,tokenizer.additional_special_tokens_ids)# 全空
toind = tokenizer.encode('<#> yes <#>')
print(tokenizer.convert_ids_to_tokens(toind))# 编码以后达到的数组
print(len(tokenizer))# 词表长度
# 添加特殊词
special_token_dict = {'additional_special_tokens':["<#>"]}
tokenizer.add_special_tokens(special_token_dict)
print(tokenizer.additional_special_tokens,tokenizer.additional_special_tokens_ids)
toind = tokenizer.encode('<#> yes <#>')
print(tokenizer.convert_ids_to_tokens(toind))
print(len(tokenizer))

Assigning ['<#>'] to the additional_special_tokens key of the tokenizer
Adding <#> to the vocabulary


[] []
['[CLS]', '<', '#', '>', 'yes', '<', '#', '>', '[SEP]']
30522
['<#>'] [30522]
['[CLS]', '<#>', 'yes', '<#>', '[SEP]']
30523


# 手动加载GPT-2模型权值的方式将句子补充完整

In [None]:
# 见GPT2Test.py
# 