In [None]:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File         :AutoModel.ipynb
@Description  :用automodel方式完成多种NLP任务
@Time         :2022/04/28 20:40:19
@Author       :Hedwig
@Version      :1.0
'''


In [None]:
# AutoModel类用于管理Transformers库中处理相同地NLP任务的底层具体模型，为上层应用管道方式提供统一的接口
# 按Bertology系列的应用场景，Transformer库被划分为以下6个子类
# AutoModel：基本载入类，适用于Transformers中的任何模型，也适用于特征提取
# AutoModelForPreTraining：特征提取任务的模型载入类，适用于Transformers库中所有的预训练模型
# AutoModelForSequenceClassification：文本分类的模型载入类，适用于Transformers中所有文本分类模型
# AutoModelForQuestionAnswering：阅读理解任务的模型载入类，适用于Transformers中所有抽取式问答模型
# AutoModelWithLMHead：完形填空任务的模型载入类，适用于Transformers中所有遮蔽语言模型
# AutoModelForTokenClassification：实体词识别的模型载入类，适用于Transformers库中所有实体词识别模型

In [1]:
from transformers import *

In [None]:
# transformers的models/auto路径下有modeling_auto.py源文件，可以找到模型载入类和具体的模型映射关系
# 以AutoModelWithLMHead类为例，MODEL_WITH_LM_HEAD_MAPPING_NAMES代表了载入类与系列模型的映射
# 在这里列出的所有元素都可以实现AutoModelWithLMHead类所完成的完形填空
# 它的元素包括两部分，具体模型的配置文件和具体模型的实现类
# 每一个具体模型的实现类会通过不同数据集被训练成多套预训练模型文件
# 每套模型训练文件又由3-4个子文件组成：词表文件、词表扩展文件(可选)、配置文件和模型权值文件
# 这些文件共用一个统一的字符串标识
# 用自动加载方式调用模型时，系统会根据统一的预训练模型字符串标识，找到对应的预训练模型文件，通过网络下载并载入 

In [None]:
# 具体地，pipeline的任务字符串(以fill-mask为例)传入后，pipeline会把传入的任务作为SUPPORTED_TASKS的键，
# 找到对应的配置字典，用get_default_model方法根据配置字典中default键找到相应model的模型字符串
# (这里是distilroberta-base)，这个字符串用于infer_framework_load_model函数转化为模型的类
# 在默认的SUPPORTED_TASKS中就是AutoModelForMaskedLM类 
# 同时还会用配置字典的imple键找到子pipeline的类名(这里是FillMaskPipeline，如果pipeline_class有指定就用
# 指定的这个)，并调用它的call方法，将模型的类作为参数传入
# 模型的类又调用相应的任务，(在这里是RobertaForMaskedLM)，加载相应的配置文件
# transformmer文件下的modelcards.py有任务字符串的键(fill-mask)与对应的
# (MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)的映射


In [12]:
import torch
from transformers import *
model_identity = 'bert-base-cased'
# 'distilbert-base-uncased','bert-base-cased'亲测好用
# 加载词表文件
tokenizer = AutoTokenizer.from_pretrained(model_identity)#'bert-base-uncased'
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# 输入文本
text = "[CLS] who is Li Jinhong ? [SEP] Li JinHong is a programmer [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)
# 屏蔽部分单词并将其转换成索引值
masked_index = 8#掩码位置
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)
# 将标识转换为词汇表索引
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
#print(indexed_tokens)
tokens_tensor = torch.tensor([indexed_tokens])
print(tokens_tensor)
# 指定设备
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
print(device)
# 加载预训练模型
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
# 也可以用下一行AutoModelWithLMHead替换上一行
model = AutoModelWithLMHead.from_pretrained(model_identity)

# 如果用手动方式，记得config=AutoConfig.from_pretrained('手动路径')
# 如果想换用别的模型标识符，打开transformers下的model文件夹选择想要的文件夹
# tokenization文件里有模型标识符，修改token即可

model.eval()
model.to(device)
# 段标识索引
segments_ids = [0]*8+[1]*7
segments_tensor = torch.tensor([segments_ids]).to(device)
tokens_tensor = tokens_tensor.to(device)
with torch.no_grad():
    outputs = model(tokens_tensor)#,token_type_ids=segments_tensor
predictions = outputs[0]# [1,15,30522]
predict_index = torch.argmax(predictions[0,masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predict_index])[0]
print('Predicted token is:',predicted_token)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/mist/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve/mai

['[CLS]', 'who', 'is', 'Li', 'Jin', '##hong', '?', '[SEP]', 'Li', 'Jin', '##H', '##ong', 'is', 'a', 'programmer', '[SEP]']
['[CLS]', 'who', 'is', 'Li', 'Jin', '##hong', '?', '[SEP]', '[MASK]', 'Jin', '##H', '##ong', 'is', 'a', 'programmer', '[SEP]']
tensor([[  101,  1150,  1110,  5255, 10922, 15564,   136,   102,   103, 10922,
          3048,  4553,  1110,   170, 23981,   102]])
cuda:0


loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/mist/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-base-cased/res

Predicted token is: Li
