In [1]:
import re
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, BertTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm, trange
from keras.preprocessing.sequence import pad_sequences  # padding
import gc
import os

Using TensorFlow backend.


In [32]:
file = "dh_msra.txt"

In [33]:
# 读取文件

all_sentences_separate = []
all_letter_labels = []
label_set = set()
with open(file, encoding="utf-8") as f:
    single_sentence = []
    single_sentence_labels = []
    for s in f.readlines():
        if s != "\n":
            word, label = s.split("\t")
            label = label.strip("\n")
            single_sentence.append(word)
            single_sentence_labels.append(label)
            label_set.add(label)
        elif s == "\n":
            all_sentences_separate.append(single_sentence)
            all_letter_labels.append(single_sentence_labels)
            single_sentence = []
            single_sentence_labels = []

In [34]:
print(all_sentences_separate[0:2])
print(all_letter_labels[0:2])
print(f"\n所有的标签：{label_set}")

[['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], ['藏', '书', '本', '来', '就', '是', '所', '有', '传', '统', '收', '藏', '门', '类', '中', '的', '第', '一', '大', '户', '，', '只', '是', '我', '们', '结', '束', '温', '饱', '的', '时', '间', '太', '短', '而', '已', '。']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

所有的标签：{'B-PER', 'O', 'B-ORG', 'I-LOC', 'I-ORG', 'I-PER', 'B-LOC'}


In [304]:
print(len(all_sentences_separate))

55289


In [35]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

In [36]:
all_sentences = []  # 句子

for one_sentence in all_sentences_separate:
    sentence = "".join(one_sentence)
    all_sentences.append(sentence)

print(all_sentences[0:2])

['当希望工程救助的百万儿童成长起来，科教兴国蔚然成风时，今天有收藏价值的书你没买，明日就叫你悔不当初！', '藏书本来就是所有传统收藏门类中的第一大户，只是我们结束温饱的时间太短而已。']


In [37]:
all_labels = []  # labels
for letter_labels in all_letter_labels:
    labels = [tag_to_ix[t] for t in letter_labels]
    all_labels.append(labels)

print(all_labels[0:2])
print(len(all_labels[0]))

[[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]]
50


In [38]:
# word2token
tokenizer = BertTokenizer.from_pretrained('./bert-chinese/', do_lower_case=True)
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sentences]

In [119]:
print(tokenized_texts[0])
print(len(tokenized_texts[0]))

[101, 2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636, 674, 1036, 4997, 2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768, 7599, 3198, 8024, 791, 1921, 3300, 3119, 5966, 817, 966, 4638, 741, 872, 3766, 743, 8024, 3209, 3189, 2218, 1373, 872, 2637, 679, 2496, 1159, 8013, 102]
52


In [40]:
# 句子padding
# 句子最长长度
MAX_LEN = 128

# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [41]:
# [3] 代表 O 实体
for label in all_labels:
    label.insert(len(label), 8)  # [SEP]
    label.insert(0, 7) # [CLS]
    if MAX_LEN > len(label) -1:
        for i in range(MAX_LEN - len(label)):
            label.append(9)  # [PAD]

In [118]:
print(len(all_labels[0]))
print(all_labels[0])

128
[7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]


In [122]:
test = [7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8]
len(test)

52

In [43]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [120]:
print(attention_masks[0])
print(attention_masks[0].count(1.0))

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
52


In [44]:
# train-test-split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    all_labels, 
                                                                                    random_state=2019, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                      input_ids,
                                                      random_state=2019, 
                                                       test_size=0.1)

In [45]:
# tensor化
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [46]:
# dataloader
# batch size
batch_size = 64

# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [47]:
# GPU
print("Is CUDA available: ", torch.cuda.is_available())
if torch.cuda.is_available():
    n_gpu = torch.cuda.device_count()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("GPU numbers: ", n_gpu)
    print("device_name: ", torch.cuda.get_device_name(0))
    torch.cuda.set_device(1)
    print(f"Current device: {torch.cuda.current_device()}")

# 读取模型
# Load a trained model and vocabulary that you have fine-tuned
output_dir = "./model_save"
model = BertForTokenClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
model.to(device)

Is CUDA available:  True
GPU numbers:  2
device_name:  Tesla M40 24GB
Current device: 1


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [123]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [280]:
# 验证集的读取也要batch
all_scores = []  # 保存全部分数
y_true = []
x_input = []

for batch in tqdm(validation_dataloader):
    # 元组打包放进GPU
    # 
    batch = tuple(t.to(device) for t in batch)  # 64
    # 解开元组
    b_input_ids, b_input_mask, b_labels = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # print(logits[0])
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个batch，字的标签的概率
    for score in scores:
        all_scores.append(score)
        
    # 真实label
    label_ids = b_labels.to('cpu').numpy()
    for true_label in label_ids:
        true_labels = [ix_to_tag[n] for n in list(true_label)]
        y_true.append(true_labels)
    # score 维度应该是，batchsize*padding后句子长度*10个标签

    # 保存输入
    input_ids = b_input_ids.to('cpu').numpy()
    for one_sent in input_ids:
        one_sent_list = list(one_sent)
        x_input.append(tokenizer.decode(one_sent_list))
    
    
    
#    for one_sent_score in scores:        
#        pred_flat = np.argmax(one_sent_score, axis=1).flatten()  # 一句话里面每个字的标签
#        pre_labels = [ix_to_tag[n] for n in pred_flat]           # 转换为label
#        
#        # pre_labels_cut = pre_labels[0:len(test_sententce)+2]     # 截断
#        y_pred.append(pre_labels)                            # 存入
#
#        label_ids = b_labels.to('cpu').numpy()  # 真实labels，一个batch
#        true_labels = [ix_to_tag[n] for n in label_ids]
#        # true_labels_cut = true_labels[0:len(test_sententce)+2]
#        y_true.append(true_labels)

100%|██████████| 87/87 [00:38<00:00,  2.24it/s]


In [283]:
print(len(all_scores))
print(len(y_true))
print(len(x_input))

5529
5529
5529


In [284]:
print(len(all_scores[0]))

128


In [285]:
print(scores[0])

[[-1.6202939  -1.2211432  -1.4908961  ... 12.054188   -0.9071342
  -0.53540987]
 [-0.48261884 -3.1787622  10.726759   ... -0.9759311  -0.63543904
  -0.8629871 ]
 [-2.0440245  -1.291023    0.1454812  ... -1.3998073  -1.5697947
  -1.397507  ]
 ...
 [ 0.02825457 -1.1788284  -0.73603225 ... -1.8228445  -3.029034
  -2.5747526 ]
 [ 0.14460203 -1.20441    -0.6588846  ... -1.7593207  -3.0693283
  -2.618497  ]
 [ 0.30255833 -1.304818   -0.590544   ... -1.7114282  -3.1273003
  -2.6204689 ]]


In [286]:
# 保存结果
y_pred = []
# y_true = []

for one_sent_score in all_scores:        
    pred_flat = np.argmax(one_sent_score, axis=1).flatten()  # 一句话里面每个字的标签
    pre_labels = [ix_to_tag[n] for n in list(pred_flat)]           # 转换为label

    # pre_labels_cut = pre_labels[0:len(test_sententce)+2]     # 截断
    y_pred.append(pre_labels)                            # 存入

print(len(y_pred))

5529


In [287]:
# label_ids = b_labels.to('cpu').numpy()  # 真实labels，一个batch,64个
# for true_label in label_ids:
#     true_labels = [ix_to_tag[n] for n in list(true_label)]
# true_labels_cut = true_labels[0:len(test_sententce)+2]
#     y_true.append(true_labels)

In [288]:
# input_ids = b_input_ids.to('cpu').numpy()
# print(input_ids)

In [260]:
# x_input = []
# for one_sent in input_ids:
#     one_sent_list = list(one_sent)
#     x_input.append(tokenizer.decode(one_sent_list))

In [289]:
index = 0
print(y_pred[index])
print("\n")
print(y_true[index])
print("\n")
print(x_input[index])

['[CLS]', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '[SEP]', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'I-PER']


['[CLS]', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 

In [290]:
cut_y_true = []
for y in y_true:
    for label in y:
        if label != '[SEP]':
            cut_y_one.append(label)
        else:
            cut_y_true.append(cut_y_one)
            cut_y_one = []
            break

In [291]:
cut_y_pred = []
for index, y in enumerate(y_pred):
    cut_y_one_pred = []
    cut_y_pred.append(y[0:len(cut_y_true[index])])

In [292]:
cut_index = 0
print(cut_y_true[cut_index])
print(len(cut_y_true[cut_index]))
print(cut_y_pred[cut_index])
print(len(cut_y_pred[cut_index]))
# assert len(cut_y_true[cut_index]) == len(cut_y_pred[cut_index])

['[CLS]', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
31
['[CLS]', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
31


In [293]:
print(len(cut_y_true))
print(len(cut_y_pred))
print(len(x_input))

5529
5529
5529


In [299]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.metrics import recall_score

In [300]:
print(f1_score(cut_y_true, cut_y_pred))

0.9521963351742804


In [301]:
print(recall_score(cut_y_true, cut_y_pred))

0.9559374311522362


In [302]:
print(accuracy_score(cut_y_true, cut_y_pred))

0.9916312389113916


In [303]:
print(classification_report(cut_y_true, cut_y_pred))

           precision    recall  f1-score   support

      ORG       0.88      0.91      0.89      2114
      LOC       0.91      0.92      0.91      4037
      PER       0.96      0.96      0.96      1937
    [CLS]       1.00      1.00      1.00      5529

micro avg       0.95      0.96      0.95     13617
macro avg       0.95      0.96      0.95     13617

