In [2]:
import re
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, BertTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.notebook import tqdm
from keras.preprocessing.sequence import pad_sequences  # padding
import gc
import os

Using TensorFlow backend.


In [2]:
gc.collect()

20

In [3]:
file = "dh_msra.txt"

In [5]:
# GPU
print("Is CUDA available: ", torch.cuda.is_available())
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU numbers: ", n_gpu)
print("device_name: ", torch.cuda.get_device_name(0))
torch.cuda.set_device(1)
print(f"Current device: {torch.cuda.current_device()}")

Is CUDA available:  True
GPU numbers:  2
device_name:  Tesla M40 24GB
Current device: 1


In [4]:
all_sentences_separate = []
all_letter_labels = []
label_set = set()
with open(file, encoding="utf-8") as f:
    single_sentence = []
    single_sentence_labels = []
    for s in f.readlines():
        if s != "\n":
            word, label = s.split("\t")
            label = label.strip("\n")
            single_sentence.append(word)
            single_sentence_labels.append(label)
            label_set.add(label)
        elif s == "\n":
            all_sentences_separate.append(single_sentence)
            all_letter_labels.append(single_sentence_labels)
            single_sentence = []
            single_sentence_labels = []

In [5]:
print(all_sentences_separate[0:2])
print(all_letter_labels[0:2])
print(f"\n所有的标签：{label_set}")

[['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], ['藏', '书', '本', '来', '就', '是', '所', '有', '传', '统', '收', '藏', '门', '类', '中', '的', '第', '一', '大', '户', '，', '只', '是', '我', '们', '结', '束', '温', '饱', '的', '时', '间', '太', '短', '而', '已', '。']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

所有的标签：{'I-LOC', 'B-ORG', 'I-PER', 'I-ORG', 'O', 'B-PER', 'B-LOC'}


In [7]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

In [8]:
all_sentences = []  # 句子

for one_sentence in all_sentences_separate:
    sentence = "".join(one_sentence)
    all_sentences.append(sentence)

print(all_sentences[0:2])

['当希望工程救助的百万儿童成长起来，科教兴国蔚然成风时，今天有收藏价值的书你没买，明日就叫你悔不当初！', '藏书本来就是所有传统收藏门类中的第一大户，只是我们结束温饱的时间太短而已。']


In [9]:
all_labels = []  # labels
for letter_labels in all_letter_labels:
    labels = [tag_to_ix[t] for t in letter_labels]
    all_labels.append(labels)

print(all_labels[0:2])
print(len(all_labels[0]))

[[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]]
50


In [10]:
print(len(all_labels))

55289


In [11]:
# word2token
tokenizer = BertTokenizer.from_pretrained('./bert-chinese/', do_lower_case=True)
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sentences]

In [12]:
print(tokenized_texts[0])

[101, 2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636, 674, 1036, 4997, 2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768, 7599, 3198, 8024, 791, 1921, 3300, 3119, 5966, 817, 966, 4638, 741, 872, 3766, 743, 8024, 3209, 3189, 2218, 1373, 872, 2637, 679, 2496, 1159, 8013, 102]


In [13]:
# 句子padding
# 句子最长长度
MAX_LEN = 128

# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [14]:
print(len(input_ids[0]))
print(input_ids[0])

128
[ 101 2496 2361 3307 2339 4923 3131 1221 4638 4636  674 1036 4997 2768
 7270 6629 3341 8024 4906 3136 1069 1744 5917 4197 2768 7599 3198 8024
  791 1921 3300 3119 5966  817  966 4638  741  872 3766  743 8024 3209
 3189 2218 1373  872 2637  679 2496 1159 8013  102    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [15]:
# [3] 代表 O 实体
for label in all_labels:
    label.insert(len(label), 8)  # [SEP]
    label.insert(0, 7) # [CLS]
    if MAX_LEN > len(label) -1:
        for i in range(MAX_LEN - len(label)):
            label.append(9)  # [PAD]

In [16]:
print(len(all_labels[0]))
print(all_labels[0])

128
[7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]


In [17]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [18]:
# 第一句话的 attention_masks
print(np.array(attention_masks[0]))
print(len(np.array(attention_masks[0])))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
128


In [19]:
# train-test-split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    all_labels, 
                                                                                    random_state=2019, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2019, 
                                                       test_size=0.1)

In [20]:
print(len(train_inputs))
print(len(validation_inputs))

print(train_inputs[0])
print(validation_inputs[0])

49760
5529
[ 101 3616 3828 6598 3315 2356 1767 4638 2600  817  966 8024 1315 2828
 3616 4673 8115 1744 2792 3300 4638 5500 4873  510 6395 1171 1469 7213
 6121 2100 3621 1217 1762  671 6629 8024 2347 5307 6631 6814 5401 1744
  511  102    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[ 101 3655 4904  510 7029 3941 1469 2166 3360 3918 3884 8024 3354 2768
  749 4263 3173 6230 5384  185 6825 5307  704 1744 4514 4638  712 6206
 7579 3332  511  102    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    

In [21]:
# tensor化
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

In [22]:
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

In [23]:
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [24]:
# dataloader
# batch size
batch_size = 64

# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [25]:
model = BertForTokenClassification.from_pretrained("./bert-chinese/", num_labels=10)
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [26]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

# 权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}]

In [27]:
# 优化器
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=5e-5)

In [28]:
# 保存loss
train_loss_set = []
# epochs 
epochs = 5
# epochs = 4

In [29]:
# BERT training loop
for _ in range(epochs): 
    ## 训练
    print(f"当前epoch： {_}")
    # 开启训练模式
    model.train()
    tr_loss = 0  # train loss
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in tqdm(enumerate(train_dataloader)):
        # 把batch放入GPU
        batch = tuple(t.to(device) for t in batch)
        # 解包batch
        b_input_ids, b_input_mask, b_labels = batch
        # 梯度归零
        optimizer.zero_grad()
        # 前向传播loss计算
        output = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels)  
        loss = output[0]
        # print(loss)
        # 反向传播
        loss.backward()
        # Update parameters and take a step using the computed gradient
        # 更新模型参数
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print(f"当前 epoch 的 Train loss: {tr_loss/nb_tr_steps}")

  0%|          | 0/5 [00:00<?, ?it/s]

当前epoch： 0


 20%|██        | 1/5 [17:01<1:08:06, 1021.52s/it]

当前 epoch 的 Train loss: 0.08692683487086722
当前epoch： 1


 40%|████      | 2/5 [34:03<51:05, 1021.77s/it]  

当前 epoch 的 Train loss: 0.028457486720403758
当前epoch： 2


 60%|██████    | 3/5 [51:05<34:03, 1021.64s/it]

当前 epoch 的 Train loss: 0.018437476925657233
当前epoch： 3


 80%|████████  | 4/5 [1:08:07<17:01, 1021.76s/it]

当前 epoch 的 Train loss: 0.014582799714816385
当前epoch： 4


100%|██████████| 5/5 [1:25:09<00:00, 1021.96s/it]

当前 epoch 的 Train loss: 0.011921173068431413





In [83]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [84]:
# 验证集的读取也要batch
for batch in tqdm(validation_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask, b_labels = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # print(logits[0])
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个字的标签的概率
    pred_flat = np.argmax(scores[0], axis=1).flatten()
    label_ids = b_labels.to('cpu').numpy()  # 真实labels
    # print(logits, label_ids)


  0%|          | 0/346 [00:00<?, ?it/s][A
  0%|          | 1/346 [00:00<00:43,  7.85it/s][A
  1%|          | 2/346 [00:00<00:44,  7.81it/s][A
  1%|          | 3/346 [00:00<00:42,  7.99it/s][A
  1%|          | 4/346 [00:00<00:41,  8.18it/s][A
  1%|▏         | 5/346 [00:00<00:40,  8.33it/s][A
  2%|▏         | 6/346 [00:00<00:40,  8.45it/s][A
  2%|▏         | 7/346 [00:00<00:39,  8.53it/s][A
  2%|▏         | 8/346 [00:00<00:39,  8.58it/s][A
  3%|▎         | 9/346 [00:01<00:39,  8.60it/s][A
  3%|▎         | 10/346 [00:01<00:38,  8.63it/s][A
  3%|▎         | 11/346 [00:01<00:38,  8.65it/s][A
  3%|▎         | 12/346 [00:01<00:38,  8.67it/s][A
  4%|▍         | 13/346 [00:01<00:38,  8.69it/s][A
  4%|▍         | 14/346 [00:01<00:38,  8.70it/s][A
  4%|▍         | 15/346 [00:01<00:38,  8.71it/s][A
  5%|▍         | 16/346 [00:01<00:37,  8.71it/s][A
  5%|▍         | 17/346 [00:01<00:37,  8.71it/s][A
  5%|▌         | 18/346 [00:02<00:37,  8.71it/s][A
  5%|▌         | 19/346 [00:0

 90%|████████▉ | 310/346 [00:35<00:04,  8.70it/s][A
 90%|████████▉ | 311/346 [00:35<00:04,  8.70it/s][A
 90%|█████████ | 312/346 [00:35<00:03,  8.70it/s][A
 90%|█████████ | 313/346 [00:35<00:03,  8.70it/s][A
 91%|█████████ | 314/346 [00:36<00:03,  8.70it/s][A
 91%|█████████ | 315/346 [00:36<00:03,  8.70it/s][A
 91%|█████████▏| 316/346 [00:36<00:03,  8.70it/s][A
 92%|█████████▏| 317/346 [00:36<00:03,  8.70it/s][A
 92%|█████████▏| 318/346 [00:36<00:03,  8.69it/s][A
 92%|█████████▏| 319/346 [00:36<00:03,  8.69it/s][A
 92%|█████████▏| 320/346 [00:36<00:02,  8.69it/s][A
 93%|█████████▎| 321/346 [00:36<00:02,  8.70it/s][A
 93%|█████████▎| 322/346 [00:37<00:02,  8.70it/s][A
 93%|█████████▎| 323/346 [00:37<00:02,  8.70it/s][A
 94%|█████████▎| 324/346 [00:37<00:02,  8.70it/s][A
 94%|█████████▍| 325/346 [00:37<00:02,  8.70it/s][A
 94%|█████████▍| 326/346 [00:37<00:02,  8.70it/s][A
 95%|█████████▍| 327/346 [00:37<00:02,  8.70it/s][A
 95%|█████████▍| 328/346 [00:37<00:02,  8.70it

In [199]:
# 保存模型
# They can then be reloaded using `from_pretrained()`



output_dir = "./model_save"
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'training_args.bin'))

In [6]:
# 读取模型
# Load a trained model and vocabulary that you have fine-tuned
output_dir = "./model_save"
model = BertForTokenClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [84]:
# 单句测试

# test_sententce = "在北京市朝阳区的一家网吧，我亲眼看见卢本伟和孙笑川一起开挂。"
test_sententce = "史源源的房子租在滨江区南环路税友大厦附近。"

In [85]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

In [86]:
# word2token
tokenized_texts = [tokenizer.encode(test_sententce, add_special_tokens=True)]
print(tokenized_texts)
print(len(tokenized_texts[0]))

[[101, 1380, 3975, 3975, 4638, 2791, 2094, 4909, 1762, 4012, 3736, 1277, 1298, 4384, 6662, 4925, 1351, 1920, 1336, 7353, 6818, 511, 102]]
23


In [87]:
# 句子padding
# 句子最长长度
MAX_LEN = 128

# 输入padding
# 此函数在keras里面
test_input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")
print(test_input_ids)

[[ 101 1380 3975 3975 4638 2791 2094 4909 1762 4012 3736 1277 1298 4384
  6662 4925 1351 1920 1336 7353 6818  511  102    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]


In [88]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in test_input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

print(np.array(attention_masks))

# 0-51 为1

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]


In [89]:
# tensor化
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(attention_masks)

In [90]:
print(test_inputs)
print(test_masks)

tensor([[ 101, 1380, 3975, 3975, 4638, 2791, 2094, 4909, 1762, 4012, 3736, 1277,
         1298, 4384, 6662, 4925, 1351, 1920, 1336, 7353, 6818,  511,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0

In [91]:
# 形成验证数据集
batch_size = 64

test_data = TensorDataset(test_inputs, test_masks)
# 随机采样
test_sampler = SequentialSampler(test_data)
# 读取数据
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [92]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [93]:
# 验证集的读取也要batch
for batch in tqdm(test_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=None,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个字的标签的概率
    pred_flat = np.argmax(scores[0], axis=1).flatten()
    # label_ids = b_labels.to('cpu').numpy()  # 真实labels
    print(pred_flat)  # 预测值

100%|██████████| 1/1 [00:00<00:00, 58.83it/s]

[7 4 5 5 6 6 6 6 6 0 1 1 0 1 1 0 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]





In [94]:
pre_labels = [ix_to_tag[n] for n in pred_flat]
print(f"测试句子: {test_sententce}")
print(len(test_sententce))
print(pre_labels)

测试句子: 史源源的房子租在滨江区南环路税友大厦附近。
21
['[CLS]', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [95]:
pre_labels_cut = pre_labels[0:len(test_sententce)+2]
pre_labels_cut

['[CLS]',
 'B-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'O',
 'O',
 'O',
 'O']

In [96]:
person = []  # 临时栈
persons = []

location = []
locations = []


for i in range(len(pre_labels_cut) - 1):
    # Person
    # 单字情况
    if pre_labels[i] == 'B-PER' and pre_labels[i+1] != 'I-PER' and len(location) == 0:
        person.append(i)  
        persons.append(person)
        person = []  # 清空
        continue    
        
    # 非单字
    # 如果前面有连着的 PER 实体    
    if pre_labels[i] == 'B-PER'and pre_labels[i+1] == 'I-PER' and len(person) != 0:
        person.append(i)
        
    # 如果前面没有连着的 B-PER 实体
    elif pre_labels[i] == 'B-PER'and pre_labels[i+1] == 'I-PER' and len(location) == 0:
        person.append(i)  # 加入新的 B-PER
    elif pre_labels[i] != 'I-PER' and len(person) != 0:
        persons.append(person)  # 临时栈内容放入正式栈
        person = []  # 清空临时栈
    elif pre_labels[i] == 'I-PER' and len(person) != 0:
        person.append(i)
    else:  # 极少数情况会有 I-PER 开头的，不理
        pass

    # Location
    # 单字情况
    if pre_labels[i] == 'B-LOC' and pre_labels[i+1] != 'I-LOC' and len(location) == 0:
        location.append(i)  
        locations.append(location)
        location = []  # 清空
        continue
        
    # 非单字
    # 如果前面有连着的 LOC 实体
    
    if pre_labels[i] == 'B-LOC' and pre_labels[i+1] == 'I-LOC' and len(location) != 0:
        locations.append(location)
        location = []  # 清空栈
        location.append(i)  # 加入新的 B-LOC
        
    # 如果前面没有连着的 B-LOC 实体
    elif pre_labels[i] == 'B-LOC' and pre_labels[i+1] == 'I-LOC' and len(location) == 0:
        location.append(i)  # 加入新的 B-LOC
    elif pre_labels[i] == 'I-LOC' and len(location) != 0:
        location.append(i)
    # 结尾
    elif pre_labels[i] != 'I-LOC' and len(location) != 0:
        locations.append(location)  # 临时栈内容放入正式栈
        location = []  # 清空临时栈
    else:  # 极少数情况会有 I-LOC 开头的，不理
        pass
    
print(persons)
print(locations)

[[1, 2, 3]]
[[9, 10, 11], [12, 13, 14], [15, 16, 17, 18]]


In [97]:
# 从文字中提取
# 人物
NER_PER = []
for word_idx in persons:
    ONE_PER = []
    for letter_idx in word_idx: 
        ONE_PER.append(test_sententce[letter_idx - 1])
    NER_PER.append(ONE_PER)

NER_PER_COMBINE = []
for w in NER_PER:
    PER = "".join(w)
    NER_PER_COMBINE.append(PER)
    
# 地点

NER_LOC = []
for word_idx in locations:
    ONE_LOC = []
    for letter_idx in word_idx: 
        # print(letter_idx)
        # print(test_sententce[letter_idx])
        ONE_LOC.append(test_sententce[letter_idx - 1])
    NER_LOC.append(ONE_LOC)

NER_LOC_COMBINE = []
for w in NER_LOC:
    LOC = "".join(w)
    NER_LOC_COMBINE.append(LOC)

# 组织

In [98]:
print(f"当前句子：{test_sententce}\n")
print(f"    人物：{NER_PER_COMBINE}\n")
print(f"    地点：{NER_LOC_COMBINE}\n")

当前句子：史源源的房子租在滨江区南环路税友大厦附近。

    人物：['史源源']

    地点：['滨江区', '南环路', '税友大厦']

