In [3]:
import os
from pprint import pprint

import torch
import torch.nn as nn
from transformers import BertForTokenClassification, BertTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.notebook import tqdm

## 读取MSRA实体识别数据集

In [4]:
file = "../datasets/dh_msra.txt"

## 检查GPU情况

In [5]:
# GPUcheck

print("CUDA Available: ", torch.cuda.is_available())
n_gpu = torch.cuda.device_count()

if torch.cuda.is_available():
    print("GPU numbers: ", n_gpu)
    print("device_name: ", torch.cuda.get_device_name(0))
    device = torch.device("cuda:0")  # 注意选择
    torch.cuda.set_device(0) 
    print(f"当前设备：{torch.cuda.current_device()}")
else :
    device = torch.device("cpu")
    print(f"当前设备：{device}")

CUDA Available:  True
GPU numbers:  1
device_name:  GeForce RTX 3090
当前设备：0


## 配置参数

规范化配置参数，方便使用。

In [6]:
class Config(object):
    """配置参数"""
    def __init__(self):
        self.model_name = 'Bert_NER.bin'
        self.bert_path = './bert-chinese/'
        self.ner_file = '../datasets/dh_msra.txt'
        
        self.num_classes = 10                    # 类别数(按需修改)，这里有10种实体类型
        self.hidden_size = 768                   # 隐藏层输出维度
        self.hidden_dropout_prob = 0.1           # dropout比例
        self.batch_size = 128                    # mini-batch大小
        self.max_len = 103                       # 句子的最长padding长度
        
        self.epochs = 3                          # epoch数
        self.learning_rate = 2e-5                # 学习率        

        self.save_path = './saved_model/'        # 模型训练结果保存路径
        
        # self.fp16 = False
        # self.fp16_opt_level = 'O1'
        # self.gradient_accumulation_steps = 1
        # self.warmup_ratio = 0.06
        # self.warmup_steps = 0
        # self.max_grad_norm = 1.0
        # self.adam_epsilon = 1e-8
        # self.class_list = class_list                              # 类别名单
        # self.require_improvement = 1000                                # 若超过1000batch效果还没提升，则提前结束训练
        
config = Config()

In [7]:
all_sentences_separate = []
all_letter_labels = []
label_set = set()

with open(config.ner_file, encoding="utf-8") as f:
    single_sentence = []
    single_sentence_labels = []
    for s in f.readlines():
        if s != "\n":
            word, label = s.split("\t")
            label = label.strip("\n")
            single_sentence.append(word)
            single_sentence_labels.append(label)
            label_set.add(label)
        elif s == "\n":
            all_sentences_separate.append(single_sentence)
            all_letter_labels.append(single_sentence_labels)
            single_sentence = []
            single_sentence_labels = []

In [8]:
print(all_sentences_separate[0:2])
print(all_letter_labels[0:2])

print(f"\n所有的标签：{label_set}")

[['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], ['藏', '书', '本', '来', '就', '是', '所', '有', '传', '统', '收', '藏', '门', '类', '中', '的', '第', '一', '大', '户', '，', '只', '是', '我', '们', '结', '束', '温', '饱', '的', '时', '间', '太', '短', '而', '已', '。']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

所有的标签：{'I-PER', 'B-PER', 'O', 'I-ORG', 'B-LOC', 'B-ORG', 'I-LOC'}


In [9]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

## 数据示例

这里简单查看一些数据例子，其中很多都是数字6。

数字6说明是 O 类型的实体。

In [10]:
all_sentences = []  # 句子

for one_sentence in all_sentences_separate:
    sentence = "".join(one_sentence)
    all_sentences.append(sentence)

print(all_sentences[0:2])

['当希望工程救助的百万儿童成长起来，科教兴国蔚然成风时，今天有收藏价值的书你没买，明日就叫你悔不当初！', '藏书本来就是所有传统收藏门类中的第一大户，只是我们结束温饱的时间太短而已。']


In [11]:
all_labels = []  # labels
for letter_labels in all_letter_labels:
    labels = [tag_to_ix[t] for t in letter_labels]
    all_labels.append(labels)

print(all_labels[0:2])
print(len(all_labels[0]))

[[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]]
50


In [12]:
print(len(all_labels))

55289


### input数据准备

In [13]:
# word2token
tokenizer = BertTokenizer.from_pretrained('./bert-chinese/', do_lower_case=True)

# 新版代码，一次性处理好输入
encoding = tokenizer(all_sentences, 
                     return_tensors='pt',  # pt 指 pytorch，tf 就是 tensorflow 
                     padding='max_length',  # padding 到 max_length
                     truncation=True,       # 激活并控制截断
                     max_length=config.max_len)

input_ids = encoding['input_ids']

In [14]:
# 这句话的input_ids
print(f"Tokenize 前的第一句话：\n{all_sentences[0]}\n")
print(f"Tokenize + Padding 后的第一句话: \n{input_ids[0]}")

Tokenize 前的第一句话：
当希望工程救助的百万儿童成长起来，科教兴国蔚然成风时，今天有收藏价值的书你没买，明日就叫你悔不当初！

Tokenize + Padding 后的第一句话: 
tensor([ 101, 2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636,  674, 1036,
        4997, 2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197,
        2768, 7599, 3198, 8024,  791, 1921, 3300, 3119, 5966,  817,  966, 4638,
         741,  872, 3766,  743, 8024, 3209, 3189, 2218, 1373,  872, 2637,  679,
        2496, 1159, 8013,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0])


In [15]:
# 新版代码
attention_masks = encoding['attention_mask']
token_type_ids = encoding['token_type_ids']

In [16]:
# 第一句话的 attention_masks
print(attention_masks[0])

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0])


## 准备labels

由于我们的input_ids是带有`[CLS]`和`[SEP]`的，所以在准备label的同时也要考虑这些情况。

In [17]:
# [3] 代表 O 实体
for label in all_labels:
    label.insert(len(label), 8)  # [SEP]
    label.insert(0, 7) # [CLS]
    if config.max_len > len(label) -1:
        for i in range(config.max_len - len(label)):  #+2的原因是扣除多出来的CLS和SEP
            label.append(9)  # [PAD]

In [18]:
print(len(all_labels[0]))
print(all_labels[0])

103
[7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]


In [19]:
# 统计最长的段落
max_len_label = 0
max_len_text = 0

for label in all_labels:
    if len(label) > max_len_text:
        max_len_label = len(label)
print(max_len_label)
        
for one_input in input_ids:
    if len(one_input) > max_len_text:
        max_len_text = len(one_input)    
        
print(max_len_text)

103
103


## 切分训练和测试集

In [20]:
# train-test-split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    all_labels, 
                                                                                    random_state=2021, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2021, 
                                                       test_size=0.1)

In [21]:
print(len(train_inputs))
print(len(validation_inputs))

print(train_inputs[0])
print(validation_inputs[0])

49760
5529
tensor([ 101, 5632, 5356,  510, 5632, 2193,  510, 5632, 4028, 8024, 2961, 5298,
        1469, 4028, 1139, 6963, 3221, 1164, 4500, 5688,  969, 3189, 1469,  689,
         865, 3198, 7313,  511,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0])
tensor([ 101, 1728, 2349, 1239, 3172, 1788, 1469,  809, 5682, 1154, 1146, 3637,
         898, 3191, 8024,  677, 3299, 1159, 4638,  840, 3142,  833, 6379,  679,
        3614, 5445, 3141, 8024, 1400, 3341, 5401, 1744, 2456, 6379, 4638, 1290,
        4670, 7561,  833, 6379, 1348, 1728,  809, 2600, 44

这里把输入的labels变为tensor形式。

In [22]:
train_labels = torch.tensor(train_labels).clone().detach()
validation_labels = torch.tensor(validation_labels).clone().detach()

In [23]:
print(train_labels[0])

print(len(train_labels))
print(len(train_inputs))

tensor([7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9])
49760
49760


In [24]:
# dataloader

# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config.batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=config.batch_size)

In [25]:
model = BertForTokenClassification.from_pretrained(config.bert_path, num_labels=config.num_classes)
model.cuda()

# 注意：
# 在新版的 Transformers 中会给出警告
# 原因是我们导入的预训练参数权重是不包含模型最终的线性层权重的
# 不过我们本来就是要“微调”它，所以这个情况是符合期望的

Some weights of the model checkpoint at ./bert-chinese/ were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./bert-chines

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [26]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

# 权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}]

In [27]:
# 优化器
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=5e-5)

In [28]:
# 保存loss
train_loss_set = []


In [29]:
# BERT training loop
for _ in range(config.epochs): 
    ## 训练
    print(f"当前epoch： {_}")
    # 开启训练模式
    model.train()
    tr_loss = 0  # train loss
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in tqdm(enumerate(train_dataloader)):
        # 把batch放入GPU
        batch = tuple(t.to(device) for t in batch)
        # 解包batch
        b_input_ids, b_input_mask, b_labels = batch
        # 梯度归零
        optimizer.zero_grad()
        # 前向传播loss计算
        output = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels)  
        loss = output[0]
        # print(loss)
        # 反向传播
        loss.backward()
        # Update parameters and take a step using the computed gradient
        # 更新模型参数
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print(f"当前 epoch 的 Train loss: {tr_loss/nb_tr_steps}")

当前epoch： 0


0it [00:00, ?it/s]

当前 epoch 的 Train loss: 0.11691816494462576
当前epoch： 1


0it [00:00, ?it/s]

当前 epoch 的 Train loss: 0.03204243875331315
当前epoch： 2


0it [00:00, ?it/s]

当前 epoch 的 Train loss: 0.020234778930449854


In [30]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [31]:
# 验证集的读取也要batch
for batch in tqdm(validation_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask, b_labels = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # print(logits[0])
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个字的标签的概率
    pred_flat = np.argmax(scores[0], axis=1).flatten()
    label_ids = b_labels.to('cpu').numpy()  # 真实labels
    # print(logits, label_ids)

  0%|          | 0/44 [00:00<?, ?it/s]

In [33]:
# 保存模型
# They can then be reloaded using `from_pretrained()`
# 创建文件夹
if not os.path.exists(config.save_path):
    os.makedirs(config.save_path)
    print("文件夹不存在，创建文件夹!")
else:
    pass


output_dir = config.save_path
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training


# Good practice: save your training arguments together with the trained model
torch.save(model_to_save.state_dict(), os.path.join(output_dir, config.model_name))

文件夹不存在，创建文件夹!


In [6]:
# 读取模型
# Load a trained model and vocabulary that you have fine-tuned
output_dir = config.save_path
model = BertForTokenClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [34]:
# 单句测试

# test_sententce = "在北京市朝阳区的一家网吧，我亲眼看见卢本伟和孙笑川一起开挂。"
test_sententce = "史源源的房子租在滨江区南环路税友大厦附近。"

In [35]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

In [39]:
encoding = tokenizer(test_sententce, 
                     return_tensors='pt',  # pt 指 pytorch，tf 就是 tensorflow 
                     padding=True,  # padding到最长的那句话
                     truncation=True,  # 激活并控制截断
                     max_length=50)

test_input_ids = encoding['input_ids']
# 创建attention masks
test_attention_masks = encoding['attention_mask']

In [41]:
# 形成验证数据集
# 为了通用，这里还是用了 DataLoader 的形式
test_data = TensorDataset(test_input_ids, test_attention_masks)
# 随机采样
test_sampler = SequentialSampler(test_data)
# 读取数据
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=config.batch_size)

In [42]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [43]:
# 验证集的读取也要batch
for batch in tqdm(test_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=None,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个字的标签的概率
    pred_flat = np.argmax(scores[0], axis=1).flatten()
    # label_ids = b_labels.to('cpu').numpy()  # 真实labels
    print(pred_flat)  # 预测值

  0%|          | 0/1 [00:00<?, ?it/s]

[7 4 5 5 6 6 6 6 6 0 1 1 0 1 1 0 1 1 1 6 6 6 8]


In [44]:
pre_labels = [ix_to_tag[n] for n in pred_flat]
print(f"测试句子: {test_sententce}")
print(len(test_sententce))
print(pre_labels)

测试句子: 史源源的房子租在滨江区南环路税友大厦附近。
21
['[CLS]', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', '[SEP]']


In [45]:
pre_labels_cut = pre_labels[0:len(test_sententce)+2]
pre_labels_cut

['[CLS]',
 'B-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'O',
 'O',
 'O',
 '[SEP]']

In [46]:
person = []  # 临时栈
persons = []

location = []
locations = []


for i in range(len(pre_labels_cut) - 1):
    # Person
    # 单字情况
    if pre_labels[i] == 'B-PER' and pre_labels[i+1] != 'I-PER' and len(location) == 0:
        person.append(i)  
        persons.append(person)
        person = []  # 清空
        continue    
        
    # 非单字
    # 如果前面有连着的 PER 实体    
    if pre_labels[i] == 'B-PER'and pre_labels[i+1] == 'I-PER' and len(person) != 0:
        person.append(i)
        
    # 如果前面没有连着的 B-PER 实体
    elif pre_labels[i] == 'B-PER'and pre_labels[i+1] == 'I-PER' and len(location) == 0:
        person.append(i)  # 加入新的 B-PER
    elif pre_labels[i] != 'I-PER' and len(person) != 0:
        persons.append(person)  # 临时栈内容放入正式栈
        person = []  # 清空临时栈
    elif pre_labels[i] == 'I-PER' and len(person) != 0:
        person.append(i)
    else:  # 极少数情况会有 I-PER 开头的，不理
        pass

    # Location
    # 单字情况
    if pre_labels[i] == 'B-LOC' and pre_labels[i+1] != 'I-LOC' and len(location) == 0:
        location.append(i)  
        locations.append(location)
        location = []  # 清空
        continue
        
    # 非单字
    # 如果前面有连着的 LOC 实体
    
    if pre_labels[i] == 'B-LOC' and pre_labels[i+1] == 'I-LOC' and len(location) != 0:
        locations.append(location)
        location = []  # 清空栈
        location.append(i)  # 加入新的 B-LOC
        
    # 如果前面没有连着的 B-LOC 实体
    elif pre_labels[i] == 'B-LOC' and pre_labels[i+1] == 'I-LOC' and len(location) == 0:
        location.append(i)  # 加入新的 B-LOC
    elif pre_labels[i] == 'I-LOC' and len(location) != 0:
        location.append(i)
    # 结尾
    elif pre_labels[i] != 'I-LOC' and len(location) != 0:
        locations.append(location)  # 临时栈内容放入正式栈
        location = []  # 清空临时栈
    else:  # 极少数情况会有 I-LOC 开头的，不理
        pass
    
print(persons)
print(locations)

[[1, 2, 3]]
[[9, 10, 11], [12, 13, 14], [15, 16, 17, 18]]


In [47]:
# 从文字中提取
# 人物
NER_PER = []
for word_idx in persons:
    ONE_PER = []
    for letter_idx in word_idx: 
        ONE_PER.append(test_sententce[letter_idx - 1])
    NER_PER.append(ONE_PER)

NER_PER_COMBINE = []
for w in NER_PER:
    PER = "".join(w)
    NER_PER_COMBINE.append(PER)
    
# 地点

NER_LOC = []
for word_idx in locations:
    ONE_LOC = []
    for letter_idx in word_idx: 
        # print(letter_idx)
        # print(test_sententce[letter_idx])
        ONE_LOC.append(test_sententce[letter_idx - 1])
    NER_LOC.append(ONE_LOC)

NER_LOC_COMBINE = []
for w in NER_LOC:
    LOC = "".join(w)
    NER_LOC_COMBINE.append(LOC)

# 组织

In [48]:
print(f"当前句子：{test_sententce}\n")
print(f"    人物：{NER_PER_COMBINE}\n")
print(f"    地点：{NER_LOC_COMBINE}\n")

当前句子：史源源的房子租在滨江区南环路税友大厦附近。

    人物：['史源源']

    地点：['滨江区', '南环路', '税友大厦']

