# Keras-NER

利用Keras做NER，数据集是建筑安装数据集。

参考：https://androidkt.com/name-entity-recognition-with-bert-in-tensorflow/

In [1]:
import re
import os
import time

import numpy as np
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences  # padding句子用
from tqdm.notebook import tqdm
from transformers import create_optimizer
from transformers import TFBertModel, BertTokenizer, TFBertForTokenClassification

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
file = "建安1600-标记.txt"

In [3]:
all_ner_data = []
with open(file, encoding="utf-8") as f:
    for s in f.readlines():
        s = s.strip('\n')
        ner_data = []
        result_1 = re.finditer(r'\[\@', s)
        result_2 = re.finditer(r'\*\]', s)
        begin = []
        end = []
        for each in result_1:
            begin.append(each.start())
        for each in result_2:
            end.append(each.end())
        # assert len(begin) == len(end)
        if not len(begin) == len(end):
            raise AssertionError(f"sentence {s} ,begin != end")
        i = 0
        j = 0
        while i < len(s):
            if i not in begin:
                ner_data.append([s[i], 'O'])
                i = i + 1
            else:
                ann = s[i + 2:end[j] - 2]
                entity, ner = ann.rsplit('#')
                if (len(entity) == 1):
                    ner_data.append([entity, 'S-' + ner])
                else:
                    if (len(entity) == 2):
                        ner_data.append([entity[0], 'B-' + ner])
                        ner_data.append([entity[1], 'I-' + ner])
                    else:
                        ner_data.append([entity[0], 'B-' + ner])
                        for n in range(1, len(entity) - 1):
                            ner_data.append([entity[n], 'I-' + ner])
                        ner_data.append([entity[-1], 'I-' + ner])
        
                i = end[j]
                j = j + 1
        all_ner_data.append(ner_data)
f.close()

In [4]:
all_ner_data[0]

[['工', 'O'],
 ['程', 'O'],
 ['名', 'O'],
 ['称', 'O'],
 ['：', 'O'],
 ['金', 'B-Program'],
 ['隅', 'I-Program'],
 ['世', 'I-Program'],
 ['纪', 'I-Program'],
 ['城', 'I-Program'],
 ['2', 'O'],
 ['#', 'O'],
 ['6', 'O'],
 ['#', 'O'],
 ['9', 'O'],
 ['#', 'O'],
 ['1', 'O'],
 ['0', 'O'],
 ['#', 'O'],
 ['天', 'O'],
 ['然', 'O'],
 ['气', 'O'],
 ['工', 'O'],
 ['程', 'O'],
 [' ', 'O'],
 ['工', 'O'],
 ['程', 'O'],
 ['地', 'O'],
 ['址', 'O'],
 ['：', 'O'],
 ['沙', 'O'],
 ['河', 'O'],
 ['市', 'O'],
 ['东', 'O'],
 ['环', 'O'],
 ['路', 'O'],
 ['西', 'O'],
 ['侧', 'O']]

In [5]:
all_ner_data_list = []
for seq_list in all_ner_data:
    zi = []
    mark = []
    for zi_mark in seq_list:
        zi.append(zi_mark[0])
        mark.append(zi_mark[1])
        seq_tuple = (zi, mark)
    all_ner_data_list.append(seq_tuple)

In [6]:
all_ner_data_list[0]

(['工',
  '程',
  '名',
  '称',
  '：',
  '金',
  '隅',
  '世',
  '纪',
  '城',
  '2',
  '#',
  '6',
  '#',
  '9',
  '#',
  '1',
  '0',
  '#',
  '天',
  '然',
  '气',
  '工',
  '程',
  ' ',
  '工',
  '程',
  '地',
  '址',
  '：',
  '沙',
  '河',
  '市',
  '东',
  '环',
  '路',
  '西',
  '侧'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-Program',
  'I-Program',
  'I-Program',
  'I-Program',
  'I-Program',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

### 输入处理

由于BERT模型的特殊性，需要再处理一下输入:

- input_ids(padding)
- attention_masks
- labels

In [7]:
class Config(object):
    """配置参数"""
    def __init__(self):
        self.model_name = 'BERT_SPBM'           # 模型名称
        self.bert_path = './bert-chinese/'     # BERT 文件路径

        self.batch_size = 128                   # mini-batch大小
                                                #
        self.epsilon = 1e-08                    # adam参数
        self.hidden_size = 768                  # 隐藏层神经元
        self.hidden_dropout_prob = 0.1          # dropout比率
        self.learning_rate = 2e-5               # 学习率
        self.max_len = 256                         # 句子的最长padding长度，注意这里尽量大，以防和laebl不匹配。尚未加入label匹配功能。 
                                                # 但是不能超过512
        self.num_classes = 7                    # 类别数(实体)
        self.num_epoch = 2                      # epoch

In [10]:
config = Config()

In [11]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-Program": 0,
             "I-Program": 1, 
             "E-Program": 2, 
             "O": 3,
             "[CLS]":4,
             "[SEP]":5,
             "[PAD]":6}

ix_to_tag = {0:"B-Program", 
             1:"I-Program", 
             2:"E-Program", 
             3:"O",
             4:"[CLS]",
             5:"[SEP]",
             6:"[PAD]"}

In [12]:
all_sentences = []  # 句子
all_labels = []  # labels
for seq_pair in all_ner_data_list:
    sentence = "".join(seq_pair[0])
    labels = [tag_to_ix[t] for t in seq_pair[1]]
    all_sentences.append(sentence)
    all_labels.append(labels)

print(all_sentences[0])
print(all_labels[0])

print(len(all_sentences[0]))
print(len(all_labels[0]))

工程名称：金隅世纪城2#6#9#10#天然气工程 工程地址：沙河市东环路西侧
[3, 3, 3, 3, 3, 0, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
38
38


In [15]:
# 
tokenizer = BertTokenizer.from_pretrained(config.bert_path, do_lower_case=True)
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sentences]

In [16]:
print(tokenized_texts[0])
print(len(tokenized_texts[0]))

[101, 2339, 4923, 1399, 4917, 8038, 7032, 7383, 686, 5279, 1814, 123, 108, 127, 108, 130, 108, 8108, 108, 1921, 4197, 3698, 2339, 4923, 2339, 4923, 1765, 1770, 8038, 3763, 3777, 2356, 691, 4384, 6662, 6205, 904, 102]
38


In [17]:
# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=config.max_len, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [18]:
print(len(input_ids[0]))
print(input_ids[0])
print(type(input_ids[0]))

256
[ 101 2339 4923 1399 4917 8038 7032 7383  686 5279 1814  123  108  127
  108  130  108 8108  108 1921 4197 3698 2339 4923 2339 4923 1765 1770
 8038 3763 3777 2356  691 4384 6662 6205  904  102    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  

In [19]:
# [3] 代表 O 实体

for label in all_labels:
    label.insert(len(label), 5)  # [SEP]
    label.insert(0, 4) # [CLS]
    if config.max_len > len(label) -1:
        for i in range(config.max_len - len(label)):
            label.append(6)  # [PAD]

In [20]:
# check
for check in all_labels:
     if len(check) != 256:
        print(len(check))

In [21]:
# print(len(all_labels_array[0]))
# print(all_labels_array[0])

In [22]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [23]:
# 第一句话的 attention_masks
print(np.array(attention_masks[0]))
print(len(np.array(attention_masks[0])))
print(attention_masks[0].count(1))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
256
38


In [24]:
# 创建segment id
def prepare_token_type(input_ids=attention_masks):
    """
    创建attention masks
    
    输入：
    @ input_ids:padding 之后的 attention_masks
    输出：
    @ attention_masks：对padding部分mask后的标记（attention_masks）
    """
    # 创建segment ids
    token_type_ids = []

    for seq in input_ids:
        # sent_token_type = [0 if i == 1 else 1 for i in seq]
        sent_token_type = [0 if i == 1 else 1 for i in seq]
        token_type_ids.append(sent_token_type)
    return token_type_ids  

In [25]:
# token_type_ids = prepare_token_type(input_ids=attention_masks)

In [26]:
# print(token_type_ids[0])
# print(len(np.array(token_type_ids[0])))
# print(token_type_ids[0].count(0))

In [27]:
input_ids = np.asarray(input_ids)
all_labels = np.asarray(all_labels)
attention_masks = np.asarray(attention_masks)
# token_type_ids = np.asarray(token_type_ids)

In [28]:
print(type(all_labels))
print(type(all_labels[0]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [29]:
# train-test-split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    all_labels, 
                                                                                    random_state=2019, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2019, 
                                                       test_size=0.1)

#train_segment_ids, validation_segment_ids, _, _ = train_test_split(token_type_ids, 
#                                                       input_ids,
#                                                       random_state=2019, 
#                                                       test_size=0.1)

In [30]:
print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
# print(train_segment_ids[0])

print("\n\n")
print(type(train_inputs[0]))
print(type(train_labels[0]))
print(type(train_masks[0]))
# print(type(train_segment_ids[0]))

print("\n\n")
print(len(train_inputs[0]))
print(len(train_labels[0]))
print(len(train_masks[0]))
# print(len(train_segment_ids[0]))

[ 101 2456 5029 3302 1218 1355 4495 1765 8038 2398 1333 1344 1862 1079
 7555 4680 1399 4917 8038 2398 1333 1344 4689 2137 6577 1737 3333 1350
 1071  800 2456 1169 3333 2145 6817 4991 4157 2456 6392 7555 4680  102
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [31]:
print("\n\n")
print(len(train_inputs))
print(len(train_labels))
print(len(train_masks))
# print(len(train_segment_ids))




1485
1485
1485


## Keras-Bert 输入准备

In [32]:
train_X = [train_inputs, train_masks]
train_y = train_labels

In [33]:
print(len(train_X[0]))
print(len(train_X[1]))
# print(len(train_X[2]))
print(len(train_y))

1485
1485
1485


In [34]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
optimizer = Adam(learning_rate=config.learning_rate, epsilon=config.epsilon)

In [35]:
def build_model():
    """构建模型"""
    token_inputs = tf.keras.layers.Input((config.max_len,), dtype=tf.int32, name='Input_word_ids')
    mask_inputs = tf.keras.layers.Input((config.max_len,), dtype=tf.int32, name='Input_masks')
    #seg_inputs = tf.keras.layers.Input((config.max_len,), dtype=tf.int32, name='Input_segments')

    # going with pooled output since seq_output results in ResourceExhausted Error even with GPU
    # 导入 BERT 模型
    bert_model = TFBertForTokenClassification.from_pretrained(config.bert_path, num_labels=config.num_classes)
    outputs = bert_model(inputs=token_inputs, 
                           attention_mask=mask_inputs, 
                           token_type_ids=None,
                           position_ids=None)
    scores = outputs[0]

    # 输入输出确定
    bert_model_ner = tf.keras.models.Model([token_inputs, mask_inputs], scores)
    
    print(bert_model_ner.summary())
    
    # 编译模型
    bert_model_ner.compile(optimizer=optimizer, loss=loss_function)
    
    return bert_model_ner

In [36]:
K.clear_session()
model = build_model()







Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input_word_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
Input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_bert_for_token_classificatio ((None, 256, 7),)    102273031   Input_word_ids[0][0]             
Total params: 102,273,031
Trainable params: 102,273,031
Non-trainable params: 0
__________________________________________________________________________________________________
None


In [None]:
model.fit(train_X, train_y, epochs=2, batch_size = 32)

Train on 1485 samples
Epoch 1/2
Epoch 2/2

In [34]:
valid_X = [validation_inputs, validation_masks]
valid_y = validation_labels

# 模型预测
result = model.predict(valid_X)

In [35]:
# 保存结果
y_pred = []
# y_true = []

for one_sent_score in result:        
    pred_flat = np.argmax(one_sent_score, axis=1).flatten()  # 一句话里面每个字的标签
    pre_labels = [ix_to_tag[n] for n in list(pred_flat)]           # 转换为label

    # pre_labels_cut = pre_labels[0:len(test_sententce)+2]     # 截断
    y_pred.append(pre_labels)                            # 存入

print(len(y_pred))

166


### 预测标签

In [36]:
print(y_pred[1])

['[CLS]', 'O', 'O', 'O', 'O', 'O', 'O', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

### 实际标签

In [37]:
y_true = []
for true_label in valid_y:
    true_labels = [ix_to_tag[n] for n in list(true_label)]
    y_true.append(true_labels)

In [38]:
print(y_true[1])

['[CLS]', 'O', 'O', 'O', 'O', 'O', 'B-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'I-Program', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'