In [38]:
import json
import codecs
import pandas as pd
import numpy as np
import tensorflow as tf
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K

In [39]:
maxlen = 250
BATCH_SIZE = 4
config_path = 'uncased_L-12_H-512_A-8/bert_config.json'
checkpoint_path = 'uncased_L-12_H-512_A-8/bert_model.ckpt'
dict_path = 'uncased_L-12_H-512_A-8/vocab.txt'
#config_path = 'biobert_v1.1_pubmed/bert_config.json'
#checkpoint_path = 'biobert_v1.1_pubmed/model.ckpt'
#dict_path = 'biobert_v1.1_pubmed/vocab.txt'


In [40]:
token_dict = {}
with codecs.open(dict_path, 'r', 'utf-8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

#分词
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            else:
                R.append('[UNK]')   # 剩余的字符是[UNK]
        return R


tokenizer = OurTokenizer(token_dict)

#填充成一样长
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])

In [41]:
print("begin data processing...")
train_df = pd.read_csv("data/train.csv").fillna(value="")
valid_df = pd.read_csv("data/valid.csv").fillna(value="")
test_df = pd.read_csv("data/test.csv").fillna(value="")
#删除label为空的行
train_df.drop(train_df[train_df['label']==''].index)
valid_df.drop(valid_df[valid_df['label']==''].index)
test_df.drop(valid_df[test_df['label']==''].index)

select_labels = train_df["label"].unique()
labels = []
#把8个label都提取出来，dict：0:treatment,1:...
for label in select_labels:
    if ";" not in label:
        if label not in labels and label:
            labels.append(label)

# with open("label.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps(dict(zip(range(len(labels)), labels)), ensure_ascii=False, indent=2))

train_data = []
valid_data = []
test_data = []
for i in range(train_df.shape[0]):
    pmid, journal, title, abstract, keywords, label,pub_type, authors, date1, doi, date2, label_category = train_df.iloc[i, :]
    label_id = [0] * len(labels)
    for j, _ in enumerate(labels):
        for separate_label in label.split(";"):
            if _ == separate_label:
                label_id[j] = 1
    #pmid,date1,date2,doi,label_category删除
    train_data.append(( title, abstract, journal,keywords, pub_type, authors, label_id))

for i in range(valid_df.shape[0]):
    pmid, journal, title, abstract, keywords, label,pub_type, authors, date1, doi, date2 = valid_df.iloc[i, :]
    label_id = [0] * len(labels)
    for j, _ in enumerate(labels):
        for separate_label in label.split(";"):
            if _ == separate_label:
                label_id[j] = 1
    valid_data.append(( title, abstract,journal, keywords, pub_type, authors, label_id))
for i in range(test_df.shape[0]):
    pmid, journal, title, abstract, keywords, label,pub_type, authors, date1, doi, date2 = test_df.iloc[i, :]
    label_id = [0] * len(labels)
    for j, _ in enumerate(labels):
        for separate_label in label.split(";"):
            if _ == separate_label:
                label_id[j] = 1
    test_data.append(( title, abstract,journal, keywords, pub_type, authors, label_id))

# print(train_data[:10])
print("finish data processing!")

begin data processing...


  


finish data processing!


In [42]:
class DataGenerator:

    def __init__(self, data, batch_size=BATCH_SIZE):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, X3, X4,X5, X6,Y = [], [], [] ,[], [], [] ,[]
            for i in idxs:
                d = self.data[i]
                #title
                text1 = d[0][:maxlen]
                #abstract
                text2 = d[1][:maxlen]
                #journal
                text3 = d[2][:maxlen]
                #keyword
                text4 = d[3][:maxlen]
                #pub_type
                text5 = d[4][:maxlen]
                #authors
                text6 = d[5][:maxlen]
                #文本数据将输入bert中，encode返回word_embedding和segmen_embedding(0是句子1,1是句子2
                x1, x2 = tokenizer.encode(first=text1,second=text2)
                x3, x4 = tokenizer.encode(first=text3,second=text4)
                x5, x6 = tokenizer.encode(first=text5,second=text6)
                y = d[6]
                X1.append(x1)
                X2.append(x2)
                X3.append(x3)
                X4.append(x4)
                X5.append(x5)
                X6.append(x6)
                Y.append(y)
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    X3 = seq_padding(X3)
                    X4 = seq_padding(X4)
                    X5 = seq_padding(X5)
                    X6 = seq_padding(X6)
                    Y = seq_padding(Y)
                    yield [X1, X2, X3, X4, X5, X6], Y
                    [X1, X2, X3, X4,X5, X6, Y] = [], [], [], [], [], [], []

In [6]:
def create_cls_model(num_labels):
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

    for layer in bert_model.layers:
        layer.trainable = True

    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))
    x3_in = Input(shape=(None,))
    x4_in = Input(shape=(None,))
    x5_in = Input(shape=(None,))
    x6_in = Input(shape=(None,))
    

    bert_layer1 = bert_model([x1_in,x2_in])
    bert_layer2 = bert_model([x3_in,x4_in])
    bert_layer3 = bert_model([x5_in,x6_in])
    cls_layer1 = Lambda(lambda x: x[:, 0])(bert_layer1)    # 取出[CLS]对应的向量用来做分类
    cls_layer2 = Lambda(lambda x: x[:, 0])(bert_layer2)
    cls_layer3 = Lambda(lambda x: x[:, 0])(bert_layer3)

    x = Add()([cls_layer1,cls_layer2,cls_layer3])
    x = Dropout(0.2)(x)
    p = Dense(num_labels, activation='sigmoid')(x)     # 多分类

    model = Model([x1_in,x2_in,x3_in,x4_in,x5_in,x6_in], p)
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(1e-5), # 用足够小的学习率
        metrics=['accuracy']
    )
    model.summary()

    return model

In [7]:
#train_D = DataGenerator(train_data[:2600])
#valid_D = DataGenerator(valid_data[:1000])
train_D = DataGenerator(train_data)
valid_D = DataGenerator(valid_data)

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model = create_cls_model(len(labels))
print("begin model training...")

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
____________________________________________________________________________________________

In [157]:
model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=3,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D)
    )

print("finish model training!")

Epoch 1/3
Epoch 2/3
Epoch 3/3
finish model training!


In [159]:
model.save('multi-label-ee.h5')
print("Model saved!")

test_D = DataGenerator(test_data)
result = model.evaluate_generator(test_D.__iter__(), steps=len(test_D))
print("模型评估结果:", result)

Model saved!
Instructions for updating:
Please use Model.evaluate, which supports generators.
模型评估结果: [0.1631319373846054, 0.7978361248970032]


In [37]:
from keras.models import load_model
from keras_bert import get_custom_objects
import time
# 加载训练好的模型
model = load_model("multi-label-ee.h5", custom_objects=get_custom_objects())
tokenizer = OurTokenizer(token_dict)
with open("label.json", "r", encoding="utf-8") as f:
    label_dict = json.loads(f.read())

maxlen = 250
s_time = time.time()
# 预测示例语句
document = pd.read_csv('data/single text.csv').fillna(value="")
pmid, journal, title, abstract, keywords, label,pub_type, authors, date1, doi, date2, label_category = document.iloc[0, :]


# 利用BERT进行tokenize
journal = journal[:maxlen]
title = title[:maxlen]
abstract = abstract[:maxlen]
keywords = keywords[:maxlen]
pub_type = pub_type[:maxlen]
authors = authors[:maxlen]
x1, x2 = tokenizer.encode(first=title,second=abstract)
x3, x4 = tokenizer.encode(first=journal,second=keywords)
x5, x6 = tokenizer.encode(first=pub_type,second=authors)

X1 = x1 + [0] * (maxlen-len(x1)) if len(x1) < maxlen else x1
X2 = x2 + [0] * (maxlen-len(x2)) if len(x2) < maxlen else x2
X3 = x3 + [0] * (maxlen-len(x3)) if len(x3) < maxlen else x3
X4 = x4 + [0] * (maxlen-len(x4)) if len(x4) < maxlen else x4
X5 = x5 + [0] * (maxlen-len(x5)) if len(x5) < maxlen else x5
X6 = x6 + [0] * (maxlen-len(x6)) if len(x6) < maxlen else x6

X1 = np.array(X1[:maxlen])
X2 = np.array(X2[:maxlen])
X3 = np.array(X3)
X4 = np.array(X4)
X5 = np.array(X5)
X6 = np.array(X6)




# 模型预测并输出预测结果
prediction = model.predict([X1, X2, X3, X4, X5, X6])
one_hot = np.where(prediction > 0.5, 1, 0)[0]


print("标题: %s" % title)
print("作者: %s" % authors)
print("预测标签: %s" % [label_dict[str(i)] for i in range(len(one_hot)) if one_hot[i]])
print("实际标签: %s" % label)
e_time = time.time()
print("cost time:", e_time-s_time)

标题: Beating severe covid-19.
作者: Wilson, Clare
预测标签: ['General Info']
实际标签: General Info
cost time: 9.325701713562012
