### **模型训练代码**

#### 数据内容存在于./Data中，模型保存在./Model_save中

In [1]:
import numpy as np
from sklearn.metrics import *
import copy
import utils
import time
import os
from keras.models import load_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras import Model
import keras.backend as K
import tensorflow as tf
from model import Capsule

# 关闭警告信息
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import warnings
warnings.filterwarnings("ignore")

##########################################################

label_to_ix=np.load('Data/label_to_ix.npy',allow_pickle=True).item()
training_data=np.load('Data/training_data.npy',allow_pickle=True)
test_data=np.load('Data/test_data.npy',allow_pickle=True)
val_data=np.load('Data/val_data.npy',allow_pickle=True)
word_to_ix=np.load('Data/word_to_ix.npy',allow_pickle=True).item()
label_emb_weight = np.load('Data/label_embedding.npy',allow_pickle=True) # 标签信息获取在'标签引入预处理.ipynb'
embed_weight = utils.build_embeedding_news("../../emb/PubMed-and-PMC-w2v.bin",word_to_ix,"Data/.")

Using TensorFlow backend.


#### 设置超参数

其中EMB_DIM为embedding尺寸，不能修改。

可调参参数：BATCH_SIZE、HIDDEN_DIM、N_CAP、CAP_DIM、ROUTINGS、LABEL_EMBEDDING_TRAIN、WORD_EMBEDDING_TRAIN；

+ 其中N_CAP、CAP_DIM、ROUTINGS为胶囊网络参数，代码见./model.py；
+ LABEL_EMBEDDING_TRAIN表示训练中更新label的embedding；
+ WORD_EMBEDDING_TRAIN表示训练时更新word的embedding；

本调参结果较好，建议备份后再调参。

In [2]:
# 超参数区域
PATIENT = 5
MIN_EPOCH = 10
MAX_EPOCH = 100
BATCH_SIZE = 16
SAVE_MODEL = 'Model_save/best_model_capsule_label.h5'
LABEL_NUM = len(label_to_ix.keys())
EMB_DIM = 200
HIDDEN_DIM = 300
N_CAP = 50
CAP_DIM = 50
ROUTINGS = 3
LABEL_EMBEDDING_TRAIN = True
WORD_EMBEDDING_TRAIN = False

def set_seed(THATSEED):
    np.random.seed(THATSEED)
    tf.compat.v1.set_random_seed(THATSEED)
set_seed(10)

#### 数据初始化

In [3]:
batch_train = utils.preprocessing(training_data,label_to_ix,BATCH_SIZE,word_to_ix)
batch_val = utils.preprocessing(val_data,label_to_ix,BATCH_SIZE,word_to_ix)
batch_test = utils.preprocessing(test_data,label_to_ix,BATCH_SIZE,word_to_ix)

#### **构造模型部分**

+ 首先将label的embedding与每个word的embedding求cos相似度，然后将相似度与word embedding连接，送入BiLSTM；
+ 然后利用胶囊网络进行特征提取，输出[N_CAP,CAP_DIM]的特征，然后划归平面为2500维特征，进行EMB_DIM的编码句子后与label embedding求cos相似度
+ 最后将相似度直接加到分类效果中；

In [None]:
# 整体模型部分
def build_model(vocab,n_cap,cap_dim,n_class):
    word_input = Input(shape=(None,), dtype="int32")
    label_input = Input(shape=(LABEL_NUM,),dtype="int32")
    
    label_emb = Embedding(LABEL_NUM,EMB_DIM,weights=[label_emb_weight],trainable=LABEL_EMBEDDING_TRAIN)
    embed = Embedding(len(vocab) + 1,
                        EMB_DIM,
                        weights=[embed_weight],
                        trainable=WORD_EMBEDDING_TRAIN,
                        )
    
    word_embed = embed(word_input)
    label_embed = label_emb(label_input)
    
    # 计算每个单词和每个标签的cos相似度，然后缀加到单词embedding后
    word_label_embed = dot([word_embed,label_embed],axes=(2,2),normalize=True)
    word_embed = concatenate([word_embed,word_label_embed])
    
    # 利用capsule的特征提取
    x = Bidirectional(CuDNNLSTM(HIDDEN_DIM,return_sequences=True))(word_embed)
    x = Capsule(
        num_capsule=n_cap,dim_capsule=cap_dim,
        routings=ROUTINGS, share_weights=True)(x)
    x = Flatten()(x)
    x = Dropout(0.2)(x)
    
    # 利用50*50的特征进行编码文档，然后与标签的embedding求cos相似度
    x_ = Dense(EMB_DIM,activation='relu')(x)
    x_ = Dropout(0.2)(x_) 
    label_x = dot([x_,label_embed],axes=(1,2),normalize=True)
    
    outputs = Dense(n_class)(x)
    
    # 将分类器的结果与标签文本相似度概率累加
    outputs = add([outputs,label_x])
    outputs = Activation('sigmoid')(outputs)
    
    model = Model(inputs=(word_input,label_input), outputs=outputs)

    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    # print(model.summary())
    model.summary()
    return model

model = build_model(word_to_ix,N_CAP,CAP_DIM,LABEL_NUM)

#### 训练过程

选取micro_f1最好的epoch作为最终选择的模型。

In [5]:
# 训练过程
bound = 0
best_scores = -10
leasttime = time.time()
label_list =  np.expand_dims(np.arange(LABEL_NUM), axis=0)
label_list = np.repeat(label_list,BATCH_SIZE,axis=0)
for epoch in range(1, MAX_EPOCH + 1):
    #break
    epoch_train = []
    epoch_val = []
    print("The epoch spend time %.1f s." % (time.time()-leasttime))
    print("*"*20,epoch,"*"*20)
    leasttime = time.time()
    # print("-"*20)
    print('{} Epoch {}/{} training'.format(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),epoch, MAX_EPOCH))
    for i,batch_data in enumerate(batch_train):
        item = model.train_on_batch([batch_data[0],label_list], batch_data[2])
        epoch_train.append(item)
    train_loss, train_acc = np.mean(epoch_train, axis=0)
    print("train loss: ", train_loss, " , train acc: ", train_acc)

    # 验证集验证效果
    y_pred = [] #model.predict(val_x1)
    y_val = []
    for batch_data in batch_val:
        epoch_val.append(model.evaluate([batch_data[0],label_list], batch_data[2],verbose=0))
        y_pred.append(model.predict([batch_data[0],label_list]))
        y_val.append(batch_data[2])
    
    val_loss,val_acc = np.mean(epoch_val, axis=0)
    y_pred = np.concatenate(y_pred,axis=0)
    y_val = np.concatenate(y_val,axis=0)
    y_pred = np.array([[1 if i >= 0.5 else 0 for i in t] for t in y_pred])
    
    y_true_label = y_val
    print(y_pred.shape,y_true_label.shape)
    micro_f1 = f1_score(y_true_label, y_pred, average='micro')
    macro_f1 = f1_score(y_true_label, y_pred, average='macro')
    print('macro F1', f1_score(y_true_label, y_pred, average='macro'))
    print('micro F1', f1_score(y_true_label, y_pred, average='micro'))
    print('loss',val_loss)
    scores = micro_f1 # 将验证集loss作为择优标准
    # 每个epoch判断是否上升
    if scores > best_scores:
        best_scores = scores
        bound = 0
        # with open(os.path.join(OUTPUT_PATH,'logs.log'),'a+') as f:
        #     f.write("Epoch %.d : macro_f1 = %.4f, micro_f1 = %.4f \n" % (epoch,macro_f1,micro_f1))
        model.save(SAVE_MODEL)
    else:
        bound += 1
    if bound > PATIENT and epoch > MIN_EPOCH:
        print("The best scores is ",best_scores)
        break

The epoch spend time 0.0 s.
******************** 1 ********************
2019-12-16 05:13:53 Epoch 1/100 training
train loss:  0.04778787  , train acc:  0.98593354
(5264, 344) (5264, 344)
macro F1 0.04961002516898733
micro F1 0.4790391176333131
loss 0.04048001003596015
The epoch spend time 871.2 s.
******************** 2 ********************
2019-12-16 05:28:25 Epoch 2/100 training
train loss:  0.03390002  , train acc:  0.9893054
(5264, 344) (5264, 344)
macro F1 0.11933904332628274
micro F1 0.6116720432359966
loss 0.03160148017153733
The epoch spend time 447.0 s.
******************** 3 ********************
2019-12-16 05:35:52 Epoch 3/100 training
train loss:  0.029481605  , train acc:  0.99031365
(5264, 344) (5264, 344)
macro F1 0.17495324655186362
micro F1 0.6530788225610473
loss 0.029815750530606706
The epoch spend time 447.4 s.
******************** 4 ********************
2019-12-16 05:43:19 Epoch 4/100 training
train loss:  0.0270852  , train acc:  0.99089175
(5264, 344) (5264, 344)


In [6]:
# 测试过程
print("*"*20,"test","*"*20)
model = load_model(SAVE_MODEL,custom_objects={'Capsule':Capsule})
test_y_pred = []
test_y = []
for batch_data in batch_test:
    test_y_pred.append(model.predict([batch_data[0],label_list]))
    test_y.append(batch_data[2])
test_y_pred = np.concatenate(test_y_pred,axis=0)
test_y = np.concatenate(test_y,axis=0)
test_y_pred = np.array([[1 if i >= 0.5 else 0 for i in t] for t in test_y_pred])
micro_f1 = f1_score(test_y_pred, test_y, average='micro')
macro_f1 = f1_score(test_y_pred, test_y, average='macro')
print('macro F1: %.4f' % macro_f1)
print('micro F1: %.4f' % micro_f1)
model.save("Model_save/best_model_capsule_label-%.4f.h5" % (micro_f1))
print("Saved model: Model_save/best_model_capsule_label-%.4f.h5" % (micro_f1))

******************** test ********************
macro F1: 0.2645
micro F1: 0.6753
Saved model: Model_save/best_model_capsule_label-0.6753.h5
