# 几个变化
- 在 dense 层前面加了 batch-norm 层
- batch 大小为 2048
- 让 embedding 层可训练

# 效果
- 加入 batch-norm 层之后，训练速度大大提高。
- embedding 层可训练之后，AUC 不断提升。原来 embedding 层不可训练的时候，AUC 在第三个 epoch 的时候就停止增加了。


# 需要调整的参数
- batch-size
- split 比例
- drop rate. 由于测试集数据和训练集数据差不多大，因此，我们不能太依赖于网络结构，我们需要将网络设置得更加具有普遍适用性。所以这里将 dropout rate 设置稍微大些。范围在 0.6 到 0.1 之间。

# 策略
- step 1: 现有参数
    - two LSTM layers both with 128 hidden units
    - one dense layer doing classification
    - batch-norm before each layer
    - dropout after each layer and the dropout rate is 0.1
    - epochs is 13
    - batch-size is 2048
- step 2: 接下来要做的事情
    - observe auc, stop when it reaches 0.999
    - use the epoch in which auc is 0.999 to train all models
    - save models
    - evaluate
    - predict
    - save the results and combine them together
    - submit the results
    - change dropout rate to 0.4

# 1. 加载数据

In [1]:
import pickle
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from keras import backend as K 
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("../data/train.csv")

datafile = open('./data.pkl', 'rb')
# text of train
X_train = pickle.load(datafile)
# text of test
X_test = pickle.load(datafile)
word_to_index = pickle.load(datafile)
index_to_word = pickle.load(datafile)
word_to_vec_map = pickle.load(datafile)
datafile.close()

# 2. 构建模型

## AUC 定义

In [4]:
# 二元分类的 AUC 的计算方式
def auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)
#-----------------------------------------------------------------------------------------
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    N = K.sum(1 - y_true)
    FP = K.sum(y_pred - y_pred * y_true)
    return FP/N
#-----------------------------------------------------------------------------------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    P = K.sum(y_true)
    TP = K.sum(y_pred * y_true)
    return TP/P

## 构建预训练的 embedding 层

In [3]:
# 我们使用的预训练的 word embedding 是 40 万个单词的训练结果，它们的特征维数是 50
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    创建一个 Keras 的 Embedding() 层，并且加载之前已经训练好的 embedding
    """
    
    # 词典中单词的个数+1，+1是 keras 模型的训练要求
    vocab_len = len(word_to_index) + 1
    # 获取单词的特征维数，随便找个单词就行了
    emb_dim = word_to_vec_map["cucumber"].shape[0]
    
    # 将 embedding 矩阵初始化为全 0 的，大小为 (vocab_len, emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # 将 emb_matrix 的行号当做单词的编号，然后将这个单词的 embedding 放到这一行，这样就把预训练的 embedding 加载进来了
    # 注意，由于单词编号是从 1 开始的，所以行 0 是没有 embedding 的，这就是为什么前面要 +1
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # 创建 Keras 的Embedding 层
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=True)

    # build embedding layer，在设置 embedding layer 的权重的时候，这一步是必须的
    embedding_layer.build((None,))
    
    # 将 emb_matrix 设置为 embedding_layer 的权重。
    # 到这里为止我们就创建了一个预训练好的 embedding layer
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

## 构建模型

In [5]:
# 其他所有的分类模型可以基于这个函数进行创建
def mother_model(input_shape, word_to_vec_map, word_to_index):
    """
    返回：一个 Keras 的模型
    
    参数:
    input_shape -- MAX_COMMENT_TEXT_SEQ
    word_to_vec_map
    word_to_index
    
    """
    
    # 创建输入层，输入的是句子的单词编号列表
    sentence_indices = Input(shape=input_shape, dtype=np.int32)
    # 创建 word embedding 层
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    # 句子编号列表进入 embedding_layer 之后会返回对应的 embeddings
    embeddings = embedding_layer(sentence_indices)
    
    dr_r = 0.5
    
    X = BatchNormalization()(embeddings)
    X = LSTM(128, return_sequences=True)(X)
    X = Dropout(dr_r)(X)
    X = BatchNormalization()(X)
    X, _, __ = LSTM(128, return_state = True)(X)
    X = Dropout(dr_r)(X)
    
    X = BatchNormalization()(X)
    X = Dense(64, activation='relu')(X)
    X = Dropout(dr_r)(X)
    
    X = BatchNormalization()(X)
    X = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=sentence_indices, outputs=X)
    
    
    return model

## 创建 toxic 分类模型

In [6]:
MAX_COMMENT_TEXT_SEQ = 200
toxic_model = mother_model((MAX_COMMENT_TEXT_SEQ,), word_to_vec_map, word_to_index)
toxic_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])

# 3. 训练模型、评估模型、保存模型

In [None]:
model_dir = './models'
filepath = model_dir + '/model-{epoch:02d}.h5'
checkpoint = ModelCheckpoint(filepath,monitor='val_loss',save_best_only=True, verbose=1)
callbacks_list = [checkpoint]
train_result = toxic_model.fit(X_train['comment_text'], train[['toxic']], 
                    epochs=50, 
                    batch_size=2048, 
                    validation_split=0.07, 
                    callbacks = callbacks_list,
                    verbose=1)

plt.plot(train_result.history['train_loss'])
plt.plot(train_result.history['validation_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

Train on 148401 samples, validate on 11170 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.25927, saving model to ./models/model-01.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.25927 to 0.17151, saving model to ./models/model-02.h5
Epoch 3/50
