In [1]:
from keras.layers import *
from keras.models import Model 
from keras.optimizers import Nadam
import keras.backend as K 
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
import numpy as np

Using TensorFlow backend.


### 膨胀门卷积
来源：https://spaces.ac.cn/archives/5409

#### Y = Conv1D1(X) ⊗ σ(Conv1D2(X))
两个Conv1D形式一样（比如卷积核数、窗口大小都一样），但权值不共享，即参数翻倍，其中一个用sigmoid函数激活，
另外一个不加激活函数，然后将它们逐位相乘。

因为sigmoid函数的值域是(0,1)，相当于给Conv1D的每个输出都加一个“阀门”来控制流量。
这就是GCNN的结构，或可以将这种结构看成一个激活函数，称为GLU（Gated Linear Unit）。

除直观意义外，用GCNN的一个好处是几乎不用担心梯度消失问题，因为有一个卷积是不加任意激活函数的，
所以对这部分求导是个常数（乘以门），可以说梯度消失的概率非常小。
如果输入和输出的维度大小一致，那么我们就把输入也加到里边，即使用残差结构：
#### Y = X + Conv1D1(X) ⊗ σ(Conv1D2(X))
值得一提的是，使用残差结构，并不只是为了解决梯度消失，而是使得信息能够在多通道传输。
可以将上式改写为更形象的等价形式，以便我们更清晰看到信息是如何流动的：
#### Y = X ⊗ (1−σ) + Conv1D1(X) ⊗ σ
#### σ = σ(Conv1D2(X))
上式中我们能更清楚看到信息的流向：以1−σ的概率直接通过，以σ的概率经过变换后才通过。
##### 补充推导  Y = X ⊗ (1−σ(Conv1D2(X))) + Conv1D1(X)⊗σ(Conv1D2(X)) = X + (Conv1D1(X)−X)⊗σ(Conv1D2(X))

In [2]:
def Dilated_gated_conv1d(seq, mask, dilation_rate=1):
    """膨胀门卷积（残差式）
    """
    dim = K.int_shape(seq)[-1]    # size
    h = Conv1D(dim*2, 3, padding='same', dilation_rate=dilation_rate)(seq)   # (bs, sl, size*2)
    def _gate(x):
        dropout_rate = 0.1
        s, h = x   # (bs, sl, size)  (bs, sl, size*2)
        g, h = h[:, :, :dim], h[:, :, dim:]   #  (bs, sl, size)  (bs, sl, size)
        g = K.in_train_phase(K.dropout(g, dropout_rate), g)   # 训练中dropout
        g = K.sigmoid(g)   # (bs, sl, size)
        return g * s + (1 - g) * h   
    seq = Lambda(_gate)([seq, h])
    seq = Lambda(lambda x: x[0] * x[1])([seq, mask])
    return seq

In [5]:
max_feature = 20000
max_len = 300
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_feature)

In [7]:
x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) 

(25000, 300) (25000,) (25000, 300) (25000,)


In [8]:
x_in = Input(shape=(max_len,))
mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x_in)
print(K.int_shape(mask))
x = Embedding(max_feature+1, 100)(x_in)
x = BatchNormalization()(x)
x = SpatialDropout1D(0.2)(x)
# x = Conv1D(128, 3, activation='relu')(x)
x = Dilated_gated_conv1d(x, mask, dilation_rate=1)
x = Dilated_gated_conv1d(x, mask, dilation_rate=2)
x = Dilated_gated_conv1d(x, mask, dilation_rate=4)
x = Dilated_gated_conv1d(x, mask, dilation_rate=1)
x = Dilated_gated_conv1d(x, mask, dilation_rate=1)
x_avg = GlobalAveragePooling1D()(x)
x_max = GlobalMaxPooling1D()(x)
x = Concatenate()([x_avg, x_max])
x = Dense(100, activation='relu')(x)
x = Dropout(0.5)(x)
x_out = Dense(1, activation='sigmoid')(x)
model = Model(x_in, x_out)
model.compile(optimizer=Nadam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

(None, 300, 1)


In [11]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=2, min_lr=0.5e-6)
early_stopper = EarlyStopping(min_delta=0.001, patience=5)
model_checkpoint= ModelCheckpoint("save_model/model_weights.h5", 
                                  monitor="val_loss", 
                                  save_best_only=True, 
                                  save_weights_only=True, mode='auto')
callbacks_list=[lr_reducer, early_stopper, model_checkpoint] 

In [13]:
model.fit(x_train, y_train, epochs=10, batch_size=128, validation_data=(x_test, y_test), callbacks=callbacks_list)  