In [None]:
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
# Attention GRU network		  
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.add_weight((input_shape[-1],1), initializer=self.init)
        self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        eij = K.squeeze(eij, axis=-1)
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1, keepdims=True)
        
        weighted_input = x*K.expand_dims(weights)
        return K.sum(weighted_input,axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

from __future__ import print_function
import numpy
import sys
sys.path.append('datasets')

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Bidirectional
from keras.layers import Embedding, GRU
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.datasets import imdb
import codeforces

numpy.random.seed(7)
# set parameters:
top_words = 20000
max_len = 1500
batch_size = 64
embed_dim = 100
filters = 128
kernel_size = 3
pool_size = 2
epochs = 10
dropout = 0.2

print('Loading data...')
(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=top_words)

print(len(x_train), 'train sequences')
print(len(x_val), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_val = sequence.pad_sequences(x_val, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_val.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(top_words+3, embed_dim, input_length=max_len))
model.add(Bidirectional(GRU(100, return_sequences=True)))
model.add(AttLayer())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print("model fitting - attention GRU network")
model.summary()
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=32)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 1500)
x_test shape: (25000, 1500)
Build model...
model fitting - attention GRU network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1500, 100)         2000300   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1500, 200)         120600    
_________________________________________________________________
att_layer_2 (AttLayer)       (None, 200)               200       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 2,121,301.0
Trainable params: 2,121,301
Non-trainable params: 0.0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/10