In [44]:
from keras import backend as K
from keras.models import Model
from keras.layers import Layer, Input, Recurrent, LSTM, LSTMCell, Embedding, Dense
from keras.layers import Bidirectional, TimeDistributed

In [45]:

def get_context_vec( context_mat , att_weigts ):
    att_weigts_rep = K.expand_dims( att_weigts , 2 )
    att_weigts_rep = K.repeat_elements(att_weigts_rep , context_mat.shape[2] , 2 )
    return K.sum(att_weigts_rep*context_mat , axis=1)


def attend( key_vec , context_mat , contextMatTimeSteps , w1 , w2 ):
    key_rep = K.repeat(key_vec , contextMatTimeSteps )
    concated = K.concatenate([key_rep , context_mat ] , axis=-1 )
    concated_r = K.reshape(concated , (-1 ,concated.shape[-1] ))
    att_energies = K.dot( ( K.dot( concated_r , w1  )) , w2 )
    att_energies = K.relu( K.reshape(att_energies  , (-1 , contextMatTimeSteps ) ) )
    att_weigts = K.softmax( att_energies )
    
    return get_context_vec(context_mat ,att_weigts  ) , att_weigts
    


# the input is the  [ input , context_matrix ] 

class AttentionDecoder(Layer):

    def __init__(self, rnn_cell  , **kwargs):
        
        self.output_dim = rnn_cell.state_size[0]
        self.rnn_cell = rnn_cell
        super(AttentionDecoder, self).__init__(**kwargs)

    def build(self, input_shape):
        assert type( input_shape ) is list
        assert len(input_shape) == 2 
                

        self.att_kernel = self.add_weight(name='att_kernel_1', 
                                      shape=( self.output_dim+input_shape[1][2] ,  input_shape[1][2] ),
                                      initializer='uniform',
                                      trainable=True)
        
        self.att_kernel_2 = self.add_weight(name='att_kernel_2', 
                                      shape=( input_shape[1][2] ,  1 ),
                                      initializer='uniform',
                                      trainable=True)
        
                
        step_input_shape = (input_shape[0][0], input_shape[0][2]+input_shape[1][2] ) # batch_size , in_dim + contextVecDim 
        self.rnn_cell.build(step_input_shape)
        
        self._trainable_weights += ( self.rnn_cell.trainable_weights )
        self._non_trainable_weights += (  self.rnn_cell.non_trainable_weights )
        
        self.contextMatTimeSteps = input_shape[1][1]
                
            
        super(AttentionDecoder, self).build(input_shape)  
            
    
    def get_initial_state(self, inputs):
   
        initial_state = K.zeros_like(inputs)   
        initial_state = K.sum(initial_state, axis=(1, 2))   
        initial_state = K.expand_dims(initial_state)   
        if hasattr(self.rnn_cell.state_size, '__len__'):
            return [K.tile(initial_state, [1, dim])  for dim in self.rnn_cell.state_size]
        else:
            return [K.tile(initial_state, [1, self.rnn_cell.state_size])]


    def call(self, input ):
        inputs , context_mat = input
        
        
        def step(inputs, states):
                hid = states[0]
                ctx_vec , att_weigts = attend( hid , context_mat, self.contextMatTimeSteps , self.att_kernel , self.att_kernel_2 )
                rnn_inp = K.concatenate( (inputs , ctx_vec ), axis=1 )
                return self.rnn_cell.call( rnn_inp , states )
            
        timesteps = inputs.shape[ 1 ]
        
        initial_state = self.get_initial_state(inputs )
        
        last_output, outputs, states = K.rnn(step,
                                             inputs,
                                             initial_state,
                                             input_length=timesteps)
        
        return outputs

    def compute_output_shape(self, input_shape ):
        return (input_shape[0][0], input_shape[0][1] , self.output_dim)

In [46]:
def vatex_baseline_model():
    
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(NUM_FEATURE, DIM_FEATURE))
    encoder_states = Bidirectional(LSTM(DIM_HIDDEN, return_sequences=True))(encoder_inputs)
    
    decoder_inputs = Input((LEN_DEC_SEQ,  ))
    decoder_outputs = Embedding(SIZE_VOCAB , 150 )( decoder_inputs )
    
    decoded_attn = AttentionDecoder(  LSTMCell(256) )([ decoder_outputs, encoder_states])
    decoded = TimeDistributed( Dense(SIZE_VOCAB , activation='softmax') )( decoded_attn )
    
    model = Model( [encoder_inputs , decoder_inputs ] , decoded )
    
    return model

In [47]:
NUM_FEATURE = 100 # L
DIM_FEATURE = 200
DIM_HIDDEN = 300
LEN_DEC_SEQ = 300
SIZE_VOCAB = 10000

model = vatex_baseline_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 100, 200)     0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 150)     1500000     input_9[0][0]                    
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 100, 600)     1202400     input_8[0][0]                    
__________________________________________________________________________________________________
attention_

# preprocess the data

In [None]:
import pickle
from keras.preprocessing.text import Tokenizer
import os

In [None]:
# todo: tokenize the data and store the tokenizer.

# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
t.texts_to_sequences(docs)

with open("tokenizer.pkl", "r") as f:
    pickle.dump(t, f)

word2index = t.word_index
index2word = dict(map(reversed, t.word_index.items()))


In [None]:
# to do : load the I3D and preprocess the text.

for ... in os.walk ...
x_all = np.load("...")
y_all = ..

def shuffle_split_data(X, y):
    split = np.random.rand(X.shape[0]) < 0.7

    X_Train = X[split]
    y_Train = y[split]
    X_Test =  X[~split]
    y_Test = y[~split]

    print len(X_Train), len(y_Train), len(X_Test), len(y_Test)
    return X_Train, y_Train, X_Test, y_Test


# train the model

In [None]:
# when train the model, endcode_inputs are just I3D data.
# decoder_inputs, decoded are same. they are tokenized clip description
# prediction are a little bit tricky. to visualize the result we also need the vocab in tokenizer.



In [None]:
def predict(model, v_inputs, LEN_DEC_SEQ): 
    
    m_input = [ v_inputs , np.zeros((1, LEN_DEC_SEQ)) ] 
    
    res = []
    for w_i in range(1, LEN_DEC_SEQ): 
        out = model.predict( m_input ) 
        out_w_i = out[0][w_i-1].argmax()  
        if out_w_i == 0: 
            break
        res.append(out_w_i)
        m_input[1][0,w_i] = out_w_i 
    return res