In [2]:
import numpy as np
import pandas as pd
import os, sys
import pickle
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.externals import joblib
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Permute, Reshape, RepeatVector, Activation
from keras.models import Sequential, load_model, model_from_json
from keras.models import model_from_json

import pydot
import graphviz
from keras.utils.vis_utils import plot_model

In [5]:
TRAIN_LABEL_PATH = "MLDS_hw2_data/training_label.json"

MAX_WORDS = 1024
    
def read_data(path):
    with open(path) as data_file:    
        y_data = json.load(data_file)
    
    videoId = []
    videoSeq = []

    for y in y_data:
        for idx, cap in enumerate(y['caption']):
            cap = "<bos> " + cap + " <eos>"
            videoId.append(y['id'])
            videoSeq.append(cap)
            
       
    return videoId, videoSeq


def getVId(path):
    TRAIN_LABEL_PATH = "MLDS_hw2_data/training_label.json"
    
    videoId, videoSeq = read_data(path)
    tokenize(videoId, videoSeq)
    curFilename = videoId[0]
    vCount = 0
    y_videoId = []
    for idx, seq in  enumerate(videoId):
            if(videoId[idx] == curFilename):
                vCount = vCount + 1
                if(vCount > 2):
                    continue
            else:
                vCount = 1
                curFilename = videoId[idx]
            y_videoId.append(videoId[idx])
    return videoId

def tokenize(videoId, videoSeq):
    
    tokenizer = Tokenizer(num_words=MAX_WORDS)

    tokenizer.fit_on_texts(videoSeq)
    word_index = tokenizer.word_index
#     print(word_index)
   
    
    print ('Convert to index sequences.')
    train_sequences = tokenizer.texts_to_sequences(videoSeq)
    train_sequences = np.array(train_sequences)
    print(train_sequences[0])

    train_sequences = pad_sequences(train_sequences, padding='post',truncating='post')
    print(train_sequences[0])
    max_seq_length = train_sequences.shape[1]
    print(max_seq_length)

    y_data = []
    y_videoId= []
    curFilename = videoId[0]
    vCount = 0
    for idx, seq in  enumerate(train_sequences):
        if(videoId[idx] == curFilename):
            vCount = vCount + 1
            if(vCount > 2):
                continue
        else:
            vCount = 1
            curFilename = videoId[idx]
        y = to_categorical(seq, MAX_WORDS)

        y_data.append(y)
        y_videoId.append(videoId[idx])

    y_data = np.array(y_data)   
    print(y_data.shape)
    with open('data/y_data1024_post.jlib', 'wb') as file:
        joblib.dump(y_data, file)
    with open('data/tokinzer1024_post', 'wb') as file:
        joblib.dump(tokenizer, file)
#     genX_data()
  
  
    return y_videoId, y_data, tokenizer

def genX_data():
    TRAIN_FEATURE_DIR = "./MLDS_hw2_data/training_data/feat/"
    x_data = {}
    for filename in os.listdir(TRAIN_FEATURE_DIR):
        f = np.load(TRAIN_FEATURE_DIR + filename)
        x_data[filename[:-4]] = f
        X_data = []
    for vid in y_videoId:
        X_data.append(x_data[vid])
    X_data = np.array(X_data)
    with open('x_data2.jlib', 'wb') as file:
        joblib.dump(X_data, file)


        

def data_generator(batchsize):
    
    with open(TRAIN_LABEL_PATH) as data_file:    
        y_data = json.load(data_file)
    
    videoId = []
    videoSeq = []
    for y in y_data:
        for idx, cap in enumerate(y['caption']):
            cap = "<bos> " + cap + " <eos>"
            videoId.append(y['id'])
            videoSeq.append(cap)
            
    
    TRAIN_FEATURE_DIR = "./MLDS_hw2_data/training_data/feat/"
    x_data = {}
    for filename in os.listdir(TRAIN_FEATURE_DIR):
        f = np.load(TRAIN_FEATURE_DIR + filename)
        x_data[filename[:-4]] = f
        
    
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(videoSeq)
    word_index = tokenizer.word_index   
    
    print ('Convert to index sequences.')
    train_sequences = tokenizer.texts_to_sequences(videoSeq)
    train_sequences = np.array(train_sequences)
    print(train_sequences[0])

    train_sequences = pad_sequences(train_sequences, padding='post',truncating='post')
    max_seq_length = train_sequences.shape[1]
   
    
    
    videoId = videoId[:20000]
    train_sequences = train_sequences[:20000]
    print(train_sequences.shape)
    filesize = len(train_sequences)
    
    while 1:
        n_entries = 0
        # as long as we haven't read all entries from the file: keep reading
        while n_entries < (filesize - batchsize):
            encoder_input_data = []
            decoder_input_data = []
            decoder_target_data = []
            
            X_data = []
            y_data = []
            y_videoId = videoId[n_entries: n_entries + batchsize]
            captions = train_sequences[n_entries : n_entries + batchsize]
            curFilename = y_videoId[0]
            vCount = 0
            for idx in  range(0,batchsize):
#                 if(y_videoId[idx] == curFilename):
#                     vCount = vCount + 1
#                     if(vCount > 3):
#                         continue
#                 else:
#                     vCount = 1
#                     curFilename = y_videoId[idx]
                # x_data
                encoder_input_data.append(x_data[y_videoId[idx]])
                
                y = to_categorical(train_sequences[idx], MAX_WORDS)
                decoder_input_data.append(y[:-1])
                decoder_target_data.append(y[1:])
            encoder_input_data = np.array(encoder_input_data)
            decoder_input_data = np.array(decoder_input_data)
            decoder_target_data = np.array(decoder_target_data)
            n_entries = n_entries + batchsize
            yield [encoder_input_data, decoder_input_data], decoder_target_data


def validation_generator(batchsize):
    
    with open(TRAIN_LABEL_PATH) as data_file:    
        y_data = json.load(data_file)
    
    videoId = []
    videoSeq = []
    for y in y_data:
        for idx, cap in enumerate(y['caption']):
            cap = "<bos> " + cap + " <eos>"
            videoId.append(y['id'])
            videoSeq.append(cap)
            
    
    TRAIN_FEATURE_DIR = "./MLDS_hw2_data/training_data/feat/"
    x_data = {}
    for filename in os.listdir(TRAIN_FEATURE_DIR):
        f = np.load(TRAIN_FEATURE_DIR + filename)
        x_data[filename[:-4]] = f
        
    
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(videoSeq)
    word_index = tokenizer.word_index   
    
    print ('Convert to index sequences.')
    train_sequences = tokenizer.texts_to_sequences(videoSeq)
    train_sequences = np.array(train_sequences)
   

    train_sequences = pad_sequences(train_sequences, padding='post',truncating='post')
    print(train_sequences.shape)
    max_seq_length = train_sequences.shape[1]
   
    
    
    videoId = videoId[20000:]
    train_sequences = train_sequences[20000:]
    print(train_sequences.shape)
    filesize = len(train_sequences)
    
    while 1:
        n_entries = 0
        # as long as we haven't read all entries from the file: keep reading
        while n_entries < (filesize - batchsize):
            encoder_input_data = []
            decoder_input_data = []
            decoder_target_data = []
            
            X_data = []
            y_data = []
            y_videoId = videoId[n_entries: n_entries + batchsize]
            captions = train_sequences[n_entries : n_entries + batchsize]
            curFilename = y_videoId[0]
            vCount = 0
            for idx in  range(0,batchsize):
#                 if(y_videoId[idx] == curFilename):
#                     vCount = vCount + 1
#                     if(vCount > 3):
#                         continue
#                 else:
#                     vCount = 1
#                     curFilename = y_videoId[idx]
                # x_data
                encoder_input_data.append(x_data[y_videoId[idx]])
                
                y = to_categorical(train_sequences[idx], MAX_WORDS)
                decoder_input_data.append(y[:-1])
                decoder_target_data.append(y[1:])
            encoder_input_data = np.array(encoder_input_data)
            decoder_input_data = np.array(decoder_input_data)
            decoder_target_data = np.array(decoder_target_data)
            n_entries = n_entries + batchsize
            yield [encoder_input_data, decoder_input_data], decoder_target_data

            
def validation():
    
    with open(TRAIN_LABEL_PATH) as data_file:    
        y_data = json.load(data_file)
    
    videoId = []
    videoSeq = []
    for y in y_data:
        for idx, cap in enumerate(y['caption']):
            cap = "<bos> " + cap + " <eos>"
            videoId.append(y['id'])
            videoSeq.append(cap)
            
    
    TRAIN_FEATURE_DIR = "./MLDS_hw2_data/training_data/feat/"
    x_data = {}
    for filename in os.listdir(TRAIN_FEATURE_DIR):
        f = np.load(TRAIN_FEATURE_DIR + filename)
        x_data[filename[:-4]] = f
        
    
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(videoSeq)
    word_index = tokenizer.word_index   
    
    print ('Convert to index sequences.')
    train_sequences = tokenizer.texts_to_sequences(videoSeq)
    train_sequences = np.array(train_sequences)
   

    train_sequences = pad_sequences(train_sequences, padding='post',truncating='post')
    print(train_sequences.shape)
    max_seq_length = train_sequences.shape[1]
   
    
    
    videoId = videoId[20000:]
    train_sequences = train_sequences[20000:]
    print(train_sequences.shape)
    filesize = len(train_sequences)
    
    encoder_input_data = []
    decoder_input_data = []
    decoder_target_data = []

    X_data = []
    y_data = []
    vCount = 0
    curFilename = videoId[0]
    for idx in  range(0,filesize):
        if(videoId[idx] == curFilename):
            vCount = vCount + 1
            if(vCount > 5):
                continue
        else:
            vCount = 1
            curFilename = videoId[idx]
        encoder_input_data.append(x_data[videoId[idx]])
        y = to_categorical(train_sequences[idx], MAX_WORDS)
        decoder_input_data.append(y[:-1])
        decoder_target_data.append(y[1:])
    encoder_input_data = np.array(encoder_input_data)
    decoder_input_data = np.array(decoder_input_data)
    decoder_target_data = np.array(decoder_target_data)
    return [encoder_input_data, decoder_input_data], decoder_target_data



def getWords(word_index, seq):
    for word in seq:
        idx = np.argmax(word)
        if idx == 0:
            continue
        print(list(word_index.keys())[list(word_index.values()).index(idx)])
        
# print(list(mydict.keys())[list(mydict.values()).index(16)]) # Prints george
    






In [3]:


# load data
encoder_input_data = None
decoder_data = None
tokinzer = None
with open('data/X_data2.jlib', 'rb') as file:
    encoder_input_data = joblib.load(file)
    print(encoder_input_data.shape)
with open('data/y_data1024_post.jlib', 'rb') as file:
    decoder_data = joblib.load(file)
    print(decoder_data.shape)
with open('data/tokinzer1024_post', 'rb') as file:
    tokinzer = joblib.load(file)
    print(len(tokinzer.word_index))
decoder_input_data = []
decoder_target_data = []
for e in decoder_data:
    i = e[:-1]
    o = e[1:]
    decoder_input_data.append(i)
    decoder_target_data.append(o)
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = np.array(decoder_target_data)


(2900, 80, 4096)
(2900, 36, 1024)
6018


In [11]:
import keras.layers.merge as merge
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Flatten
import keras
num_encoder_tokens = 4096
num_decoder_tokens = 1024
latent_dim = 512

# num_encoder_tokens = encoder_input_data.shape[2]
# num_decoder_tokens = decoder_input_data.shape[2]

batch_size = 200
epochs = 100
TIME_STEPS_ENCODER = 80
DECODER_MAX_LENGTH = 35

def attention_3d_block(inputs):
    
    # inputs.shape = (batch_size, time_steps, input_dim)
  
    print(inputs.shape)
    input_dim = int(inputs.shape[2])
    print(input_dim)
    a = Permute((2, 1))(inputs)
#     a = Reshape((input_dim, TIME_STEPS_ENCODER))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(DECODER_MAX_LENGTH, name='softmax_dense')(a)
#     if SINGLE_ATTENTION_VECTOR:
#         a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
#         a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    a_probs = Dense(num_decoder_tokens, name='dim2_num_of_decoders', activation='softmax')(a_probs)
#     output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return a_probs

# Define an input sequence and process it.
encoder_inputs = Input(shape=(TIME_STEPS_ENCODER, num_encoder_tokens), name="encoder_inputs")
encoder = LSTM(latent_dim, return_state=True,return_sequences=True, name='endcoder_lstm')

encoder_output, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
encoder_outputs = [encoder_output, state_h, state_c]


# attention = keras.layers.Permute((2,1))(encoder_output)
# attention = keras.layers.Dense(TIME_STEPS_ENCODER, activation='softmax', name="attention_dense")(attention)
# attention = keras.layers.Permute((2,1))(attention)
 
# hidden = keras.layers.Multiply()([encoder_output, attention])
# hidden = keras.layers.Permute((2,1))(hidden)
# hidden = keras.layers.Dense(DECODER_MAX_LENGTH, activation='relu')(hidden)
# hidden = keras.layers.Permute((2,1))(hidden)
# hidden_output = keras.layers.Dense(num_decoder_tokens, activation='relu')(hidden)

attention_inputs = Input(shape=(TIME_STEPS_ENCODER, latent_dim))
at = Permute((2, 1))(attention_inputs)
at = Dense(TIME_STEPS_ENCODER, activation='softmax',name='dense')(at)
at = Permute((2, 1), name='attention_vec')(at)

output_attention_mul = merge([attention_inputs, at], name='attention_mul', mode='mul')
hidden = keras.layers.Permute((2,1))(output_attention_mul)
hidden = keras.layers.Dense(DECODER_MAX_LENGTH, activation='relu')(hidden)
hidden = keras.layers.Permute((2,1))(hidden)
hidden = keras.layers.Dense(num_decoder_tokens)(hidden)

attention_model = Model(attention_inputs, hidden, name='attention_model')
attention_model.summary()


attention_result = attention_model(encoder_output)


# encoder_output_as_input = Input(shape=(TIME_STEPS_ENCODER, latent_dim))
# attention_result_as_input = Input(shape=(TIME_STEPS_ENCODER, latent_dim))
# hidden = merge([encoder_output_as_input, attention_result_as_input], name='attention_mul', mode='mul')
# hidden = keras.layers.Permute((2,1))(hidden)
# hidden = keras.layers.Dense(DECODER_MAX_LENGTH, activation='relu')(hidden)
# hidden = keras.layers.Permute((2,1))(hidden)
# hidden_output = keras.layers.Dense(num_decoder_tokens, activation='relu')(hidden)
# hidden_model = Model([encoder_output_as_input, attention_result_as_input], hidden_output, name='hidden_layers_model')
# hidden_model.summary()


# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(DECODER_MAX_LENGTH, num_decoder_tokens), name= "decoder_inputs")
decoder_con_inputs = merge([attention_result, decoder_inputs], name='output_mul', mode='concat')

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_con_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
result = decoder_dense(decoder_outputs)
# result = keras.layers.Multiply(name = "output_attention_mul")([decoder_outputs, hidden_output])


# hidden_prob = hidden_model([encoder_output, attention_result])

# result = keras.layers.Multiply()([decoder_outputs, hidden])
 

# attention_result = attention_model(encoder_output)
# result = merge([decoder_outputs, attention_result], name='output_mul', mode='concat')
# output_dense = Dense(num_decoder_tokens, activation='softmax', name='output_dense')

# output_dense = output_dense(result)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

model = Model([encoder_inputs, decoder_inputs], result)
model.summary()
plot_model(model, to_file='model.png', show_shapes=True)


earlystopping = EarlyStopping(monitor='val_loss', patience = 4, verbose=1, mode='min')
# checkpoint = ModelCheckpoint(filepath=  './models/model1_best.h5',
#                             verbose=1,
#                             save_best_only=True,
#                             save_weights_only=False,
#                             monitor='val_loss',
#                                 mode='min')
# Run training
opt = keras.optimizers.adam(lr = 0.001)

model.compile(metrics=['accuracy'], optimizer=opt, loss='categorical_crossentropy')
try:
    pass
#     earlystopping = EarlyStopping(monitor='loss', patience = 8, verbose=1, mode='min')
        
#     X_val, y_val = validation()
#     model.fit_generator(data_generator(100),
#                         validation_data=(X_val, y_val),
#                 steps_per_epoch = 80,
#               epochs=epochs,
              
#              )
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.15,
             callbacks=[earlystopping])

    
except KeyboardInterrupt:
    print("\nW: interrupt received, stopping…")
finally:
    pass



  name=name)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_13 (InputLayer)            (None, 80, 512)       0                                            
____________________________________________________________________________________________________
permute_7 (Permute)              (None, 512, 80)       0           input_13[0][0]                   
____________________________________________________________________________________________________
dense (Dense)                    (None, 512, 80)       6480        permute_7[0][0]                  
____________________________________________________________________________________________________
attention_vec (Permute)          (None, 80, 512)       0           dense[0][0]                      
___________________________________________________________________________________________

Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 00041: early stopping


In [18]:
def genModel():
    encoder_inputs = Input(shape=(TIME_STEPS_ENCODER, num_encoder_tokens), name="encoder_inputs")
    encoder = LSTM(latent_dim, return_state=True,return_sequences=True, name='endcoder_lstm')

    encoder_output, state_h, state_c = encoder(encoder_inputs)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]
    encoder_outputs = [encoder_output, state_h, state_c]


    # attention = keras.layers.Permute((2,1))(encoder_output)
    # attention = keras.layers.Dense(TIME_STEPS_ENCODER, activation='softmax')(attention)
    # attention = keras.layers.Permute((2,1))(attention)

    # hidden = keras.layers.Multiply()([encoder_output, attention])
    # hidden = keras.layers.Permute((2,1))(hidden)
    # hidden = keras.layers.Dense(DECODER_MAX_LENGTH, activation='relu')(hidden)
    # hidden = keras.layers.Permute((2,1))(hidden)
    # hidden = keras.layers.Dense(num_decoder_tokens, activation='relu')(hidden)

    attention_inputs = Input(shape=(TIME_STEPS_ENCODER, latent_dim))
    at = Permute((2, 1))(attention_inputs)
    at = Dense(TIME_STEPS_ENCODER, activation='softmax',name='dense')(at)
    at = Permute((2, 1), name='attention_vec')(at)

    output_attention_mul = merge([attention_inputs, at], name='attention_mul', mode='mul')
    hidden = keras.layers.Permute((2,1))(output_attention_mul)
    hidden = keras.layers.Dense(DECODER_MAX_LENGTH, activation='relu')(hidden)
    hidden = keras.layers.Permute((2,1))(hidden)
    hidden = keras.layers.Dense(num_decoder_tokens)(hidden)

    attention_model = Model(attention_inputs, hidden, name='attention_model')



    # encoder_output_as_input = Input(shape=(TIME_STEPS_ENCODER, latent_dim))
    # attention_result_as_input = Input(shape=(TIME_STEPS_ENCODER, latent_dim))
    # hidden = merge([encoder_output_as_input, attention_result_as_input], name='attention_mul', mode='mul')
    # hidden = keras.layers.Permute((2,1))(hidden)
    # hidden = keras.layers.Dense(DECODER_MAX_LENGTH, activation='relu')(hidden)
    # hidden = keras.layers.Permute((2,1))(hidden)
    # hidden_output = keras.layers.Dense(num_decoder_tokens, activation='relu')(hidden)
    # hidden_model = Model([encoder_output_as_input, attention_result_as_input], hidden_output, name='hidden_layers_model')
    # hidden_model.summary()


    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(DECODER_MAX_LENGTH, num_decoder_tokens), name= "decoder_inputs")

    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)
    # output_attention_mul = keras.layers.Multiply()([decoder_outputs, hidden_output])


    # hidden_prob = hidden_model([encoder_output, attention_result])

    # result = keras.layers.Multiply()([decoder_outputs, hidden])


    attention_result = attention_model(encoder_output)
    result = merge([decoder_outputs, attention_result], name='output_mul', mode='concat')
    output_dense = Dense(num_decoder_tokens, activation='softmax', name='output_dense')

    output_dense = output_dense(result)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

    model = Model([encoder_inputs, decoder_inputs], output_dense)
    model.summary()
    return model

In [53]:
model = genModel()
model.load_weights('models/'+LOAD_MODEL_NUM+'/model.h5')
output_dense = model.layers[len(model.layers)-1]

print(output_dense.input_shape, output_dense.output_shape)
W = np.array(output_dense.get_weights()[0])
b = np.array(output_dense.get_weights()[1])
out_model = Sequential()
# out_model.add(Input(shape=(DECODER_MAX_LENGTH, num_decoder_tokens)))

out_model.add(Dense(1024,  weights=[W,b], input_shape = ((35, 2048))))
out_model.summary()



  name=name)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
encoder_inputs (InputLayer)      (None, 80, 4096)      0                                            
____________________________________________________________________________________________________
decoder_inputs (InputLayer)      (None, 35, 1024)      0                                            
____________________________________________________________________________________________________
endcoder_lstm (LSTM)             [(None, 80, 512), (No 9439232     encoder_inputs[0][0]             
____________________________________________________________________________________________________
decoder_lstm (LSTM)              [(None, 35, 512), (No 3147776     decoder_inputs[0][0]             
                                                                   endcoder_lstm[0][1]     

In [12]:
encoder_model = Model(encoder_inputs, encoder_outputs)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

att_in = Input(shape=(DECODER_MAX_LENGTH, num_decoder_tokens), name= "att_in")
decoder_concat_input = merge([att_in, decoder_inputs], mode='concat')

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_concat_input, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs + [att_in],
    [decoder_outputs] + decoder_states)

encoder_model.summary()
decoder_model.summary()
attention_model.summary()
# hidden_model.summary()







_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_inputs (InputLayer)  (None, 80, 4096)          0         
_________________________________________________________________
endcoder_lstm (LSTM)         [(None, 80, 512), (None,  9439232   
Total params: 9,439,232
Trainable params: 9,439,232
Non-trainable params: 0
_________________________________________________________________
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
att_in (InputLayer)              (None, 35, 1024)      0                                            
____________________________________________________________________________________________________
decoder_inputs (InputLayer)      (None, 35, 1024)      0                                            
_______________________

  name=name)


In [15]:

path, dirs, files = os.walk("./models/").__next__()
MODEL_NUM = str(int(len(dirs)+1)) + "concat"

directory = os.path.join('models',MODEL_NUM)
if not os.path.exists(directory):
    os.makedirs(directory)
directory = "./models/10concat"
# decode_sequence(input_seq, encoder_model,decoder_model )
encoder_model.save(os.path.join(directory, 'encoder_model.h5'))
decoder_model.save_weights(os.path.join(directory, 'decoder_model_weights.h5'))
attention_model.save(os.path.join(directory, 'attention_model.h5'))
# hidden_model.save(os.path.join(directory, 'hidden_model.h5'))
model.save_weights(os.path.join(directory, 'model.h5'))


10concat


In [53]:
def att_decode_model(num_encoder_tokens, num_decoder_tokens, latent_dim):


    decoder_inputs = Input(shape=(DECODER_MAX_LENGTH, num_decoder_tokens))
    att_in = Input(shape=(DECODER_MAX_LENGTH, num_decoder_tokens), name= "att_in")
    decoder_concat_input = merge([att_in, decoder_inputs], mode='concat')
    
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_concat_input, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs + [att_in],
        [decoder_outputs] + decoder_states)
    return decoder_model


def att_decode_sequence(input_seq, encoder_model, decoder_model, att_model):
    return_seq, states_h, states_c = encoder_model.predict(input_seq)
    att_prob = att_model.predict(return_seq)
   
    #     hid_prob = hid_model.predict([return_seq, att_prob])
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, tokenizer.word_index['bos']] = 1
    stop_condition = False
    decoded_sentence = []
   
    seq_len = 0
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + [states_h, states_c] +[att_prob])
       
     
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            sampled_char = 'pad'
        else:
            sampled_char = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(sampled_token_index)]
        decoded_sentence.append(sampled_char)

        # Exit condition: either hit max length
        # or find stop character.
        if ( 
           len(decoded_sentence) > 34):
            stop_condition = True
            seq_length = 0

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]
        seq_len = seq_len + 1

    return decoded_sentence

def decode_model(num_encoder_tokens, num_decoder_tokens, latent_dim):
    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    return decoder_model


def decode_sequence(input_seq, encoder_model, decoder_model):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, tokenizer.word_index['bos']] = 1

    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            sampled_char = ''
        else:
            sampled_char = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(sampled_token_index)]
        decoded_sentence.append(sampled_char)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == 'eos' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

decode_seq = None
p_MAX = -1
def make_sen(sen):
    bos_flag = -1
    #eos_flag = -1
    max_len_sen = []
    ans = ""

    for i in range(0,len(sen)):
        if sen[i] == "eos" or sen[i] == "bos" :
            if len(max_len_sen) < (i - bos_flag - 1):
                max_len_sen = sen[bos_flag +1 :i]
                #print(max_len_sen)
                bos_flag = i

#     for t in max_len_sen:
#         ans = ans + t + " "

    return max_len_sen

def beam_search(model, X_test,target_seq, prob,  path, lens):
    global p_MAX
    global decode_seq
    node = 2
    y_pred = model.predict([X_test,target_seq])
    y_pred = y_pred.reshape((num_decoder_tokens))
    sampled_token_index = y_pred.argsort()[-node:][::-1]
   
    for i in range(node):
        if sampled_token_index[i] == 0:
            sampled_char = ''
        else:
            sampled_char = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(sampled_token_index[i])]
      
        if(sampled_char != 'eos' and lens <= 9):
            p = y_pred[sampled_token_index[i]]
            prob_new = list(prob)
            prob_new.append(p)
            path_new = list(path)
            path_new.append(sampled_char)
            target_seq = np.zeros((1, 1, num_decoder_tokens))
            target_seq[0,len(path_new),np.argmax(y_pred[0,len(ans)-1,:])] = 1
            beam_search(model, X_test, target_seq, prob_new, path_new, lens+1)
        else:
            p = y_pred[sampled_token_index[i]]
            prob_new = list(prob)
            prob_new.append(p)
            p = functools.reduce(operator.mul, prob_new, 1)
            if(p > p_MAX):
                decode_seq = path
                p_MAX = p

def decoder_str(model,Xtest):
    Xtest = Xtest.reshape(1,80,4096)
    Ydata = np.zeros((1,DECODER_MAX_LENGTH,num_decoder_tokens ))
    Ydata[0,0,tokenizer.word_index['bos']] = 1
#     beam_search(model, Xtest, Ydata, [], ['bos'],0)
#     return decode_seq
    
    ans = ['bos']

    while len(ans) < 30 and ans[-1] != 'eos':
        #print(Xtest.shape,Ydata.shape)
        y_pred = model.predict([Xtest,Ydata])
        # tmp = np.zeros((1,nb_words + 1))
        # tmp[0,np.argmax(y_pred[0,len(ans)-1,:])] = 1
        Ydata[0,len(ans),np.argmax(y_pred[0,len(ans)-1,:])] = 1
        index = np.argmax(y_pred[0,len(ans)-1,:])
        ans.append(list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(index)])
        # Ydata = Ydata.reshape(1,-1,nb_words + 1)
        #print(Ydata.shape)

    #print(ans)
    #ans.append('eos')
    return ans

def getX_test(path):   
    X_test = []
    X_test_filename = []
    for filename in os.listdir(path):
        f = np.load(os.path.join(path , filename))
        X_test.append(f)
        X_test_filename.append(filename[:-4])
    X_test = np.array(X_test)
    return X_test, X_test_filename

data_directory = "MLDS_hw2_data"




with open('data/tokinzer1024', 'rb') as file:
    tokenizer = joblib.load(file)
    print(len(tokenizer.word_index))

    
path, dirs, files = os.walk("./models/").__next__()
LOAD_MODEL_NUM = str(int(len(dirs)))
print(LOAD_MODEL_NUM)
LOAD_MODEL_NUM = "10concat"
enc_model = load_model('models/'+LOAD_MODEL_NUM+'/encoder_model.h5')
# enc_model.summary()
dec_model = att_decode_model(num_encoder_tokens, num_decoder_tokens, latent_dim)
dec_model.load_weights('models/'+LOAD_MODEL_NUM+'/decoder_model_weights.h5')
# dec_model.summary()
att_model = load_model('models/'+LOAD_MODEL_NUM+'/attention_model.h5')
# att_model.summary()
# hid_model = load_model('models/'+LOAD_MODEL_NUM+'/hidden_model.h5')
# hid_model.summary()

X_test, X_test_filename = getX_test(os.path.join(data_directory, "testing_data/feat/"))




6018
10


  name=name)
  return cls(**config)


In [54]:
# specail_vid = ["klteYv1Uv9A_27_33.avi", "5YJaS2Eswg0_22_26.avi", "UbmZAe5u5FI_132_141.avi", "JntMAcTlOF0_50_70.avi", "tJHUH9tpqPg_113_118.avi"]

with open("resultc.csv", 'w') as file:
    for idx, x in enumerate(X_test): 
        
        decoded_sentence = decoder_str(model, x)
        
#         print(decoded_sentence)
#         decoded_sentence = att_decode_sequence(x.reshape(-1, 80, 4096), enc_model,dec_model,att_model)
#         decoded_sentence = decode_sequence(x.reshape(-1, 80, 4096), enc_model,dec_model)
        decode_str = ''
        filter_string = ['bos', 'eos']
        counter = {}
        a_counter = 0
        for idx2, c in enumerate(decoded_sentence):
            if c in counter:
                counter[c] += 1
            else:
                counter[c] = 1
            
            if(decoded_sentence[idx2-1] == c and idx2 > 0 and c != 'a'):
                continue
            
            if(c != 'a' and counter[c]>= 2):
                continue
            if(c == 'a' and counter[c] >=5):
                continue
            
            
            if c in filter_string:
                continue
            if len(c) > 0:
                decode_str += c + ' '
        print(X_test_filename[idx] + ' > ' + decode_str)
#         file.write(X_test_filename[idx] + ',' + decode_str[:-1] + '\n')

04Gt01vatkk_248_265.avi > a woman is slicing a potato 
04Gt01vatkk_308_321.avi > a man is slicing a pork chop 
0lh_UWF9ZP4_27_31.avi > a person is slicing a potato 
0lh_UWF9ZP4_62_69.avi > a woman is adding eggs in a bowl 
1Sp2__RCT0c_11_15.avi > a man is dancing in a 
30GeJHYoerk_121_126.avi > a man is riding a horse 
3qqEKTPxLNs_1_15.avi > a baby is playing a 
4PcL6-mjRNk_11_18.avi > a man is riding a 
4xVGpDmA4lE_23_33.avi > a man is running in a 
5HAf_INrFy0_3_25.avi > a man is sitting on a large of a 
5YJaS2Eswg0_22_26.avi > a man is riding a horse 
6JnGBs88sL0_4_10.avi > a man is riding a horse 
6q1dX6thX3E_286_295.avi > a man is playing a 
71soiLO6I9U_15_24.avi > a man is eating 
HV12kTtdTT4_5_14.avi > a man is riding a cigarette 
IhwPQL9dFYc_124_129.avi > a woman is slicing a pork chop 
inzk2fTUe1w_1_15.avi > a man is slicing a potato 
J---aiyznGQ_0_6.avi > a man is playing a piano 
j2Dhf-xFUxU_13_20.avi > a woman is slicing a potato 
Jag7oTemldY_12_25.avi > a group of are 
jbz