# Code to train and test

### Write the captions from json file:

In [1]:
import json
import os, os.path
import pickle

train_val = json.load(open('videodatainfo_2017.json', 'r'))


# combine all images and annotations together
sentences = train_val['sentences']

# for efficiency lets group annotations by video
itoa = {}
for s in sentences:
    videoid_buf = s['video_id']
    videoid = int(videoid_buf[5:])
    if not videoid in itoa: itoa[videoid] = []
    itoa[videoid].append(s)
    
output = open('./DATA/word_features/captions.pkl', 'wb')
pickle.dump(itoa, output)
output.close()


### Auxilary functions to handle captions

In [2]:
import numpy as np

"""Functions to do the following:
            * Create vocabulary
            * Create dictionary mapping from word to word_id
            * Map words in captions to word_ids"""

def build_vocab(word_count_thresh):
    """Function to create vocabulary based on word count threshold.
        Input:
                word_count_thresh: Threshold to choose words to include to the vocabulary
        Output:
                vocabulary: Set of words in the vocabulary"""
    
    pkl_file = open('./DATA/word_features/captions.pkl', 'rb')
    sentences = pickle.load(pkl_file)
    pkl_file.close()

    unk_required = False
    all_captions = []
    word_counts = {}
    for vid in sentences.keys():
        for cid in range(0,20):
            caption = sentences[vid][cid]['caption']
            caption = '<BOS> ' + caption + ' <EOS>'
            all_captions.append(caption)
            for word in caption.split(' '):
                if word in word_counts.keys():
                    word_counts[word] += 1
                else:
                    word_counts[word] = 1
    for word in word_counts.keys():
        if word_counts[word] < word_count_thresh:
            word_counts.pop(word)
            unk_required = True
    return word_counts,unk_required

def word_to_word_ids(word_counts,unk_required, vocab_size):
    """Function to map individual words to their id's.
        Input:
                word_counts: Dictionary with words mapped to their counts
        Output:
                word_to_id: Dictionary with words mapped to their id's. 
    """

    count = 0
    word_to_id = {}
    id_to_word = {}

    # Taking the most frequent vocab_size words
    words = [word for word in word_counts.keys()]
    values = [word_counts[word] for word in words]
    sorted_indices = np.argsort(values)
    words = np.array(words)
    most_freq_words = words[sorted_indices[::-1][0:vocab_size]]
    
    id_to_word = [most_freq_words[i] for i in range(most_freq_words.shape[0])] 
    
    #word2idx
    word_to_id = {}
    for i in range(len(id_to_word)):
        word_to_id[id_to_word[i]] = i
    
    print(word_to_id['<EOS>'])
    index = word_to_id['<EOS>']
    word = id_to_word[0]
    print(index,word)
    
    word_to_id['<EOS>'] = 0
    id_to_word[0] = '<EOS>'
    word_to_id[word] = index
    id_to_word[index] = word
    
    return word_to_id,id_to_word

def convert_caption(caption,word_to_id,max_caption_length):
    """Function to map each word in a caption to it's respective id and to retrieve caption masks
        Input:
                caption: Caption to convert to word_to_word_ids
                word_to_id: Dictionary mapping words to their respective id's
                max_caption_length: Maximum number of words allowed in a caption
        Output:
                caps: Captions with words mapped to word id's
                cap_masks: Caption masks with 1's at positions of words and 0's at pad locations"""
    caps,cap_masks = [],[]
    if type(caption) == 'str':
        caption = [caption] # if single caption, make it a list of captions of length one
    for cap in caption:
        cap = '<BOS> '+cap+' <EOS>'
        nWords = cap.count(' ') + 1
        if nWords >= max_caption_length:
            carr = cap.split(' ')
            carr = carr[0:(max_caption_length-2)]
            cap  = ' '.join(carr)
            cap  = cap + ' <EOS>'
            nWords = cap.count(' ')+1
        cap = cap + ' <EOS>'*(max_caption_length-nWords)
        cap_masks.append([1.0]*nWords + [0.0]*(max_caption_length-nWords))
        curr_cap = []
        for word in cap.split(' '):
            #print(word)
            if word in word_to_id.keys():
                curr_cap.append(word_to_id[word]) # word is present in chosen vocabulary
            else:
                curr_cap.append(word_to_id['<UNK>']) # word not present in chosen vocabulary
        caps.append(curr_cap)
        #print('Caption_Length:',len(caps[0]))
    return np.array(caps),np.array(cap_masks)

### Train Test  Validation Split

In [3]:
## Get the list of the files we have extracted features
import os
from sklearn.model_selection import train_test_split

video_list = os.listdir('./DATA/features')
videos = []
for item in video_list:
    videos.append(item.split('-')[0])

video_train, video_test = train_test_split(videos, test_size=0.1, random_state=42)
video_train, video_val = train_test_split(video_train, test_size=0.1, random_state=42)

In [4]:
print('Training Videos -', len(video_train))
print('Testing Videos -', len(video_test))
print('Validation Videos -', len(video_val))

Training Videos - 5890
Testing Videos - 728
Validation Videos - 655


### Auxillary functions to handle model build

In [5]:
import numpy as np
import tensorflow as tf
import glob
import cv2
import imageio
import pickle
np.random.seed(0)
#Global initializations
n_lstm_steps = 30
DATA_DIR = './DATA/'
VIDEO_DIR = DATA_DIR + 'features/'
YOUTUBE_CLIPS_DIR = DATA_DIR + 'videos/'
TEXT_DIR = DATA_DIR+'word_features/'
pkl_file = open('./DATA/word_features/captions.pkl', 'rb')
sentences = pickle.load(pkl_file)
pkl_file.close()
word_counts,unk_required = build_vocab(0)
word2id,id2word = word_to_word_ids(word_counts,unk_required, len(word_counts.keys()))
video_files = video_train
val_files = video_val

print ("{0} files processed".format(len(video_files)))

def get_bias_vector():
    """Function to return the initialization for the bias vector
       for mapping from hidden_dim to vocab_size.
       Borrowed from neuraltalk by Andrej Karpathy"""
    bias_init_vector = np.array([1.0*word_counts[id2word[i]] for i in range(len(id2word))])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector)
    return bias_init_vector

def fetch_data_batch(batch_size):
    """Function to fetch a batch of video features, captions and caption masks
        Input:
                batch_size: Size of batch to load
        Output:
                curr_vids: Features of the randomly selected batch of video_files
                curr_caps: Ground truth (padded) captions for the selected videos
                curr_masks: Mask for the pad locations in curr_caps"""
    curr_batch_vids = np.random.choice(video_files,batch_size)
    curr_vids = np.array([np.load(VIDEO_DIR + vid+'-30-features' + '.npy') for vid in curr_batch_vids])
    captions = [np.random.choice(sentences[int(vid[5:])],1)[0]['caption'] for vid in curr_batch_vids]
    curr_caps,curr_masks = convert_caption(captions,word2id,n_lstm_steps)
    return curr_vids,curr_caps,curr_masks

def fetch_data_batch_val(batch_size):
    """Function to fetch a batch of video features from the validation set and its captions.
        Input:
                batch_size: Size of batch to load
        Output:
                curr_vids: Features of the randomly selected batch of video_files
                curr_caps: Ground truth (padded) captions for the selected videos"""

    curr_batch_vids = np.random.choice(val_files,batch_size)
    curr_vids = np.array([np.load(VIDEO_DIR +vid+'-30-features' + '.npy') for vid in curr_batch_vids])
    captions = [np.random.choice(sentences[int(vid[5:])],1)[0]['caption'] for vid in curr_batch_vids]
    curr_caps,curr_masks = convert_caption(captions,word2id,n_lstm_steps)
    return curr_vids,curr_caps,curr_masks, curr_batch_vids


def print_in_english(caption_idx):
    """Function to take a list of captions with words mapped to ids and
        print the captions after mapping word indices back to words."""
    captions_english = [[id2word[word] for word in caption] for caption in caption_idx]
    for i,caption in enumerate(captions_english):
        if '<EOS>' in caption:
            caption = caption[0:caption.index('<EOS>')]
        print (str(i+1) + ' ' + ' '.join(caption))
        print ('..................................................')

def playVideo(video_urls):
    video = imageio.get_reader(YOUTUBE_CLIPS_DIR + video_urls[0] + '.mp4','ffmpeg')
    for frame in video:
        fr = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
        cv2.imshow('frame',fr)
        if cv2.waitKey(40) & 0xFF == ord('q'):
            break
    cv2.destroyAllWindows()

2
2 a
5890 files processed


In [12]:
print(len(word2id))

20001


In [57]:
tmp_val = 'video3707'
np.random.choice(sentences[int(tmp_val[5:])],1)[0]['caption']

'a woman showing how to make cookies'

In [25]:
tdata = np.load(VIDEO_DIR+'video0-30-features.npy')
tdata.shape

(30, 2048)

In [18]:
len(word_counts.keys())

29325

In [60]:
print(id2word[0], word2id['a'])

<EOS> 1


### Build the model to train

In [9]:
import numpy as np
import tensorflow as tf
import sys
#GLOBAL VARIABLE INITIALIZATIONS TO BUILD MODEL
n_steps = 30
hidden_dim = 500
frame_dim = 2048
batch_size = 1
vocab_size = len(word2id)
bias_init_vector = get_bias_vector()
n_steps_vocab = 30

def build_model():
    """This function creates weight matrices that transform:
            * frames to caption dimension
            * hidden state to vocabulary dimension
            * creates word embedding matrix """

    print ("Network config: \nN_Steps: {}\nHidden_dim:{}\nFrame_dim:{}\nBatch_size:{}\nVocab_size:{}\n".format(n_steps,
                                                                                                    hidden_dim,
                                                                                                    frame_dim,
                                                                                                    batch_size,
                                                                                                    vocab_size))

    #Create placeholders for holding a batch of videos, captions and caption masks
    video = tf.placeholder(tf.float32,shape=[batch_size,n_steps,frame_dim],name='Input_Video')
    caption = tf.placeholder(tf.int32,shape=[batch_size,n_steps_vocab],name='GT_Caption')
    caption_mask = tf.placeholder(tf.float32,shape=[batch_size,n_steps_vocab],name='Caption_Mask')
    dropout_prob = tf.placeholder(tf.float32,name='Dropout_Keep_Probability')

    with tf.variable_scope('Im2Cap') as scope:
        W_im2cap = tf.get_variable(name='W_im2cap',shape=[frame_dim,
                                                    hidden_dim],
                                                    initializer=tf.random_uniform_initializer(minval=-0.08,maxval=0.08))
        b_im2cap = tf.get_variable(name='b_im2cap',shape=[hidden_dim],
                                                    initializer=tf.constant_initializer(0.0))
    with tf.variable_scope('Hid2Vocab') as scope:
        W_H2vocab = tf.get_variable(name='W_H2vocab',shape=[hidden_dim,vocab_size],
                                                         initializer=tf.random_uniform_initializer(minval=-0.08,maxval=0.08))
        b_H2vocab = tf.Variable(name='b_H2vocab',initial_value=bias_init_vector.astype(np.float32))

    with tf.variable_scope('Word_Vectors') as scope:
        word_emb = tf.get_variable(name='Word_embedding',shape=[vocab_size,hidden_dim],
                                                                initializer=tf.random_uniform_initializer(minval=-0.08,maxval=0.08))
    print ("Created weights")

    #Build two LSTMs, one for processing the video and another for generating the caption
    with tf.variable_scope('LSTM_Video',reuse=None) as scope:
        lstm_vid = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim)
        lstm_vid = tf.nn.rnn_cell.DropoutWrapper(lstm_vid,output_keep_prob=dropout_prob)
    with tf.variable_scope('LSTM_Caption',reuse=None) as scope:
        lstm_cap = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim)
        lstm_cap = tf.nn.rnn_cell.DropoutWrapper(lstm_cap,output_keep_prob=dropout_prob)

    #Prepare input for lstm_video
    video_rshp = tf.reshape(video,[-1,frame_dim])
    video_rshp = tf.nn.dropout(video_rshp,keep_prob=dropout_prob)
    video_emb = tf.nn.xw_plus_b(video_rshp,W_im2cap,b_im2cap)
    video_emb = tf.reshape(video_emb,[batch_size,n_steps,hidden_dim])
    padding = tf.zeros([batch_size,n_steps-1,hidden_dim])
    video_input = tf.concat([video_emb,padding],1)
    #video_input=video_emb
    print ("Video_input: {}".format(video_input.get_shape()))
    #Run lstm_vid for 2*n_steps-1 timesteps
    with tf.variable_scope('LSTM_Video') as scope:
        out_vid,state_vid = tf.nn.dynamic_rnn(lstm_vid,video_input,dtype=tf.float32)
    print ("Video_output: {}".format(out_vid.get_shape()))

    #Prepare input for lstm_cap
    padding = tf.zeros([batch_size,n_steps_vocab,hidden_dim])
    caption_vectors = tf.nn.embedding_lookup(word_emb,caption[:,0:n_steps_vocab-1])
    caption_vectors = tf.nn.dropout(caption_vectors,keep_prob=dropout_prob)
    caption_2n = tf.concat([padding,caption_vectors],1)
    #caption_2n = caption_vectors
    caption_input = tf.concat([caption_2n,out_vid],2)
    print ("Caption_input: {}".format(caption_input.get_shape()))
    #Run lstm_cap for 2*n_steps-1 timesteps
    with tf.variable_scope('LSTM_Caption') as scope:
        out_cap,state_cap = tf.nn.dynamic_rnn(lstm_cap,caption_input,dtype=tf.float32)
    print ("Caption_output: {}".format(out_cap.get_shape()))

    #Compute masked loss
    output_captions = out_cap[:,n_steps_vocab:,:]
    output_logits = tf.reshape(output_captions,[-1,hidden_dim])
    output_logits = tf.nn.dropout(output_logits,keep_prob=dropout_prob)
    output_logits = tf.nn.xw_plus_b(output_logits,W_H2vocab,b_H2vocab)
    output_labels = tf.reshape(caption[:,1:],[-1])
    caption_mask_out = tf.reshape(caption_mask[:,1:],[-1])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_logits,labels=output_labels)
    masked_loss = loss*caption_mask_out
    loss = tf.reduce_sum(masked_loss)/tf.reduce_sum(caption_mask_out)
    return video,caption,caption_mask,output_logits,loss,dropout_prob

db1 = None
db2 = None
db3 = None
def train():
    global db1,db2,db3
    with tf.Graph().as_default():
        learning_rate = 0.0001
        video,caption,caption_mask,output_logits,loss,dropout_prob = build_model()
        optim = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
        nEpoch = 300
        nIter = int(nEpoch*6000/batch_size)
        
        ckpt_file = './ckpt_v4/model_58000.ckpt.meta'

        saver = tf.train.Saver()
        with tf.Session() as sess:
            if ckpt_file:
                saver_ = tf.train.import_meta_graph(ckpt_file)
                saver_.restore(sess,'./ckpt_v4/model_58000.ckpt')
                print ("Restored model")
            else:
                sess.run(tf.global_variables_initializer())
            for i in range(nIter):
                #print(i)
                vids,caps,caps_mask = fetch_data_batch(batch_size=batch_size)
                db1,db2,db3 = vids, caps, caps_mask
                #print(type(vids),type(caps), type(caps_mask))
                #print(vids,caps, caps_mask)
                _,curr_loss,o_l = sess.run([optim,loss,output_logits],feed_dict={video:vids,
                                                                            caption:caps,
                                                                            caption_mask:caps_mask,
                                                                            dropout_prob:0.5})

                if i%1000 == 0:
                    print ("\nIteration {} \n".format(i))
                    out_logits = o_l.reshape([batch_size,n_steps_vocab-1,vocab_size])
                    output_captions = np.argmax(out_logits,2)
                    #print_in_english(output_captions[0:4])
                    #print ("GT Captions")
                    #print_in_english(caps[0:4])
                    print ("Current train loss: {} ".format(curr_loss))
                    vids,caps,caps_mask,_ = fetch_data_batch_val(batch_size=batch_size)
                    db1,db2,db3 = vids,caps,caps_mask
                    curr_loss,o_l = sess.run([loss,output_logits],feed_dict={video:vids,
                                                                            caption:caps,
                                                                            caption_mask:caps_mask,
                                                                            dropout_prob:1.0})
                    out_logits = o_l.reshape([batch_size,n_steps_vocab-1,vocab_size])
                    output_captions = np.argmax(out_logits,2)
                    print_in_english(output_captions[0:2])
                    print ("GT Captions")
                    print_in_english(caps[0:2])
                    print ("Current validation loss: {} ".format(curr_loss))

                if i%2000 == 0:
                    saver.save(sess,'./ckpt_v5/model_'+str(i)+'.ckpt')
                    print ('Saved {}'.format(i))

### Training Begins !!!

In [7]:
train()

Network config: 
N_Steps: 30
Hidden_dim:500
Frame_dim:2048
Batch_size:30
Vocab_size:29325

Created weights
Video_input: (30, 59, 500)
Video_output: (30, 59, 500)
Caption_input: (30, 59, 1000)
Caption_output: (30, 59, 500)
INFO:tensorflow:Restoring parameters from ./ckpt_v4/model_58000.ckpt
Restored model

Iteration 0 

Current train loss: 5.710939407348633 
1 man are <BOS>
..................................................
2 man men are <BOS> <BOS> on on <BOS>
..................................................
GT Captions
1 <BOS> men drawing guns and then football player playing
..................................................
2 <BOS> two men driving a ferrari
..................................................
Current validation loss: 6.091744422912598 
Saved 0

Iteration 1000 

Current train loss: 3.989447593688965 
1 a are a
..................................................
2 a are on a
..................................................
GT Captions
1 <BOS> montage of baseball play

1 a talking talking and playing
..................................................
2 a animated cartoon is at
..................................................
GT Captions
1 <BOS> women are singing and dancing
..................................................
2 <BOS> an old man looks around suspiciously
..................................................
Current validation loss: 3.7483720779418945 
Saved 16000

Iteration 17000 

Current train loss: 3.6277105808258057 
1 a man is around a video
..................................................
2 a are in and costumes
..................................................
GT Captions
1 <BOS> a man running in the wilderness
..................................................
2 <BOS> children dressed up in costumes
..................................................
Current validation loss: 3.73954176902771 

Iteration 18000 

Current train loss: 3.2740445137023926 
1 a animated man is in a a video
.............................................

Saved 32000

Iteration 33000 

Current train loss: 3.666600227355957 
1 a cartoon character is to a
..................................................
2 a of to and a background and
..................................................
GT Captions
1 <BOS> a cartoon character speaks to people
..................................................
2 <BOS> numbers go up in the right corner
..................................................
Current validation loss: 3.9967732429504395 

Iteration 34000 

Current train loss: 3.3666536808013916 
1 a man is a hat shirt is a man of a
..................................................
2 a man of on stage stage show
..................................................
GT Captions
1 <BOS> a man in a black tshirt shows a clip of halle sallasee running a race in slow motion
..................................................
2 <BOS> a couple dances on a tv show
..................................................
Current validation loss: 3.937234401702881 
Save


Iteration 50000 

Current train loss: 3.4279513359069824 
1 two men playing playing a ping of ping tennis
..................................................
2 a man is playing a the online video game game
..................................................
GT Captions
1 <BOS> two player teams play a game of table tennis against one another
..................................................
2 <BOS> a person is playing on an xbox one video game console
..................................................
Current validation loss: 3.692199468612671 
Saved 50000

Iteration 51000 

Current train loss: 3.653903007507324 
1 a players are running a scoring a goal
..................................................
2 a woman is in a
..................................................
GT Captions
1 <BOS> soccer team is playing and makes a goal everyone cheers
..................................................
2 <BOS> a doll dressed in bridal weal holding a goat in hand
...............................

### Testing

In [10]:
def test():
    with tf.Graph().as_default():
        learning_rate = 0.00001
        video,caption,caption_mask,output_logits,loss,dropout_prob = build_model()
        optim = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
        ckpt_file = './ckpt_v5/model_58000.ckpt.meta'
        saver = tf.train.Saver()
        with tf.Session() as sess:
            if ckpt_file:
                saver_ = tf.train.import_meta_graph(ckpt_file)
                saver_.restore(sess,'./ckpt_v5/model_58000.ckpt')
                print ("Restored model")
            else:
                sess.run(tf.initialize_all_variables())
            while(1):
                vid,caption_GT,_,current_batch_vids = fetch_data_batch_val(1)
                caps,caps_mask = convert_caption(['<BOS>'],word2id,30)

                for i in range(30):
                    o_l = sess.run(output_logits,feed_dict={video:vid,
                                                            caption:caps,
                                                            caption_mask:caps_mask,
                                                            dropout_prob:1.0})
                    out_logits = o_l.reshape([batch_size,n_steps-1,vocab_size])
                    output_captions = np.argmax(out_logits,2)
                    caps[0][i+1] = output_captions[0][i]
                    print_in_english(caps)
                    if id2word[output_captions[0][i]] == '<EOS>':
                        break
                print ('............................\nGT Caption:\n')
                print_in_english(caption_GT)
                play_video = input('Should I play the video? ')
                if play_video.lower() == 'y':
                    playVideo(current_batch_vids)
                test_again = input('Want another test run? ')
                if test_again.lower() == 'n':
                    break
test()

Network config: 
N_Steps: 30
Hidden_dim:500
Frame_dim:2048
Batch_size:1
Vocab_size:29325

Created weights
Video_input: (1, 59, 500)
Video_output: (1, 59, 500)
Caption_input: (1, 59, 1000)
Caption_output: (1, 59, 500)
INFO:tensorflow:Restoring parameters from ./ckpt_v5/model_58000.ckpt
Restored model
1 <BOS> a
..................................................
1 <BOS> a person
..................................................
1 <BOS> a person is
..................................................
1 <BOS> a person is cooking
..................................................
1 <BOS> a person is cooking a
..................................................
1 <BOS> a person is cooking a dish
..................................................
1 <BOS> a person is cooking a dish in
..................................................
1 <BOS> a person is cooking a dish in a
..................................................
1 <BOS> a person is cooking a dish in a pot
.............................

### Attention

In [None]:
 coding: utf-8

# In[1]:

import tensorflow as tf
import numpy as np
import json
import os

from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Dense, Activation, Input, GRU, Dropout
from keras.optimizers import RMSprop
from keras.layers.wrappers import TimeDistributed
from keras.preprocessing import sequence
from keras.models import Model
from keras import backend as K
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

from Attention import Attention_Layer
from Multimodel_layer import Multimodel_Layer


# In[2]:

class Caption_Generator:
    
    def __init__(self):
        self.captions = []
        self.captions_in_each_video = []
        self.word2id = {}
        self.id2word = {}
        self.max_sentence_length = 0
        self.vocabulary_size = 0
        self.batch_size = 25
        self.embedding_output_shape = 512
        
    ################################################################################################
    def read_data(self, n_batch):
        print "loading Data for new Batch... "
        files = [] 
    
        #reading captions
        with open('MLDS_HW2/MLDS_hw2_data/training_label.json') as data_file:
            training_labels = json.load(data_file)
        
        
        self.captions_in_each_video = []
        for i in n_batch:
            files.append(training_labels[i]['id'])
            for j in range(len(training_labels[i]['caption'])):
                training_labels[i]['caption'][j] = "<s> "+training_labels[i]['caption'][j]+" <e>" 
                self.captions.append(training_labels[i]['caption'][j].lower().split(' '))
            self.captions_in_each_video.append(len(training_labels[i]['caption']))

        
        #reading video features
        video_features = np.zeros((len(files),80,4096))
        
        video_features[0] = np.load("MLDS_HW2/MLDS_hw2_data/training_data/feat/"+files[0]+".npy")

        for i in range(1,len(files)):
            video_features[i] = np.load("MLDS_HW2/MLDS_hw2_data/training_data/feat/"+files[i]+".npy")
        
        print "Data Loaded Successfully....."

        return video_features
    ################################################################################################
    def create_vocabulary(self):

        print "creating vocabulary..."
        labels = []
        with open('MLDS_HW2/MLDS_hw2_data/training_label.json') as data_file:
            training_labels = json.load(data_file)
        
        for i in range(len(training_labels)):
            for j in range(len(training_labels[i]['caption'])):
                training_labels[i]['caption'][j] = "<s> "+training_labels[i]['caption'][j]+" <e>" 
                labels.append(training_labels[i]['caption'][j].lower().split(' '))
        
        self.max_sentence_length = 1 + max([len(caption) for caption in labels])
        print("\t Max sentence length : ", self.max_sentence_length)
         
        #computing char2id and id2char vocabulary
        index = 0
        for caption in labels:
            for word in caption:
                if word not in self.word2id:
                    self.word2id[word] = index
                    self.id2word[index] = word
                    index += 1
                    
        
        self.vocabulary_size = len(self.word2id)
    
       
            
    ################################################################################################
    def transform_inputs(self, video_features):
        #transforming the no of samples of video features equal to no of samples of captions
        new_features = np.zeros((len(self.captions), 80, 4096))
        for i in range(len(self.captions_in_each_video)):
            for j in range(self.captions_in_each_video[i]):
                new_features[j] = video_features[i]
                
        return new_features
            
    
    ################################################################################################
    def one_of_N_encoding(self): 
        print("encoding inputs...")      
        #creating caption tensor that is a matrix of size numCaptions x maximumSentenceLength x wordVocabularySize
        encoded_tensor = np.zeros((len(self.captions), self.max_sentence_length, self.vocabulary_size), dtype=np.float16)
        label_tensor = np.zeros((len(self.captions), self.max_sentence_length, self.vocabulary_size), dtype =np.float16)
        #one-hot-encoding
        for i in range(len(self.captions)):
            for j in range(len(self.captions[i])):
                encoded_tensor[i, j, self.word2id[self.captions[i][j]]] = 1
                if j<len(self.captions[i])-1:
                    label_tensor[i,j,self.word2id[self.captions[i][j+1]]] = 1
                
        return encoded_tensor, label_tensor
    
    ################################################################################################
    def embedding_layer(self, input_data):
        print("embedding inputs....")
        model = Sequential()
        model.add(Dense(self.embedding_output_shape, input_shape = (self.max_sentence_length, self.vocabulary_size)))
        model.add(Activation('relu'))
        model.compile('rmsprop','mse')
        embedding_weights = model.get_weights()
        output_array = model.predict(input_data)
        self.embedding_weights = model.get_weights()
        output_weights = np.asarray(self.embedding_weights[0]).T
        self.embedding_weights[0] = output_weights
        self.embedding_weights[1] = np.ones((self.vocabulary_size,))
        return output_array
    
    ################################################################################################
    def data_preprocessing(self, n_batch):
        #########################Preprocessing Data##############################
        #print("Data Preprocessing.......")
        #print("\tReading data.......")
        video_features = self.read_data(n_batch)
        video_features = self.transform_inputs(video_features)
        #print("\tvideo features : ",video_features.shape)
        #print("\tCaptions : ", len(self.captions))
        #print("\tCreating Vocabulary......")
        #self.create_vocabulary()

        # one-hot encoding of captions
        #print("\tEncoding Captions......")
        encoded_tensor, label_tensor = self.one_of_N_encoding()
        #print("\tEncoded Captions : ",encoded_tensor.shape)

        # embedding the one-hot encoding of each word into 512
        #print("\tEmbedding Captions.......")
        embedded_input = self.embedding_layer(encoded_tensor)

        #print("\tEmbedding Weights : ", np.asarray(self.embedding_weights[0]).shape)

        #print("\tEmbedded_captions : ",embedded_input.shape)
        
        return video_features, embedded_input, label_tensor
        
    ################################################################################################    
    def build_model(self, video_features, embedded_input):
        #########################training model##################################
        print('Building Sentence Generator Model...')

        input1 = Input(shape=(embedded_input.shape[1],embedded_input.shape[2]), dtype='float32')
        #input2 = Input(shape=(visual_features.shape[0],visual_features.shape[1]), dtype='float32')
        input2 = Input(shape=(video_features.shape[1], video_features.shape[2]), dtype='float32')
        
        model = Sequential()
        
        layer1 = GRU(512, return_sequences = True, input_shape = (embedded_input.shape[1],embedded_input.shape[2]), activation = 'relu')(input1)
        
        attention_layer = Attention_Layer(output_dim = 32)([layer1, input2])

        multimodel_layer = Multimodel_Layer(output_dim = 1024)([layer1,attention_layer])

        dropout = Dropout(0.5)(multimodel_layer)

        layer2 = TimeDistributed(Dense(activation = 'tanh', units = 512))(dropout)

        softmax_layer = Dense(units = self.vocabulary_size, activation = 'softmax', weights = self.embedding_weights)(layer2)
        
        model = Model(inputs = [input1, input2], outputs = [softmax_layer])
        
        '''
        # We also specify here the optimization we will use, in this case we use RMSprop with learning rate 0.001.
        # RMSprop is commonly used for RNNs instead of regular SGD.
        # categorical_crossentropy is the same loss used for classification problems using softmax. (nn.ClassNLLCriterion)
        '''
        model.compile(loss = 'categorical_crossentropy', optimizer = RMSprop(lr=0.001))

        print(model.summary()) # Convenient function to see details about the network model.

        return model
    
    ################################################################################################    
    def train(self):       
        
        batches = np.arange(1450)
        #########################training model##################################
        for epoch in range(10):
            print "\n\n\nEpoch : ",epoch+1
            np.random.shuffle(batches)
            batch = 0
            for iteration in range(1450/self.batch_size):
                if batch+self.batch_size >= 1450:
                    n_batch = batches[batch:-1]
                else:    
                    n_batch = batches[batch:(batch+self.batch_size)]
                batch += self.batch_size
                self.captions = []
                video_features, embedded_input, label_tensor = self.data_preprocessing(n_batch)
                if(iteration == 0 and epoch == 0):
                    model = caption_generator.build_model(video_features, embedded_input)
                # define the checkpoint
                filepath="Sentence_Generator_Model_Results/word-weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
                checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
                callbacks_list = [checkpoint]

                print"\n\n###########Training the model on epoch : ", epoch+1, " batch : ", iteration+1 ," ###########\n\n"
                model.fit(x = [embedded_input,video_features], y = label_tensor, batch_size = 256, epochs= 5, callbacks = callbacks_list)
            self.save_model(model,epoch)
            
            
        return model
    
    ################################################################################################    
    def save_model(self, model, epoch):
        # serialize model to JSON
        filename = "Sentence_Generator_Model_Results/model_epoch_"+str(epoch)+".h5"
        #with open("batch_model.json", "w") as json_file:
            #json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights(filename)
        print("Saved model to disk")
        
    ################################################################################################    
    def load_model(self, model, epoch):
        # load weights into new model
        filename = "Sentence_Generator_Model_Results/model_epoch_"+str(epoch)+".h5"
        model.load_weights(filename)
        print("Loaded model from disk")
        return model
    
    ################################################################################################    
    def test(self, model, epoch):

        print("word : ",self.id2word[0])
        test_captions = []
        with open('MLDS_HW2/MLDS_hw2_data/testing_public_label.json') as data_file:
            testing_labels = json.load(data_file)
        
        files = []
        self.captions_in_each_video = []

        for i in range(len(testing_labels)):
            files.append(testing_labels[i]['id'])
            for j in range(len(testing_labels[i]['caption'])):
                test_captions.append(testing_labels[i]['caption'][j].lower().split(' '))
            self.captions_in_each_video.append(j)
        
        encoded_tensor = np.zeros((len(test_captions), self.max_sentence_length, self.vocabulary_size), dtype=np.float16)
        encoded_tensor[:,0,0] = 1

        print("number of files : ",len(files))
        #reading video features
        video_features = np.zeros((len(files),80,4096))
        
        print("shape : ",np.load("MLDS_HW2/MLDS_hw2_data/testing_data/feat/"+files[0]+".npy").shape)

        for i in range(len(files)):
            video_features[i] = np.load("MLDS_HW2/MLDS_hw2_data/testing_data/feat/"+files[i]+".npy")

        new_features = np.zeros((len(self.captions), 80, 4096))
        for i in range(len(self.captions_in_each_video)):
            for j in range(self.captions_in_each_video[i]):
                new_features[j] = video_features[i]
        
        new_features = np.reshape(new_features, (len(self.captions)*80, 1, 4096))
        

        #print("new_features : ", new_features.shape)
        encoded_tensor = np.repeat(encoded_tensor, 80, axis=0)

        embedded_input = self.embedding_layer(encoded_tensor)

        print("embedded_input : ", embedded_input.shape)
        print("video_features : ", new_features.shape)

        model  = self.build_model()
        model = self.load_model(model)

        output = model.predict([embedded_input[:200,:,:], new_features[:200,:,:]])
        
        with open("Model_Results/Results/generated_text_epoch"+str(epoch)+".txt", "a") as fileHandler:
            
            for i in range(200):
                text = ""
                for j in range(41):
                    word = np.argmax(output[i,j,:])
                    text += self.id2word[word]
                    text += " "
                fileHandler.write("Generated text for example ",i," : ", text)
                fileHandler.write("\n")
            fileHandler.close()    
            

    ################################################################################################



# In[3]:

caption_generator = Caption_Generator()


# In[ ]:

caption_generator.create_vocabulary()


# In[ ]:

model = caption_generator.train()


# In[ ]:

In [2]:
!top

[?1h=[H[2J[mtop - 06:22:12 up  1:41,  4 users,  load average: 1.11, 0.35, 0.12[m[m[m[m[K
Tasks:[m[m[1m 193 [m[mtotal,[m[m[1m   1 [m[mrunning,[m[m[1m 192 [m[msleeping,[m[m[1m   0 [m[mstopped,[m[m[1m   0 [m[mzombie[m[m[m[m[K
%Cpu(s):[m[m[1m  0.4 [m[mus,[m[m[1m  0.3 [m[msy,[m[m[1m  0.0 [m[mni,[m[m[1m 99.0 [m[mid,[m[m[1m  0.3 [m[mwa,[m[m[1m  0.0 [m[mhi,[m[m[1m  0.0 [m[msi,[m[m[1m  0.0 [m[mst[m[m[m[m[K
KiB Mem :[m[m[1m 26752224 [m[mtotal,[m[m[1m 24606748 [m[mfree,[m[m[1m   584804 [m[mused,[m[m[1m  1560672 [m[mbuff/cache[m[m[m[m[K
KiB Swap:[m[m[1m        0 [m[mtotal,[m[m[1m        0 [m[mfree,[m[m[1m        0 [m[mused.[m[m[1m 25745880 [m[mavail Mem [m[m[m[m[K
[K
[7m  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND     [m[m[K
[m 3645 narain.+  20   0  304328  56088  12332 S   6.7  0.2   0:01.78 jupyter-no+ [m[m[K
[m    1 root

[m   14 root      rt   0       0      0      0 S   0.0  0.0   0:00.00 watchdog/1  [m[m[K[H[mtop - 06:22:24 up  1:41,  4 users,  load average: 0.94, 0.34, 0.12[m[m[m[m[K

%Cpu(s):[m[m[1m  0.4 [m[mus,[m[m[1m  0.1 [m[msy,[m[m[1m  0.0 [m[mni,[m[m[1m 99.3 [m[mid,[m[m[1m  0.2 [m[mwa,[m[m[1m  0.0 [m[mhi,[m[m[1m  0.0 [m[msi,[m[m[1m  0.0 [m[mst[m[m[m[m[K
KiB Mem :[m[m[1m 26752224 [m[mtotal,[m[m[1m 24606180 [m[mfree,[m[m[1m   583892 [m[mused,[m[m[1m  1562152 [m[mbuff/cache[m[m[m[m[K
KiB Swap:[m[m[1m        0 [m[mtotal,[m[m[1m        0 [m[mfree,[m[m[1m        0 [m[mused.[m[m[1m 25746772 [m[mavail Mem [m[m[m[m[K
[K

[m 3741 narain.+  20   0  598636  47232  11280 S   1.7  0.2   0:00.98 python3     [m[m[K
[m    1 root      20   0  119952   6100   3976 S   0.0  0.0   0:07.39 systemd     [m[m[K
[m    2 root      20   0       0      0      0 S   0.0  0.0   0:00.00 kthreadd    [m[m[K
[