In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
device_lib.list_local_devices()

Num GPUs Available:  1


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 5531667082195390905
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 4152360960
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16909906045692684308
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5"
 xla_global_id: 416903419]

# **Imports**


In [1]:
import shutil
import random
import tqdm
import numpy as np
import cv2
import os
import json
import time
# from vgg16 import VGG16
from keras.applications.vgg16 import VGG16
from keras.models import Model, load_model
from keras.preprocessing.text import Tokenizer
from keras.layers import Input, LSTM, Dense
from keras.utils.data_utils import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib
import config

# **Extract Features**


In [2]:
def video_to_frames(video):
    path = os.path.join(config.train_path, 'temporary_images')
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)

    video_path = os.path.join(config.train_path, 'video', video)
    count = 0
    image_list = []

    # Path to video file
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if ret is False:
            break
        
        cv2.imwrite(os.path.join(config.train_path, 'temporary_images', 'frame%d.jpg' % count), frame)
        image_list.append(os.path.join(config.train_path, 'temporary_images', 'frame%d.jpg' % count))
        count += 1

    cap.release()
    cv2.destroyAllWindows()
    return image_list

In [7]:
def model_cnn_load():
    model = VGG16(weights="imagenet", include_top=True, input_shape=(224, 224, 3))
    out = model.layers[-2].output
    model_final = Model(inputs=model.input, outputs=out)
    return model_final

vgg_model = model_cnn_load()
vgg_model.save('vgg16.h5')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [3]:
vgg_model = load_model('vgg16.h5')



In [4]:
def load_image(path):
    img = cv2.imread(path)
    img = cv2.resize(img, (224, 224))
    return img


def extract_features(video, model):
    """
    :param video: The video whose frames are to be extracted to convert into a numpy array
    :param model: the pretrained vgg16 model
    :return: numpy array of size 4096x80
    """
    video_id = video.split(".")[0]
    print(video_id)
    print(f'Processing video {video}')

    image_list = video_to_frames(video)
    samples = np.round(np.linspace(0, len(image_list) - 1, 80))
    image_list = [image_list[int(sample)] for sample in samples]
    images = np.zeros((len(image_list), 224, 224, 3))
    for i in range(len(image_list)):
        img = load_image(image_list[i])
        images[i] = img
    images = np.array(images)
    
    fc_feats = model.predict(images, batch_size=128)
    img_feats = np.array(fc_feats)
    
    # cleanup
    shutil.rmtree(os.path.join(config.train_path, 'temporary_images'))
    return img_feats


def extract_feats_pretrained_cnn():
    """
    saves the numpy features from all the videos
    """

    if not os.path.isdir(os.path.join(config.train_path, 'feat')):
        os.mkdir(os.path.join(config.train_path, 'feat'))

    video_list = os.listdir(os.path.join(config.train_path, 'video'))
    
    #ًWhen running the script on Colab an item called '.ipynb_checkpoints' 
    #is added to the beginning of the list causing errors later on, so the next line removes it.
    # video_list.remove('.ipynb_checkpoints')
    
    for video in video_list:

        outfile = os.path.join(config.train_path, 'feat', video + '.npy')
        img_feats = extract_features(video, vgg_model)
        np.save(outfile, img_feats)

In [12]:
extract_feats_pretrained_cnn()

-4wsuPCjDBc_5_15
Processing video -4wsuPCjDBc_5_15.avi
-7KMZQEsJW4_205_208
Processing video -7KMZQEsJW4_205_208.avi
-8y1Q0rA3n8_108_115
Processing video -8y1Q0rA3n8_108_115.avi
-8y1Q0rA3n8_95_102
Processing video -8y1Q0rA3n8_95_102.avi
-9CUm-2cui8_39_44
Processing video -9CUm-2cui8_39_44.avi
-AwoiGR6c8M_10_14
Processing video -AwoiGR6c8M_10_14.avi
-bjOB4zS0uE_100_105
Processing video -bjOB4zS0uE_100_105.avi
-Cv5LsqKUXc_17_25
Processing video -Cv5LsqKUXc_17_25.avi
-Cv5LsqKUXc_71_76
Processing video -Cv5LsqKUXc_71_76.avi
-DKuLXYoY3g_14_20
Processing video -DKuLXYoY3g_14_20.avi
-dm-ds5rRaM_44_52
Processing video -dm-ds5rRaM_44_52.avi
-DRy7rBg0IQ_31_37
Processing video -DRy7rBg0IQ_31_37.avi
-FugkxLmGO4_5_16
Processing video -FugkxLmGO4_5_16.avi
-joBOHlg5J0_72_80
Processing video -joBOHlg5J0_72_80.avi
-mAoVOhKy0c_4_9
Processing video -mAoVOhKy0c_4_9.avi
-Ms9tsWmhyU_80_95
Processing video -Ms9tsWmhyU_80_95.avi
-pUwIypksfE_13_23
Processing video -pUwIypksfE_13_23.avi
-rkErLY0rRc_26_35
Process

# **CLEANING AND PREPROCESSING CAPTIONS**


In [9]:
train_path = config.train_path
TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json')
# mentioning the train test split
train_split = 0.85
# loading the json file for training
with open(TRAIN_LABEL_PATH) as data_file:    
    y_data = json.load(data_file)

len(y_data)

1450

In [10]:
# train_list contains all the captions with their video ID
# vocab_list contains all the vocabulary from training data
train_list = []
vocab_list = []
for y in y_data:
    for caption in y['caption']:
        caption = "<bos> " + caption + " <eos>"
        # we are only using sentences whose length lie between 6 and 10
        if len(caption.split())>10 or len(caption.split())<6:
            continue
        else:
            train_list.append([caption, y['id']])
print(len(train_list))

random.shuffle(train_list)
training_list = train_list[:int(len(train_list)*train_split)]
validation_list = train_list[int(len(train_list)*train_split):]
for train in training_list:
    vocab_list.append(train[0])
# Tokenizing the words
tokenizer = Tokenizer(num_words=1500)
tokenizer.fit_on_texts(vocab_list)

x_data = {}
TRAIN_FEATURE_DIR = os.path.join(config.train_path, 'feat')
# Loading all the numpy arrays at once and saving them in a dictionary
for filename in os.listdir(TRAIN_FEATURE_DIR):
    if filename == '.gitignore':
        continue
    f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename), allow_pickle=True)
    x_data[filename[:-4]] = f

len(x_data)

17216


1970

# **MODEL FOR TRAINING**

In [11]:
"""
time_steps_encoder is the number of frames per video we will be using for training
num_encoder_tokens is the number of features from each frame
latent_dim is the number of hidden features for lstm
time_steps_decoder is the maximum length of each sentence
num_decoder_tokens is the final number of tokens in the softmax layer
batch size
"""
time_steps_encoder=80
num_encoder_tokens=4096
latent_dim=512
time_steps_decoder=10
num_decoder_tokens=1500
batch_size=320

encoder_inputs = Input(shape=(time_steps_encoder, num_encoder_tokens), name="encoder_inputs")
encoder = LSTM(latent_dim, return_state=True,return_sequences=True, name='endcoder_lstm')
_, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]


# Set up the decoder
decoder_inputs = Input(shape=(time_steps_decoder, num_decoder_tokens), name= "decoder_inputs")
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_relu')
decoder_outputs = decoder_dense(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 80, 4096)]   0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, 10, 1500)]   0           []                               
                                                                                                  
 endcoder_lstm (LSTM)           [(None, 80, 512),    9439232     ['encoder_inputs[0][0]']         
                                 (None, 512),                                                     
                                 (None, 512)]                                                     
                                                                                              

# **LOADING THE DATASET**

In [12]:
def load_dataset(training_list):
    """
    Loads the dataset in batches for training
    :return: batch of data
    """
    encoder_input_data = []
    decoder_input_data = []
    decoder_target_data = []
    videoId = []
    videoSeq = []
    for idx, cap in enumerate(training_list):
        caption = cap[0]
        videoId.append(cap[1])
        videoSeq.append(caption)
    train_sequences = tokenizer.texts_to_sequences(videoSeq)
    train_sequences = np.array(train_sequences)
    train_sequences = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=config.max_length)
    file_size = len(train_sequences)
    n = 0
    for i in range(config.epochs):
        for idx in range(0, file_size):
            n += 1
            encoder_input_data.append(x_data[videoId[idx]])
            y = to_categorical(train_sequences[idx], config.num_decoder_tokens)
            decoder_input_data.append(y[:-1])
            decoder_target_data.append(y[1:])
            if n == config.batch_size:
                encoder_input = np.array(encoder_input_data)
                decoder_input = np.array(decoder_input_data)
                decoder_target = np.array(decoder_target_data)
                encoder_input_data = []
                decoder_input_data = []
                decoder_target_data = []
                n = 0
                yield ([encoder_input, decoder_input], decoder_target)

# **TRAIN THE MODEL**

In [13]:
train = load_dataset(training_list)
valid = load_dataset(validation_list)

early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='min')

# Run training
opt = Adam(lr=0.0003)
reduce_lr = ReduceLROnPlateau(monitor="val_loss",
                            factor=0.1, patience=5, verbose=0,
                            mode="auto")
model.compile(metrics=['accuracy'], optimizer=opt, loss='categorical_crossentropy')

validation_steps = len(validation_list)//config.batch_size
steps_per_epoch = len(training_list)//config.batch_size

model.fit(train, validation_data=valid, validation_steps=validation_steps,
            epochs=config.epochs, steps_per_epoch=steps_per_epoch,
            callbacks=[reduce_lr, early_stopping])

  super().__init__(name, **kwargs)
  train_sequences = np.array(train_sequences)


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 65: early stopping


<keras.callbacks.History at 0x1ecf58e78e0>

In [15]:
if not os.path.exists(config.save_model_path):
    os.makedirs(config.save_model_path)

encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(config.latent_dim,))
decoder_state_input_c = Input(shape=(config.latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
encoder_model.summary()
decoder_model.summary()

# saving the models
encoder_model.save(os.path.join(config.save_model_path, 'encoder_model.h5'))
decoder_model.save_weights(os.path.join(config.save_model_path, 'decoder_model_weights.h5'))
with open(os.path.join(config.save_model_path, 'tokenizer' + str(config.num_decoder_tokens)), 'wb') as file:
    joblib.dump(tokenizer, file)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, 80, 4096)]       0         
                                                                 
 endcoder_lstm (LSTM)        [(None, 80, 512),         9439232   
                              (None, 512),                       
                              (None, 512)]                       
                                                                 
Total params: 9,439,232
Trainable params: 9,439,232
Non-trainable params: 0
_________________________________________________________________
Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, 10, 1500)]   0           []                               

# **TEST**

In [2]:
with open(os.path.join(config.save_model_path, 'tokenizer' + str(config.num_decoder_tokens)), 'rb') as file:
        tokenizer = joblib.load(file)
    # loading encoder model. This remains the same
inf_encoder_model = load_model(os.path.join(config.save_model_path, 'encoder_model.h5'))

# inference decoder model loading
decoder_inputs = Input(shape=(None, config.num_decoder_tokens))
decoder_dense = Dense(config.num_decoder_tokens, activation='softmax')
decoder_lstm = LSTM(config.latent_dim, return_sequences=True, return_state=True)
decoder_state_input_h = Input(shape=(config.latent_dim,))
decoder_state_input_c = Input(shape=(config.latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
inf_decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
inf_decoder_model.load_weights(os.path.join(config.save_model_path, 'decoder_model_weights.h5'))



In [6]:
def index_to_word():
    # inverts word tokenizer
    index_to_word = {value: key for key, value in tokenizer.word_index.items()}
    return index_to_word

def greedy_search(loaded_array):
    """

    :param f: the loaded numpy array after creating videos to frames and extracting features
    :return: the final sentence which has been predicted greedily
    """
    inv_map = index_to_word()
    states_value = inf_encoder_model.predict(loaded_array.reshape(-1, 80, 4096))
    target_seq = np.zeros((1, 1, 1500))
    sentence = ''
    target_seq[0, 0, tokenizer.word_index['bos']] = 1
    for i in range(15):
        output_tokens, h, c = inf_decoder_model.predict([target_seq] + states_value)
        states_value = [h, c]
        output_tokens = output_tokens.reshape(config.num_decoder_tokens)
        y_hat = np.argmax(output_tokens)
        if y_hat == 0:
            continue
        if inv_map[y_hat] is None:
            break
        else:
            sentence = sentence + inv_map[y_hat] + ' '
            target_seq = np.zeros((1, 1, 1500))
            target_seq[0, 0, y_hat] = 1
    return ' '.join(sentence.split()[:-1])

In [7]:
def get_test_data():
    """
    loads all the numpy files
    :return: two lists containing all the video arrays and the video Id
    """
    X_test = []
    X_test_filename = []
    with open(os.path.join(config.test_path, 'testing_id.txt')) as testing_file:
        lines = testing_file.readlines()
        for filename in lines:
            filename = filename.strip()
            f = np.load(os.path.join(config.train_path, 'feat', filename + '.npy'))
            X_test.append(f)
            X_test_filename.append(filename[:-4])
        X_test = np.array(X_test)
    return X_test, X_test_filename

In [8]:
X_test, X_test_filename = get_test_data()

# generate inference test outputs
with open(os.path.join(config.test_path, 'test_%s.txt' % config.search_type), 'w') as file:
    for idx, x in enumerate(X_test):
        file.write(X_test_filename[idx] + ',')
        if config.search_type == 'greedy':
            start = time.time()
            decoded_sentence = greedy_search(x.reshape(-1, 80, 4096))
            file.write(decoded_sentence + ',{:.2f}'.format(time.time()-start))
        # else:
        #     start = time.time()
        #     decoded_sentence = decode_sequence2bs(x.reshape(-1, 80, 4096))
        #     decode_str = decoded_sentence_tuning(decoded_sentence)
        #     for d in decode_str:
        #         file.write(d + ' ')
        #     file.write(',{:.2f}'.format(time.time() - start))
        file.write('\n')

        # re-init max prob
        config.max_probability = -1

  if config.search_type is 'greedy':


