In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding,BatchNormalization, Dropout, Input, TimeDistributed, Dense, add, Merge, RepeatVector, Activation, Flatten
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle
from keras.preprocessing import image
import keras
from keras import backend 
from keras.models import load_model
import time
from PIL import Image
from keras.utils import plot_model
from nltk.translate.bleu_score import corpus_bleu


In [None]:
# code to make caption dictionary whose keys are image file name and values are image caption.
token_dir = "Flickr8k_text/Flickr8k.token.txt"

image_captions = open(token_dir).read().split('\n')
caption = {}    
for i in range(len(image_captions)-1):
    id_capt = image_captions[i].split("\t")
    id_capt[0] = id_capt[0][:len(id_capt[0])-2] # to rip off the #0,#1,#2,#3,#4 from the tokens file
    if id_capt[0] in caption:
        caption[id_capt[0]].append(id_capt[1])
    else:
        caption[id_capt[0]] = [id_capt[1]]

 <h3> Two files were made that named "trainimgs.txt" and "testImages.txt" that will have start and end token at the start and end of each caption respectively.  </h3>

In [None]:
train_imgs_id = open("Flickr8k_text/Flickr_8k.trainImages.txt").read().split('\n')[:-1]
train_imgs_captions = open("Flickr8k_text/trainimgs.txt",'w')
for img_id in train_imgs_id:
    for captions in caption[img_id]:
        desc = "<start> "+captions+" <end>"
        train_imgs_captions.write(img_id+"\t"+desc+"\n")
        train_imgs_captions.flush()
train_imgs_captions.close()

test_imgs_id = open("Flickr8k_text/Flickr_8k.testImages.txt").read().split('\n')[:-1]
test_imgs_captions = open("Flickr8k_text/testimgs.txt",'w')
for img_id in test_imgs_id:
    for captions in caption[img_id]:
        desc = "<start> "+captions+" <end>"
        test_imgs_captions.write(img_id+"\t"+desc+"\n")
        test_imgs_captions.flush()
test_imgs_captions.close()

In [None]:
test_imgs_captions = open("Flickr8k_text/testimgs.txt").read().split('\n')

In [None]:
#normalize the image
def preprocess_input(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x

In [None]:
def preprocess(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

<h3> We are using VGG model to extract features from images. We will only change the output layer of the model, now our output will be second last layer of the model which gives output (4096,) </h3>

In [None]:
model = VGG16()
model.layers.pop()
model = Model(inputs=model.inputs, outputs=model.layers[-1].output)

<h3> Encode function takes in image path and outputs the vector using the VGG model</h3>

In [None]:
def encode(image):
    image = preprocess(image)
    temp_enc = model.predict(image)
    temp_enc = np.reshape(temp_enc, temp_enc.shape[1])
    return temp_enc

In [None]:
images = 'Flickr8k_Dataset/Flicker8k_Dataset/'

In [None]:
train_imgs_id = open("Flickr8k_text/Flickr_8k.trainImages.txt").read().split('\n')[:-1]
test_imgs_id = open("Flickr8k_text/Flickr_8k.testImages.txt").read().split('\n')[:-1]
encoding_train = {}
for img in tqdm(train_imgs_id): #tqdm instantly make your loops show a smart progress meter
    path = images+str(img)
    encoding_train[img] = encode(path)

In [None]:
with open("encoded_train_images_vgg.p", "wb") as encoded_pickle: 
    pickle.dump(encoding_train, encoded_pickle) #python object can be pickled so that it can be saved on disk. 

In [None]:
encoding_train = pickle.load(open('encoded_train_images_vgg.p', 'rb'))

In [None]:
encoding_train['3556792157_d09d42bef7.jpg'].shape

In [None]:
encoding_test = {}
for img in tqdm(test_imgs_id):
    path = images+str(img)
    encoding_test[img] = encode(path)

In [None]:
with open("encoded_test_images_vgg.p", "wb") as encoded_pickle:
    pickle.dump(encoding_test, encoded_pickle)

In [None]:
encoding_test = pickle.load(open('encoded_test_images_vgg.p', 'rb'))

In [None]:
dataframe = pd.read_csv('Flickr8k_text/trainimgs.txt', delimiter='\t')
captionz = []
img_id = []
dataframe = dataframe.sample(frac=1)
iter = dataframe.iterrows()

for i in range(len(dataframe)):
    nextiter = next(iter)
    captionz.append(nextiter[1][1])
    img_id.append(nextiter[1][0])

In [None]:
#calculate the number of the samples
no_samples=0
tokens = []
tokens = [i.split() for i in captionz]
for caption in captionz:
    no_samples+=len(caption.split())-1

In [None]:
dataframe = pd.read_csv('Flickr8k_text/testimgs.txt', delimiter='\t')
test_captionz = []
test_img_id = []
dataframe = dataframe.sample(frac=1)
iter = dataframe.iterrows()

for i in range(len(dataframe)):
    nextiter = next(iter)
    test_captionz.append(nextiter[1][1])
    test_img_id.append(nextiter[1][0])

In [None]:
#calculating vocabulary 
vocab= [] 
for token in tokens:
    vocab.extend(token)
vocab = list(set(vocab))
with open("vocab.p", "wb") as pickle_d:
    pickle.dump(vocab, pickle_d)

In [None]:
vocab= pickle.load(open('vocab.p', 'rb'))

In [None]:
vocab_size = len(vocab)

In [None]:
#tokenize
word_idx = {val:index for index, val in enumerate(vocab)}
idx_word = {index:val for index, val in enumerate(vocab)}

In [None]:
#calculate the maxlength sentence to padd the samples for training
caption_length = [len(caption.split()) for caption in captionz]
max_length = max(caption_length)
max_length # maximum lenght of a caption.

In [None]:
def data_process(captionz, encoding_train, word_idx, batch_size, max_length):
    partial_captions = []
    next_words = []
    images = []
    total_count = 0
    while 1:
        for image_counter, caption in enumerate(captionz):
            current_image = encoding_train[img_id[image_counter]]
    
            for i in range(len(caption.split())-1):
                total_count+=1
                partial = [word_idx[txt] for txt in caption.split()[:i+1]]
                partial_captions.append(partial)
                next = np.zeros(vocab_size)
                next[word_idx[caption.split()[i+1]]] = 1
                next_words.append(next)
                images.append(current_image)
                if total_count>=batch_size:
                    next_words = np.asarray(next_words)
                    images = np.asarray(images)
                    partial_captions = sequence.pad_sequences(partial_captions, maxlen=max_length, padding='post')
                    total_count = 0
                    yield [[images, partial_captions], next_words]
                    partial_captions = []
                    next_words = []
                    images = []

In [None]:
def data_process_par_inject(captionz, encoding_train, word_idx, batch_size, max_length):
    partial_captions = []
    next_words = []
    images = []
    total_count = 0
    while 1:
    
        for image_counter, caption in enumerate(captionz):
            current_image = encoding_train[img_id[image_counter]].tolist()
            for i in range(len(caption.split())-1):
                total_count+=1
                partial = [word_idx[txt] for txt in caption.split()[:i+1]]
                partial_captions.append(current_image + partial)
                next = np.zeros(vocab_size)
                next[word_idx[caption.split()[i+1]]] = 1
                next_words.append(next)
                #images.append(current_image)

                if total_count>=batch_size:
                    next_words = np.asarray(next_words)
                    images = np.asarray(images)
                    partial_captions = sequence.pad_sequences(partial_captions, maxlen=4096+max_length, padding='post')
                    total_count = 0
                
                    yield [partial_captions, next_words]
                    partial_captions = []
                    next_words = []
                    images = []

In [None]:
def data_process_pre_inject(captionz, encoding_train, word_idx, batch_size):
    partial_captions = []
    next_words = []
    images = []
    total_count = 0
    while 1:
    
        for image_counter, caption in enumerate(captionz):
            current_image = encoding_train[img_id[image_counter]]
            for i in range(len(caption.split())-1):
                total_count+=1
                partial = [word_idx[txt] for txt in caption.split()[:i+1]]
                partial_captions.append(partial)
                next = np.zeros(vocab_size)
                next[word_idx[caption.split()[i+1]]] = 1
                next_words.append(next)
                images.append(current_image)

                if total_count>=batch_size:
                    next_words = np.asarray(next_words)
                    images = np.asarray(images)
                    partial_captions = sequence.pad_sequences(partial_captions, maxlen=4096, padding='post')
                    total_count = 0
                    yield [[images, partial_captions], next_words]
                    partial_captions = []
                    next_words = []
                    images = []

In [None]:
#pre-inject
inputs1 = Input(shape=(4096,))
fe1 = Dense(128, activation='relu')(inputs1)
# sequence model
inputs2 = Input(shape=(4096,))
se1 = Embedding(vocab_size, 128, mask_zero=True)(inputs2)
inputs = add([fe1, se1])
se2 = LSTM(128)(inputs)
se3 = Dropout(0.5)(se2)
# decoder model
decoder1 = Dense(64, activation='relu')(se3)
outputs = Dense(vocab_size, activation='softmax')(decoder1)
# tie it together [image, seq] [word]
pre_inject_model = Model(inputs=[inputs1, inputs2], outputs=outputs)
plot_model(pre_inject_model, to_file='pre_inject_model.png', show_shapes=True)
pre_inject_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
batch_size = 128
epoch=1
pre_inject_model.fit_generator(data_process_pre_inject(captionz, encoding_train, word_idx, batch_size), 
                               steps_per_epoch=no_samples/batch_size, epochs=epoch, verbose=1, callbacks=None)

In [None]:
#merge model
inputs1 = Input(shape=(4096,))
fe1 = Dense(128, activation='relu')(inputs1)
# sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 128, mask_zero=True)(inputs2)
se2 = LSTM(128)(se1)
se3 = Dropout(0.5)(se2)
# decoder model
decoder1 = add([fe1, se3])
decoder2 = Dense(64, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
merge_model = Model(inputs=[inputs1, inputs2], outputs=outputs)
plot_model(merge_model, to_file='merge_model.png', show_shapes=True)
merge_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
merge_model.fit_generator(data_process(captionz, encoding_train, word_idx, batch_size, max_length), 
                               steps_per_epoch=no_samples/batch_size, epochs=epoch, verbose=1, callbacks=None)

In [None]:
#init_inject
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(128, activation='relu')(fe1)
# sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 128, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(128)(se2, initial_state=[fe2, fe2])
# decoder model
#decoder1 = add([fe2, se3])
decoder2 = Dense(128, activation='relu')(se3)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
init_inject_model = Model(inputs=[inputs1, inputs2], outputs=outputs)
plot_model(init_inject_model, to_file='init_inject_model.png', show_shapes=True)
init_inject_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
init_inject_model.fit_generator(data_process(captionz, encoding_train, word_idx, batch_size, max_length), 
                               steps_per_epoch=no_samples/batch_size, epochs=epoch, verbose=1, callbacks=None)

In [None]:
# par_inject model
inputs1 = Input(shape=(4096+max_length,))
se1 = Embedding(vocab_size, 128, mask_zero=True)(inputs1)
se2 = Dropout(0.5)(se1)
se3 = LSTM(128)(se2)
# decoder model
#decoder1 = add([fe2, se3])
decoder2 = Dense(64, activation='relu')(se3)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
par_inject_model = Model(inputs= inputs1, outputs=outputs)
plot_model(par_inject_model, to_file='par_inject_model.png', show_shapes=True)
par_inject_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
par_inject_model.fit_generator(data_process_par_inject(captionz, encoding_train, word_idx, batch_size, max_length), 
                               steps_per_epoch=no_samples/batch_size, epochs=epoch, verbose=1, callbacks=None)

<h3>In order to predict  results I used  greedy search. </h3>

In [None]:
# evaluate the skill of the model
def evaluate_model(test_imgs_id, test_captionz, word_idx, idx_word, sequence, max_length, encoding_test, model, method = "not par"):
    actual_caption = []
    predicted_caption = []
    for encounter, caption in enumerate(test_captionz):
        #print(predict_captions(test_img_id[encounter], word_idx, idx_word, sequence, max_length, encoding_test, model))
        #print(caption.split()[1:-1])
        prediction = predict_captions(test_img_id[encounter], word_idx, idx_word, sequence, max_length, encoding_test, model, method).split()
        predicted_caption.append(prediction)
        actual_caption.append([caption.split()[1:-1]])
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual_caption, predicted_caption, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual_caption, predicted_caption, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual_caption, predicted_caption, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual_caption, predicted_caption, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
def predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, model, method = 'not par'):
    start_word = ["<start>"]
    while 1:
        now_caps = [word_idx[i] for i in start_word]
        now_caps = sequence.pad_sequences([now_caps], maxlen=max_length, padding='post')
        e = encoding_test[image_file]
        if method == 'not par':
            preds = model.predict([np.array([e]), np.array(now_caps)])
        else:
            _input = e.tolist() + now_caps[0].tolist()
            preds = model.predict(np.array([_input]))
        word_pred = idx_word[np.argmax(preds[0])]
        start_word.append(word_pred)
        
        if word_pred == "<end>" or len(start_word) > max_length: 
            break            
    return ' '.join(start_word[1:-1])

In [None]:
# evaluating int_inject_model
evaluate_model(test_img_id, test_captionz, word_idx, idx_word, sequence, max_length, encoding_test, init_inject_model)

In [None]:
# evaluating pre_inject_model
evaluate_model(test_img_id, test_captionz, word_idx, idx_word, sequence, max_length, encoding_test, pre_inject_model)

In [None]:
# evaluating par_inject_model
evaluate_model(test_img_id, test_captionz, word_idx, idx_word, sequence, max_length, encoding_test, par_inject_model, method='par')

In [None]:
# evaluating merge model
evaluate_model(test_img_id, test_captionz, word_idx, idx_word, sequence, max_length, encoding_test, merge_model)

As the merge model outperform others in the bleu score results, I trained it for 5 epochs to get reasonable predictions

In [None]:
epoch=5
merge_model.fit_generator(data_process(captionz, encoding_train, word_idx, batch_size, max_length), 
                               steps_per_epoch=no_samples/batch_size, epochs=epoch, verbose=1, callbacks=None)

# Prediction examples

In [None]:
image_file ="3430607596_7e4f74e3ff.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="3255482333_5bcee79f7e.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="3168123064_d1983b8f92.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="3316725440_9ccd9b5417.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="3218480482_66af7587c8.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="2541104331_a2d65cfa54.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="309687244_4bdf3b591f.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="2542662402_d781dd7f7c.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))

In [None]:
image_file ="2654514044_a70a6e2c21.jpg"
test_image =  images + image_file
Image.open(test_image)

In [None]:
# print original five texts for the image
image_index = [i for i in range(len(test_img_id)) if test_img_id[i] == image_file]
print('Original five sentences')
print(test_captionz[image_index[0]])
print(test_captionz[image_index[1]])
print(test_captionz[image_index[2]])
print(test_captionz[image_index[3]])
print(test_captionz[image_index[4]])
#print prediction of the merge model
print ('Greedy search prediction:', predict_captions(image_file, word_idx, idx_word, sequence, max_length, encoding_test, merge_model))