In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import string
import random
from tqdm import tqdm
from PIL import Image 

# Keras Libraries
import tensorflow
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import add
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from nltk.translate.bleu_score import sentence_bleu



In [2]:
# Import train,test,val image names from given datafiles  

train_image_names = open('/kaggle/input/flickr/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt','r').read().splitlines()
val_image_names = open('/kaggle/input/flickr/Flickr8k/Flickr8k_text/Flickr_8k.valImages.txt','r').read().splitlines()
test_image_names = open('/kaggle/input/flickr/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt','r').read().splitlines()
images_path = '/kaggle/input/flickr/Flickr8k/Flicker8k_Images/'

In [3]:
# Import Lemmatized text descriptions
lemma_desc_list  = open('/kaggle/input/flickr/Flickr8k/Flickr8k_text/Flickr8k.token.txt','r').read().splitlines()

### Cleaning of text descriptions

In [4]:
def preprocess_text(line):
    line = line.split()                             # Convert to a list of words
    line = [w.lower() for w in line]                # Convert to lowercase
    line = [w for w in line if w.isalpha()]         # Remove numbers
    line = " ".join(line).translate(
        str.maketrans("", "", string.punctuation)   # Remove punctuation
    )
    line = "startseq " + line + " endseq"
    return line

In [5]:
# Storing the descriptions in a dictionary
lemmatized_text_desc = {}
for i in lemma_desc_list:
    image_name = i.split('\t')[0]
    image_name = image_name.split('#')[0]
    text = i.split('\t')[1]
    text = preprocess_text(text)
    if image_name in lemmatized_text_desc:
        lemmatized_text_desc[image_name].append(text)
    else:
        lemmatized_text_desc[image_name] = [text]

In [6]:
# Split into train, test and val descriptors for our model
train_text = {}
val_text = {}
test_text = {}
for i in train_image_names:
    train_text[i] = lemmatized_text_desc[i]
for i in val_image_names:
    val_text[i] = lemmatized_text_desc[i]
for i in test_image_names:
    test_text[i] = lemmatized_text_desc[i]

In [7]:
max_length = 0
for filename,texts in lemmatized_text_desc.items():
    for i in texts:
        if(max_length < len(i.split())):
            max_length = len(i.split())
            max_string = i
            max_list = i.split()

In [8]:
max_length

36

In [9]:
word_counts = {}
nsents = 0
for key,values in train_text.items():
    for i in values:
        nsents += 1
        for w in i.split(' '):
            word_counts[w] = word_counts.get(w, 0) + 1
vocabulary = [w for w in word_counts]
print(len(vocabulary))

7276


In [10]:
vocabulary1 = []

for key,values in train_text.items():
    for i in values:
        for w in i.split(' '):
            if(w not in vocabulary1): vocabulary1.append(w)
print(len(vocabulary1))

7276


In [11]:
# Read GloVe files

with open("/kaggle/input/glove6b200d/glove.6B.200d.txt", "r") as f:
    glove = f.read().split("\n")

In [13]:
# Initialize the dictionary
glove_dict = {}

for line in glove:
    try:
        elements = line.split()
        word, vector = elements[0], np.array([float(i) for i in elements[1:]])
        glove_dict[word] = vector
    except:
        continue

In [14]:
embeddings_index = {} 
glove_file = open('/kaggle/input/glove6b200d/glove.6B.200d.txt', encoding="utf-8")
for line in glove_file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [15]:
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocabulary:             
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

vocab_size = len(ixtoword) + 1

In [16]:
ixtoword1 = {}
wordtoix1 = {}
ix = 1
for w in vocabulary1:
    wordtoix1[w] = ix
    ixtoword1[ix] = w
    ix += 1

vocab_size1 = len(ixtoword1) + 1

In [19]:
glove_weights1 = np.random.uniform(0, 1, (vocab_size1, 200))
for word, i in wordtoix.items():
    embedding_vector = glove_dict.get(word)
    if embedding_vector is not None:
        glove_weights1[i] = embedding_vector

In [None]:
# CNN model (ResNet50) for feature extraction from images

resnet_model = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg')
resnet_model.summary()

In [21]:
from torchvision import transforms

img_transform = transforms.Compose([transforms.Resize((224, 224))])

In [22]:
def img_preprocess(img_path):
    im = cv2.imread(images_path  + img_path)   
    im_res = cv2.resize(im,(224,224))
    im_res = np.expand_dims(im_res, axis=0)
    return im_res

In [None]:
# # Predict the feature vectors for each training image 

# train_data = {}
# ctr=0
# for ix in train_image_names:
#     if ix == "":
#         continue
#     ctr+=1
#     if ctr%500==0:
#         print(ctr)
#     path = ix
#     img = img_preprocess(path)
#     pred = resnet_model.predict(img).reshape(2048)
#     train_data[ix] = pred

In [None]:
# # Predict the feature vectors for each training image 

# train_data = {}
# ctr=0
# for ix in train_image_names:
#     if ix == "":
#         continue
#     ctr+=1
#     if ctr%500==0:
#         print(ctr)
#     path = ix
#     img = Image.open(images_path + path)
#     img = img_transform(img)
#     # Add a batch dimension to the tensor
#     img = np.expand_dims(img, axis=0)
#     pred = resnet_model.predict(img).reshape(2048)
#     train_data[ix] = pred
#     # print(img.shape)

In [None]:
# filename = 'cnn_train_features1.pickle'
# file = open(filename, 'wb')
# pkl.dump(train_data,file)

In [30]:
filename = '/kaggle/input/features1/cnn_train_features1.pickle'
file = open(filename, 'rb')
trainImg_features = pkl.load(file)

In [31]:
from tensorflow.keras.layers import concatenate, LSTM, Attention, MultiHeadAttention, LayerNormalization

# #Baseline- wihout attention
# inputs1 = Input(shape=(2048,))
# inputs2 = Input(shape=(max_length,))

# fe1 = Dropout(0.5)(inputs1)
# fe2 = Dense(200, activation='relu')(fe1)
# fe3 = Reshape((1, 200), input_shape=(200,))(fe2)

# se1 = Embedding(vocab_size, 200, mask_zero=False)(inputs2)
# merged = concatenate([fe3, se1], axis = 1)
# se2 = LSTM(200, return_sequences = True)(merged)
# se3 = Dropout(0.5)(se2)

# decoder1 = add([fe2, se3])
# decoder2 = Dense(200, activation='relu')(decoder1)
# outputs = Dense(vocab_size, activation='softmax')(decoder2)

# model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# model.summary()

# # Attention-1
inputs1 = Input(shape=(2048,))
in1 = Reshape((1, 2048), input_shape=(2048,))(inputs1)

feat_l1 = Dropout(0.5)(in1)
feat_l2 = Dense(200, activation = 'relu')(feat_l1)
fe3 = Reshape((1, 200), input_shape=(200,))(feat_l2)


# sequence input -> second path
in2 = Input(shape=(max_length,))
emb = Embedding(vocab_size, 200, weights=[glove_weights1], trainable=True, mask_zero=False)(in2)
emb = Dense(200, activation = 'relu')(emb)

comb_l1 = concatenate([fe3, emb], axis = 1)
query = Dense(200, activation = 'relu')(comb_l1)
value = Dense(200, activation = 'relu')(comb_l1)
key = Dense(200, activation = 'relu')(comb_l1)
atte_layer1 = Attention()([query, value, key])

seq_l1 = Dropout(0.1)(atte_layer1)
seq_l2 = LSTM(200, return_sequences = True)(seq_l1)

seq_l3 = Dropout(0.1)(seq_l2)
seq_l4 = LSTM(200, return_sequences = True)(seq_l3)

seq_l5 = Dropout(0.1)(seq_l4)
seq_l6 = LSTM(200)(seq_l5)


comb_l2 = add([Reshape((200, ))(feat_l2), seq_l6])
comb_l3 = Dense(200, activation = 'relu')(comb_l2)

# output
output = Dense(vocab_size, activation = 'softmax')(comb_l3)

# compile model
model = Model(inputs = [inputs1, in2], outputs = output)
model.summary()



# # MultiHead Attention
# # Define input layers
# inputs1 = Input(shape=(2048,))
# inputs2 = Input(shape=(max_length,))

# # Define the feature extractor network
# fe1 = Dropout(0.5)(inputs1)
# fe2 = Dense(200, activation='relu')(fe1)
# fe3 = Reshape((1, 200), input_shape=(200,))(fe2)

# # Define the sequence encoder network with attention
# se1 = Embedding(vocab_size, 200, mask_zero=False)(inputs2)
# se2 = LSTM(200, return_sequences=True)(se1)
# attn = MultiHeadAttention(num_heads=8, key_dim=4)(fe3, se2)
# context = concatenate([attn, fe3], axis=-1)
# se3 = LSTM(200)(context)
# se4 = Dropout(0.5)(se3)

# # Define the decoder network
# decoder1 = add([fe2, se4])
# decoder2 = Dense(200, activation='relu')(decoder1)
# outputs = Dense(vocab_size, activation='softmax')(decoder2)

# # Define the model
# model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# model.summary()

# # MultiheadAttention-2
# inputs1 = Input(shape=(2048,))
# inputs2 = Input(shape=(max_length,))

# fe1 = Dropout(0.5)(inputs1)
# fe2 = Dense(200, activation='relu')(fe1)
# fe3 = Reshape((1, 200), input_shape=(200,))(fe2)

# se1 = Embedding(vocab_size, 200, mask_zero=False)(inputs2)
# se2 = LSTM(200, return_sequences=True)(se1)

# # Adding Multi-Head Attention layer
# attn = MultiHeadAttention(num_heads=8, key_dim=64)(se2, se2, se2)
# attn = Dropout(0.5)(attn)
# attn = LayerNormalization(epsilon=1e-6)(attn)

# # Concatenating the context vector and features from CNN
# context = concatenate([attn, fe3], axis=1)
# se3 = LSTM(200)(context)
# se4 = Dropout(0.5)(se3)

# decoder1 = add([fe2, se4])
# decoder2 = Dense(200, activation='relu')(decoder1)
# outputs = Dense(vocab_size, activation='softmax')(decoder2)

# model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 reshape_3 (Reshape)            (None, 1, 2048)      0           ['input_3[0][0]']                
                                                                                                  
 dropout_4 (Dropout)            (None, 1, 2048)      0           ['reshape_3[0][0]']              
                                                                                                  
 input_4 (InputLayer)           [(None, 36)]         0           []                               
                                                                                            

In [None]:
# Setting the embedding layer weights to the weights we predicted from the word embeddings
model.layers[5].set_weights([glove_weights1])
model.layers[5].trainable = True

In [32]:
from tensorflow.keras.optimizers import Adam
optimizer = Adam(lr = 0.0001)

In [33]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

### Data Loader

In [34]:
# We will create batches and pass it for training
def data_loader(descs, imgs, wrd_to_indx, max_len, batch_size):
    X1, X2, Y = [], [], []
    n = 0

    while True:
        for img_name, desc in descs.items():
            # n += 1

            img = imgs[img_name]
            for d in desc:
                # Encoding of the sentence
                seq = [wrd_to_indx[word] for word in d.split(' ') if word in wrd_to_indx]
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size1)[0]
                    # store
                    X1.append(img)
                    X2.append(in_seq)
                    Y.append(out_seq)
                    
            n += 1

            if n==batch_size:
                yield ([np.array(X1), np.array(X2)], np.array(Y))
                X1, X2, Y = [], [], []
                n=0

In [35]:
epochs = 15 #15
batch_size = 3 #3
steps = len(train_text)//batch_size

generator = data_loader(train_text, trainImg_features, wordtoix1, max_length, batch_size)

# with tensorflow.device('gpu'):
model.fit(generator, epochs=epochs, steps_per_epoch=steps, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x79809024d330>

In [None]:
#Removes start and end seq from test captions
def remove_seq(test_captions):
    for i in range(len(test_captions)):
        text = test_captions[i]
        word_list = text.split()
        word_list = word_list[1:-1]
        test_captions[i] = ' '.join(word_list)
    return test_captions

In [None]:
def predict_caption(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        
        yhat = model.predict([photo,sequence], verbose=0)
        
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break

    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [36]:
model.save('keras_attention_true.h5')

In [None]:
# import torch
# torch.save(model, 'model_keras.h5' )

In [None]:
# import pickle as pkl
# filename = 'model_keras1.pickle'
# file = open(filename, 'wb')
# pkl.dump(model,file)

### Inference

In [None]:
image_name = test_image_names[23]
# img = img_preprocess(image_name)
# img = Image.open(images_path + image_name)
img = cv2.imread(images_path + image_name) 
# img = img_transform(img)
# img = cv2.imread("/kaggle/input/flickr/Flickr8k/Flicker8k_Images"  + "/" + tr_img)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img,(224,224))
img = np.expand_dims(img, axis=0)

In [None]:
with tensorflow.device('gpu'):
    pred = resnet_model.predict(img).reshape(1,2048)

In [None]:
x = plt.imread(images_path+image_name)
plt.imshow(x)
plt.show()

prediction = predict_caption(pred)
print(prediction)

### Evaluation with BLEU scores

In [None]:
def evaluate_model(img_list):
    scores = []
    preds = []
#     img_list = random.sample(img_list, 100)
    for image_name in tqdm(img_list):    
        img = img_preprocess(image_name)
        pred = resnet_model.predict(img).reshape(1,2048)
        
        prediction = predict_caption(pred)
        preds.append(prediction)
        
        reference = test_text[image_name].copy()
        reference = remove_seq(reference)

        score = sentence_bleu(reference, prediction)
        scores.append(score)
    return scores,preds

In [None]:
g_scores, g_predictions = evaluate_model(test_image_names)

In [None]:
np.mean(g_scores)

### Meteor Scores

In [None]:
from nltk.translate.meteor_score import meteor_score

scores_list = []
test_images_list = test_image_names.copy()
#test_images_list = random.sample(test_images_list, 100)

for img_name in tqdm(test_images_list):   
    predictions_list = []
    # img = cv2.imread("/kaggle/input/flickr/Flickr8k/Flicker8k_Images"  + "/" + img_name)
    # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # img = cv2.resize(img,(224,224))
    # img = np.expand_dims(img, axis=0)
    img = img_preprocess(img_name)
    # with tensorflow.device('gpu'):
    pred = resnet_model.predict(img, verbose = 0).reshape(1,2048)

    pred = algo(pred)
    predictions_list.append(pred)

    reference = test_text[img_name].copy()
    reference = remove_seq(reference)


    pred_words = pred.split()
    print(reference)
    score = meteor_score([x.split() for x in reference], pred.split())
    scores_list.append(score)   

In [None]:
np.mean(scores_list)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

references =[]
for image_name in tqdm(test_image_names):    
    reference = test_text[image_name].copy()
    reference = remove_seq(reference)
    word_list = []    
    for sen in reference:
        word_list.append(sen.split())
    references.append(word_list)

predictions_words = []
for prediction in g_predictions:
    predictions_words.append(prediction.split())

bleu_1 = corpus_bleu(references, predictions_words, weights=(1.0, 0, 0, 0))
bleu_2 = corpus_bleu(references, predictions_words, weights=(0.5, 0.5, 0, 0))
bleu_3 = corpus_bleu(references, predictions_words, weights=(0.33, 0.33, 0.33, 0))
bleu_4 = corpus_bleu(references, predictions_words, weights=(0.25, 0.25, 0.25, 0.25))

print("BLEU-1: {:.4f}".format(bleu_1))
print("BLEU-2: {:.4f}".format(bleu_2))
print("BLEU-3: {:.4f}".format(bleu_3))
print("BLEU-4: {:.4f}".format(bleu_4))

In [None]:
# def predict_beam_search(image, beam_index = 3):
#     start = [wordtoix["startseq"]]
#     start_word = [[start, 0.0]]
#     while len(start_word[0][0]) < max_length:
#         temp = []
#         for s in start_word:
#             par_caps = pad_sequences([s[0]], maxlen=max_length, padding='post')
#             preds = model.predict([image,par_caps], verbose=0)
#             word_preds = np.argsort(preds[0])[-beam_index:]
#             # Getting the top <beam_index>(n) predictions and creating a 
#             # new list so as to put them via the model again
#             for w in word_preds:
#                 next_cap, prob = s[0][:], s[1]
#                 next_cap.append(w)
#                 prob += preds[0][w]
#                 temp.append([next_cap, prob])
                    
#         start_word = temp
#         # Sorting according to the probabilities
#         start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
#         # Getting the top words
#         start_word = start_word[-beam_index:]
    
#     start_word = start_word[-1][0]
#     intermediate_caption = [ixtoword[i] for i in start_word]
#     final_caption = []
    
#     for i in intermediate_caption:
#         if i != 'endseq':
#             final_caption.append(i)
#         else:
#             break

#     final_caption = ' '.join(final_caption[1:])
#     return final_caption