In [2]:
import os
import pickle
import numpy as np
import keras
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16  import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add


In [3]:
Base_Dir = '/kaggle/input/flickr30k'
Working_Dir = '/kaggle/working'

Extract Image Features


In [4]:
model = VGG16()
model =  Model(inputs=model.inputs, outputs = model.layers[-2].output)
print(model.summary())

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14758

In [None]:
#extract features from image
features = {}
directory = os.path.join(Base_Dir, 'flickr30k_images')
for img_name in tqdm(os.listdir(directory)):
  img_path = directory + '/' + img_name
  image = load_img(img_path, target_size = (224, 224))
  image = img_to_array(image)
  # reshape data for model
  image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
  #preprocessing
  image = preprocess_input(image)
  feature = model.predict(image, verbose = 0)
  #get image ID
  image_id = img_name.split('.')[0]
  features[image_id] = feature

  0%|          | 0/31783 [00:00<?, ?it/s]

In [None]:
#stores features in pickle
pickle.dump(features, open(os.path.join(Working_Dir, 'features.pkl'), 'wb'))


In [None]:
#load features from pickle
with open(os.path.join(Working_Dir,'features.pkl'),'rb') as f:
    features = pickle.load(f)

In [None]:
#load the captions Data
with open(os.path.join(Base_Dir, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
#create mapping of image to captions
mapping = {}
#process lines
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    #convert caption list to string
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)
    
    
    

In [None]:
len(mapping)

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            #preprocessing steps
            caption = caption.lower()
            #delete digits, special chars, etc
            caption = caption.replace('[^A-Za-z]', '')
            #delete additional space
            caption = caption.replace('\s+', '')
            #add start and end tags to the caption
            caption = '<start>' + " ".join([word for word in caption.split() if len(word)>1]) + '<end>'
            captions[i] = caption
            
            
            
            

In [None]:
#before preprocess of text
mapping['1000344755']

In [None]:
#preprocess the text
clean(mapping)


In [None]:
mapping['1088385559']

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
#tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1


In [None]:
vocab_size

Above is the number of unique words

In [None]:
max_length = max(len(caption.split()) for caption in all_captions)
max_length

Train Test Split

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]


In [None]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield[X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0
                

Model Creation

In [None]:
#encoder model
#image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
#sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)
#decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation = 'softmax')(decoder2)

model = Model(inputs = [inputs1, inputs2], outputs = outputs)
model.compile(loss='categorical_crossentropy', optimizer = 'adam')
#plot the model
plot_model(model, show_shapes = True)

Train the model

In [None]:
epochs = 15
batch_size = 2
steps = len(train)
for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch = steps, verbose = 1)
    

In [None]:
model.save(Working_Dir + '/best_model.h5')

Generate Captions for the Image

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
        return None

In [None]:
#generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    in_text = '<start>'
    for i in range(max_length):
        #encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose = 0)
        #get index wirh high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        in_text += " " + word
        #stop if we reach end tag
        if word == '<end>':
            break
    return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu
actual, predicted = list(), list()

for key in tqdm(test):
    # get actual caption
    captions = mapping[key]
    #predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    #split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    # calculate BLEU score
    print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights = (1.0, 0, 0, 0)))
    print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights = (0.5, 0.5, 0, 0)))
    

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    image_id = image_name.split('.')[0]
    img_path = os.path.join(Base_Dir, "Images", image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('----Actual----')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('----Predicted-----')
    print(y_pred)
    plt.imshow(image)