In [1]:
# Importing modules
import os
import pickle  # for storing numpy features
import numpy
from tqdm.notebook import tqdm # gives UI for how much data is stored till now

from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import load_img,to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [2]:
BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'

## Extracting Image features


In [3]:
# load vgg16 model
model = VGG16()

# reconstructuing the model
model = Model(inputs=model.inputs, outputs = model.layers[-2].output)

#summary
print(model.summary())

2023-01-27 11:06:43.954581: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-27 11:06:44.061090: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-27 11:06:44.061875: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-27 11:06:44.063000: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
    16384/553467096 [..............................] - ETA: 3s

  4202496/553467096 [..............................] - ETA: 12s

 12533760/553467096 [..............................] - ETA: 6s 

 22945792/553467096 [>.............................] - ETA: 4s

 34963456/553467096 [>.............................] - ETA: 3s

 47079424/553467096 [=>............................] - ETA: 3s

 58728448/553467096 [==>...........................] - ETA: 2s

 70647808/553467096 [==>...........................] - ETA: 2s

 82608128/553467096 [===>..........................] - ETA: 2s

 94593024/553467096 [====>.........................] - ETA: 2s

100671488/553467096 [====>.........................] - ETA: 2s

109977600/553467096 [====>.........................] - ETA: 2s

122003456/553467096 [=====>........................] - ETA: 2s

























































































Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [4]:
#EXTRACTING FEATURES FROM IMAGE
features = {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory)):
    # loading the image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(224, 224))
    # converting image pixels to numpy array
    image = img_to_array(image)
    # reshaping data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocessing image for vgg
    image = preprocess_input(image)
    # extracting features
    feature = model.predict(image, verbose=0)
    # getting image ID
    image_id = img_name.split('.')[0]
    # storing feature
    features[image_id] = feature

  0%|          | 0/8091 [00:00<?, ?it/s]

2023-01-27 11:06:51.092881: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


2023-01-27 11:06:52.184433: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


In [None]:
# storing features in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))

In [None]:
# loading features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

## Loading the Captions Data

In [None]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# creating mapping of image to captions
mapping = {}
# processing lines
for line in tqdm(captions_doc.split('\n')):
    # splitting the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # removing extension from image ID
    image_id = image_id.split('.')[0]
    # converting caption list to string
    caption = " ".join(caption)
    # creating list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # storing the caption
    mapping[image_id].append(caption)

In [None]:
len(mapping)

In [None]:
len(features)

## Preprocessing Text Data

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # taking one caption at a time
            caption = captions[i]
            # preprocessing steps
            # converting to lowercase
            caption = caption.lower()
            # deleting digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            # deleting additional spaces
            caption = caption.replace('\s+', ' ')
            # adds start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# preprocess the text
clean(mapping)

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = numpy.array(X1), numpy.array(X2), numpy.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

## Model Creation

In [None]:
# Encoder Model
# image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# Decoder Model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# plotting the model
plot_model(model, show_shapes=True)

In [None]:
# Training the model
epochs= 30
batch_size = 32
steps = len(train)// batch_size

for i in range(epochs):
    # creating data generator
    generator = data_generator(train,mapping,features,tokenizer,max_length,vocab_size,batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1,steps_per_epoch=steps,verbose=1)

In [None]:
# saving the model
model.save(WORKING_DIR+'/best_mode.h5')

## Generating Captions for the image

In [None]:
# converting indexs to words
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
# generating captions for an image
def predict_caption(model,image,tokenizer,max_length):
    # adding start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in  range(max_length):
        # encodeing input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # padding the sequence
        sequence = pad_sequences([sequence],max_length)
        #predict next word
        yhat = model.predict([image,sequence],verbose=0)
        # getting index with high probability
        yhat = numpy.argmax(yhat)
        # converting index to word
        word = idx_to_word(yhat, tokenizer)
        # stopping if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
      
    return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# validating with the test data
actual, predicted = list(),list()

for key in tqdm(test):
    # get the actual caption
    captions = mapping[key]
    # predict the caption for image
    y_pred = predict_caption(model,features[key],tokenizer,max_length)
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    #append ot the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    
# Calculating BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

## Visualizing the Results

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

def generate_caption(image_name):
    # loading the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    img_path = os.path.join(BASE_DIR, "Images", image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    # predicting the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("1001773457_577c3a7d70.jpg")

In [None]:
generate_caption("1002674143_1b742ab4b8.jpg")

In [None]:
generate_caption("101669240_b2d3e7f17b.jpg")