Dataset link

https://www.kaggle.com/datasets/adityajn105/flickr8k

In [176]:
!kaggle datasets download -d adityajn105/flickr8k

'kaggle' is not recognized as an internal or external command,
operable program or batch file.


In [177]:
import os
import pickle
import numpy as np
import pandas as pd
import re
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical, plot_model, image_dataset_from_directory
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [178]:
## Extracting Image Features using VGG16
vgg_model = VGG16(weights='imagenet')

In [179]:
vgg_model.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [180]:
data = pd.read_csv('D:\image_captioning\captions.txt')

In [181]:
data

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


------------------

In [182]:
## Load the Captions Data
captions_data_file = open("D:\image_captioning\captions.txt")
captions_data = captions_data_file.readlines()
captions_data_file.close()

In [183]:
captions_data[:10]

['image,caption\n',
 '1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .\n',
 '1000268201_693b08cb0e.jpg,A girl going into a wooden building .\n',
 '1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .\n',
 '1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .\n',
 '1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .\n',
 '1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting\n',
 '1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road .\n',
 '1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street .\n',
 '1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each other on the road .\n']

In [184]:
file_types = []
for line in captions_data:
    image_formats = ['.jpg','.png','.bmp']
    for img_f in image_formats:
        if img_f in line and img_f not in file_types:
            file_types.append(img_f)

In [185]:
file_types

['.jpg']

In [186]:
image_captions = {}
for line in captions_data:
    line = line.strip()
    for file_type in file_types:
        if file_type in line:
            image_file, caption = line.split(file_type)
            caption = caption.replace(".\n",'').strip()
            caption = re.sub(r"['\",-]", '', caption)
            if caption.startswith(","):
                caption = caption[1:]
            if caption.endswith(" ."):
                caption = caption[:-2]
            if image_file+file_type in image_captions:
                image_captions[image_file+file_type].append("startseq "+caption.lower()+" endseq")
            else:
                image_captions[image_file+file_type] = ["startseq "+caption.lower()+" endseq"]

In [187]:
image_captions

{'1000268201_693b08cb0e.jpg': ['startseq a child in a pink dress is climbing up a set of stairs in an entry way endseq',
  'startseq a girl going into a wooden building endseq',
  'startseq a little girl climbing into a wooden playhouse endseq',
  'startseq a little girl climbing the stairs to her playhouse endseq',
  'startseq a little girl in a pink dress going into a wooden cabin endseq'],
 '1001773457_577c3a7d70.jpg': ['startseq a black dog and a spotted dog are fighting endseq',
  'startseq a black dog and a tricolored dog playing with each other on the road endseq',
  'startseq a black dog and a white dog with brown spots are staring at each other in the street endseq',
  'startseq two dogs of different breeds looking at each other on the road endseq',
  'startseq two dogs on pavement moving toward each other endseq'],
 '1002674143_1b742ab4b8.jpg': ['startseq a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl endseq',
  'startseq a little g

In [188]:
len(image_captions)

8091

In [189]:
len(os.listdir('D:\image_captioning\Images'))

8091

In [190]:
image_captions.values()



In [191]:
total_captions = []
for caption in image_captions.values():
    total_captions.extend(caption)

In [192]:
total_captions

['startseq a child in a pink dress is climbing up a set of stairs in an entry way endseq',
 'startseq a girl going into a wooden building endseq',
 'startseq a little girl climbing into a wooden playhouse endseq',
 'startseq a little girl climbing the stairs to her playhouse endseq',
 'startseq a little girl in a pink dress going into a wooden cabin endseq',
 'startseq a black dog and a spotted dog are fighting endseq',
 'startseq a black dog and a tricolored dog playing with each other on the road endseq',
 'startseq a black dog and a white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq',
 'startseq a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl endseq',
 'startseq a little girl is sitting in front of a large painted rainbow endseq',
 'startseq a small girl in the grass play

In [193]:
caption_tokenizer = Tokenizer()
caption_tokenizer.fit_on_texts(total_captions)

In [194]:
vocab_size = len(caption_tokenizer.index_word)
vocab_size

8835

In [195]:
max_words_length = max(len(caption.split()) for caption in total_captions)

In [196]:
max_words_length

38

In [197]:
image_data_training = image_dataset_from_directory("D:\image_captioning\Images/",
                                          validation_split = 0.2,
                                          subset = 'training',
                                          batch_size=32,
                                          seed=77,
                                          label_mode = None,
                                          image_size=(224,224),
                                          shuffle=False
                                         )
image_data_testing = image_dataset_from_directory("D:\image_captioning\Images/",
                                          validation_split = 0.2,
                                          subset = 'validation',
                                          batch_size=32,
                                          seed=77,
                                          label_mode = None,
                                          image_size=(224,224),
                                          shuffle=False
                                         )

Found 8091 files belonging to 1 classes.
Using 6473 files for training.
Found 8091 files belonging to 1 classes.
Using 1618 files for validation.


In [198]:
image_data_training.file_paths

['D:\\image_captioning\\Images/1000268201_693b08cb0e.jpg',
 'D:\\image_captioning\\Images/1001773457_577c3a7d70.jpg',
 'D:\\image_captioning\\Images/1002674143_1b742ab4b8.jpg',
 'D:\\image_captioning\\Images/1003163366_44323f5815.jpg',
 'D:\\image_captioning\\Images/1007129816_e794419615.jpg',
 'D:\\image_captioning\\Images/1007320043_627395c3d8.jpg',
 'D:\\image_captioning\\Images/1009434119_febe49276a.jpg',
 'D:\\image_captioning\\Images/1012212859_01547e3f17.jpg',
 'D:\\image_captioning\\Images/1015118661_980735411b.jpg',
 'D:\\image_captioning\\Images/1015584366_dfcec3c85a.jpg',
 'D:\\image_captioning\\Images/101654506_8eb26cfb60.jpg',
 'D:\\image_captioning\\Images/101669240_b2d3e7f17b.jpg',
 'D:\\image_captioning\\Images/1016887272_03199f49c4.jpg',
 'D:\\image_captioning\\Images/1019077836_6fc9b15408.jpg',
 'D:\\image_captioning\\Images/1019604187_d087bf9a5f.jpg',
 'D:\\image_captioning\\Images/1020651753_06077ec457.jpg',
 'D:\\image_captioning\\Images/1022454332_6af2c1449a.jpg',

In [199]:
image_captions['102455176_5f8ead62d5.jpg']

['startseq a man uses ice picks and crampons to scale ice endseq',
 'startseq an ice climber in a blue jacket and black pants is scaling a frozen ice wall endseq',
 'startseq an ice climber scaling a frozen waterfall endseq',
 'startseq a person in blue and red ice climbing with two picks endseq',
 'startseq climber climbing an ice wall endseq']

In [200]:
training_captions = {}
testing_captions = {}
for file_name in image_data_training.file_paths:
    file_path = file_name.split("/")[-1]
    if file_path in image_captions:
        training_captions[file_path] = image_captions[file_path]
    else:
        print("Warning: a file not found", file_path)
for file_name in image_data_testing.file_paths:
    file_path = file_name.split("/")[-1]
    if file_path in image_captions:
        testing_captions[file_path] = image_captions[file_path] 
    else:
        print("Warning: a file not found", file_path)

In [201]:
training_captions

{'1000268201_693b08cb0e.jpg': ['startseq a child in a pink dress is climbing up a set of stairs in an entry way endseq',
  'startseq a girl going into a wooden building endseq',
  'startseq a little girl climbing into a wooden playhouse endseq',
  'startseq a little girl climbing the stairs to her playhouse endseq',
  'startseq a little girl in a pink dress going into a wooden cabin endseq'],
 '1001773457_577c3a7d70.jpg': ['startseq a black dog and a spotted dog are fighting endseq',
  'startseq a black dog and a tricolored dog playing with each other on the road endseq',
  'startseq a black dog and a white dog with brown spots are staring at each other in the street endseq',
  'startseq two dogs of different breeds looking at each other on the road endseq',
  'startseq two dogs on pavement moving toward each other endseq'],
 '1002674143_1b742ab4b8.jpg': ['startseq a little girl covered in paint sits in front of a painted rainbow with her hands in a bowl endseq',
  'startseq a little g

In [202]:
def tokenize_captions(caption_dict, tokenizer, max_len):
    result_dict = {}
    for key in caption_dict.keys():
        result_dict[key] = tokenizer.texts_to_sequences(caption_dict[key])
    return result_dict

In [203]:
training_seq_caption = tokenize_captions(training_captions, caption_tokenizer, max_words_length)
testing_seq_caption = tokenize_captions(testing_captions, caption_tokenizer, max_words_length)

In [204]:
training_seq_caption

{'1000268201_693b08cb0e.jpg': [[2,
   1,
   42,
   4,
   1,
   90,
   170,
   7,
   119,
   53,
   1,
   395,
   12,
   392,
   4,
   28,
   5225,
   693,
   3],
  [2, 1, 19, 314, 64, 1, 194, 117, 3],
  [2, 1, 40, 19, 119, 64, 1, 194, 2431, 3],
  [2, 1, 40, 19, 119, 5, 392, 20, 60, 2431, 3],
  [2, 1, 40, 19, 4, 1, 90, 170, 314, 64, 1, 194, 2994, 3]],
 '1001773457_577c3a7d70.jpg': [[2, 1, 15, 9, 8, 1, 852, 9, 17, 343, 3],
  [2, 1, 15, 9, 8, 1, 1562, 9, 34, 10, 137, 82, 6, 5, 155, 3],
  [2, 1, 15, 9, 8, 1, 14, 9, 10, 27, 997, 17, 637, 22, 137, 82, 4, 5, 72, 3],
  [2, 13, 31, 12, 738, 2648, 88, 22, 137, 82, 6, 5, 155, 3],
  [2, 13, 31, 6, 720, 796, 319, 137, 82, 3]],
 '1002674143_1b742ab4b8.jpg': [[2,
   1,
   40,
   19,
   186,
   4,
   607,
   106,
   4,
   47,
   12,
   1,
   596,
   1184,
   10,
   60,
   219,
   4,
   1,
   969,
   3],
  [2, 1, 40, 19, 7, 49, 4, 47, 12, 1, 55, 596, 1184, 3],
  [2,
   1,
   52,
   19,
   4,
   5,
   41,
   114,
   10,
   3441,
   4,
   47,
   12,
   1

In [205]:
def get_dict_cap(dictionary, batch_size, max_len, file_path='D:\image_captioning\Images/'):
    n = 0
    X1,X2,Y = list(), list(), list()
    while True:
        for key in dictionary:
            seq_data = dictionary[key]
            img_array = preprocess_input(img_to_array(load_img(file_path+key, target_size=(224,224))))
            for seq in seq_data:
                for i in range(1,len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
                    try:
                        out_seq = to_categorical([out_seq], num_classes=vocab_size+1)[0]
                    except:
                        print(out_seq)
                        print(seq)
                    X1.append(in_seq)
                    X2.append(img_array)
                    Y.append(out_seq)
            n+=1
            if n>=batch_size:
                yield X1,X2,Y
                n=0
                X1.clear()
                X2.clear()
                Y.clear()

In [206]:
batch_size = 32
total_batches_train = len(image_data_training.file_paths)//batch_size
total_batches_test = len(image_data_testing.file_paths)//batch_size
train_caption_gen = get_dict_cap(training_seq_caption, batch_size, max_words_length)
test_caption_gen = get_dict_cap(testing_seq_caption, batch_size, max_words_length)

In [207]:
total_batches_train

202

In [208]:
## MOdel
## LSTM Model
lstm_input  = Input(shape=(max_words_length,))
lstm_e1 = Embedding(vocab_size+1, 50, mask_zero=True)(lstm_input)
lstm_rnn1 = LSTM(256, return_sequences=True)(lstm_e1)
lstm_d1 = Dropout(0.2)(lstm_rnn1)
lstm_rnn2 = LSTM(100)(lstm_d1)

In [209]:
vgg_model.layers[-2].output

<KerasTensor: shape=(None, 4096) dtype=float32 (created by layer 'fc2')>

In [210]:
vgg_input = Sequential()
#vgg_input = Sequential()
for layer in vgg_model.layers[:-1]:
    vgg_input.add(layer)

In [211]:
vgg_input.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 56, 56, 256)      

In [212]:
dense1 = Dense(100, activation='relu')(vgg_model.layers[-2].output)

In [213]:
concat = add([lstm_rnn2, dense1])
dense2 = Dense(50, activation='relu')(concat)
outputs = Dense(vocab_size+1, activation='softmax')(dense2)

In [214]:
caption_model = Model(inputs=[vgg_model.inputs, lstm_input], outputs=outputs)

In [215]:
caption_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 224, 224, 64  1792        ['input_5[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv2 (Conv2D)          (None, 224, 224, 64  36928       ['block1_conv1[0][0]']           
                                )                                                           

In [216]:
vgg_input.trainable = False

In [217]:
caption_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 224, 224, 64  1792        ['input_5[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv2 (Conv2D)          (None, 224, 224, 64  36928       ['block1_conv1[0][0]']           
                                )                                                           

In [218]:
#plot_model(caption_model, show_shapes=True)

In [219]:
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
epochs = 50

for i in range(epochs):
    # create data generator
    X1, X2, Y = next(train_caption_gen)
    X1, X2, Y = np.array(X1), np.array(X2), np.array(Y)
    caption_model.fit([X2,X1],Y, epochs=i+1, verbose=1, workers=4, initial_epoch=i-1)



In [1]:
caption_model.save("D:/Models/Image_Caption.h5")

NameError: name 'caption_model' is not defined

In [None]:
def predict_caption(model, image, tokenizer, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)
        #print(image.shape)
        #print(sequence)
        yhat = model.predict([np.expand_dims(image,axis=0), sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += " " + word
        if word == 'endseq':
            break
    return in_text.replace("startseq",'').replace("endseq",'')

In [None]:
from matplotlib import pyplot as plt
def generate_caption(image_name):
    image_original = img_to_array(load_img(image_name, target_size=(224,224)))
    image = preprocess_input(image_original)
    captions = image_captions[image_name.split("/")[-1]]
    print('******Actual*******')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(caption_model, image, caption_tokenizer, max_words_length)
    print('*******Predicted***********')
    print(y_pred)
    #print(image_original)
    plt.imshow(plt.imread(image_name))

In [None]:
generate_caption(image_data_training.file_paths[8])

In [None]:
generate_caption(image_data_testing.file_paths[4])

In [None]:
li = [5,5,8,6,6,7]
import numpy as np
np.argmax(li)