In [91]:
import numpy as np
import os
import pickle
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add

In [92]:
BASE_DIR = "archive"
WORKING_DIR = "Data"

# Image Features

In [93]:
model_img = VGG16()
model_img = Model(inputs = model_img.inputs,outputs = model_img.layers[-2].output)


In [94]:
 print(model_img.summary())

None


In [95]:
# features = {}
# directory = os.path.join(BASE_DIR, "Images")

# for img_name in tqdm(os.listdir(directory)):
#     img_path = os.path.join(directory, img_name)
    
#     # Resize the image to 224x224 as required by the model
#     img = load_img(img_path, target_size=(224, 224))
    
#     # Convert the image to an array
#     img = img_to_array(img)
    
#     # Reshape the image to (1, 224, 224, 3) for batch processing
#     img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    
#     # Preprocess the image for the model
#     img = preprocess_input(img)
    
#     # Get the feature prediction from the model
#     feature = model_img.predict(img, verbose=0)
    
#     # Extract the image ID from the file name
#     image_id = img_name.split('.')[0]
    
#     # Store the feature in the dictionary
#     features[image_id] = feature


In [124]:
# dumping data
# pickle.dump(features,open(os.path.join(WORKING_DIR,'features_pk1'),'wb'))
with open(os.path.join(WORKING_DIR,'features_pk1'),'rb') as f:
    features = pickle.load(f)

Load the pickled data, no need to run upper parts again


In [125]:
with open("archive/captions.txt",'r') as f:
    next(f)
    caption_doc = f.read()

In [126]:
# mapping ={}
# for line in tqdm(caption_doc.split('\n')):
#     if(len(line)<2):
#         continue
#     tokens = line.split(",")
#     img_id = tokens[0].split(".")[0]
#     caption = " ".join(tokens[1:])
#     if img_id not in mapping:
#         mapping[img_id] = []
#     mapping[img_id].append(caption)

In [127]:
# dumping caption data
# pickle.dump(mapping,open(os.path.join(WORKING_DIR,'captions_pk1'),'wb'))
with open(os.path.join(WORKING_DIR,'captions_pk1'),'rb') as f:
    mapping = pickle.load(f)

In [128]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            caption = caption.lower()
            # delete digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
#             " ".join([word for word in caption.split() if len(word)>1])
            caption = 'startseq ' + caption + ' endseq'
            captions[i] = caption

In [129]:
clean(mapping)

In [130]:
all_captions = []
for key in mapping:
    for value in mapping[key]:
        all_captions.append(value)

In [131]:
all_captions[:10]

['startseq a child in a pink dress is climbing up a set of stairs in an entry way . endseq',
 'startseq a girl going into a wooden building . endseq',
 'startseq a little girl climbing into a wooden playhouse . endseq',
 'startseq a little girl climbing the stairs to her playhouse . endseq',
 'startseq a little girl in a pink dress going into a wooden cabin . endseq',
 'startseq a black dog and a spotted dog are fighting endseq',
 'startseq a black dog and a tri-colored dog playing with each other on the road . endseq',
 'startseq a black dog and a white dog with brown spots are staring at each other in the street . endseq',
 'startseq two dogs of different breeds looking at each other on the road . endseq',
 'startseq two dogs on pavement moving toward each other . endseq']

### Tokenizing


In [132]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index)+1

In [142]:
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [133]:
vocab_size

8496

In [134]:
max_length = max(len(caption.split()) for caption in all_captions)
max_length


39

# Train Test Split

In [135]:
ids = list(mapping.keys())
split = int(len(ids)*0.9)
train = ids[:split]
test = ids[split:]

In [136]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield {"image": X1, "text": X2}, y
                X1, X2, y = list(), list(), list()
                n = 0

In [139]:
inputs1 = Input(shape=(4096,), name="image")
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
inputs2 = Input(shape=(max_length,), name="text")
se1 = Embedding(vocab_size, 256)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# plot the model
plot_model(model, show_shapes=True,show_layer_names=False)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [140]:
epochs = 50
batch_size = 64
steps = len(train) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

KeyboardInterrupt: 

In [144]:
pickle.dump(model,open("best.pkl","wb"))

# Genrating Captions

In [115]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [116]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length, padding='post')
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
    return in_text

In [117]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in test[:10]:
    # get actual caption
    captions = mapping[key]
    # predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    # calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))



BLEU-1: 0.050000
BLEU-2: 0.019612


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [118]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(img_path):

    image_name = img_path.split("/")[-1]
    img_path = os.path.join(BASE_DIR, "Images", image_name)
    image = Image.open(img_path)
    captions = mapping[image_name.split('.')[0]]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    y_pred = predict_caption(model, features[image_name.split('.')[0]], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    # plt.imshow(img_path)

In [119]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_new_caption(img_path):
    
    img = load_img(img_path, target_size=(224, 224))
    
    img = img_to_array(img)
    
    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    
    # Preprocess the image for the model
    img = preprocess_input(img)
    
    # Get the feature prediction from the model
    feature = model_img.predict(img, verbose=0)
    
    y_pred = predict_caption(model, feature, tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)

In [120]:
generate_new_caption("archive/Images/19212715_20476497a3.jpg")



--------------------Predicted--------------------
startseq in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in
