In [2]:
import os
import tqdm
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf
import keras

from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Dense
from tensorflow.keras.layers import AvgPool2D, GlobalAveragePooling2D, MaxPool2D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.applications import InceptionV3, ResNet50, DenseNet121
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:
IMAGE_DIR = 'dataset/flickr_8k/images/'
TRAIN_DIR = 'dataset/flickr_8k/train.txt'
VAL_DIR = 'dataset/flickr_8k/val.txt'
TEST_DIR = 'dataset/flickr_8k/test.txt'
LABEL_DIR = 'dataset/flickr_8k/token.txt'
DESCRIPTION_DIR = 'dataset/flickr_8k/description.txt'

VOCAB_SIZE = 1000
MAX_LENGTH = 10

In [40]:
train_imgs = []
train_captions = []
val_imgs = []
val_captions = []
test_imgs = []
test_captions = []
with open(TRAIN_DIR, 'r') as f:
    for line in f.read().splitlines():
        img, caption = line.split('\t')
        train_imgs.append(IMAGE_DIR + img)
        train_captions.append(caption.lower())
        
with open(VAL_DIR, 'r') as f:
    for line in f.read().splitlines():
        img, caption = line.split('\t')
        val_imgs.append(IMAGE_DIR + img)
        val_captions.append(caption.lower())

with open(TEST_DIR, 'r') as f:
    for line in f.read().splitlines():
        img, caption = line.split('\t')
        test_imgs.append(IMAGE_DIR + img)
        test_captions.append(caption.lower())
        
len(train_imgs), len(train_captions), len(val_imgs), len(val_captions), len(test_imgs), len(test_captions)

(30000, 30000, 5000, 5000, 5000, 5000)

In [41]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_imgs, train_captions))
val_dataset = tf.data.Dataset.from_tensor_slices((train_imgs, train_captions))
test_dataset = tf.data.Dataset.from_tensor_slices((train_imgs, train_captions))

In [42]:
def fit_vectorizer(train_sentences):
    vectorizer = tf.keras.layers.TextVectorization( 
        standardize=None,
        max_tokens=VOCAB_SIZE,
        output_sequence_length=MAX_LENGTH, 
        ragged=False
    ) 
    
    vectorizer.adapt(train_sentences)

    return vectorizer

caption_only = train_dataset.map(lambda img, caption: caption)
vectorizer = fit_vectorizer(caption_only)
vectorizer(train_captions[0])

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([  2,  15,   9,   7,  33, 256,   2,  14,   9,   4], dtype=int64)>

In [43]:
def preprocess_dataset(dataset, is_training=True):
    dataset = dataset.map(lambda img, caption: (img, vectorizer(caption)))

    if is_training:
        dataset = dataset.shuffle(buffer_size=1000)
    
    dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)
    
    return dataset

train_dataset_final = preprocess_dataset(train_dataset, is_training=True)
val_dataset_final = preprocess_dataset(train_dataset, is_training=True)
test_dataset_final = preprocess_dataset(train_dataset, is_training=True)

In [44]:
# Define CNN base model (InceptionV3)
# cnn_base_model = InceptionV3(include_top=True, weights='imagenet')
# cnn_model = Model(inputs=cnn_base_model.input, outputs=cnn_base_model.layers[-2].output)

cnn_base_model = DenseNet121(include_top=True, weights = 'imagenet')
cnn_model = Model(inputs=cnn_base_model.input, outputs=cnn_base_model.layers[-2].output)

# Image feature input
image_features_input = Input(shape=(1, 1024))  # Adjust the shape based on CNN output
image_features = Dense(512, activation='relu')(image_features_input)
image_features_reshaped = Reshape((1, 512), input_shape=(512,))(image_features)

def extract_image_features(image_path):
    img = load_img(image_path, target_size=(224, 224, 3))  # Resize for InceptionV3
    img = img_to_array(img)
    img = tf.expand_dims(img, axis=0)  # Add batch dimension
    img = tf.keras.applications.densenet.preprocess_input(img)  # Preprocessing
    features = cnn_model.predict(img, verbose=0)
#     img = img_to_array(img)
#     img = img/255.
#     img = np.expand_dims(img,axis=0)
    feature = cnn_model.predict(img, verbose=0)
    feature = tf.reshape(feature, (1, -1))  # Flatten to (1, -1)
    return feature

In [45]:
# Caption input and embedding layer
caption_input = Input(shape=(MAX_LENGTH,))
caption_embedding = Embedding(VOCAB_SIZE, 256)(caption_input)
caption_lstm = LSTM(512, return_sequences=True)(caption_embedding)

# Combine CNN and LSTM outputs
combined = add([image_features_reshaped, caption_lstm])
output = LSTM(512)(combined)
output = Dense(VOCAB_SIZE, activation='softmax')(output)

# Compile the model
model = Model(inputs=[image_features_input, caption_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [46]:
def data_generator(images, captions, batch_size=32):
    num_samples = len(images)
    while True:
        for i in range(0, num_samples, batch_size):
            batch_images, batch_captions, output_captions = [], [], []
            for j in range(batch_size):
                if i+j >= num_samples:
                    break
                image, caption = images[i+j], captions[i+j]
                features = FEATURES[image]
                seq = vectorizer(caption)
                batch_images.extend([features for s in range(1, len(seq))])
                for k in range(1, len(seq)):
                    in_seq, out_seq = seq[:k], seq[k]
                    in_seq = pad_sequences([in_seq], maxlen=MAX_LENGTH)[0]
                    out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vectorizer.vocabulary_size())[0]
                    
                    batch_captions.append(in_seq)
                    output_captions.append(out_seq)
            batch_captions, output_captions = tf.convert_to_tensor(np.array(batch_captions)), tf.convert_to_tensor(np.array(output_captions))
            batch_images = tf.convert_to_tensor(np.array(batch_images))
            
            yield (batch_images, batch_captions), output_captions  # Input is a tuple, output is the target caption

In [20]:
with open('features_densenet121.pkl', 'rb') as f:
    FEATURES = pickle.load(f)
    print('Image features have been loaded')

Image features have been loaded


In [47]:
# Train model
BATCH_SIZE = 128
train_generator = data_generator(train_imgs, train_captions, batch_size=BATCH_SIZE)
val_generator = data_generator(val_imgs, val_captions, batch_size=BATCH_SIZE)
model.fit(train_generator, batch_size=BATCH_SIZE, steps_per_epoch=len(train_imgs) // BATCH_SIZE, 
          epochs=2, validation_data=val_generator, validation_steps=len(val_imgs) // BATCH_SIZE, 
          verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x19433502890>

In [48]:
BATCH_SIZE = 32
test_generator = data_generator(test_imgs, test_captions, batch_size=BATCH_SIZE)
loss = model.evaluate(test_generator, steps=len(test_imgs) // BATCH_SIZE, verbose=1)
print(f"Test Loss: {loss}")

Test Loss: 2.993499994277954


In [None]:
def predict_caption(img):
    
#     in_text = "startseq"
    feat = extract_image_features(img)
    for i in range(MAX_LENGTH):
        sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        sequence = pad_sequences([sequence],maxlen=MAX_LENGTH,padding='post')
        final_caption = set()
        ypred = model.predict([photo, sequence])
        ypred = ypred.argmax() 
        word = idx_to_word[ypred]
        in_text = ' ' + word
        final_caption.add(in_text)
        
        print(in_text)
        
#         if word == "endseq":
#             break
    
#         final_caption = in_text.split()[1:-1]
        st = ' '.join(final_caption)
        final_caption.clear()
    
    return st

In [75]:
def generate_caption(model, img, vectorizer):
    feat = extract_image_features(img)
    
    caption = '<start>'
    for i in range(MAX_LENGTH):
        seq = vectorizer([caption])[0]
        seq = pad_sequences([seq], maxlen=MAX_LENGTH)
        
        y_pred = cnn_model.predict([image_features, seq], verbose=0)
        y_pred = np.argmax(y_pred)  # Get the index of the highest probability word
        
        # Map index to word
        word = vectorizer.get_vocabulary()[y_pred]
        caption.append(word)

    return ' '.join(caption)  # Skip <start> and <end> tokens

In [91]:
model.input_shape

[(None, 1, 1024), (None, 10)]

In [161]:
test_captions[100]

'a hiker ascends a snowy hill .'

In [160]:
feat = extract_image_features(test_imgs[100])
feat = tf.convert_to_tensor(feat)
feat =  tf.expand_dims(feat, axis=0)
caption = '<UNK>'
final_caption = set()
for i in range(MAX_LENGTH):
    seq = vectorizer([caption])[0]
    seq = pad_sequences([seq], maxlen=MAX_LENGTH)[0]
    in_seq = tf.convert_to_tensor(seq)
    in_seq = tf.expand_dims(in_seq, axis=0)
    y_pred = model.predict([feat, in_seq])
    y_pred = np.argmax(y_pred)  # Get the index of the highest probability word

    word = vectorizer.get_vocabulary()[y_pred]
    caption = caption + ' ' + word
    final_caption.add(word)

caption



'<UNK> man         '

In [None]:
vocabulary = vectorizer.get_vocabulary()
vocabulary[1:5]

In [59]:
vectorizer('UNK a . in')

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([1, 2, 3, 4, 0, 0, 0, 0, 0, 0], dtype=int64)>

In [81]:
vectorizer(['<start>'])

<tf.Tensor: shape=(1, 10), dtype=int64, numpy=array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)>

In [None]:
vectorizer.

In [162]:
model.save('model_densenet121.h5')