In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, array_to_img
from tqdm import tqdm

In [None]:
dataset, info = tfds.load("coco_captions", split=["train[:1%]", "validation[:1%]"], with_info=True)
train_dataset, test_dataset = dataset
print("Train size:", info.splits["train"].num_examples)
print("Validation size:", info.splits["validation"].num_examples)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/coco_captions/2014/1.1.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

In [None]:
train_captions = []
train_images = []

for example in tfds.as_numpy(train_dataset.take(500)):  # limit to 500 for demo
    img = example["image"]
    for cap in example["captions"]["text"]:
        train_captions.append("startseq " + cap.decode("utf-8") + " endseq")
        train_images.append(img)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in train_captions)

print("Vocab size:", vocab_size)
print("Max caption length:", max_length)

In [None]:
cnn_model = VGG16(weights="imagenet")
cnn_model = Model(inputs=cnn_model.inputs, outputs=cnn_model.layers[-2].output)

def preprocess_image(img):
    img = tf.image.resize(img, (224, 224))
    img = tf.cast(img, tf.float32)
    img = np.expand_dims(img, axis=0)
    return preprocess_input(img)

def extract_features(images):
    features = []
    for img in tqdm(images):
        arr = preprocess_image(img)
        feat = cnn_model.predict(arr, verbose=0)
        features.append(feat[0])
    return np.array(features)

image_features = extract_features(train_images)

print("Image features shape:", image_features.shape)

In [None]:
sequences_X1, sequences_X2, sequences_y = [], [], []

for i, caption in enumerate(train_captions):
    seq = tokenizer.texts_to_sequences([caption])[0]
    for j in range(1, len(seq)):
        in_seq, out_seq = seq[:j], seq[j]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        sequences_X1.append(image_features[i])
        sequences_X2.append(in_seq)
        sequences_y.append(out_seq)

sequences_X1 = np.array(sequences_X1)
sequences_X2 = np.array(sequences_X2)
sequences_y = tf.keras.utils.to_categorical(sequences_y, num_classes=vocab_size)

print("Training samples:", sequences_X1.shape, sequences_X2.shape, sequences_y.shape)


In [None]:
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation="relu")(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = tf.keras.layers.add([fe2, se3])
decoder2 = Dense(256, activation="relu")(decoder1)
outputs = Dense(vocab_size, activation="softmax")(decoder2)

caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)
caption_model.compile(loss="categorical_crossentropy", optimizer="adam")

caption_model.summary()

In [None]:
caption_model.fit([sequences_X1, sequences_X2], sequences_y, epochs=2, batch_size=64)


In [None]:
def generate_caption(model, photo, tokenizer, max_length):
    in_text = "startseq"
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = None
        for w, index in tokenizer.word_index.items():
            if index == yhat:
                word = w
                break
        if word is None:
            break
        in_text += " " + word
        if word == "endseq":
            break
    return in_text

In [None]:
example = next(iter(test_dataset.take(1)))
img = example["image"]
arr = preprocess_image(img)
feat = cnn_model.predict(arr, verbose=0)
caption = generate_caption(caption_model, feat, tokenizer, max_length)

print("Generated Caption:", caption)
print("Original Captions:", [c.decode("utf-8") for c in example["captions"]["text"]])

plt.imshow(array_to_img(img))
plt.title(caption)
plt.axis("off")
plt.show()