In [1]:
import numpy as np


In [2]:
features = np.load("Data/image_features.npy", allow_pickle=True).item()
captions = np.load("Data/captions_sequences.npy")
word_to_index = np.load("Data/word_to_index.npy", allow_pickle=True).item()
index_to_word = np.load("Data/index_to_word.npy", allow_pickle=True).item()

VOCAB_SIZE = len(word_to_index)
MAX_LENGTH = 38

In [7]:
import csv
import re

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r'[^\w\s]', '', caption)
    caption = re.sub(r'\s+', ' ', caption).strip()
    return f"<start> {caption} <end>"

def load_captions(filename):
    captions_dict = {}
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # skip header
        for row in reader:
            if len(row) != 2:
                continue
            image_id, caption = row
            caption = clean_caption(caption)
            if image_id not in captions_dict:
                captions_dict[image_id] = []
            captions_dict[image_id].append(caption)
    return captions_dict

captions_dict = load_captions("Data/captions.txt")


In [8]:
np.save("Data/captions_dict.npy", captions_dict)
print("✅ captions_dict saved successfully.")

✅ captions_dict saved successfully.


In [9]:
X1, X2, y = [], [], []

for image_name, captions in captions_dict.items():
    feature = features[image_name]
    
    for caption in captions:
        tokens = caption.split()
        seq = [word_to_index.get(word, word_to_index['<unk>']) for word in tokens]

        for i in range(1, len(seq)):
            in_seq = seq[:i]
            out_word = seq[i]
            in_seq_padded = np.pad(in_seq, (0, MAX_LENGTH - len(in_seq)), mode='constant')

            X1.append(feature)
            X2.append(in_seq_padded)
            y.append(out_word)


In [10]:
import numpy as np

X1 = np.array(X1)
X2 = np.array(X2)
y = np.array(y)

print("Image features shape: ", X1.shape)
print("Input sequences shape:", X2.shape)
print("Target words shape:   ", y.shape)


Image features shape:  (476960, 2048)
Input sequences shape: (476960, 38)
Target words shape:    (476960,)


In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add


In [12]:
EMBED_DIM = 256    # Dimension of embedding
LSTM_UNITS = 256   # LSTM hidden units


In [19]:
print("VOCAB_SIZE:", VOCAB_SIZE)
print("Max index in word_to_index:", max(word_to_index.values()))


VOCAB_SIZE: 2997
Max index in word_to_index: 2996


In [18]:
VOCAB_SIZE = max(word_to_index.values()) + 1  # ✅ 2996 + 1 = 2997


In [20]:
# Feature extractor (image input)
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(EMBED_DIM, activation='relu')(fe1)

# Sequence processor (caption input)
inputs2 = Input(shape=(MAX_LENGTH,))
se1 = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(LSTM_UNITS)(se2)

# Decoder (merge image + caption paths)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(VOCAB_SIZE, activation='softmax')(decoder2)

# Final model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.summary()


In [21]:
EPOCHS = 20
BATCH_SIZE = 256

history = model.fit([X1, X2], y, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)


Epoch 1/20




[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 329ms/step - loss: 4.1488
Epoch 2/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 310ms/step - loss: 2.9315
Epoch 3/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m466s[0m 250ms/step - loss: 2.6848
Epoch 4/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m479s[0m 257ms/step - loss: 2.5418
Epoch 5/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m473s[0m 254ms/step - loss: 2.4448
Epoch 6/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 259ms/step - loss: 2.3600
Epoch 7/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 241ms/step - loss: 2.3011
Epoch 8/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 224ms/step - loss: 2.2463
Epoch 9/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m464s[0m 249ms/step - loss: 2.2073
Epoch 10/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━

In [22]:
model.save("image_captioning_model.h5")




In [24]:
model.save("image_captioning_model_tf.keras", save_format="tf")


