In [8]:
import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Load Preprocessed Data
with open('pkl/cleaned_captions.pkl', 'rb') as f:
    cleaned_captions = pickle.load(f)

with open('pkl/image_features.pkl', 'rb') as f:
    image_features = pickle.load(f)

# Step 2: Build Tokenizer & Save
all_captions = []
for captions in cleaned_captions.values():
    for caption in captions:
        all_captions.append('startseq ' + caption + ' endseq')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# Save tokenizer
with open('pkl/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(c.split()) for c in all_captions)

# Step 3: Data Generator
def data_generator(captions_dict, features, tokenizer, max_length, vocab_size, batch_size):
    while True:
        X1, X2, y = [], [], []
        n = 0
        for img_id, captions in captions_dict.items():
            for caption in captions:
                caption = 'startseq ' + caption + ' endseq'
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1.append(features[img_id])
                    X2.append(in_seq)
                    y.append(out_seq)

                    n += 1
                    if n == batch_size:
                        yield ([np.array(X1), np.array(X2)], np.array(y))
                        X1, X2, y = [], [], []
                        n = 0

# Step 4: Define the Model
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_len,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Step 5: Dataset Setup for Training
batch_size = 32
epochs = 10
steps = len(all_captions) // batch_size

output_signature = (
    (
        tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),
        tf.TensorSpec(shape=(None, max_len), dtype=tf.int32),
    ),
    tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)
)

dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(cleaned_captions, image_features, tokenizer, max_len, vocab_size, batch_size),
    output_signature=output_signature
)

# Step 6: Train the Model
checkpoint = ModelCheckpoint('model/image_caption_model.h5', monitor='loss', save_best_only=True, verbose=1)
model.fit(dataset, epochs=epochs, steps_per_epoch=steps, callbacks=[checkpoint])


FileNotFoundError: [Errno 2] No such file or directory: 'pkl/cleaned_captions.pkl'