In [188]:
import os
import pandas as pd 
import pickle
import numpy as np
import re
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add,BatchNormalization,Embedding,LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
import glob as glob
import seaborn as sns

# Tokenization & Splitting

In [206]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(cleaned_data.to_list())
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in cleaned_data)
max_length

32

In [207]:
all_images=os.listdir(img_path)
len(all_images)

8091

In [208]:
train_images, test_images = train_test_split(all_images, test_size=0.15, random_state=42)

In [269]:
train_caption, test_caption =[],[]
for c in caption_IDs:
    img_id,img_cap=c.split('\t')
    if img_id in train_images:
        train_caption.append(c)
    elif img_id in test_images:
        test_caption.append(c)
    else:
        print(c)

# Extracting Image features

In [None]:
inceptionV3_model=InceptionV3(weights='imagenet',include_top=False,input_shape=(299, 299, 3))

In [222]:
# The second-to-last layer typically contains high-level features learned by the model.
inceptionV3_model = Model(inputs=inceptionV3_model.inputs, outputs=inceptionV3_model.layers[-2].output)

In [223]:
features = {}
for image in tqdm(data['image'].unique().tolist()):
    img = load_img(os.path.join(img_path, image), target_size=(299, 299))
    img = img_to_array(img)
    img = img / 255.0
    img = np.expand_dims(img, axis=0)
    feature = inceptionV3_model.predict(img, verbose=0)
    features[image] = feature.flatten()  # Flatten the features

  0%|          | 0/8091 [00:00<?, ?it/s]

In [225]:
train_image_features, test_image_features = {}, {} # A Dictionary to store image features with their corresponding IDs       
for id_ in features:
    if id_ in train_images:
        train_image_features[id_] = features[id_].flatten()  # Flattening the features    
    elif id_ in test_images:
        test_image_features[id_] = features[id_].flatten()  # Flattening the features
    else:
        print('Unknown image ID !')
# get the feature matrix for every image and store them in these dictionaries train_image_features, test_image_features

# Data Setup

In [272]:
def data_generator(captions, image_features, tokenizer, max_caption_len, batch_size):
    num_samples = len(captions)
    image_ids = list(image_features.keys())
    
    while True:
        np.random.shuffle(image_ids)
        
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            x_images, x_captions, y = [], [], []
            for caption in captions[start:end]:
                image_id = caption.split('\t')[0]  # Assuming your DataFrame has 'image' column
                caption_text = caption.split('\t')[1]  # Assuming 'cleaned_caption' is the processed caption text column
                
                seq = tokenizer.texts_to_sequences([caption_text])[0]
                
                for i in range(1, len(seq)):
                    in_seq, out_word = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_caption_len)[0]
                    out_word = to_categorical([out_word], num_classes=vocab_size)[0]
                    
                    x_images.append(image_features[image_id])  # Already flattened
                    x_captions.append(in_seq)
                    y.append(out_word)
            
            yield (np.array(x_images), np.array(x_captions)), np.array(y)

In [273]:
max_captions_len=max(len(line.split()) for line in cleaned_data)+1
max_captions_len

33

In [274]:
cnn_output_dim = inceptionV3_model.output_shape[1] * inceptionV3_model.output_shape[2] * inceptionV3_model.output_shape[3]
cnn_output_dim

12288

In [287]:
batch_train=64
batch_test=64

In [276]:
train_data_generator = data_generator(train_caption, train_image_features, tokenizer, max_captions_len, batch_train)
test_data_generator = data_generator(test_caption, test_image_features, tokenizer, max_captions_len, batch_test)


# Build The Model

In [294]:
def build_model(vocab_size, max_caption_len, cnn_output_dim):
    # Encoder
    Input_image = Input(shape=(cnn_output_dim,), name='Feature_Input')  # Flattened shape
    x = BatchNormalization()(Input_image)
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)
    
    # Decoder
    Input_cation = Input(shape=(max_caption_len,), name='Caption_Input')
    y = Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True)(Input_cation)
    y = LSTM(256)(y)
    
    # Output
    decoder = add([x, y])
    decoder = Dense(256, activation='relu')(decoder)
    output = Dense(vocab_size, activation='softmax', name='output_layer')(decoder)
    
    # Create the model
    model = Model(inputs=[Input_image, Input_cation], outputs=output)
    return model

In [295]:
model=build_model(vocab_size,max_captions_len,cnn_output_dim)
model.summary()

In [297]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.1, clipnorm=1.0))

# Training

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def lr_scheduler(epoch, lr):
    return float(lr * tf.math.exp(-0.6))

    
lr_schedul = LearningRateScheduler(lr_scheduler)

steps_per_epoch = len(train) // batch_train
validation_steps = len(test) // batch_test

history = model.fit(
    train_data_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=test_data_generator,
    validation_steps=validation_steps,
    epochs=15,
    callbacks=[early_stopping, lr_schedul]
)


Epoch 1/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 5s/step - loss: 6.4640 - val_loss: 4.9123 - learning_rate: 0.0549
Epoch 2/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 5s/step - loss: 4.8345 - val_loss: 4.7899 - learning_rate: 0.0301
Epoch 3/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 5s/step - loss: 4.4630 - val_loss: 4.5336 - learning_rate: 0.0165
Epoch 4/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 5s/step - loss: 4.3137 - val_loss: 4.4567 - learning_rate: 0.0091
Epoch 5/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 5s/step - loss: 4.2220 - val_loss: 4.4484 - learning_rate: 0.0050
Epoch 6/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 5s/step - loss: 4.1722 - val_loss: 4.3680 - learning_rate: 0.0027
Epoch 7/15
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 5s/step - loss: 4.1616 - val_loss: 4.3858 - learnin

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], color='green', linestyle='-', marker='o', markersize=5, label='Train Loss')
plt.plot(history.history['val_loss'], color='red', linestyle='--', marker='x', markersize=5, label='Validation Loss')

plt.title('Model Loss Over Epochs', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.grid(True, linestyle='--', alpha=0.7)

plt.legend(loc='upper left', fontsize=12)

plt.xlim(0, len(history.history['loss']) - 1)

plt.tight_layout()
plt.show()
