In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import Model
from keras.layers import *

# Extracting captions from captions.txt

In [2]:
import pandas as pd

with open('captions.txt', 'r') as f:
    captions_str = f.read()

In [3]:
import string

def clean_caption(text):
    caption = [ch for ch in text if ch not in string.punctuation]
    caption = ''.join(caption)
    caption = caption.split(' ')
    caption = [word.lower() for word in caption if len(word) > 1 and word.isalpha()]
    caption = ' '.join(caption)

    return caption

In [4]:
captions_arr = captions_str.split('\n')

In [5]:
rows = []

captions_text = ""

for i in range(1, len(captions_arr), 5):
    row = []

    row.append('Images/' + captions_arr[i].split('.jpg,')[0] + '.jpg')

    # print(captions_arr[i].split('.jpg,'))
    caption_1 = 'startseq ' + clean_caption(captions_arr[i].split('.jpg,')[1]) + ' endseq'
    row.append(caption_1)

    caption_2 = 'startseq ' + clean_caption(captions_arr[i+1].split('.jpg,')[1]) + ' endseq'
    row.append(caption_2)

    caption_3 = 'startseq ' + clean_caption(captions_arr[i+2].split('.jpg,')[1]) + ' endseq'
    row.append(caption_3)

    caption_4 = 'startseq ' + clean_caption(captions_arr[i+3].split('.jpg,')[1]) + ' endseq'
    row.append(caption_4)

    caption_5 = 'startseq ' + clean_caption(captions_arr[i+4].split('.jpg,')[1]) + ' endseq'
    row.append(caption_5)

    rows.append(row)

    captions_text = captions_text + " " + caption_1 + " " + caption_2 + " " + caption_3 + " " + caption_4 + " " + caption_5

captions_df = pd.DataFrame(columns=['image_path', 'caption_1', 'caption_2', 'caption_3', 'caption_4', 'caption_5'] , data=rows[1:])

In [6]:
captions_df.to_csv('captions.csv', index=False)

In [7]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts([captions_text])

In [8]:
vocab_size = 0
for word, index in tokenizer.word_index.items():
    if(vocab_size < index):
        vocab_size = index

vocab_size

8765

In [9]:
sentences = []

for row in captions_df.iloc:
    sentences.extend([row[1], row[2], row[3], row[4], row[5]])

In [10]:
tokenized_sentences = tokenizer.texts_to_sequences(sentences)

max_len = 0

for sentence in tokenized_sentences:
    if(max_len < len(sentence)):
        max_len = len(sentence)

max_len

34

In [11]:
from keras.utils import pad_sequences

def preprocess_caption(sentence):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    tokenized_sentence = pad_sequences([tokenized_sentence], maxlen=max_len, padding='pre')
    return tokenized_sentence[0]

## Image Preprocessing

In [12]:
from keras.applications import ResNet50
from keras.layers import GlobalMaxPooling2D, Flatten

In [13]:
img_model = ResNet50(include_top=False, weights='imagenet')
img_model.trainable = False

# Add GlobalMaxPooling and Flatten layers on top of the base model
x = GlobalMaxPooling2D()(img_model.output)  # Apply global max pooling
x = Flatten()(x)  # Flatten the pooled features into a 1D vector

# Create a new model that ends after the flattening layer
img_model = Model(inputs=img_model.input, outputs=x)
img_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, None, None,   0           ['input_1[0][0]']                
                                3)                                                                
                                                                                                  
 conv1_conv (Conv2D)            (None, None, None,   9472        ['conv1_pad[0][0]']              
                                64)                                                           

In [14]:
from keras.utils import load_img, img_to_array
from keras.applications.resnet import preprocess_input
import numpy as np

def extract_img_features(img_path):
    img = load_img(img_path, target_size=(224, 224))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    features = img_model.predict(img)

    return features

In [None]:
import pickle

for j in range(0, 9):

    print(f'Training for {1000*j} to {1000*(j+1)}')

    features = []
    X_sentences = []
    next_word = []

    if(j != 8):
        shortened_df = captions_df.iloc[1000*j:1000*(j+1), :]
    else:
        shortened_df = captions_df.iloc[8000:, :]

    for row in shortened_df.iloc:
        feature = extract_img_features(row.iloc[0])

        for sentence in row[1:]:
            tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

            for i in range(1,len(tokenized_sentence)):
                features.append(feature)
                X_sentences.append(tokenized_sentence[:i])
                next_word.append(tokenized_sentence[i])

    with open(f'features/features{1000*j}_{1000*(j+1)}.pkl', 'wb') as f:
         pickle.dump(features, f)

    with open(f'features/X_sentences{1000*j}_{1000*(j+1)}.pkl', 'wb') as f:
         pickle.dump(X_sentences, f)

    with open(f'features/next_word{1000*j}_{1000*(j+1)}.pkl', 'wb') as f:
         pickle.dump(next_word, f)

In [15]:
from keras.regularizers import L2

# Input for image features
image_features = Input(shape=(1, 2048))
dense1 = Dense(256, activation='relu', kernel_initializer='he_normal', kernel_regularizer = L2(1e-3))(image_features)
batch_norm1 = BatchNormalization()(dense1)
dropout1 = Dropout(0.4)(batch_norm1)

# Reshape dense1 to add a time-step dimension
dense1_reshaped = Reshape((1, 256))(dropout1)  # Output shape: (batch_size, 1, 256)

# Input for captions
caption_input = Input(shape=(max_len,))
embedding = Embedding(vocab_size, output_dim=256)(caption_input)
# dropout2 = Dropout(0.4)(embedding)

# GRU Encoder
gru_encoder = GRU(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, kernel_regularizer = L2(1e-3))(embedding)

# Concatenate dense1_reshaped and gru_encoder along the time-step dimension
combined = concatenate([dense1_reshaped, gru_encoder], axis=1)

# GRU Decoder
gru_decoder = GRU(128, kernel_regularizer = L2(1e-3), dropout=0.3, recurrent_dropout=0.3)(combined)

# Final output layer
output = Dense(vocab_size, activation='softmax')(gru_decoder)

# Define the model
model = Model(inputs=[image_features, caption_input], outputs=output)

# Print model summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1, 2048)]    0           []                               
                                                                                                  
 dense (Dense)                  (None, 1, 256)       524544      ['input_2[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (None, 1, 256)      1024        ['dense[0][0]']                  
 alization)                                                                                       
                                                                                                  
 input_3 (InputLayer)           [(None, 34)]         0           []                         

In [16]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from keras.utils import pad_sequences
from keras.utils import to_categorical
import numpy as np

for j in range(0, 8):

    print(f'Training for {1000*j} to {1000*(j+1)} rows')

    with open(f'/kaggle/working/features/features{1000*j}_{1000*(j+1)}.pkl', 'rb') as f:
        features = pickle.load(f)

    with open(f'/kaggle/working/features/X_sentences{1000*j}_{1000*(j+1)}.pkl', 'rb') as f:
        X_sentences = pickle.load(f)

    with open(f'/kaggle/working/features/next_word{1000*j}_{1000*(j+1)}.pkl', 'rb') as f:
        next_word = pickle.load(f)
    
    next_word = to_categorical(next_word, num_classes=vocab_size)

    X_sentences = pad_sequences(X_sentences, maxlen=max_len, padding='pre')

    features =  np.array(features)
    X_sentences = np.array(X_sentences)

    model.fit([features, X_sentences], next_word, batch_size=512, epochs=40, validation_split=0.2, callbacks=None)

In [21]:
from keras.models import load_model
model = load_model('model_full_1.h5')

ValueError: Unrecognized keyword arguments: ['batch_shape']

In [305]:
def caption_generator(img_path):
    img_features = extract_img_features(img_path)
    img_features = np.reshape(img_features, (1, 1, 2048))
    # print('img_features :\n', img_features)
    caption_input = [tokenizer.word_index['startseq']]
    # print('caption_input :\n', caption_input)

    for i in range(max_len):
        seq = pad_sequences([caption_input], maxlen=max_len, padding='pre')
        # print('seq :\n', seq)
        prediction = model.predict([img_features, seq])
        next_word = np.argmax(prediction[0])
        word = tokenizer.index_word[next_word]

        if(word == 'endseq'):
            break

        caption_input.append(next_word)

    return ' '.join([tokenizer.index_word[index] for index in caption_input[1:]])

In [None]:
from keras.utils import load_img

path = '/kaggle/working/Images/1001773457_577c3a7d70.jpg'

image = load_img(path)

image

In [None]:
caption = caption_generator(path)
print(caption)