In [1]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Embedding, Input, RepeatVector, Concatenate, TimeDistributed, Activation
import numpy as np

In [2]:
embedding_size = 128 #dimensionality of the word embedding
max_len = 40
vocab_size=8254

In [3]:
image_input = Input(shape=(2048,))
image_dense = Dense(embedding_size, activation='relu')(image_input)
image_repeat = RepeatVector(max_len)(image_dense)

image_model = Model(inputs=image_input, outputs=image_repeat)

image_model.summary()

In [4]:
language_input = Input(shape=(max_len,))
language_embed = Embedding(input_dim=vocab_size, output_dim=embedding_size,input_shape=(40,))(language_input)
language_lstm = LSTM(256, return_sequences=True)(language_embed)
language_time_distributed = TimeDistributed(Dense(embedding_size))(language_lstm)

language_model = Model(inputs=language_input, outputs=language_time_distributed)

language_model.summary()

  super().__init__(**kwargs)


In [5]:
concatenated = Concatenate()([image_model.output, language_model.output])
x = LSTM(128, return_sequences=True)(concatenated)
x = LSTM(512, return_sequences=False)(x)
x = Dense(vocab_size)(x)
output = Activation('softmax')(x)

In [6]:
model = Model(inputs=[image_input, language_input], outputs=output)

# Compile the model (ensure to use the same loss function and optimizer as during training)
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

In [7]:
model.load_weights("model_weights.h5")

In [8]:
model.summary()

In [9]:
from keras.preprocessing import image
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [10]:
def preprocessing(img_path):
    im = image.load_img(img_path, target_size=(224, 224, 3))
    im = image.img_to_array(im)
    im = np.expand_dims(im, axis=0)
    return im

In [11]:
def get_encoding(model, img):
    image = preprocessing(img)
    pred = model.predict(image).reshape(2048)
    return pred

In [12]:
from keras.applications import ResNet50
resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3), pooling='avg')

In [13]:
def predict_captions(image, model, max_len, word_2_indices, indices_2_word):
    start_word = ["<start>"]
    while True:
        par_caps = [word_2_indices[i] for i in start_word]
        par_caps = pad_sequences([par_caps], maxlen=max_len, padding='post')
        preds = model.predict([np.array([image]), np.array(par_caps)])
        word_pred = indices_2_word[np.argmax(preds[0])]
        start_word.append(word_pred)
        
        if word_pred == "<end>" or len(start_word) > max_len:
            break
            
    return ' '.join(start_word[1:-1])

In [15]:
import json
# Load word_2_indices from JSON file
with open('word_2_indices.json', 'r') as f:
    word_2_indices = json.load(f)

# Convert keys from strings to integers
word_2_indices = {key: int(value) for key, value in word_2_indices.items()}

# Load indices_2_word from JSON file
with open('indices_2_word.json', 'r') as f:
    indices_2_word = json.load(f)

# Convert keys from strings to integers
indices_2_word = {int(key): value for key, value in indices_2_word.items()}


In [16]:
img_path = "man.jpg"  # Replace with the path to your image
test_img = get_encoding(resnet, img_path)
predicted_caption = predict_captions(test_img, model, max_len, word_2_indices, indices_2_word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/s

In [17]:
predicted_caption


'A boy in a blue shirt playing on a field . . .'

In [18]:
model.save('model.h5')



In [20]:
from keras.models import load_model

new_model=load_model('model.h5')



In [23]:
img_path = "dog.jpg"  # Replace with the path to your image
test_img = get_encoding(resnet, img_path)
predicted_caption = predict_captions(test_img,new_model, max_len, word_2_indices, indices_2_word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

In [24]:
predicted_caption

'A white dog shakes on the edge of the beach . . .'