In [1]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential,Model
from keras.optimizers import Adam
from keras.layers import LSTM,Dense,Activation,Dropout,RepeatVector,Embedding,TimeDistributed, Add, Input
from keras.applications import ResNet50 

Using TensorFlow backend.


# Model

In [2]:
captions = np.load("./captions.npy")
next_words = np.load("./next_words.npy")
images = np.load("./images.npy")
image_names = np.load('./image_names.npy')

print(captions.shape)
print(next_words.shape)
print(images.shape)
print(image_names.shape)

with open('./word_2_indices.p','rb') as fi:
    word_2_indices = pickle.load(fi)
vocab_size = len(word_2_indices)
print(vocab_size)

(25493, 40)
(25493, 8254)
(25493, 2048)
(25493,)
8254


In [3]:
embedding_size = 128
max_len = 40

In [4]:
image_model_inp = Input(shape=(2048,))
image_model_ly1 = Dense(embedding_size, activation='relu')(image_model_inp)
image_model_ly2 = RepeatVector(max_len)(image_model_ly1)

image_model = Model(input =image_model_inp,output = image_model_ly2)
image_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               262272    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 40, 128)           0         
Total params: 262,272
Trainable params: 262,272
Non-trainable params: 0
_________________________________________________________________


  """


In [5]:
language_model_inp = Input(shape=(max_len,))
language_model_ly1 = Embedding(input_dim=vocab_size, output_dim=embedding_size)(language_model_inp)
language_model_ly2 = LSTM(256, return_sequences=True)(language_model_ly1)
language_model_ly3 = TimeDistributed(Dense(embedding_size))(language_model_ly2)

language_model = Model(input=language_model_inp,output=language_model_ly3)
language_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 40)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 40, 128)           1056512   
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           394240    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 40, 128)           32896     
Total params: 1,483,648
Trainable params: 1,483,648
Non-trainable params: 0
_________________________________________________________________


  


In [6]:
model_ly1 = Add()([image_model_ly2, language_model_ly3])
model_ly2 = LSTM(128, return_sequences=True)(model_ly1)
model_ly3 = LSTM(512, return_sequences=False)(model_ly2)
model_ly4 = Dense(vocab_size,activation = 'softmax')(model_ly3)


model = Model(input = [image_model_inp,language_model_inp],output = model_ly4)



  import sys


In [7]:
model.load_weights("./model_weights.h5")
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
hist = model.fit([images, captions], next_words, batch_size=512, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
model.save_weights('./model_weights.h5')