In [None]:
import numpy as np
import pandas as pd
import pickle
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, RepeatVector, TimeDistributed, Merge, Masking
from keras.layers.merge import add, concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD

In [None]:
def load_npy(path):
    with open(path, "rb") as handle:
        arr = np.load(handle)
    handle.close()
    return (arr)

In [None]:
X_train_photos = load_npy("../data/preprocessed/X_train_photos.npy")
X_train_captions = load_npy("../data/preprocessed/X_train_captions.npy")
embedding_matrix = load_npy("../data/embedding_matrix/embedding_matrix.npy")
y_train = load_npy("../data/preprocessed/y_train.npy")

In [None]:
print(X_train_photos.shape)
print(X_train_captions.shape)
print(y_train.shape)
print(embedding_matrix.shape)

In [None]:
VOCAB_SIZE = 30212

In [None]:
inputs_photo = Input(shape = (4096,), name="Inputs-photo")
drop1 = Dropout(0.5)(inputs_photo)
dense1 = Dense(256, activation='relu')(drop1)
inputs_caption = Input(shape=(15,), name = "Inputs-caption")
embedding = Embedding(VOCAB_SIZE, 300,
                mask_zero = True, trainable = False,
                weights=[embedding_matrix])(inputs_caption)
drop2 = Dropout(0.5)(embedding)
lstm1 = LSTM(256)(drop2)

merged = concatenate([dense1, lstm1])
dense2 = Dense(256, activation='relu')(merged)
outputs = Dense(VOCAB_SIZE, activation='softmax')(dense2)

model = Model(inputs=[inputs_photo, inputs_caption], outputs=outputs)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd)

print(model.summary())
plot_model(model, to_file='images/model1.png', show_shapes=True, show_layer_names=False)

![](images/model1.png)

In [None]:
model.fit([X_train_photos,X_train_captions], to_categorical(y_train, VOCAB_SIZE), epochs = 1, verbose = 1)

In [None]:
inputs_photo = Input(shape = (4096,), name="Inputs-photo")
drop1 = Dropout(0.5)(inputs_photo)
dense1 = Dense(300, activation='relu')(drop1)
cnn_feats = Masking()(RepeatVector(1)(dense1))
inputs_caption = Input(shape=(15,), name = "Inputs-caption")
embedding = Embedding(VOCAB_SIZE, 300,
                mask_zero = True, trainable = False,
                weights=[embedding_matrix])(inputs_caption)
merged = concatenate([cnn_feats, embedding], axis=1)
lstm_layer = LSTM(units=300,
                  input_shape=(15 + 1, 300),   
                  return_sequences=False,
                  dropout=.5)(merged)


outputs = Dense(units=VOCAB_SIZE,activation='softmax')(lstm_layer)

model = Model(inputs=[inputs_photo, inputs_caption], outputs=outputs)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='sparse_categorical_crossentropy', optimizer=sgd)
print(model.summary())
plot_model(model, to_file='images/model6.png', show_shapes=True,show_layer_names=False )



![](images/model6.png)

In [None]:
model.fit([X_train_photos,X_train_captions], y_train, epochs = 1, verbose = 1)