In [1]:
from google.colab import drive
import os

drive.mount('/content/gdrive', force_remount=True)

os.chdir('gdrive/My Drive/Data_science_eng/finalproject/image-captioning-gru')
print("Current working directory: {0}".format(os.getcwd()))

Mounted at /content/gdrive
Current working directory: /content/gdrive/My Drive/Data_science_eng/finalproject/image-captioning-gru


In [2]:
import pickle
import numpy as np
import os
import matplotlib.pyplot as plt

In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, TimeDistributed
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.optimizers import RMSprop

In [4]:
import tensorflow as tf
from tensorflow.python.keras import backend as K
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.compat.v1.Session(config=config)
K.set_session(sess)

tf.test.is_built_with_cuda()

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0



True

In [5]:
with open('./indices_to_words.pickle', 'rb') as f:
  indices_to_words = pickle.load(f)

with open('./words_to_indices.pickle', 'rb') as f:
  words_to_indices = pickle.load(f)

with open('./images_train.npy', 'rb') as f:
  images_train = np.load(f)

with open('./images_val.npy', 'rb') as f:
  images_val = np.load(f)

with open('./captions_train.npy', 'rb') as f:
  captions_train = np.load(f)

with open('./captions_val.npy', 'rb') as f:
  captions_val = np.load(f)

with open('./next_words_train.npy', 'rb') as f:
  next_words_train = np.load(f)

with open('./next_words_val.npy', 'rb') as f:
  next_words_val = np.load(f)

In [6]:
next_words_train.shape

(30000, 40, 1)

In [7]:
embedding_size = 128
maxLen = 40
vocab_size = 8919 
img_emb_size = 2048
state_size = 512

In [8]:
img_input = Input(shape=(img_emb_size, ), name='Image-Input')
img_output = Dense(state_size, activation='tanh', name='Image-output')

decoder_input = Input(shape=(40,), name='decoder-input')

decoder_embedding = Embedding(input_dim = vocab_size, output_dim = embedding_size, input_length = maxLen, name='decoder-embedding')

decoder_layer1 = GRU(state_size, name='decoder-layer-1', return_sequences = True)
decoder_layer2 = GRU(state_size, name='decoder-layer-2', return_sequences = True)
decoder_layer3 = GRU(state_size, name='decoder-layer-3', return_sequences = True)

decoder_time_dense = TimeDistributed(Dense(vocab_size, activation='softmax'), name='decoder-output')

initial_state = img_output(img_input)
net = decoder_input
net = decoder_embedding(net)
net = decoder_layer1(net, initial_state = initial_state)
net = decoder_layer2(net, initial_state = initial_state)
net = decoder_layer3(net, initial_state = initial_state)

decoder_output = decoder_time_dense(net)

In [17]:
model = Model(inputs=[img_input, decoder_input], outputs=[decoder_output])

In [18]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder-input (InputLayer)     [(None, 40)]         0           []                               
                                                                                                  
 Image-Input (InputLayer)       [(None, 2048)]       0           []                               
                                                                                                  
 decoder-embedding (Embedding)  (None, 40, 128)      1141632     ['decoder-input[0][0]']          
                                                                                                  
 Image-output (Dense)           (None, 512)          1049088     ['Image-Input[0][0]']            
                                                                                            

In [20]:
model.compile(optimizer='RMSprop', loss='sparse_categorical_crossentropy')
model.load_weights('./model_weights.h5')

In [21]:
x_data = {
            'decoder-input': captions_train,
            'Image-Input': images_train
         }
y_data = {
    'decoder-output': next_words_train
}

In [22]:
x_val_data = {
            'decoder-input': captions_val,
            'Image-Input': images_val
         }
y_val_data = {
    'decoder-output': next_words_val
}

In [23]:
model.fit(x_data, y_data, batch_size=256, epochs=20, validation_data=(x_val_data, y_val_data))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb6fa2bf990>

In [15]:
model.save('image-cap-model.h50')
model.save_weights("./model_weights.h50")

In [16]:
model.get_weights()

[array([[-0.33307758,  0.33550197,  0.10830905, ..., -0.21449953,
          0.3960982 , -0.07804417],
        [-0.03361038,  0.00674953,  0.02386484, ..., -0.02963126,
         -0.00972021, -0.03473064],
        [-0.0704022 ,  0.01149779,  0.00630879, ...,  0.02804337,
          0.04644089, -0.06249357],
        ...,
        [-0.03645843,  0.02126919, -0.0111524 , ...,  0.00813331,
         -0.01248745,  0.00982001],
        [ 0.03833535,  0.01882918,  0.04525992, ..., -0.02025734,
         -0.04974176,  0.00167461],
        [ 0.01501792, -0.06770608, -0.01098049, ..., -0.02256813,
          0.01003502,  0.0138842 ]], dtype=float32),
 array([[ 0.09261385,  0.03378068,  0.02485102, ...,  0.05264533,
         -0.00798195, -0.01991087],
        [-0.06107935, -0.05279775, -0.04485036, ..., -0.001121  ,
          0.00750277, -0.04821083],
        [-0.04355156, -0.09892548,  0.01465156, ..., -0.01950648,
         -0.02299211, -0.13118951],
        ...,
        [ 0.00987985, -0.08615965,  0.0