In [1]:
import numpy as np
import keras
from keras.preprocessing import text, sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GRU, Dropout, Dense
from keras.callbacks import TensorBoard, ModelCheckpoint

Using TensorFlow backend.


In [2]:
fname = 'data/Flickr8k.token.txt'
img_to_caps = dict()

with open(fname, 'r') as f:
    for line in f:
        tokens = line.split(' ')
        img_fname, num = tokens[0].split('#')
        caption = ' '.join(tokens[1:]).strip()
        if img_fname not in img_to_caps: img_to_caps[img_fname] = []
        img_to_caps[img_fname].append(caption)

In [3]:
n_vocab = 100

In [4]:
tk = text.Tokenizer(nb_words=n_vocab)

texts = []
for img_name in img_to_caps:
    texts += img_to_caps[img_name]

tk.fit_on_texts(texts)
sorted_word_counts = sorted(tk.word_counts.items(), key=lambda x: x[1])
sorted_word_counts = sorted_word_counts[::-1][:n_vocab]

In [5]:
word_to_int = {t[0]: i for i,t in enumerate(sorted_word_counts)}
int_to_word = {i: t[0] for i,t in enumerate(sorted_word_counts)}

In [6]:
max_seq_len = 16

In [7]:
img_to_seqs = {}
for img_fname, captions in img_to_caps.items():
    seqs = []
    for caption in captions:
        seqs.append([word_to_int[w] for w in caption.split() if w in word_to_int])
    img_to_seqs[img_fname] = seqs

In [8]:
img_to_padded_seqs, img_to_next_chars = {}, {}
for img_fname, seqs in img_to_seqs.items():
    partial_seqs = []
    next_words = []
    for seq in seqs:
        for i in range(1,len(seq)):
            partial_seqs.append(seq[:i])
            next_words.append(seq[i])
    padded_partial_seqs = sequence.pad_sequences(partial_seqs, max_seq_len)
    
    next_words_1hot = np.zeros([len(next_words), n_vocab], dtype=np.bool)
    for i,next_word in enumerate(next_words):
        next_words_1hot[i,next_word] = 1
    
    img_to_padded_seqs[img_fname] = padded_partial_seqs
    img_to_next_chars[img_fname] = next_words_1hot

In [9]:
X_lst, y_lst = [], []
for img_fname in img_to_padded_seqs:
    X_lst.append(img_to_padded_seqs[img_fname])
    y_lst.append(img_to_next_chars[img_fname])

In [26]:
X, y = np.concatenate(X_lst, axis=0), np.concatenate(y_lst, axis=0)
X = np.expand_dims(X, axis=2)

In [None]:
# X = np.zeros([n_seqs, max_seq_len, n_vocab], dtype=np.bool)
# y = np.zeros([n_seqs, n_vocab], dtype=np.bool)

# for i,seq in enumerate(seqs):
#     for j,c in enumerate(seq):
#         X[i,j,char_to_int[c]] = 1
#     y[i,char_to_int[next_chars[i]]] = 1

In [17]:
model = Sequential()
model.add(GRU(256, input_shape=(max_seq_len,1)))
model.add(Dense(n_vocab,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
model_checkpoint = ModelCheckpoint('weights.{epoch:02d}-{loss:.2f}.hdf5', monitor='loss', verbose=0,
                                  save_best_only=True, mode='min')
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=1, write_graph=True, write_images=False)
callbacks_list = [model_checkpoint, tensorboard]

In [27]:
# Uncomment to train

# nb_epoch = 20
# batch_size = 128
# model.fit(X, y, nb_epoch=nb_epoch, batch_size=batch_size, callbacks=callbacks_list)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3b5d979c88>

In [39]:
# Loading model from weight

weight_fname = 'imgcap_language_weights.hdf5'
model.load_weights(weight_fname)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [40]:
# Generate random start

num_iter = 100
curr_seq = np.zeros([1,max_seq_len,1])
curr_seq[0,:,0] = X[np.random.randint(X.shape[0])].reshape(1,-1)

for i in curr_seq[0,:,0]:
    if i != 0: print(sorted_word_counts[int(i)][0], end=' ')
print()

for i in range(num_iter):
    prediction = model.predict(curr_seq)
    idx = np.argmax(prediction)
    next_word = sorted_word_counts[idx][0]
    curr_seq[0,:max_seq_len-1,0] = curr_seq[0,1:,0]
    curr_seq[0,max_seq_len-1,0] = idx
    print(next_word, end=' ')


children 
are playing in a water in the snow and to the ball in a red shirt and blue next is a green with a in its mouth in the background her her her the water her the other in the air the ball in the background and a dog is a in the air to another her the water behind to the other in the red and another is wearing a ball in a pool and a dog in the background with a background in the background in front of a man with a woman and a of with a 