In [1]:
from CapGenerator import load_data as ld

data = ld.prepare_dataset('train')

Dataset: 6000
Descriptions: train=6000, test=1000
Photos: train=6000, test=1000


In [2]:
train_features, train_descriptions = data[0]

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
from CapGenerator import generate_model as gen


# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key Pin descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
        return all_desc

    # fit a tokenizer given caption descriptions
    def create_tokenizer(descriptions):
        lines = to_lines(descriptions)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(lines)
        return tokenizer

    # calculate the length of the description with the most words
    def max_length(descriptions):
        lines = to_lines(descriptions)
        return max(len(d.split()) for d in lines)

    # create sequences of images, input sequences and output words for an image
    def create_sequences(tokenizer, max_length, desc_list, photo):
        X1, X2, y = list(), list(), list()
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
             s
            # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(photo)
                X2.append(in_seq)
                y.append(out_seq)
                return np.array(X1), np.array(X2), np.array(y)

            # data generator, intended to be used in a call to model.fit_generator()
            def data_generator(descriptions, photos, tokenizer, max_length):
                # loop for ever over images
                while 1:
                    for key, desc_list in descriptions.items():
                        # retrieve the photo feature
                        photo = photos[key][0]
                        print('Photo:')
                        print(photo.shape)
                        in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
                        yield [[in_img, in_seq], out_word]

                        # prepare tokenizer
                        tokenizer = create_tokenizer(train_descriptions)
                        vocab_size = len(tokenizer.word_index) + 1
                        print('Vocabulary Size: %d' % vocab_size)

                        # determine the maximum sequence length
                        max_length = max_length(train_descriptions)
                        print('Description Length: %d' % max_length)

                        # test the data generator
                        generator = gen.data_generator(train_descriptions, train_features, tokenizer, max_length)


SyntaxError: invalid syntax (<ipython-input-3-782ec568ef4b>, line 11)

In [None]:
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

In [4]:
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model, Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import concatenate
from keras.layers.merge import add

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

EMBEDDING_DIM = 128


def define_model(vocab_size, max_length):
    # feature extractor (encoder)
    inputs1 = Input(shape=(4096, ))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(EMBEDDING_DIM, activation='relu')(fe1)
    fe3 = RepeatVector(max_length)(fe2)

    # embedding
    inputs2 = Input(shape=(max_length, ))
    emb2 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    emb3 = LSTM(256, return_sequences=True)(emb2)
    emb4 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(emb3)

    # merge inputs
    merged = concatenate([fe3, emb4])
    # language model (decoder)
    lm2 = LSTM(1000)(merged)
    #lm3 = Dense(500, activation='relu')(lm2)
    outputs = Dense(vocab_size, activation='softmax')(lm2)

    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='plot.png')
    return model

Using TensorFlow backend.


In [5]:
model = define_model(200, 34)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 256)      51200       input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (D

In [None]:
from keras.applications.vgg16 import VGG16

model = VGG16(weights='imagenet', include_top=True, input_shape = (224, 224, 3))

In [None]:
SVG(model_to_dot(model, show_shapes=True, show_layer_names=True).create(prog='dot', format='svg'))

In [None]:
plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
for k, v in train_descriptions.items():
    

In [None]:
from keras.preprocessing.text import Tokenizer

    t = Tokenizer()  # all without .
    text = "<start> Tomorrow will be cold. <end>"
    text = text.replace(".", " .")
    t.fit_on_texts([text])
    print(t.word_index)

In [None]:
# extract features from each photo in the directory
def extract_features(filename):
    # load the model
    model = VGG16()
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # load the photo
    image = load_img(filename, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get features
    feature = model.predict(image, verbose=0)
    return feature