In [3]:
from keras import backend as K
from keras.layers import Layer
import numpy as np
import six
from keras import backend as K
from keras.layers import Convolution2D, Dense, Embedding, GRU, Input
from keras.models import Model, Sequential

In [2]:
class L2Normalize(Layer):
    def __init__(self, axis=-1, **kwargs):
        self.axis = axis
        return super(L2Normalize, self).__init__(**kwargs)

    def call(self, x, mask=None):
        return K.l2_normalize(x, axis=self.axis)

    def get_config(self):
        base_config = super(L2Normalize, self).get_config()
        base_config.update(dict(axis=self.axis))
        return base_config

In [6]:
def encode_sentences(model, vocab_map, X, max_length=None,
        embedding_dim=1024, verbose=False, batch_size=128):
    '''Encode sentences into the joint embedding space.
    This is mostly from the original @ryankiros implementation.
    '''
    n_words = len(vocab_map)
    features = np.zeros((len(X), embedding_dim), dtype='float32')

    captions = [s.split() for s in X]
    if max_length is None:
        max_length = max((len(c) for c in captions))
    else:
        captions = [s[:max_length - 1] for s in captions]
    # quick check if a word is in the dictionary
    d = defaultdict(bool)
    for w in vocab_map.keys():
        d[w] = True

    k = max_length - 1
    if verbose:
        print(k)
    numbatches = len(captions) / batch_size + 1
    for minibatch in range(0, len(captions), batch_size):
        caption = captions[minibatch:minibatch + batch_size]

        seqs = []
        for i, cc in enumerate(caption):
            seqs.append([
                vocab_map[w] if d[w] > 0 and vocab_map[w] < n_words else 1 for w in cc
            ])
        x = np.zeros((k + 1, len(caption))).astype('int64')
        x_mask = np.zeros((k + 1, len(caption))).astype('float32')
        for idx, s in enumerate(seqs):
            x[:len(s), idx] = s
            x_mask[:len(s) + 1, idx] = 1.

        ff = model.predict(x.transpose(1, 0))
        for ind, c in enumerate(range(minibatch, minibatch + len(caption))):
            features[c] = ff[ind]

    return features

In [4]:
def build_image_encoder(weights=None, input_dim=4096, embedding_dim=1024, normalize=True):
    input = Input(shape=(input_dim,))
    x = Dense(
        embedding_dim,
        weights=weights
    )(input)
    if normalize:
        x = L2Normalize()(x)
    model = Model(input=input, output=x)
    return model


def build_sentence_encoder(embedding_weights=None, gru_weights=None, input_length=None, vocab_dim=32198,
        vocab_embedding_dim=300, embedding_dim=1024, normalize=True):
    # NOTE: This gives slightly different results than the original model.
    # I think it's because the original has a different masking scheme.
    model = Sequential([
        Embedding(
            vocab_dim, vocab_embedding_dim, input_length=input_length,
            weights=embedding_weights, mask_zero=True  # TODO: masking isn't quite right
        ),
        GRU(embedding_dim, weights=gru_weights, inner_activation='sigmoid'),
    ])
    if normalize:
        model.add(L2Normalize())
    return model


def build_pretrained_models(model_filename, input_length=None, normalize=True):
    img_enc_weights, embedding_weights, gru_weights, vocab_map = load_pretrained_parameters(model_filename)
    image_encoder = build_image_encoder(weights=img_enc_weights, normalize=normalize)
    sentence_encoder = build_sentence_encoder(
        embedding_weights=embedding_weights,
        gru_weights=gru_weights,
        input_length=input_length, vocab_dim=len(vocab_map),
        normalize=normalize)
    return image_encoder, sentence_encoder, vocab_map


def load_pretrained_parameters(filename):
    '''Load up the pre-trained weights from the @ryankiros implementation.
    '''
    params = np.load(filename)
    vocab_map = np.load('{}.dictionary.pkl'.format(filename))
    # image encoder weights
    if params:
        img_enc_weights = [params['ff_image_W'], params['ff_image_b']]
    else:
        img_enc_weights = None
    # sentence encoder weights
    embedding_weights = [params['Wemb']]
    W_h = params['encoder_Wx']
    U_h = params['encoder_Ux']
    b_h = params['encoder_bx']
    W_r, W_z = np.split(params['encoder_W'], 2, axis=1)
    U_r, U_z = np.split(params['encoder_U'], 2, axis=1)
    b_r, b_z = np.split(params['encoder_b'], 2)
    gru_weights = [
        W_z, U_z, b_z,
        W_r, U_r, b_r,
        W_h, U_h, b_h,
    ]
    return img_enc_weights, embedding_weights, gru_weights, vocab_map

In [29]:
image_encoder, sentence_encoder, vocab_map = build_pretrained_models('vse/coco.npz')



ValueError: You called `set_weights(weights)` on layer "gru_6" with a  weight list of length 9, but the layer was expecting 3 weights. Provided weights: [array([[ 0.12316032,  0.00989882,  0.01626704, .....

In [14]:
img_enc_weights, embedding_weights, gru_weights, vocab_map = load_pretrained_parameters('vse/f8k.npz')
image_encoder = build_image_encoder(weights=img_enc_weights, normalize=True)
sentence_encoder = build_sentence_encoder(
    embedding_weights=embedding_weights,
    gru_weights=gru_weights,
    input_length=None, vocab_dim=len(vocab_map),
    normalize=True)
image_encoder, sentence_encoder, vocab_map



ValueError: You called `set_weights(weights)` on layer "gru_3" with a  weight list of length 9, but the layer was expecting 3 weights. Provided weights: [array([[-0.07494818,  0.08813515, -0.10807338, .....

In [18]:
embedding_weights[0].shape

(8919, 300)

In [20]:
gru_weights[0].shape

(300, 1024)

In [26]:
input_length=None
vocab_dim=len(vocab_map)
vocab_embedding_dim=300
normalize=True
embedding_dim=1024

model = Sequential([
    Embedding(
        vocab_dim, vocab_embedding_dim, input_length=input_length,
        mask_zero=True, name='embedding'  # TODO: masking isn't quite right weights=embedding_weights, 
    ),
    GRU(embedding_dim, inner_activation='sigmoid', name='gru'), # weights=gru_weights, 
])

model.summary()




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         2675700   
_________________________________________________________________
gru (GRU)                    (None, 1024)              4070400   
Total params: 6,746,100.0
Trainable params: 6,746,100
Non-trainable params: 0.0
_________________________________________________________________


In [27]:
model.get_layer('gru').set_weights(gru_weights)

ValueError: You called `set_weights(weights)` on layer "gru" with a  weight list of length 9, but the layer was expecting 3 weights. Provided weights: [array([[-0.07494818,  0.08813515, -0.10807338, .....

In [28]:
len(gru_weights)

9

In [44]:
len(model.get_layer('gru').get_weights())

3

In [41]:
embedding_weights[0].shape

(8919, 300)

In [42]:
gru_weights[0].shape

(300, 1024)