In [1]:
"""
1. generate data
    Loads vocab
    Loads image features
    Provide data for training
    
2. Builds image caption model
3. Training 
4. Evaluation
"""

import os
import sys
import tensorflow as tf
from tensorflow import gfile
from tensorflow import logging
import pprint
# import cPickle # for python 2
import _pickle as cPickle  # for python 3
import numpy as np
import math

input_description_file = "./data/results_20130124.token"
input_img_feature_dir = "./data/feature_extraction_inception_v3"
input_vocab_file = "./data/vocab.txt"
output_dir = "./data/local_run"
checkpoint_dir = "./data/local_run"

if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)

# hyper parameters
def get_default_params():
    return tf.contrib.training.HParams(
        num_vocab_word_threshold=3,
        # LSTM
        num_embedding_nodes=32,
        num_timesteps=10,
        num_lstm_nodes=[64, 64],
        num_lstm_layers=2,
        # fc
        num_fc_nodes=32,
        # train
        batch_size=50,
        cell_type='lstm',
        # clip gradient
        clip_lstm_grads=1.0,
        learning_rate=0.001,
        keep_prob=0.8,
        # save file
        log_frequent=10,
        save_frequent=100,
    )

hps = get_default_params()

In [2]:
class Vocab(object):
    """Loads vocab table"""
    def __init__(self, filename, word_num_threshold):
        self._id_to_word = {}
        self._word_to_id = {}
        self._unk = -1
        self._eos = -1
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename)

    def _read_dict(self, filename):
        with gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, occurence = line.strip('\r\n').split('\t')
            occurence = int(occurence)
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word == '.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception('duplicate words in vocab file')
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk

    @property
    def eos(self):
        return self._eos

    def word_to_id(self, word):
        return self._word_to_id.get(word, self.unk)

    def id_to_word(self, cur_id):
        return self._id_to_word.get(cur_id, '<UNK>')

    def size(self):
        return len(self._word_to_id)

    def encode(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
        return word_ids

    def decode(self, sentence_id):
        words = [self.id_to_word(word_id) for word_id in sentence_id]
        return ' '.join(words)
    
# testing
vocab = Vocab(input_vocab_file, hps.num_vocab_word_threshold)
vocab_size = vocab.size()
logging.info("vocab_size: %d" % vocab_size)

pprint.pprint(vocab.encode("I have a dream"))
pprint.pprint(vocab.decode([5,23,6,352]))
    


INFO:tensorflow:vocab_size: 10875
[1494, 389, 1, 0]
'the young on toddler'


In [3]:
def parse_token_file(token_file):
    """Parses image description file into a dict with 
    key: img_names, value: a list of descriptions"""
    img_name_to_tokens = {}
    with gfile.GFile(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id, description = line.strip('\r\n').split('\t')
        img_name, _ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name, [])
        img_name_to_tokens[img_name].append(description)
    return img_name_to_tokens

def convert_token_to_id(img_name_to_tokens, vocab):
    """Converts each description of imgs to a list of id. """
    img_name_to_token_ids = {}
    for img_name in img_name_to_tokens:
        img_name_to_token_ids.setdefault(img_name, [])
        descriptions = img_name_to_tokens[img_name]
        for description in descriptions:
            token_ids = vocab.encode(description)
            img_name_to_token_ids[img_name].append(token_ids)
    return img_name_to_token_ids


    
    
img_name_to_tokens = parse_token_file(input_description_file)
img_name_to_token_ids = convert_token_to_id(img_name_to_tokens, vocab)

logging.info("num of all images: %d" % len(img_name_to_tokens))
pprint.pprint(list(img_name_to_tokens.keys())[0:5])
pprint.pprint(img_name_to_tokens['2778832101.jpg'])

logging.info("num of all images: %d" % len(img_name_to_token_ids))
pprint.pprint(list(img_name_to_token_ids.keys())[0:5])
pprint.pprint(img_name_to_token_ids['2778832101.jpg'])

INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg']
['A man in jeans is reclining on a green metal bench along a busy sidewalk and '
 'crowded street .',
 'A white male with a blue sweater and gray pants laying on a sidewalk bench .',
 'A man in a blue shirt and gray pants is sleeping on a sidewalk bench .',
 'A person is sleeping on a bench , next to cars .',
 'A man sleeping on a bench in a city area .']
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg']
[[3, 9, 4, 132, 8, 3532, 6, 1, 48, 337, 146, 139, 1, 244, 93, 7, 380, 36, 2],
 [3, 20, 179, 11, 1, 26, 284, 7, 120, 128, 297, 6, 1, 93, 146, 2],
 [3, 9, 4, 1, 26, 21, 7, 120, 128, 8, 340, 6, 1, 93, 146, 2],
 [3, 63, 8, 340, 6, 1, 146, 12, 70, 15, 518, 2],
 [3, 9, 340, 6, 1, 146, 4, 1, 112, 171, 2]]


In [4]:
class ImageCaptionData(object):
    """Provide data for this model"""
    def __init__(self,
                 img_name_to_token_ids,
                 img_feature_dir,
                 num_timesteps,
                 vocab,
                 deterministic = False):
        self._vocab = vocab
        
        self._img_name_to_token_ids = img_name_to_token_ids
        self._num_timesteps = num_timesteps
        self._indicator = 0
        self._deterministic = deterministic
        self._img_feature_filenames = []
        self._img_feature_data = []
        
        self._all_img_feature_filepaths = []
        for filename in gfile.ListDirectory(img_feature_dir):
            self._all_img_feature_filepaths.append(
                os.path.join(img_feature_dir, filename))
        pprint.pprint(self._all_img_feature_filepaths)
        
        self._load_img_feature_pickle()
        
        if not self._deterministic:
            self._random_shuffle()


    def _load_img_feature_pickle(self):
        """Load img features data form pickle files"""
        for filepath in self._all_img_feature_filepaths:
            logging.info("loading %s" % filepath)
            with gfile.GFile(filepath, 'rb') as f:
                filenames, features = cPickle.load(f, encoding='latin1')   # !!!
                self._img_feature_filenames += filenames # merge
                self._img_feature_data.append(features) # append
        
        # [(1000,1,1,2048), (1000,1,1,2048)] -> (2000,1,1,2048)
        self._img_feature_data = np.vstack(self._img_feature_data)
        
        origin_shape = self._img_feature_data.shape
         # (2000,1,1,2048)->(2000,2048)
        self._img_feature_data = np.reshape( 
            self._img_feature_data, (origin_shape[0], origin_shape[3]))
        self._img_feature_filenames = np.asarray(self._img_feature_filenames)
        
        print(self._img_feature_data.shape)
        print(self._img_feature_filenames.shape)
        if not self._deterministic:
            self._random_shuffle()


    def size(self):
        return len(self._img_feature_filenames)

    def img_feature_size(self):
        return self._img_feature_data.shape[1]

    def _random_shuffle(self):
        """Shuffle data randomly"""
        p = np.random.permutation(self.size())
        self._img_feature_filenames = self._img_feature_filenames[p]
        self._img_feature_data = self._img_feature_data[p]

    def _img_desc(self, filenames):
        """Get dexcription for filenames in  batch"""
        batch_sentence_ids = []
        batch_weights = []
        
        for filename in filenames:
            token_ids_set = self._img_name_to_token_ids[filename]
            # chosen_token_ids = random.choice(token_ids_set)
            chosen_token_ids = token_ids_set[0]
            chosen_token_length = len(chosen_token_ids)

            weight = [1 for i in range(chosen_token_length)]
            # 截断
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:# 填充
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        # conver tot numpy arrays
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        
        return batch_sentence_ids, batch_weights

    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0  # reset
            end_indicator = self._indicator + batch_size # reset
        assert end_indicator <= self.size()

        batch_img_features = self._img_feature_data[self._indicator: end_indicator]
        batch_img_names = self._img_feature_filenames[self._indicator: end_indicator]
        
        # batch_sentence_ids:[100,34,23,1,0,0,0]->batch_weights:[1,1,1,1,0,0,0]
        # '1' represents calculating the wieights, '0' no calculating 
        batch_sentence_ids, batch_weights = self._img_desc(batch_img_names)

        self._indicator = end_indicator
        return batch_img_features, batch_sentence_ids, batch_weights, batch_img_names


caption_data = ImageCaptionData(img_name_to_token_ids, 
                                input_img_feature_dir,
                                hps.num_timesteps, vocab)
img_feature_dim = caption_data.img_feature_size()
caption_data_size = caption_data.size()
logging.info("img_feature_dim: %d" % img_feature_dim)
logging.info("caption_data_size: %d" % caption_data_size)

batch_img_features, batch_sentence_ids, batch_weights, batch_img_names = caption_data.next_batch(5)
pprint.pprint(batch_img_features)
pprint.pprint(batch_sentence_ids)
pprint.pprint(batch_weights)
pprint.pprint(batch_img_names)


['./data/feature_extraction_inception_v3/image_features-22.pickle',
 './data/feature_extraction_inception_v3/image_features-0.pickle',
 './data/feature_extraction_inception_v3/image_features-1.pickle',
 './data/feature_extraction_inception_v3/image_features-10.pickle',
 './data/feature_extraction_inception_v3/image_features-11.pickle',
 './data/feature_extraction_inception_v3/image_features-12.pickle',
 './data/feature_extraction_inception_v3/image_features-13.pickle',
 './data/feature_extraction_inception_v3/image_features-14.pickle',
 './data/feature_extraction_inception_v3/image_features-15.pickle',
 './data/feature_extraction_inception_v3/image_features-16.pickle',
 './data/feature_extraction_inception_v3/image_features-17.pickle',
 './data/feature_extraction_inception_v3/image_features-18.pickle',
 './data/feature_extraction_inception_v3/image_features-19.pickle',
 './data/feature_extraction_inception_v3/image_features-2.pickle',
 './data/feature_extraction_inception_v3/image_feat

In [5]:
"""
Construct the graph
"""
def create_rnn_cell(hidden_dim, cell_type):
    """Choose between lstm & gru"""
    if cell_type == 'lstm':
        return tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
    elif cell_type == 'gru':
        return tf.contrib.rnn.GRUCell(hidden_dim)
    else:
        raise Exception("%s has not been supported" % cell_type)

        
def dropout(cell, keep_prob):
    """drop out operation for RNN layer, not suitable for fc"""
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)


def get_train_model(hps, vocab_size, img_feature_dim):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    img_feature  = tf.placeholder(tf.float32, (batch_size, img_feature_dim))
    sentence = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    mask = tf.placeholder(tf.float32, (batch_size, num_timesteps))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)

    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)
        # embed_token_ids: [batch_size, num_timesteps-1, num_embedding_nodes]
        embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:num_timesteps-1])

    img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init):
        # img_feature: [batch_size, img_feature_dim]
        # after fc:    [batch_size, num_embedding_nodes] as embed_img
        # for concat embed_img and embed_token_ids, 
        # need to make embed_img & embed_token_id the same size
        embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes)
        # now: embed_img: [batch_size, 1, num_embedding_nodes]
        embed_img = tf.expand_dims(embed_img, 1)  #add one dimention
        # embed_inputs: [batch_size, num_timesteps, num_embedding_nodes]
        embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1)
        
        """Now we have our inputs for LSTM"""

        
        
    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        # 多层的LSTM
        cells = []  
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)  #drop out wrapper
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)

        initial_state = cell.zero_state(hps.batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[-1]] 
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell,
                                           embed_inputs,
                                           initial_state=initial_state)

    # Sets up the fully-connected layer.
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hps.num_lstm_nodes[-1]])
        fc1 = tf.layers.dense(rnn_outputs_2d, hps.num_fc_nodes, name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        fc1_dropout = tf.nn.relu(fc1_dropout)
        logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits')

        
    # calculate loss
    with tf.variable_scope('loss'):
        # flatten the ground truth, & mask
        sentence_flatten = tf.reshape(sentence, [-1])  
        mask_flatten = tf.reshape(mask, [-1])
        mask_sum = tf.reduce_sum(mask_flatten)
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, 
            labels=sentence_flatten)
        
        #去除weight为0 的元素
        weighted_softmax_loss = tf.multiply(softmax_loss,
                                            tf.cast(mask_flatten, tf.float32))
        
        # calculate accuracy
        prediction = tf.argmax(logits, 1, output_type = tf.int32)
        correct_prediction = tf.equal(prediction, sentence_flatten)
        correct_prediction_with_mask = tf.multiply(
            tf.cast(correct_prediction, tf.float32),
            mask_flatten)
        accuracy = tf.reduce_sum(correct_prediction_with_mask) / mask_sum
        
        loss = tf.reduce_sum(weighted_softmax_loss) / mask_sum
        tf.summary.scalar('loss', loss)

        
    # define train_op
    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads)
        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

    return ((img_feature, sentence, mask, keep_prob),
            (loss, accuracy, train_op),
            global_step)

placeholders, metrics, global_step = get_train_model(hps, vocab_size, img_feature_dim)
img_feature, sentence, mask, keep_prob = placeholders
loss, accuracy, train_op = metrics

summary_op = tf.summary.merge_all()

init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
INFO:tensorflow:variable name: embedding/embeddings:0
INFO:tensorflow:variable name: image_feature_embed/dense/kernel:0
INFO:tensorflow:variable name: image_feature_embed/dense/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: fc/fc1/kernel:0
INFO:tensorflow:variable name: fc/fc1/bias:0
INFO:tensorflow:variable name: fc/logits/kernel:0
INFO:tensorflow:variable name: fc/logits/bias:0
INFO:tensorflow:Summary name embedding/embeddings:0_grad is illegal; using embedding/embeddings_0_grad instead.
INFO:tensorflow:Summary name ima

In [6]:
"""
training run this graph
"""
training_steps = 2000  # 10000

with tf.Session() as sess:
    sess.run(init_op)
    writer = tf.summary.FileWriter(output_dir, sess.graph)
    for i in range(training_steps):
        (batch_img_features, 
         batch_sentence_ids, 
         batch_weights, _) = caption_data.next_batch(hps.batch_size)
        input_vals = (batch_img_features, 
                      batch_sentence_ids, 
                      batch_weights, 
                      hps.keep_prob)
        
        feed_dict = dict(zip(placeholders, input_vals))
        fetches = [global_step, loss, accuracy, train_op]
        
        should_log = (i + 1) % hps.log_frequent == 0
        should_save = (i + 1) % hps.save_frequent == 0
        if should_log:
            fetches += [summary_op]
        outputs = sess.run(fetches, feed_dict)
        global_step_val, loss_val, accuracy_val = outputs[0:3]
        
        if should_log:
            summary_str = outputs[4]
            writer.add_summary(summary_str, global_step_val)
            logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f'
                         % (global_step_val, loss_val, accuracy_val))
        if should_save:
            logging.info("Step: %d, image caption model saved" % (global_step_val))
            saver.save(sess, os.path.join(output_dir, "image_caption"), global_step=global_step_val)
        

INFO:tensorflow:Step:    10, loss: 9.243, accuracy: 0.002
INFO:tensorflow:Step:    20, loss: 9.128, accuracy: 0.000
INFO:tensorflow:Step:    30, loss: 8.866, accuracy: 0.002
INFO:tensorflow:Step:    40, loss: 8.410, accuracy: 0.008
INFO:tensorflow:Step:    50, loss: 7.825, accuracy: 0.012
INFO:tensorflow:Step:    60, loss: 7.263, accuracy: 0.028
INFO:tensorflow:Step:    70, loss: 6.833, accuracy: 0.074
INFO:tensorflow:Step:    80, loss: 6.207, accuracy: 0.086
INFO:tensorflow:Step:    90, loss: 6.125, accuracy: 0.110
INFO:tensorflow:Step:   100, loss: 5.882, accuracy: 0.128
INFO:tensorflow:Step: 100, image caption model saved
INFO:tensorflow:Step:   110, loss: 6.174, accuracy: 0.098
INFO:tensorflow:Step:   120, loss: 5.802, accuracy: 0.138
INFO:tensorflow:Step:   130, loss: 5.587, accuracy: 0.134
INFO:tensorflow:Step:   140, loss: 5.563, accuracy: 0.134
INFO:tensorflow:Step:   150, loss: 5.739, accuracy: 0.124
INFO:tensorflow:Step:   160, loss: 5.637, accuracy: 0.136
INFO:tensorflow:Ste

INFO:tensorflow:Step:  1310, loss: 4.521, accuracy: 0.252
INFO:tensorflow:Step:  1320, loss: 4.124, accuracy: 0.282
INFO:tensorflow:Step:  1330, loss: 4.174, accuracy: 0.264
INFO:tensorflow:Step:  1340, loss: 4.279, accuracy: 0.262
INFO:tensorflow:Step:  1350, loss: 4.667, accuracy: 0.244
INFO:tensorflow:Step:  1360, loss: 4.277, accuracy: 0.260
INFO:tensorflow:Step:  1370, loss: 4.249, accuracy: 0.246
INFO:tensorflow:Step:  1380, loss: 4.273, accuracy: 0.280
INFO:tensorflow:Step:  1390, loss: 4.661, accuracy: 0.240
INFO:tensorflow:Step:  1400, loss: 4.558, accuracy: 0.240
INFO:tensorflow:Step: 1400, image caption model saved
INFO:tensorflow:Step:  1410, loss: 4.214, accuracy: 0.270
INFO:tensorflow:Step:  1420, loss: 4.283, accuracy: 0.280
INFO:tensorflow:Step:  1430, loss: 4.368, accuracy: 0.266
INFO:tensorflow:Step:  1440, loss: 4.470, accuracy: 0.254
INFO:tensorflow:Step:  1450, loss: 4.201, accuracy: 0.250
INFO:tensorflow:Step:  1460, loss: 4.298, accuracy: 0.268
INFO:tensorflow:St