In [1]:
"""
1. 生成数据：载入词表，载入图像特征，提供数据
2. 实现模型
3. 训练
4. 评估
"""

import os
import sys
import tensorflow as tf
from tensorflow import gfile
from tensorflow import logging
import pprint
import pickle
import numpy as np
import math
import random

input_description_file = ".\\data\\results_20130124.token"
input_img_feature_dir = ".\\InceptionV3\\feature_extraction_inception_v3"
input_vocab_file = ".\\data\\vocab.txt"
output_dir = ".\\data\\local_run"

if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)

def get_default_params():
    return tf.contrib.training.HParams(#将图像特征embedding
        num_vocab_word_threshold=3,#过滤词表
        num_embedding_nodes=32,#embedding的size,每张图变成32位embedding
        num_timesteps=20,
        num_lstm_nodes=[64, 64],#每层大小
        num_lstm_layers=2,#LSTM层数
        num_fc_nodes=32,#全连接的大小
        batch_size=50,
        cell_type='lstm',
        clip_lstm_grads=1.0,#梯度剪切
        learning_rate=0.001,
        keep_prob=0.8,#Dropout
        log_frequent=100,#每隔100打印一次log
        save_frequent=1000,#每1000次保存一次模型
    )

hps = get_default_params()

  from ._conv import register_converters as _register_converters


In [2]:
#词表载入以及文本描述文件转换成id
class Vocab(object):
    def __init__(self, filename, word_num_threshold):
        self._id_to_word = {}
        self._word_to_id = {}
        self._unk = -1
        self._eos = -1#end of sentence
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename) 

    def _read_dict(self, filename):#3.读取词表，从词表文件中读取到新建的词典里
        with gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, occurence = line.strip('\r\n').split('\t')
            occurence = int(occurence)
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word == '.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception('duplicate words in vocab file')
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk

    @property
    def eos(self):
        return self._eos

    def word_to_id(self, word):
        return self._word_to_id.get(word, self.unk)#对没有见过的词返回unk的id

    def id_to_word(self, cur_id):
        return self._id_to_word.get(cur_id, '<UNK>')#对于没有见过的id返回unk

    def size(self):
        return len(self._word_to_id)

    def encode(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]#把句子转成id列表
        return word_ids

    def decode(self, sentence_id):
        words = [self.id_to_word(word_id) for word_id in sentence_id]#把id列表转成文本的一句话
        return ' '.join(words)#返回字符串，用空格拼接
    

def parse_token_file(token_file):#将描述文件解析成字典
    """Parses token file."""
    img_name_to_tokens = {}
    with gfile.GFile(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id, description = line.strip('\r\n').split('\t')
        img_name, _ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name, [])
        img_name_to_tokens[img_name].append(description)
    return img_name_to_tokens

def convert_token_to_id(img_name_to_tokens, vocab):#将图片的描述文件转成id
    """Converts tokens of each description of imgs to id. """
    img_name_to_token_ids = {}
    for img_name in img_name_to_tokens:
        img_name_to_token_ids.setdefault(img_name, [])
        descriptions = img_name_to_tokens[img_name]
        for description in descriptions:
            token_ids = vocab.encode(description)
            img_name_to_token_ids[img_name].append(token_ids)
    return img_name_to_token_ids

vocab = Vocab(input_vocab_file, hps.num_vocab_word_threshold)
vocab_size = vocab.size()
logging.info("vocab_size: %d" % vocab_size)
pprint.pprint(vocab.encode("I have a dream."))
pprint.pprint(vocab.decode([5,10,9,20]))
    
    
img_name_to_tokens = parse_token_file(input_description_file)
img_name_to_token_ids = convert_token_to_id(img_name_to_tokens, vocab)

logging.info("num of all images: %d" % len(img_name_to_tokens))
pprint.pprint(list(img_name_to_tokens.keys())[0:10])
pprint.pprint(img_name_to_tokens['2778832101.jpg'])

logging.info("num of all images: %d" % len(img_name_to_token_ids))
pprint.pprint(list(img_name_to_token_ids.keys())[0:10])
pprint.pprint(img_name_to_token_ids['2778832101.jpg'])

INFO:tensorflow:vocab_size: 10875
[1494, 389, 1, 0]
'the of man white'
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
['A man in jeans is reclining on a green metal bench along a busy sidewalk and '
 'crowded street .',
 'A white male with a blue sweater and gray pants laying on a sidewalk bench .',
 'A man in a blue shirt and gray pants is sleeping on a sidewalk bench .',
 'A person is sleeping on a bench , next to cars .',
 'A man sleeping on a bench in a city area .']
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
[[3, 9, 4, 132, 8, 3532, 6, 1, 48, 337, 146, 139, 1, 244, 93, 7, 380, 36, 2],
 [3, 20, 179, 11, 1, 26, 284, 7, 120, 1

In [3]:
class ImageCaptionData(object):#读取提取的图片特征
    def __init__(self,
                 img_name_to_token_ids,
                 img_feature_dir,
                 num_timesteps,#在batch中让文本对齐
                 vocab,
                 deterministic = False):#是否可以shuffle
        self._vocab = vocab
        self._all_img_feature_filepaths = []#获得文件夹下所有子文件的名字
        for filename in gfile.ListDirectory(img_feature_dir):
            self._all_img_feature_filepaths.append(os.path.join(img_feature_dir, filename))
        pprint.pprint(self._all_img_feature_filepaths)

        self._img_name_to_token_ids = img_name_to_token_ids
        self._num_timesteps = num_timesteps
        self._indicator = 0#指示遍历到数据集的哪个部分了
        self._deterministic = deterministic
        self._img_feature_filenames = []#存储所有图片名字
        self._img_feature_data = []#存储所有图片提取出来的向量
        self._load_img_feature_pickle()#载入所有pickle文件
        if not self._deterministic:#是否可以shuffle
            self._random_shuffle()


    def _load_img_feature_pickle(self):#载入所有pickle文件
        for filepath in self._all_img_feature_filepaths:
            logging.info("loading %s" % filepath)
            with gfile.GFile(filepath, 'rb') as f:
                filenames, features = pickle.load(f)
                self._img_feature_filenames += filenames#此时是两个列表做合并
                self._img_feature_data.append(features)
        self._img_feature_data = np.vstack(self._img_feature_data)#合并_img_feature_data
        #[（1000,1,1,2048），（1000,1,1,2048）] -> [2000,1,1,2048]
        origin_shape = self._img_feature_data.shape
        self._img_feature_data = np.reshape(
            self._img_feature_data, (origin_shape[0], origin_shape[3]))#[2000,1,1,2048] -> [2000,2048]
        self._img_feature_filenames = np.asarray(self._img_feature_filenames)#变成numpy数据格式因为shuffler时用的permutation是numpy的API
        print(self._img_feature_data.shape)
        print(self._img_feature_filenames.shape)
        if not self._deterministic:
            self._random_shuffle()


    def size(self):
        return len(self._img_feature_filenames)

    def img_feature_size(self):
        return self._img_feature_data.shape[1]

    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._img_feature_filenames = self._img_feature_filenames[p]
        self._img_feature_data = self._img_feature_data[p]

    def _img_desc(self, filenames):#找到图片描述并对图片描述统一（截断，填充）
        batch_sentence_ids = []
        batch_weights = []
        for filename in filenames:
            token_ids_set = self._img_name_to_token_ids[filename]
            chosen_token_ids = random.choice(token_ids_set)#随机选一个描述
            #chosen_token_ids = token_ids_set[0]
            chosen_token_length = len(chosen_token_ids)

            weight = [1 for i in range(chosen_token_length)]
            if chosen_token_length >= self._num_timesteps:#截断
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:                                         #填充
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        return batch_sentence_ids, batch_weights

    def next(self, batch_size):#进行batch size操作
        end_indicator = self._indicator + batch_size
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()

        batch_img_features = self._img_feature_data[self._indicator: end_indicator]
        batch_img_names = self._img_feature_filenames[self._indicator: end_indicator]
        batch_sentence_ids, batch_weights = self._img_desc(batch_img_names)#通过图片名找到图片描述
        #weights用于统计一个描述里哪些是有用的哪些是填充的
        #如[30,1175,10,3,0,0,0] -> [1,1,1,1,0,0,0],在算梯度时后三个不参与计算，减少计算量，提高准确率
        self._indicator = end_indicator
        return batch_img_features, batch_sentence_ids, batch_weights, batch_img_names


caption_data = ImageCaptionData(img_name_to_token_ids, input_img_feature_dir, hps.num_timesteps, vocab)
img_feature_dim = caption_data.img_feature_size()
caption_data_size = caption_data.size()
logging.info("img_feature_dim: %d" % img_feature_dim)
logging.info("caption_data_size: %d" % caption_data_size)

batch_img_features, batch_sentence_ids, batch_weights, batch_img_names = caption_data.next(5)
pprint.pprint(batch_img_features)
pprint.pprint(batch_sentence_ids)
pprint.pprint(batch_weights)
pprint.pprint(batch_img_names)

['.\\InceptionV3\\feature_extraction_inception_v3\\image_features-0.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-1.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-10.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-11.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-12.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-13.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-14.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-15.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-16.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-17.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-18.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-19.pickle',
 '.\\InceptionV3\\feature_extraction_inception_v3\\image_features-

In [4]:
def create_rnn_cell(hidden_dim, cell_type):#RNN大小，RNN种类
    if cell_type == 'lstm':
        return tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
    elif cell_type == 'gru':
        return tf.contrib.rnn.GRUCell(hidden_dim)
    else:
        raise Exception("%s has not been supported" % cell_type)

def dropout(cell, keep_prob):#封装dropout
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)


def get_train_model(hps, vocab_size, img_feature_dim):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    img_feature  = tf.placeholder(tf.float32, (batch_size, img_feature_dim))#图像特征
    sentence = tf.placeholder(tf.int32, (batch_size, num_timesteps))#图像描述的句子
    mask = tf.placeholder(tf.float32, (batch_size, num_timesteps))#weight
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)#训练的次数

    #训练流程 sentence[a,b,c,d,e]
    #         img_feature[0.4,0.3,10,2]
    #         predict1: img_feature -> embedding_img -> lstm -> a
    #         predict2: a -> embedding_word -> lstm -> b
    #         input[img,a,b,c,d]
    
    
    
    
    
    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],#词表大小，每个词应该的大小
            tf.float32)
        embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:num_timesteps-1])#[batch_size,num_timesteps-1,num_embedding_nodes]
        #从embedding表里查询                                                                #-1是因为最后一个字符只作为输出不作为输入
    img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)#对feature做全连接让其和embedding_nodes一样大
    with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init):
        embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes)#[batch_size,num_embedding_nodes]
        embed_img = tf.expand_dims(embed_img, 1)#[batch_size,1,num_embedding_nodes]
        embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1)#拼接

        
        
        
        
    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)#对多层cell连接

        initial_state = cell.zero_state(hps.batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell,#多个输入需要dynamicRNN，返回output以及中间状态
                                           embed_inputs,
                                           initial_state=initial_state)

    # Sets up the fully-connected layer.
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hps.num_lstm_nodes[-1]])
        fc1 = tf.layers.dense(rnn_outputs_2d, hps.num_fc_nodes, name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        fc1_dropout = tf.nn.relu(fc1_dropout)
        logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits')

    with tf.variable_scope('loss'):
        sentence_flatten = tf.reshape(sentence, [-1])#展平句子
        mask_flatten = tf.reshape(mask, [-1])#展平mask
        mask_sum = tf.reduce_sum(mask_flatten)
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            
            logits=logits, labels=sentence_flatten)
        weighted_softmax_loss = tf.multiply(softmax_loss,
                                            tf.cast(mask_flatten, tf.float32))
        
        prediction = tf.argmax(logits, 1, output_type = tf.int32)#预测值，1是axis
        correct_prediction = tf.equal(prediction, sentence_flatten)#正确预测的值
        correct_prediction_with_mask = tf.multiply(
            tf.cast(correct_prediction, tf.float32),
            mask_flatten)
        accuracy = tf.reduce_sum(correct_prediction_with_mask) / mask_sum
        loss = tf.reduce_sum(weighted_softmax_loss) / mask_sum
        tf.summary.scalar('loss', loss)

    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm(#应用梯度
            tf.gradients(loss, tvars), hps.clip_lstm_grads)
        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)#直方图
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

    return ((img_feature, sentence, mask, keep_prob),
            (loss, accuracy, train_op),
            global_step)

placeholders, metrics, global_step = get_train_model(hps, vocab_size, img_feature_dim)
img_feature, sentence, mask, keep_prob = placeholders
loss, accuracy, train_op = metrics

summary_op = tf.summary.merge_all()

init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
INFO:tensorflow:variable name: embedding/embeddings:0
INFO:tensorflow:variable name: image_feature_embed/dense/kernel:0
INFO:tensorflow:variable name: image_feature_embed/dense/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: fc/fc1/kernel:0
INFO:tensorflow:variable name: fc/fc1/bias:0
INFO:tensorflow:variable name: fc/logits/kernel:0
INFO:tensorflow:variable name: fc/logits/bias:0
INFO:tensorflow:Summary name embedding/embeddings:0_grad is illegal; using embedding/embeddings_0_grad instead.
INFO:tensorflow:Summary name ima

In [None]:
training_steps = 50000
with tf.Session() as sess: 
    sess.run(init_op)
    logging.info("[*] Reading checkpoint ...")
    ckpt = tf.train.get_checkpoint_state(output_dir)
    if ckpt and ckpt.model_checkpoint_path:
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        saver.restore(sess, os.path.join(output_dir, ckpt_name))
        logging.info("[*] Success Read Checkpoint From %s" % (ckpt_name))
    #else:
        #raise Exception("[*] Failed load checkpoint")
    writer = tf.summary.FileWriter(output_dir, sess.graph)



    for i in range(training_steps):
        batch_img_features, batch_sentence_ids, batch_weights, _ = caption_data.next(hps.batch_size)
        input_vals = (batch_img_features, batch_sentence_ids, batch_weights, hps.keep_prob)

        feed_dict = dict(zip(placeholders, input_vals))
        fetches = [global_step, loss, accuracy, train_op]

        should_log = (i + 1) % hps.log_frequent == 0
        should_save = (i + 1) % hps.save_frequent == 0
        if should_log:
            fetches += [summary_op]
        outputs = sess.run(fetches, feed_dict)
        global_step_val, loss_val, accuracy_val = outputs[0:3]
        if should_log:
            summary_str = outputs[4]
            writer.add_summary(summary_str, global_step_val)
            logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f'
                         % (global_step_val, loss_val, accuracy_val))
        if should_save:
            logging.info("Step: %d, image caption model saved" % (global_step_val))
            saver.save(sess, os.path.join(output_dir, "image_caption"), global_step=global_step_val)

INFO:tensorflow:[*] Reading checkpoint ...
INFO:tensorflow:Restoring parameters from .\data\local_run\image_caption-54000
INFO:tensorflow:[*] Success Read Checkpoint From image_caption-54000
INFO:tensorflow:Step: 54100, loss: 3.379, accuracy: 0.336
INFO:tensorflow:Step: 54200, loss: 3.580, accuracy: 0.336
INFO:tensorflow:Step: 54300, loss: 3.261, accuracy: 0.352
INFO:tensorflow:Step: 54400, loss: 3.613, accuracy: 0.340
INFO:tensorflow:Step: 54500, loss: 3.694, accuracy: 0.306
INFO:tensorflow:Step: 54600, loss: 3.593, accuracy: 0.314
INFO:tensorflow:Step: 54700, loss: 3.327, accuracy: 0.377
INFO:tensorflow:Step: 54800, loss: 3.396, accuracy: 0.385
INFO:tensorflow:Step: 54900, loss: 3.351, accuracy: 0.354
INFO:tensorflow:Step: 55000, loss: 3.355, accuracy: 0.361
INFO:tensorflow:Step: 55000, image caption model saved
INFO:tensorflow:Step: 55100, loss: 3.292, accuracy: 0.381
INFO:tensorflow:Step: 55200, loss: 3.490, accuracy: 0.370
INFO:tensorflow:Step: 55300, loss: 2.586, accuracy: 0.504


INFO:tensorflow:Step: 66800, loss: 2.357, accuracy: 0.544
INFO:tensorflow:Step: 66900, loss: 2.652, accuracy: 0.520
INFO:tensorflow:Step: 67000, loss: 2.184, accuracy: 0.580
INFO:tensorflow:Step: 67000, image caption model saved
INFO:tensorflow:Step: 67100, loss: 2.310, accuracy: 0.545
INFO:tensorflow:Step: 67200, loss: 2.192, accuracy: 0.580
INFO:tensorflow:Step: 67300, loss: 2.247, accuracy: 0.553
INFO:tensorflow:Step: 67400, loss: 2.105, accuracy: 0.587
INFO:tensorflow:Step: 67500, loss: 2.491, accuracy: 0.541
INFO:tensorflow:Step: 67600, loss: 2.258, accuracy: 0.576
INFO:tensorflow:Step: 67700, loss: 2.267, accuracy: 0.568
INFO:tensorflow:Step: 67800, loss: 2.180, accuracy: 0.600
INFO:tensorflow:Step: 67900, loss: 2.392, accuracy: 0.537
INFO:tensorflow:Step: 68000, loss: 2.315, accuracy: 0.562
INFO:tensorflow:Step: 68000, image caption model saved
INFO:tensorflow:Step: 68100, loss: 2.434, accuracy: 0.545
INFO:tensorflow:Step: 68200, loss: 2.058, accuracy: 0.607
INFO:tensorflow:Step