In [1]:
import tensorflow as tf
from tensorflow import gfile,logging
import numpy as np
import math
import os
os.sys.path.append(os.path.dirname(os.path.abspath('..')))

  from ._conv import register_converters as _register_converters


In [2]:
# 预设参数
def get_default_params():
    return tf.contrib.training.HParams(
        emb_size=64,    # 嵌入维度
        t_size=10,    # LSTM的时间维度
        lstm_size=[32, 32],
        lstm_layers=2,
        fc_size=32,
        dropout_rate=0.2,
        batch_size=64,
        grad_thresh=1.0,    # 梯度阈值
        lr=0.001,    # 学习率
        cnt_thresh=3,    # 词的频率阈值
    )


params = get_default_params()

# 路径
cap_file = '../dataset/flickr30k/results_20130124.token'    # cap文件
voc_file = '../dataset/flickr30k/vocab.txt'    # 词典文件
feature_dir = '../dataset/flickr30k/features'    # 图片特征路径


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



打包一个词典数据类，用于字符的编码与解码。这里与之前做TextClf时所用类是一样的。

In [3]:
class Vocab:
    def __init__(self, voc_file, cnt_thresh):
        self._word2id = dict()
        self._id2word = dict()
        self._unk = 0
        self._cnt_thresh = cnt_thresh
        self._load_tabel(voc_file)

    @property
    def unk(self):
        return self._unk

    @property
    def size(self):
        return len(self._word2id)

    def _load_tabel(self, filename):
        with gfile.GFile(filename, 'r') as fd:
            data = fd.readlines()

        for line in data:
            idx, word, cnt = line.strip().split('\t')
            idx=int(idx)
            cnt = int(cnt)

            if cnt < self._cnt_thresh:
                continue

            self._word2id[word] = idx
            self._id2word[idx] = word

    def word2id(self, word):
        return self._word2id.get(word, self._unk)

    def id2word(self, idx):
        return self._id2word.get(idx, '<UNK>')

    def s2id(self, s):
        return [self.word2id(word) for word in s.split(' ')]

    def id2s(self, idxs):
        return ' '.join([self.id2word(idx) for idx in idxs])
    
# test
# voc=Vocab(voc_file,params.cnt_thresh)
# voc.s2id('I have a dream')    # 应该返回[1495,389,1,0]
# voc.id2s([1495,389,1,0])    # 应该返回'I have a <UNK>'

然后解析cap文件，并写一个将img_name与编码后的cap对应起来的函数：

In [4]:
# 该函数与feature_extraction.py中的一样
def parse_cap(cap_path):
    '''
    解析cap文件
    :param cap_path: 描述文件
    :return: img与cap的映射表
    '''
    with gfile.GFile(cap_path, 'r') as fd:
        text = fd.readlines()

    img2cap = dict()

    for line in text:
        img_name, cap = line.strip().split('\t')
        img_name = img_name.split('#')[0]
        cap = cap.strip()

        img2cap.setdefault(img_name, list())
        img2cap[img_name].append(cap)

    return img2cap


def img2capid(img2cap, voc_cls):
    '''
    img2cap: img_name到cap的映射字典
    voc_cls: 编码器
    '''
    imgname2capid = dict()

    for img_name in img2cap.keys():
        imgname2capid.setdefault(img_name, list())
        caps = img2cap[img_name]

        for cap in caps:
            cap_id = voc_cls.s2id(cap)    # 使用编码器编码
            imgname2capid[img_name].append(cap_id)

    return imgname2capid

# test
# voc=Vocab(voc_file,params.cnt_thresh)
# img2cap=parse_cap(cap_file)
# imgname2capid=img2capid(img2cap,voc)
# imgname2capid['2778832101.jpg']

接下来是任务专用数据类，用于给模型提供数据。这里提供的数据$(X,Y)$分别是图片特征与文字描述。

In [5]:
import pickle
import numpy as np
import random


class ImgCapData:
    def __init__(self, imgname2capid, voc_cls, feature_dir, t_size=20, batch_size=32, shuffle=True):
        self._voc_cls = voc_cls

        img_feature_files = list()    # 图片特征pickle文件列表
        for file in gfile.ListDirectory(feature_dir):
            img_feature_files.append(os.path.join(feature_dir, file))

        self._imgname2capid = imgname2capid
        self._t_size = t_size
        self._idx = 0
        self._batch_size = batch_size
        self._n_samples = 0
        self._n_features = 0

        self._img_names = list()
        self._img_features = list()
        self._load_img_feature(img_feature_files)

        if shuffle:
            self._shuffle_data()

    @property
    def img_names(self):
        return self._img_names

    @property
    def img_features(self):
        return self._img_features

    @property
    def n_samples(self):
        return self._n_samples

    @property
    def n_features(self):
        return self._n_features

    def _load_img_feature(self, img_feature_files):
        '''
        载入图片特征
        img_feature_files: pickle文件路径列表
        '''
        for file in img_feature_files:
            with gfile.GFile(file, 'rb') as fd:
                img_name_batch, img_featur_batch = pickle.load(fd)
                self._img_names += img_name_batch
                self._img_features.append(img_featur_batch)

        self._img_features = np.vstack(self._img_features)
        shape_org = self._img_features.shape    # 原始形状(n_samples,1,1,2048)
        self._img_features = np.reshape(self._img_features,
                                        (shape_org[0], shape_org[-1]))

        self._img_names = np.asarray(self._img_names)
        self._n_samples = self._img_names.shape[0]
        self._n_features=shape_org[-1]

    def _shuffle_data(self):
        idxs = np.random.permutation(self._n_samples)
        self._img_names = self._img_names[idxs]
        self._img_features = self._img_features[idxs]

    def _get_cap(self, img_batch):
        '''
        由一个batch的img_name得到一个batch的capid
        '''
        s_id_batch = list()
        w_batch = list()

        for img in img_batch:
            caps_id = self._imgname2capid[img]    # 图片对应的5个cap
            chosen_cap_id = random.choice(caps_id)    # 随机挑选一个cap

            chosen_cap_id = chosen_cap_id[:self._t_size]    # 截断
            cap_len = len(chosen_cap_id)
            loss_weight = [1 for _ in range(cap_len)]    # 计算loss时的weight掩码

            n_pad = self._t_size-cap_len    # pad数，小于等于0时不起作用
            chosen_cap_id += [self._voc_cls.unk for _ in range(n_pad)]
            loss_weight += [0 for _ in range(n_pad)]

            s_id_batch.append(chosen_cap_id)
            w_batch.append(loss_weight)

        return np.asarray(s_id_batch), np.asarray(w_batch)

    def next_batch(self):
        while self._idx+self._batch_size < self._n_samples:
            # 图片特征，即X
            img_feature_batch = self._img_features[self._idx:
                                                   (self._idx+self._batch_size)]
            # img_name，用于提取cap
            img_name_batch = self._img_names[self._idx:
                                             (self._idx+self._batch_size)]
            # cap_id即Y，cap_w即损失权重
            cap_id_batch, cap_w_batch = self._get_cap(img_name_batch)
            yield img_feature_batch, cap_id_batch, cap_w_batch

        self._idx = 0
        self._shuffle_data()


# test
voc = Vocab(voc_file, params.cnt_thresh)
img2cap = parse_cap(cap_file)
imgname2capid = img2capid(img2cap, voc)
data = ImgCapData(imgname2capid, voc, feature_dir, t_size=params.t_size)

## 网络设计

In [6]:
unit_I = data.n_features

voc_size = voc.size    # 词典大小

emb_size = params.emb_size
t_size = params.t_size

unit_fc = 32

unit_O = voc.size

## 网络搭建
输入处理。Google的show and tell模型是一个seq2seq模型，其输入序列为```img -> word1 -> word2 -> ...```。假设```t_size=10```，那么准确来说$X_{seq}$为```img -> word1 -> ... -> word9```，需要预测的序列为```word1 -> ... -> word10```。

In [7]:
X = tf.placeholder(tf.float32, [None, unit_I])
Y = tf.placeholder(tf.int32, [None, t_size])
w = tf.placeholder(tf.float32, [None, t_size])    # 计算损失时的权重
is_training = tf.placeholder(tf.bool)

global_step = tf.Variable(tf.zeros([], tf.int32),
                          name='global_step', trainable=False)

# 对图片特征做嵌入
img_emb_init = tf.uniform_unit_scaling_initializer(factor=1)
with tf.variable_scope('img_emb', initializer=img_emb_init):
    img_emb = tf.layers.dense(X, emb_size)
    # 在中间插入一个时间维度，得到(batch_size,1,emb_size)
    img_emb = tf.expand_dims(img_emb, 1)

# 将单词的id做嵌入
s_emb_init = tf.random_uniform_initializer(-1, 1)
with tf.variable_scope('s_emb', initializer=s_emb_init):
    s_emb_lookup = tf.get_variable('embedding', [voc_size, emb_size],
                                   tf.float32)
    # (batch_size,t_size-1,emb_size)
    s_emb = tf.nn.embedding_lookup(s_emb_lookup, Y[:, :params.t_size-1])

# 在时间维度上堆叠，图片嵌入是初始状态
# (batch_size,t_size,emb_size)
inputs = tf.concat([img_emb, s_emb], axis=1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:
Use keras.layers.dense instead.


网络的搭建。

In [8]:
scale = 1.0 / math.sqrt(params.emb_size + params.lstm_size[-1]) / 3.0
lstm_init = tf.random_uniform_initializer(-scale, scale)
with tf.variable_scope('lstm', initializer=lstm_init):
    lstm_layers = list()
    for i in range(params.lstm_layers):
        layer = tf.nn.rnn_cell.LSTMCell(params.lstm_size[i])

        # DropoutWrapper没有training参数，只能使用tf.cond来实现
        keep_prob = tf.cond(is_training,
                            lambda: 1-params.dropout_rate,
                            lambda: tf.constant(1.0))
        layer = tf.nn.rnn_cell.DropoutWrapper(layer,
                                              output_keep_prob=keep_prob)

        lstm_layers.append(layer)

    lstm_layers = tf.nn.rnn_cell.MultiRNNCell(lstm_layers)
    lstm_outputs, _ = tf.nn.dynamic_rnn(lstm_layers, inputs=inputs,
                                        dtype=tf.float32)    # (batch_size,t_size,lstm_size[-1])
    # 把所有bathch的所有状态压成向量，便于全连接
    # (batch_size*t_size,lstm_size[-1])
    lstm_outputs = tf.reshape(lstm_outputs, [-1, params.lstm_size[-1]])

# FC layer
fc_init = tf.uniform_unit_scaling_initializer(factor=1)
with tf.variable_scope('fc', initializer=fc_init):
    # (batch_size*t_size,unit_fc)
    fc = tf.layers.dense(lstm_outputs, unit_fc, activation=tf.nn.relu)
    fc = tf.layers.dropout(fc, rate=params.dropout_rate, training=is_training)

logits = tf.layers.dense(fc, unit_O)    # (batch_size*t_size,voc_size)

with tf.name_scope('Eval'):
    Y_flatten = tf.reshape(Y, [-1])    # (batch_size*t_size)
    w_flatten = tf.reshape(w, [-1])    # (batch_size*t_size)
    w_sum = tf.reduce_sum(w_flatten)

    # 损失向量，与labels同形状
    loss_vec = tf.losses.sparse_softmax_cross_entropy(labels=Y_flatten,
                                                      logits=logits, reduction=tf.losses.Reduction.NONE)
    loss_vec = tf.multiply(loss_vec, w_flatten)    # 对应位相乘
    loss = tf.reduce_sum(loss_vec)/w_sum

    pred = tf.math.argmax(logits, axis=1, output_type=tf.int32)
    correct_pred = tf.equal(pred, Y_flatten)
    correct_pred = tf.multiply(tf.cast(correct_pred, tf.float32), w_flatten)
    acc = tf.reduce_sum(correct_pred)/w_sum

with tf.name_scope('train_op'):
    lr = 1e-3
    t_vars = tf.trainable_variables()    # 可训练变量
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, t_vars),
                                      params.grad_thresh)
    optimizer = tf.train.AdamOptimizer(params.lr)
    train_op = optimizer.apply_gradients(zip(grads, t_vars),
                                         global_step=global_step)

init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True    # 按需使用显存

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


## 训练模型

In [9]:
with tf.Session(config=config) as sess:
    sess.run(init)
    epochs = 5

    batch_cnt = 0
    for epoch in range(epochs):
        for img_feature_batch, cap_id_batch, cap_w_batch in data.next_batch():
            batch_cnt += 1

            loss_val, acc_val, _ = sess.run([loss, acc, train_op],
                                            feed_dict={X: img_feature_batch, 
                                                       Y: cap_id_batch, 
                                                       w: cap_w_batch,
                                                      is_training:True})
            
            # 每1000batch输出一次信息
            if (batch_cnt+1) % 1000 == 0:
                print('epoch: {}, batch_loss: {}, batch_acc: {}'.format(
                    epoch, loss_val, acc_val))

epoch: 0, batch_loss: 3.9964351654052734, batch_acc: 0.21221864223480225
epoch: 0, batch_loss: 3.160750150680542, batch_acc: 0.28145694732666016
epoch: 0, batch_loss: 2.6492908000946045, batch_acc: 0.3439490497112274
epoch: 0, batch_loss: 2.235731601715088, batch_acc: 0.40378549695014954
epoch: 0, batch_loss: 1.9558547735214233, batch_acc: 0.43589743971824646
epoch: 0, batch_loss: 1.8399429321289062, batch_acc: 0.5111111402511597
epoch: 0, batch_loss: 1.8181525468826294, batch_acc: 0.5113268494606018
epoch: 0, batch_loss: 1.6528805494308472, batch_acc: 0.5416666865348816
epoch: 0, batch_loss: 1.579603672027588, batch_acc: 0.5495207905769348
epoch: 0, batch_loss: 1.381394863128662, batch_acc: 0.6070287823677063
epoch: 0, batch_loss: 1.3057690858840942, batch_acc: 0.6282894611358643
epoch: 0, batch_loss: 1.3348941802978516, batch_acc: 0.6416938304901123
epoch: 0, batch_loss: 1.1924693584442139, batch_acc: 0.6656050682067871
epoch: 0, batch_loss: 1.1415369510650635, batch_acc: 0.633757948

KeyboardInterrupt: 