In [1]:
import os
import sys
import numpy as np
import math

# tf.logging.set_verbosity(tf.logging.INFO)    # 代替print()函数

In [2]:
import tensorflow as tf


def get_default_params():
    return tf.contrib.training.HParams(
        emb_size=64,
        t_size=50,
        lstm_size=[32, 32],
        lstm_layers=2,
        fc_size=32,
        dropout_rate=0.2,
        batch_size=64,
        grad_thresh=1.0,    # 梯度阈值
        lr=0.001,
        cnt_thresh=10,    # 词的频率阈值
    )


params = get_default_params()

# 分词后的文件
seg_train_file = 'cnews.seg_train.txt'
seg_val_file = 'cnews.seg_val.txt'
seg_test_file = 'cnews.seg_test.txt'
# 词表
vocal_table = 'cnews.vocal.txt'
# 类别表
cat_file = 'cnews.cat.txt'
# 输出路径
out_path = './out'

if not os.path.exists(out_path):
    os.mkdir(out_path)

  from ._conv import register_converters as _register_converters



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



## 数据封装
对于非结构化数据，数据部分的处理才是最麻烦的。为了便于管理，定义数据类是很有必要的。这里分为两块，第一块是用于编码的数据类，第二块是用于神经网络的数据类，类似于之前的```CifarData```，主要API为```next_batch```。

第一块数据类，用于encoding：

In [3]:
class Vocal:
    '''
    词典类，用于将word转成id
    '''

    def __init__(self, voc_file, cnt_thresh):
        '''
        voc_file: 词典文件
        cnt_thresh: 频率阈值
        '''
        self._word2id = dict()
        self._unk = 0    # 未知单词的编码
        self._cnt_thresh = cnt_thresh
        self._load_table(voc_file)
        
    @property
    def unk(self):
        '''
        位置单词编码，用于外界padding时调用
        '''
        return self._unk

    @property
    def size(self):
        return len(self._word2id)

    def _load_table(self, filename):
        with open(filename, 'r', encoding='utf-8') as fd:
            data = fd.readlines()

        for line in data:
            idx, word, cnt = line.strip().split('\t')
            cnt = int(cnt)

            if cnt < self._cnt_thresh:
                continue

            self._word2id[word] = int(idx)

    def word2id(self, word):
        return self._word2id.get(word, self._unk)

    def sentence2id(self, s):
        s_id = [self._word2id.get(word, self._unk)
                for word in s.split()]
        return s_id

# test
# voc_cls = Vocal(vocal_table, params.cnt_thresh)
# print(voc_cls.size)
# print(voc_cls.word2id('的'))    # 该句应该返回2
# print(voc_cls.sentence2id('的 在 了 是'))    # 该句应该返回[2,4,6,7]


class CatDict:
    def __init__(self, cat_file):
        '''
        cat_file: 类别文件
        '''
        self._cat2id = dict()
        self._load_table(cat_file)

    def _load_table(self, filename):
        with open(filename, 'r', encoding='utf-8') as fd:
            data = fd.readlines()

        for line in data:
            idx, cat, _ = line.split('\t')
            self._cat2id[cat] = int(idx)

    def cat2id(self, cat):
        if cat not in self._cat2id:
            raise Exception('{} is not in cat'.format(cat))
        else:
            return self._cat2id[cat]

# test
# cat_dict=CatDict(cat_file)
# print(cat_dict.cat2id('时尚'))    # 此句应该输出1

第二块数据类，用于产生格式化的batch数据：

In [4]:
class TextData:
    def __init__(self, filename, vocal, cat_dict, t_size=30, batch_size=32, shuffle=True):
        '''
        filename: 导入的源文件，以该源文件生成格式化数据
        vocal: 词典类
        cat_dict: 类别类
        t_size: 
        '''
        self._data = list()
        self._target = list()
        self._n_samples = 0

        self._idx = 0  # mini-batch的游标
        self._batch_size = batch_size

        self._vocal = vocal
        self._cat_dict = cat_dict
        self._t_size = t_size

        self._load_data(filename)

        if shuffle:
            self._shuffle_data()
            
        print(self._data.shape, self._target.shape)

    def _load_data(self, filename):
        with open(filename, 'r', encoding='utf-8') as fd:
            text = fd.readlines()

        for line in text:
            label, content = line.strip().split('\t')
            y = self._cat_dict.cat2id(label)
            x = self._vocal.sentence2id(content)

            x = x[:self._t_size]
            n_pad = self._t_size-len(x)    # 需要填充的位数
            x = x+[self._vocal.unk for _ in range(n_pad)]

            self._data.append(x)
            self._target.append(y)

        self._data = np.array(self._data)
        self._target = np.array(self._target)
        self._n_samples = self._data.shape[0]

    def _shuffle_data(self):
        '''
        打乱数据
        '''
        idxs = np.random.permutation(self._n_samples)
        self._data = self._data[idxs]
        self._target = self._target[idxs]

    def next_batch(self):
        '''
        生成mini-batch
        '''
        while self._idx + self._batch_size < self._n_samples:
            yield self._data[self._idx: (self._idx + self._batch_size)], self._target[self._idx: (self._idx + self._batch_size)]
            self._idx += self._batch_size

        self._idx = 0
        self._shuffle_data()


# test
voc_cls = Vocal(vocal_table, params.cnt_thresh)    # 词典类
cat_dict = CatDict(cat_file)    # 类别类
train_data = TextData(seg_train_file, voc_cls, cat_dict, params.t_size,batch_size=params.batch_size)
val_data = TextData(seg_val_file, voc_cls, cat_dict, params.t_size,batch_size=params.batch_size)
test_data = TextData(seg_test_file, voc_cls, cat_dict, params.t_size,batch_size=params.batch_size)

(50000, 50) (50000,)
(5000, 50) (5000,)
(10000, 50) (10000,)


# 网络搭建

In [5]:
import tensorflow as tf

# voc_cls = Vocal(vocal_table, params.cnt_thresh)    # 词典类
vocal_size = voc_cls.size

unit_O = 10    # 输出单元数，类别数

In [6]:
def gen_lstm_layer(inputs, unit_I, unit_O, t_size=5, batch_size=32, init=tf.random_uniform_initializer(-1, 1)):
    '''
    生成一层LSTM
    inputs: 序列数据，维度为(n_samples,t_size,n_features)，也可为(n_samples,t_size)
    '''
    def gen_params(unit_I, unit_O):
        '''
        生成权重与偏置参数
        '''
        w_x = tf.get_variable('w_x',shape=[unit_I, unit_O])
        w_h = tf.get_variable('w_h',shape=[unit_O, unit_O])
        b = tf.get_variable('bias',shape=[1, unit_O],
                            initializer=tf.constant_initializer(0.0))
        return w_x, w_h, b

    with tf.variable_scope('LSTM_layer', initializer=init):
        with tf.variable_scope('i'):
            w_ix, w_ih, b_i = gen_params(unit_I, unit_O)
        with tf.variable_scope('f'):
            w_fx, w_fh, b_f = gen_params(unit_I, unit_O)
        with tf.variable_scope('g'):
            w_gx, w_gh, b_g = gen_params(unit_I, unit_O)
        with tf.variable_scope('o'):
            w_ox, w_oh, b_o = gen_params(unit_I, unit_O)

        # 初始的c与h，零初始化
        c = tf.Variable(tf.zeros([batch_size, unit_O]), trainable=False)
        h = tf.Variable(tf.zeros([batch_size, unit_O]), trainable=False)

        for t in range(t_size):
            input_t = inputs[:, t, :]    # 提取时间维度
            input_t = tf.reshape(input_t, [batch_size, unit_I])

            f = tf.sigmoid(tf.matmul(input_t, w_fx)+tf.matmul(h, w_fh)+b_f)
            i = tf.sigmoid(tf.matmul(input_t, w_ix)+tf.matmul(h, w_ih)+b_i)
            g = tf.tanh(tf.matmul(input_t, w_gx)+tf.matmul(h, w_gh)+b_g)
            o = tf.sigmoid(tf.matmul(input_t, w_ox)+tf.matmul(h, w_oh)+b_o)

            c = c*f+g*i
            h = o*tf.tanh(c)
            
        return h

In [7]:
X = tf.placeholder(tf.int32, [None, params.t_size])
Y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)    # 训练标识位

# 自动计数
global_step = tf.Variable(tf.zeros([], tf.int32),
                          name='global_step', trainable=False)

with tf.variable_scope('emb', initializer=tf.random_uniform_initializer(-1.0, 1.0)):
    emb_lookup = tf.get_variable('embedding', [vocal_size, params.emb_size],
                                 dtype=tf.float32)
    emb = tf.nn.embedding_lookup(emb_lookup, X)

# 对embedding使用RNN网络
xavier_scale = 1 / math.sqrt(params.emb_size + params.lstm_size[-1]) / 3
initializer = tf.random_uniform_initializer(-xavier_scale, xavier_scale)

#####################
lstm_outputs=gen_lstm_layer(emb,unit_I=params.emb_size,unit_O=params.lstm_size[0],
                            t_size=params.t_size,batch_size=params.batch_size,init=initializer)
#####################

# with tf.variable_scope('LSTM', initializer=initializer):
#     lstm_layers = list()
#     for i in range(params.lstm_layers):
#         layer = tf.nn.rnn_cell.LSTMCell(params.lstm_size[i])

#         # DropoutWrapper没有training参数，只能使用tf.cond来实现
#         keep_prob = tf.cond(is_training,
#                             lambda: 1-params.dropout_rate,
#                             lambda: tf.constant(1.0))
#         layer = tf.nn.rnn_cell.DropoutWrapper(layer,
#                                               output_keep_prob=keep_prob)

#         lstm_layers.append(layer)

#     lstm_layers = tf.nn.rnn_cell.MultiRNNCell(lstm_layers)

#     lstm_outputs, _ = tf.nn.dynamic_rnn(lstm_layers,
#                                         inputs=emb, dtype=tf.float32)
#     lstm_outputs = lstm_outputs[:, -1, :]

with tf.name_scope('FC'):
    fc = tf.layers.dense(lstm_outputs, params.fc_size, activation=tf.nn.relu)
    fc = tf.layers.dropout(fc, rate=params.dropout_rate, training=is_training)

logits = tf.layers.dense(fc, unit_O,
                         activation=None)    # 后接FC层，无激活

with tf.name_scope('Eval'):
    loss = tf.losses.sparse_softmax_cross_entropy(labels=Y, logits=logits)
    predict = tf.argmax(logits, 1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predict, Y), tf.float32))

with tf.name_scope('train_op'):
    lr = 1e-3
    t_vars = tf.trainable_variables()    # 可训练变量
    # 应用梯度截断
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, t_vars),
                                      params.grad_thresh)
    optimizer = tf.train.AdamOptimizer(lr)
    train_op = optimizer.apply_gradients(zip(grads, t_vars),
                                         global_step=global_step)

init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True    # 按需使用显存

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


# 训练网络

In [8]:
import numpy as np

with tf.Session(config=config) as sess:
    sess.run(init)
    epochs = 20

    batch_cnt = 0
    for epoch in range(epochs):
        for batch_data, batch_labels in train_data.next_batch():
            batch_cnt += 1
            loss_val, acc_val, _ = sess.run(
                [loss, accuracy, train_op], feed_dict={
                    X: batch_data,
                    Y: batch_labels,
                    is_training: True
                })

            # 每1000batch输出一次信息
            if (batch_cnt+1) % 1000 == 0:
                print('epoch: {}, batch_loss: {}, batch_acc: {}'.format(
                    epoch, loss_val, acc_val))

            # 每5000batch做一次验证
            if (batch_cnt+1) % 5000 == 0:
                all_test_acc_val = list()
                for test_batch_data, test_batch_labels in val_data.next_batch():
                    test_acc_val = sess.run(accuracy, feed_dict={
                        X: test_batch_data,
                        Y: test_batch_labels,
                        is_training: False
                    })
                    all_test_acc_val.append(test_acc_val)
                test_acc = np.mean(all_test_acc_val)
                print('epoch: {}, test_acc: {}'.format(epoch, test_acc))

epoch: 1, batch_loss: 1.1625792980194092, batch_acc: 0.5
epoch: 2, batch_loss: 0.9158648252487183, batch_acc: 0.6875
epoch: 3, batch_loss: 0.6839292049407959, batch_acc: 0.78125
epoch: 5, batch_loss: 0.31747156381607056, batch_acc: 0.90625
epoch: 6, batch_loss: 0.24026355147361755, batch_acc: 0.921875
epoch: 6, test_acc: 0.8167067170143127
epoch: 7, batch_loss: 0.1479153037071228, batch_acc: 0.921875
epoch: 8, batch_loss: 0.12282991409301758, batch_acc: 0.953125
epoch: 10, batch_loss: 0.07064007222652435, batch_acc: 0.984375
epoch: 11, batch_loss: 0.05656968057155609, batch_acc: 0.984375
epoch: 12, batch_loss: 0.03091406263411045, batch_acc: 0.984375
epoch: 12, test_acc: 0.8397436141967773
epoch: 14, batch_loss: 0.11046174168586731, batch_acc: 0.953125
epoch: 15, batch_loss: 0.025200583040714264, batch_acc: 0.984375
epoch: 16, batch_loss: 0.010227980092167854, batch_acc: 1.0
epoch: 17, batch_loss: 0.04418171942234039, batch_acc: 0.984375
epoch: 19, batch_loss: 0.004938417114317417, bat