In [1]:
# 构建计算图——LSTM模型
#      embedding
#      LSTM
#      fc
#      train_op
# 训练流程代码
# 数据集封装
#      api: next_batch(batch_size)
# 词表封装
#      api: sentence2id(text_sentence): 句子转换id
# 类别封装
#      api: category2id(text_category)

import tensorflow as tf
import os
import sys
import numpy as np
import math

tf.logging.set_verbosity(tf.logging.INFO)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def get_default_params():
    return tf.contrib.training.HParams(
        num_embedding_size = 16,  
        num_timesteps = 50, # minibatch长度对齐
        num_lstm_nodes = [32, 32],
        num_lstm_layers = 2,
        num_fc_nodes = 32,
        batch_size = 100,
        clip_lstm_grads = 1.0, # lstm梯度
        learning_rate = 0.001,
        num_word_threshold = 10  # 词频阈值
#         num_embedding_size = 32,  
#         num_timesteps = 600, # minibatch长度对齐
#         num_lstm_nodes = [64, 64],
#         num_lstm_layers = 2,
#         num_fc_nodes = 64,
#         batch_size = 100,
#         clip_lstm_grads = 1.0, # lstm梯度
#         learning_rate = 0.001,
#         num_word_threshold = 10  # 词频阈值
    )

hps = get_default_params()

# input file
train_file = './data/cnews.train.seg.txt'
val_file = './data/cnews.val.seg.txt'
test_file = './data/cnews.test.seg.txt'
vocab_file = './data/cnews.vocab.txt'
category_file = './data/cnews.category.txt'
output_file = './data/run_text_rnn'

if not os.path.exists(output_file):
    os.mkdir(output_file)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
print(hps.num_word_threshold)

10


In [4]:
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
    
    def _read_dict(self, filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id) # id递增
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
        
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids
    
    
class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
    
    def size(self):
        return len(self._category_to_id)
        
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Execption("%s is not in our category list" % category_name);
        return self._category_to_id[category]
        
        
vocab = Vocab(vocab_file, hps.num_word_threshold)
vocab_size = vocab.size()
tf.logging.info('vocab_size: %d' % vocab.size())

# test_str = '的 在 了 是'
# print(vocab.sentence_to_id(test_str))

category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
tf.logging.info('num_classes: %d' % num_classes)
test_str = '时尚'
tf.logging.info('label: %s, id: %d' % (test_str, category_vocab.category_to_id(test_str)))

INFO:tensorflow:vocab_size: 77323
INFO:tensorflow:num_classes: 10
INFO:tensorflow:label: 时尚, id: 5


In [5]:
class TextDataset:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info('Loading data from %s', filename)
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            id_words = id_words[0: self._num_timesteps] # 长的部分做截断
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
        self._random_shuffle()
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
        
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Execption("batch_size: %d is too large" % batch_size)
        
        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_outputs = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs
    
train_dataset = TextDataset(train_file, vocab, category_vocab, hps.num_timesteps)
val_dataset = TextDataset(val_file, vocab, category_vocab, hps.num_timesteps)
test_dataset = TextDataset(test_file, vocab, category_vocab, hps.num_timesteps)

print(train_dataset.next_batch(2))
print(val_dataset.next_batch(2))
print(test_dataset.next_batch(2))

INFO:tensorflow:Loading data from ./data/cnews.train.seg.txt
INFO:tensorflow:Loading data from ./data/cnews.val.seg.txt
INFO:tensorflow:Loading data from ./data/cnews.test.seg.txt
(array([[  287,  8097,    11,    47,  1231,   287,   232,   908,  9260,
          321,    50,  2490,    11,    47,  1231,   287,   232,   908,
         9260,   321,    50,   940,    11, 12337,    32,  2764,     1,
         3085,  4831,     2,  2846, 13096,  2515,  6169,     2,  1292,
            1,  7702,  2779,     2, 12337,   695,     6,  8396,  2143,
            2,   785,     1,  5868,   908],
       [   71,  1481, 39467, 10072,  6250, 10660,  2519,   250,    17,
          250,    16,   156,   599,   456,    82,    78,   130,    23,
          148,   269,     1,   160,  2121,  1935,   270,     1,  9027,
        10072,  6250,   112,   113,   130,    23,    75,   319,     1,
           22,   537,   130,    69,    23,     3,   388,    27,    94,
           23,   941,    44,     1,    71]], dtype=int32), array(

In [6]:
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    
    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    outputs = tf.placeholder(tf.int32, (batch_size, ))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)
    
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size, hps.num_embedding_size],
            tf.float32)
        # [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)
        
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    
    def _generate_params_for_lstm_cell(x_size, h_size, bias_size):
        """generates parameters for pure lstm implementation."""
        x_w = tf.get_variable('x_weights', x_size)
        h_w = tf.get_variable('h_weights', h_size)
        b = tf.get_variable('biases', bias_size, initializer=tf.constant_initializer(0.0))
        return x_w, h_w, b
    
    with tf.variable_scope('lstm_nn', initializer = lstm_init):
        """
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = tf.contrib.rnn.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True)
            cell = tf.contrib.rnn.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        
        initial_state = cell.zero_state(batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, lstm_outputs[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell, embed_inputs, initial_state = initial_state)
        last = rnn_outputs[:, -1, :]
        """
        # 输入门
        with tf.variable_scope('inputs'):
            ix, ih, ib = _generate_params_for_lstm_cell(
                x_size = [hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size = [hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size = [1, hps.num_lstm_nodes[0]]
            )
            
        # 输出门
        with tf.variable_scope('outputs'):
            ox, oh, ob = _generate_params_for_lstm_cell(
                x_size = [hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size = [hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size = [1, hps.num_lstm_nodes[0]]
            )
        
        # 遗忘门
        with tf.variable_scope('forget'):
            fx, fh, fb = _generate_params_for_lstm_cell(
                x_size = [hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size = [hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size = [1, hps.num_lstm_nodes[0]]
            )
        
        # 中间层
        with tf.variable_scope('memory'):
            cx, ch, cb = _generate_params_for_lstm_cell(
                x_size = [hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size = [hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size = [1, hps.num_lstm_nodes[0]]
            )
        state = tf.Variable(
            tf.zeros([batch_size, hps.num_lstm_nodes[0]]),
            trainable = False
        )
        h = tf.Variable(
            tf.zeros([batch_size, hps.num_lstm_nodes[0]]),
            trainable = False
        )
        for i in range(num_timesteps):
            # [batch_size, 1, embed_size]
            embed_input = embed_inputs[:, i, :]
            embed_input = tf.reshape(embed_input, [batch_size, hps.num_embedding_size])
            forget_gate = tf.sigmoid(tf.matmul(embed_input, fx) + tf.matmul(h, fh) + fb)
            input_gate = tf.sigmoid(tf.matmul(embed_input, ix) + tf.matmul(h, ih) + ib)
            output_gate = tf.sigmoid(tf.matmul(embed_input, ox) + tf.matmul(h, oh) + ob)
            mid_state = tf.tanh(tf.matmul(embed_input, cx) + tf.matmul(h, ch) + cb)
            state = mid_state * input_gate + state * forget_gate
            h = output_gate * tf.tanh(state)
        last = h
        
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        fc1 = tf.layers.dense(last, hps.num_fc_nodes, activation=tf.nn.relu, name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        logits = tf.layers.dense(fc1_dropout,num_classes,name='fc2')
        
    with tf.name_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=outputs)
        loss = tf.reduce_mean(softmax_loss)
        # [0, 1, 5, 4, 2] -> argmax: 2
        y_pred = tf.argmax(tf.nn.softmax(logits), 1, output_type=tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    with tf.name_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info('variable name: %s' %(var.name))
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), hps.clip_lstm_grads)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), global_step = global_step)
        
    return ((inputs, outputs, keep_prob), (loss, accuracy), (train_op, global_step))

placeholders, metrics, others = create_model(hps, vocab_size, num_classes)
inputs, outputs, keep_prob = placeholders
loss, accuracy = metrics
train_op, global_step = others

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
INFO:tensorflow:variable name: embedding/embedding:0
INFO:tensorflow:variable name: lstm_nn/inputs/x_weights:0
INFO:tensorflow:variable name: lstm_nn/inputs/h_weights:0
INFO:tensorflow:variable name: lstm_nn/inputs/biases:0
INFO:tensorflow:variable name: lstm_nn/outputs/x_weights:0
INFO:tensorflow:variable name: lstm_nn/outputs/h_weights:0
INFO:tensorflow:variable name: lstm_nn/outputs/biases:0
INFO:tensorflow:variable name: lstm_nn/forget/x_weights:0
INFO:tensorflow:variable name: lstm_nn/forget/h_weights:0
INFO:tensorflow:variable name: lstm_nn/forget/biases:0
INFO:tensorflow:variable name: lstm_nn/memory/x_weights:0
I

In [7]:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8
test_keep_prob_value = 1.0

num_train_steps = 10000

# Train: 99.7%
# Valid: 92.7%
# Test:  93.2%
with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        if i % 150 ==0:
            batch_inputs,batch_labels = test_dataset.next_batch(hps.batch_size)
        else:
            batch_inputs, batch_labels = train_dataset.next_batch(hps.batch_size)
        outputs_val = sess.run([loss, accuracy, train_op, global_step], 
        feed_dict={
            inputs: batch_inputs,
            outputs: batch_labels,
            keep_prob: train_keep_prob_value,
        })
        loss_val, accuracy_val, _, global_step_val = outputs_val
        if global_step_val % 150 == 0:
            tf.logging.info("teststep: %5d,loss: %3.3f,accuracy:%3.5f" % (global_step_val,loss_val,accuracy_val))
        if global_step_val % 20 == 0:
            tf.logging.info("trainstep: %5d,loss: %3.3f,accuracy:%3.5f" % (global_step_val,loss_val,accuracy_val))

INFO:tensorflow:trainstep:    20,loss: 2.302,accuracy:0.13000
INFO:tensorflow:trainstep:    40,loss: 2.300,accuracy:0.09000
INFO:tensorflow:trainstep:    60,loss: 2.293,accuracy:0.15000
INFO:tensorflow:trainstep:    80,loss: 2.294,accuracy:0.13000
INFO:tensorflow:trainstep:   100,loss: 2.294,accuracy:0.16000
INFO:tensorflow:trainstep:   120,loss: 2.270,accuracy:0.15000
INFO:tensorflow:trainstep:   140,loss: 2.221,accuracy:0.15000
INFO:tensorflow:teststep:   150,loss: 2.228,accuracy:0.21000
INFO:tensorflow:trainstep:   160,loss: 2.293,accuracy:0.16000
INFO:tensorflow:trainstep:   180,loss: 2.197,accuracy:0.15000
INFO:tensorflow:trainstep:   200,loss: 2.118,accuracy:0.20000
INFO:tensorflow:trainstep:   220,loss: 2.063,accuracy:0.24000
INFO:tensorflow:trainstep:   240,loss: 1.966,accuracy:0.26000
INFO:tensorflow:trainstep:   260,loss: 1.826,accuracy:0.23000
INFO:tensorflow:trainstep:   280,loss: 1.873,accuracy:0.32000
INFO:tensorflow:teststep:   300,loss: 1.876,accuracy:0.31000
INFO:tenso

INFO:tensorflow:trainstep:  2380,loss: 0.593,accuracy:0.78000
INFO:tensorflow:teststep:  2400,loss: 0.479,accuracy:0.83000
INFO:tensorflow:trainstep:  2400,loss: 0.479,accuracy:0.83000
INFO:tensorflow:trainstep:  2420,loss: 0.780,accuracy:0.71000
INFO:tensorflow:trainstep:  2440,loss: 0.375,accuracy:0.91000
INFO:tensorflow:trainstep:  2460,loss: 0.436,accuracy:0.88000
INFO:tensorflow:trainstep:  2480,loss: 0.562,accuracy:0.85000
INFO:tensorflow:trainstep:  2500,loss: 0.814,accuracy:0.75000
INFO:tensorflow:trainstep:  2520,loss: 0.426,accuracy:0.88000
INFO:tensorflow:trainstep:  2540,loss: 0.511,accuracy:0.85000
INFO:tensorflow:teststep:  2550,loss: 0.540,accuracy:0.87000
INFO:tensorflow:trainstep:  2560,loss: 0.540,accuracy:0.84000
INFO:tensorflow:trainstep:  2580,loss: 0.445,accuracy:0.85000
INFO:tensorflow:trainstep:  2600,loss: 0.571,accuracy:0.85000
INFO:tensorflow:trainstep:  2620,loss: 0.530,accuracy:0.86000
INFO:tensorflow:trainstep:  2640,loss: 0.698,accuracy:0.83000
INFO:tenso

INFO:tensorflow:trainstep:  4720,loss: 0.069,accuracy:0.99000
INFO:tensorflow:trainstep:  4740,loss: 0.118,accuracy:0.96000
INFO:tensorflow:trainstep:  4760,loss: 0.277,accuracy:0.93000
INFO:tensorflow:trainstep:  4780,loss: 0.223,accuracy:0.96000
INFO:tensorflow:teststep:  4800,loss: 0.143,accuracy:0.97000
INFO:tensorflow:trainstep:  4800,loss: 0.143,accuracy:0.97000
INFO:tensorflow:trainstep:  4820,loss: 0.128,accuracy:0.96000
INFO:tensorflow:trainstep:  4840,loss: 0.079,accuracy:0.97000
INFO:tensorflow:trainstep:  4860,loss: 0.322,accuracy:0.93000
INFO:tensorflow:trainstep:  4880,loss: 0.243,accuracy:0.91000
INFO:tensorflow:trainstep:  4900,loss: 0.199,accuracy:0.97000
INFO:tensorflow:trainstep:  4920,loss: 0.189,accuracy:0.95000
INFO:tensorflow:trainstep:  4940,loss: 0.391,accuracy:0.91000
INFO:tensorflow:teststep:  4950,loss: 0.323,accuracy:0.91000
INFO:tensorflow:trainstep:  4960,loss: 0.337,accuracy:0.93000
INFO:tensorflow:trainstep:  4980,loss: 0.220,accuracy:0.94000
INFO:tenso

INFO:tensorflow:trainstep:  7060,loss: 0.092,accuracy:0.98000
INFO:tensorflow:trainstep:  7080,loss: 0.041,accuracy:0.99000
INFO:tensorflow:trainstep:  7100,loss: 0.109,accuracy:0.97000
INFO:tensorflow:trainstep:  7120,loss: 0.296,accuracy:0.96000
INFO:tensorflow:trainstep:  7140,loss: 0.073,accuracy:0.99000
INFO:tensorflow:trainstep:  7160,loss: 0.067,accuracy:0.99000
INFO:tensorflow:trainstep:  7180,loss: 0.028,accuracy:0.98000
INFO:tensorflow:teststep:  7200,loss: 0.088,accuracy:0.98000
INFO:tensorflow:trainstep:  7200,loss: 0.088,accuracy:0.98000
INFO:tensorflow:trainstep:  7220,loss: 0.064,accuracy:0.99000
INFO:tensorflow:trainstep:  7240,loss: 0.026,accuracy:0.99000
INFO:tensorflow:trainstep:  7260,loss: 0.052,accuracy:0.98000
INFO:tensorflow:trainstep:  7280,loss: 0.052,accuracy:0.98000
INFO:tensorflow:trainstep:  7300,loss: 0.083,accuracy:0.98000
INFO:tensorflow:trainstep:  7320,loss: 0.140,accuracy:0.96000
INFO:tensorflow:trainstep:  7340,loss: 0.090,accuracy:0.97000
INFO:tens

INFO:tensorflow:trainstep:  9420,loss: 0.012,accuracy:1.00000
INFO:tensorflow:trainstep:  9440,loss: 0.125,accuracy:0.96000
INFO:tensorflow:teststep:  9450,loss: 0.025,accuracy:0.99000
INFO:tensorflow:trainstep:  9460,loss: 0.070,accuracy:0.99000
INFO:tensorflow:trainstep:  9480,loss: 0.008,accuracy:1.00000
INFO:tensorflow:trainstep:  9500,loss: 0.077,accuracy:0.99000
INFO:tensorflow:trainstep:  9520,loss: 0.018,accuracy:0.99000
INFO:tensorflow:trainstep:  9540,loss: 0.004,accuracy:1.00000
INFO:tensorflow:trainstep:  9560,loss: 0.031,accuracy:0.98000
INFO:tensorflow:trainstep:  9580,loss: 0.007,accuracy:1.00000
INFO:tensorflow:teststep:  9600,loss: 0.037,accuracy:0.98000
INFO:tensorflow:trainstep:  9600,loss: 0.037,accuracy:0.98000
INFO:tensorflow:trainstep:  9620,loss: 0.106,accuracy:0.97000
INFO:tensorflow:trainstep:  9640,loss: 0.131,accuracy:0.98000
INFO:tensorflow:trainstep:  9660,loss: 0.009,accuracy:1.00000
INFO:tensorflow:trainstep:  9680,loss: 0.004,accuracy:1.00000
INFO:tenso