In [1]:
# 构建计算图——LSTM模型
#     embedding
#     LSTM
#     fc
#     train_op
# 训练流程代码
# 数据级封装
#     api: next_batch(batch_size)
# 词表的封装：
#     api: sentence2id(text_sentence): 句子转换成id
# 类别的封装：
#     api: category2id(text_category).

import tensorflow as tf
import os
import sys
import numpy as np
import math

tf.logging.set_verbosity(tf.logging.INFO) # 打印日志的声明

In [2]:
def get_default_params():
    return tf.contrib.training.HParams(
        num_embedding_size = 32, # 每一个词的embedding大小，即输入的通道数
        num_timesteps = 600,  # 固定每一个输入的词语数量为50
        num_filters = 128,
        num_kernel_size = 3,
        num_fc_nodes = 64, # 全连接层输出的维度
        batch_size = 100,
        learning_rate = 0.001, # 训练率
        num_word_threshold = 10,  # 词语出现的最低频数，低于这个值的词语忽略
    )

hps = get_default_params()

train_file = 'cnews_data/cnews.train.seg.txt'
val_file = 'cnews_data/cnews.val.seg.txt'
test_file = 'cnews_data/cnews.test.seg.txt'
vocab_file = 'cnews_data/cnews.vocab.txt'
category_file = 'cnews_data/cnews.category.txt'
output_folder = 'cnews_data/run_text_run'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

In [3]:
class Vocab:
    def __init__(self, filename, num_word_threshold): # threshold: 阈值
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
        
    def _read_dict(self, filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx
    
    def word_to_id(self, word):
        # 第二个参数为若指定key不存在时的返回值
        return self._word_to_id.get(word, self._unk) 
    
    @property
    def unk(self):
        return self._unk
    
    def size(self):
        return len(self._word_to_id)
    
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids
    
    
class CategoryDict:
    def __init__(self, filename):
        self._category_to_id = {}
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
    
    def size(self):
        return len(self._category_to_id)
    
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Exception(
                '%s is not in our category lsit' % category)
        return self._category_to_id[category]
        
vocab = Vocab(vocab_file, hps.num_word_threshold)
vocab_size = vocab.size()
tf.logging.info('vocab_size: %d' % vocab_size)

category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
tf.logging.info('num_classes: %s' % num_classes)
test_str = '时尚'
tf.logging.info(
    'label: %s, id: %d' % (
        test_str,
        category_vocab.category_to_id(test_str)))
        
    

INFO:tensorflow:vocab_size: 77325
INFO:tensorflow:num_classes: 10
INFO:tensorflow:label: 时尚, id: 4


In [4]:
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)
        
    def _parse_file(self, filename):
        tf.logging.info('Loading data from %s' , filename)
        with open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            id_words = id_words[0:self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [
                self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        self._inputs = np.asarray(self._inputs, dtype = np.int32)
        self._outputs = np.asarray(self._outputs, dtype = np.int32)
        self._random_shuffle()
        self._num_examples = len(self._inputs)
        
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
        
    def num_examples(self):
        return self._num_examples
    
    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception('Batch_size : %d is too large' % batch_size)
        batch_inputs = self._inputs[self._indicator:end_indicator]
        batch_outputs = self._outputs[self._indicator:end_indicator]
        self._indicator = end_indicator
        return batch_inputs,batch_outputs
    
train_dataset = TextDataSet(
    train_file, vocab, category_vocab, hps.num_timesteps)
val_dataset = TextDataSet(
    val_file, vocab, category_vocab, hps.num_timesteps)
test_dataset = TextDataSet(
    test_file, vocab, category_vocab, hps.num_timesteps)

print(train_dataset.num_examples())
print(val_dataset.num_examples())
print(test_dataset.num_examples())

print(train_dataset.next_batch(2))
print(val_dataset.next_batch(2))
print(test_dataset.next_batch(2))

INFO:tensorflow:Loading data from cnews_data/cnews.train.seg.txt
INFO:tensorflow:Loading data from cnews_data/cnews.val.seg.txt
INFO:tensorflow:Loading data from cnews_data/cnews.test.seg.txt
50000
5000
10000
(array([[ 6751, 12824,   410, ...,    12,   428, 15385],
       [35979,   325,  6792, ...,     0,     0,     0]], dtype=int32), array([8, 5], dtype=int32))
(array([[ 2473,   225,    88, ...,     0,     0,     0],
       [13208,    44,    11, ...,  1777,    22,     1]], dtype=int32), array([0, 8], dtype=int32))
(array([[ 239, 2341, 1028, ...,    1,  129,   33],
       [6987, 4035, 3629, ...,    1,  172,  836]], dtype=int32), array([9, 9], dtype=int32))


In [5]:
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    
    # [100, 600]
    inputs = tf.placeholder(tf.int32, [batch_size, num_timesteps])
    outputs = tf.placeholder(tf.int32, [batch_size,])
    # dropout 中的保留的神经元的数目
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    
    # 保存当前训练到的步数
    global_step = tf.Variable(
        tf.zeros([], tf.int64), name = 'global_step', trainable = False)
    
    # [100, 600, 32]
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope(
        'embedding', initializer = embedding_initializer):
        embeddings = tf.get_variable(
            'embedding',
            [vocab_size, hps.num_embedding_size],
            tf.float32)
        # [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)
    
    # [100, 600, 64]
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_filters) / 3.0
    cnn_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('cnn',initializer=cnn_init):
        conv1 = tf.layers.conv1d(embed_inputs,
                                 hps.num_filters,
                                 hps.num_kernel_size,
                                 activation = tf.nn.relu,
                                 name = 'conv1')
        global_maxpooling = tf.reduce_max(conv1, axis = 1)
        
        
    fc_init = tf.initializers.variance_scaling(scale = 1.0, distribution = 'uniform')
    with tf.variable_scope('fc', initializer = fc_init):
        # [100, 64]
        fc1 = tf.layers.dense(global_maxpooling,
                              hps.num_fc_nodes,
                              activation = tf.nn.relu,
                              name = 'fc1')
        print(fc1.shape)
        # [100, 64]
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        print(fc1_dropout.shape)
        # [100, 10]
        logits = tf.layers.dense(fc1_dropout,
                                 num_classes,
                                 name = 'fc2')
        print(logits.shape)
        
    with tf.name_scope('metrics'):
        # [100,]
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = logits, labels = outputs)
        print("softmax_loss: ",softmax_loss.shape)
        loss = tf.reduce_mean(softmax_loss)
        # [0, 1, 5, 4, 2] -> argmax: 2
        y_pred = tf.argmax(tf.nn.softmax(logits),
                           1,
                           output_type = tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
    with tf.name_scope('train_op'):
        train_op = tf.train.AdamOptimizer(hps.learning_rate).minimize(
            loss, global_step = global_step)
        
    return ((inputs, outputs, keep_prob),
            (loss, accuracy),
            (train_op, global_step))
        
placeholders, metrics, others = create_model(
    hps, vocab_size, num_classes)

inputs, outputs, keep_prob = placeholders
loss, accuracy = metrics
train_op, global_step = others
    

(100, 64)
(100, 64)
(100, 10)
softmax_loss:  (100,)


In [7]:
def eval_holdout(sess, accuracy, dataset_for_test, batch_size):
    num_batches = dataset_for_test.num_examples() // batch_size
    tf.logging.info('Eval holdout: num_examples = %d, batch_size = %d',
                    dataset_for_test.num_examples(), batch_size)
    accuracy_vals = []
    for i in range(num_batches):
        batch_inputs, batch_labels = dataset_for_test.next_batch(batch_size)
        accuracy_val = sess.run(accuracy,
                                feed_dict = {
                                    inputs: batch_inputs,
                                    outputs: batch_labels,
                                    keep_prob: 1.0,
                                })
        accuracy_vals.append(accuracy_val)
    return np.mean(accuracy_vals)
    
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8

num_train_steps = 10000

# Train: 99.7%
# Valid: 92.7%
# Test: 93.2%
with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs, batch_labels = train_dataset.next_batch(hps.batch_size)
        outputs_val = sess.run([loss, accuracy, train_op, global_step],
                               feed_dict = {
                                  inputs: batch_inputs,
                                  outputs: batch_labels,
                                  keep_prob: train_keep_prob_value,
                               })
        loss_val, accuracy_val, _, global_step_val = outputs_val
        if global_step_val % 200 == 0:
            tf.logging.info("Step: %5d, loss: %3.3f, accuracy: %3.3f"
                            % (global_step_val, loss_val, accuracy_val))
        if global_step_val % 1000 == 0:
            accuracy_eval = eval_holdout(
                sess, accuracy, val_dataset, hps.batch_size)
            accuracy_test = eval_holdout(
                sess, accuracy, test_dataset, hps.batch_size)
            tf.logging.info("Step: %5d, val_accuracy: %3.3f, test_accuracy: %3.3f"
                            % (global_step_val, accuracy_eval, accuracy_test))
        

INFO:tensorflow:Step:   200, loss: 0.625, accuracy: 0.810
INFO:tensorflow:Step:   400, loss: 0.258, accuracy: 0.940
INFO:tensorflow:Step:   600, loss: 0.217, accuracy: 0.950
INFO:tensorflow:Step:   800, loss: 0.260, accuracy: 0.930
INFO:tensorflow:Step:  1000, loss: 0.124, accuracy: 0.970
INFO:tensorflow:Eval holdout: num_examples = 5000, batch_size = 100
INFO:tensorflow:Eval holdout: num_examples = 10000, batch_size = 100
INFO:tensorflow:Step:  1000, val_accuracy: 0.918, test_accuracy: 0.925
INFO:tensorflow:Step:  1200, loss: 0.154, accuracy: 0.930
INFO:tensorflow:Step:  1400, loss: 0.087, accuracy: 0.980
INFO:tensorflow:Step:  1600, loss: 0.139, accuracy: 0.950
INFO:tensorflow:Step:  1800, loss: 0.066, accuracy: 0.980
INFO:tensorflow:Step:  2000, loss: 0.079, accuracy: 0.980
INFO:tensorflow:Eval holdout: num_examples = 5000, batch_size = 100
INFO:tensorflow:Eval holdout: num_examples = 10000, batch_size = 100
INFO:tensorflow:Step:  2000, val_accuracy: 0.914, test_accuracy: 0.938
INFO