In [64]:
import os, sys
import time
import codecs
import numpy as np
import tensorflow as tf
from datetime import timedelta
from collections import defaultdict


## 数据处理

1. #### 生成字典

In [3]:
files = os.listdir("./data/cnews/")

words = defaultdict(int)

base_dir = 'data/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')

save_dir = 'checkpoints/textrnn'
save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径

for f in files:
    if not f.endswith(".txt"):continue
    file_path = os.path.join("./data/cnews", f)
    with codecs.open(file_path, 'r', 'utf8') as rf:
        for line in rf:
            line = line.strip()
            for w in line:
                words[w] += 1

sorted_words = sorted(words.items(), key=lambda x:x[1], reverse=True)
newlines = list()
for w, c in sorted_words:
    newline = "\t".join([w, str(c)])
    newlines.append(newline)
    
with codecs.open("vocab.txt", 'w', 'utf8') as wf:
    wf.write('\n'.join(newlines))

In [4]:
word2index = {"UNK":0}
index2word = {0:"UNK"}
with codecs.open('./vocab.txt', 'r', 'utf8') as rf:
    for i, line in enumerate(rf):
        if i >= 4999:break
        word = line.strip().split('\t')[0]
        word2index[word] = i+1
        index2word[i+1] = word

In [5]:
def generate_index(query):
    query_index = [word2index.get(w, 0) for w in query]
    return query_index

def generate_word(indexs):
    index_query = [index2word.get(i, "UNK") for i in indexs]
    return index_query

def read_category():
    """读取分类目录，固定"""
    categories = [u'体育', u'财经', u'房产', u'家居', u'教育', u'科技', u'时尚', u'时政', u'游戏', u'娱乐']
    categories = [x for x in categories]
    cat_to_id = dict(zip(categories, range(len(categories))))
    return categories, cat_to_id

def read_file(filename):
    contents, labels = list(), list()
    with codecs.open(filename, 'r','utf8') as rf:
        for line in rf:
            try:
                l = line.strip().split('\t')
#                 if len(l) != 2:
#                     print line.strip()
                label, content = l
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except Exception as e:
                continue              
    return contents, labels                

# contents, labels = read_file("./data/cnews/cnews.train.txt")
# print contents[0], labels[0]
categories, cat2id = read_category()

In [62]:
# 生成训练集和预测集

def generate_data(filename, word2id, cat2id, maxlen=500):
    contents, labels = read_file(filename)
    
    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word2index.get(w, 0) for w in contents[i]])
        label_id.append(cat2id.get(labels[i]))
        
    x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id, maxlen)
    # change to one_hot coding
    y_pad = tf.keras.utils.to_categorical(label_id, num_classes=len(cat2id))
    # y_pad = tf.one_hot(label_id, len(cat2id))
    return x_pad, y_pad

def get_embedding(vocab_size, embedding_dim, input_x):
    with tf.device("/cpu:0"):
        embedding_table = tf.get_variable('embedding', [vocab_size, embedding_dim], reuse=True)
        embedding = tf.nn.embedding_lookup(embedding_table, input_x)
        return embedding
    
def get_batch(x, y, batch_size=64):
    data_len = len(x)
    num_batch = int((data_len-1) / batch_size ) + 1
    
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]
    
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i+1)*batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
        
def feed_data(model, x_batch, y_batch, keep_prob):
    feed_dict = {
         model.input_x: x_batch,
         model.input_y: y_batch,
         model.keep_prob: keep_prob
     }
    return feed_dict

def evaluate(sess, x_, y_, model):
    data_len = len(x_)
    batch_eval = get_batch(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(model, x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len

def get_time_dif(start_time):
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


In [7]:
# Config配置
class Config(object):
    # 模型参数
    embedding_dim = 64      # 词向量维度
    seq_length = 600        # 序列长度
    num_classes = 10        # 类别数
    vocab_size = 5000       # 词汇表达小

    num_layers= 2           # 隐藏层层数
    hidden_dim = 128        # 隐藏层神经元
    rnn = 'gru'             # lstm 或 gru

    dropout_keep_prob = 0.8 # dropout保留比例
    learning_rate = 1e-3    # 学习率

    batch_size = 128         # 每批训练大小
    num_epochs = 10          # 总迭代轮次

    print_per_batch = 100    # 每多少轮输出一次结果
    save_per_batch = 10      # 每多少轮存入tensorboard

In [48]:
# TextRNN模型
class TextRNN(object):
    def __init__(self, config):
        self.config = config
        self.input_x = tf.placeholder(tf.int32, shape=[None, self.config.seq_length], name="input_x")
        self.input_y = tf.placeholder(tf.int32, shape=[None, self.config.num_classes], name="input_y")
        self.keep_prob = tf.placeholder(tf.float32, name="keep_prob")
        
        self.rnn()
        
    def rnn(self):
        
        with tf.variable_scope("rnn", reuse=tf.AUTO_REUSE):
            def lstm_cell():
                return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
            def gru_cell():
                return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
            def dropout():
                if self.config.rnn=="lstm":
                    cell = lstm_cell()
                else:
                    cell = gru_cell()
                return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.config.dropout_keep_prob)
        
#         embedding_inputs = get_embedding(self.config.vocab_size, self.config.embedding_dim, self.input_x)
            with tf.device('/cpu:0'):
            
                embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
                embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
        
            with tf.name_scope("rnn"):
            
            # 多层rnn
                cells = [dropout() for _ in range(self.config.num_layers)]
                rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
            
                _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
                last = _outputs[:, -1, :]   #取最后一个时序输出
            
            with tf.name_scope("score"):
            # 全连接层，后面接dropout以及Relu
                fc = tf.layers.dense(last, self.config.hidden_dim, name="fc1")
                fc = tf.contrib.layers.dropout(fc, self.keep_prob)
                fc = tf.nn.relu(fc)
            
            # 分类器
                self.logits = tf.layers.dense(fc, self.config.num_classes, name="fc2")
                self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)
            
            with tf.name_scope('optimize'):
            # 损失函数，交叉熵
                cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
                self.loss = tf.reduce_mean(cross_entropy)
            
                self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
                
            with tf.name_scope("accuracy"):
            # 准确率
                correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
                self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            
            
            

In [9]:

class TRNNConfig(object):
    embedding_dim = 64
    seq_length = 600
    num_classes = 10
    vocab_size = 5000
    
    num_layers = 2
    hidden_dim = 128
    rnn = 'gru'
    
    dropout_keep_prob = 0.8
    learning_rate = 1e-3
    
    batch_size = 128
    num_epochs = 10
    
    print_per_batch = 100
    save_per_batch = 10



In [66]:
# train module
def train():
    config = TRNNConfig()
    model = TextRNN(config)
    print "config tensorboard and saver"
    tensorboard_dir = "tensorboard/textrnn"
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    
    saver = tf.train.Saver()
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    # train start
    x_train, y_train = generate_data(train_dir, word2index, cat2id, config.seq_length)
    x_val, y_val = generate_data(val_dir, word2index, cat2id, config.seq_length)
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    writer.add_graph(sess.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练
            
    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = get_batch(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
#             print "xxxxx x_batch shape: ", x_batch.shape
#             print "xxxxx y_batch shape: ", y_batch.shape
#             print y_batch
#             print config.dropout_keep_prob
            feed_dict = feed_data(model, x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                 # 每多少轮次将训练结果写入tensorboard scalar
                s = sess.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                 # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = sess.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(sess, x_val, y_val, model)  # todo

                if acc_val > best_acc_val:
                     # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=sess, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                       + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            feed_dict[model.keep_prob] = config.dropout_keep_prob
            sess.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break

In [67]:
tf.reset_default_graph()
train()

config tensorboard and saver
Training and evaluating...
('Epoch:', 1)
Iter:      0, Train Loss:    2.3, Train Acc:  12.50%, Val Loss:    2.3, Val Acc:   9.14%, Time: 0:00:25 *
Iter:    100, Train Loss:    1.0, Train Acc:  73.44%, Val Loss:    1.2, Val Acc:  57.76%, Time: 0:03:58 *
Iter:    200, Train Loss:   0.48, Train Acc:  82.81%, Val Loss:   0.76, Val Acc:  74.78%, Time: 0:07:31 *
Iter:    300, Train Loss:   0.46, Train Acc:  85.94%, Val Loss:   0.61, Val Acc:  81.90%, Time: 0:11:12 *


KeyboardInterrupt: 