# data_process

In [19]:
# -*- coding:utf-8 -*-
from collections import Counter
import tensorflow as tf
from tensorflow import keras
import numpy as np
import codecs
import jieba
import re


def read_file(filename):
    """
    读取filename的文件内容 并返回label以及分词后的content内容
    :param filename: 待读取文件
    :return: 读取文件labels 和 contents 文件
    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")
    contents, labels = [], []
    with codecs.open(filename, 'r', encoding='utf-8') as f:
        row = 1
        for line in f:
            try:
                # 处理每一篇news
                line = line.rstrip()
                label, content = line.split('\t')
                labels.append(label)
                blocks = re_han.split(content)
                word = []

                for block in blocks:
                    if re_han.match(block):
                        cut_res = jieba.lcut(block)
                        word.extend(cut_res)
                row += 1
                if row % 1000 == 0:
                    print("已经完成切割的news篇数为：", row)
                contents.append(word)
            except:
                print("数据读取出现错误！！!")
    return labels, contents


def build_vocab_vector(filenames, voc_size=10000):
    """
    去停用词 得到前9999个词 获取对应词的以及其词向量
    并写入本地磁盘
    vocab_word.txt 全部9999个词
    vector_word.npz 全部9999个词的100为词向量
    :param filenames:
    :param voc_size:
    :return:
    """
    stop_words = codecs.open('./data/stopwords.txt', 'r', encoding='utf-8')
    stop = [word.rstrip().strip('\n') for word in stop_words]

    all_data = []
    j = 1
    # 每一个词的维度为100维
    embeddings = np.zeros([10000, 100])

    for filename in filenames:
        print("读取" + filename + "的内容")
        labels, contents = read_file(filename)
        for each_line in contents:
            line = []
            for w_index in range(len(each_line)):
                # 去停用词
                if str(each_line[w_index]) not in stop:
                    line.append(each_line[w_index])

            all_data.extend(line)
    print("已经对全部文件完成读取并且完成切词以及去停用词.....")

    counter = Counter(all_data)
    counter_pairs = counter.most_common(voc_size - 1)
    word, _ = list(zip(*counter_pairs))

    f = codecs.open('./data/vector_word.txt', 'r', encoding='utf-8')
    vocab_word = codecs.open('./data/vocab_word.txt', 'w', encoding='utf-8')

    for each_line in f:
        item = each_line.split(' ')
        key = item[0]
        vec = np.array(item[1:], dtype='float32')
        if key in word:
            embeddings[j] = np.array(vec)
            vocab_word.write(key.strip('\r') + '\n')
            j += 1
    np.savez_compressed('./data/vector_word.npz', embeddings=embeddings)
    f.close()
    vocab_word.close()


def get_word_id(filename):
    """
    返回vocab的id信息
    :param filename: vocab文件名称 本例中为vocab_word.txt
    :return: 对vocab中全部的词汇进行id信息配置
    """
    key = codecs.open(filename, 'r', encoding='utf-8')
    word_id = {'<PAD>': 0}
    w_index = 1
    for w in key:
        w = w.strip('\r').strip('\n')
        word_id[w] = w_index
        w_index += 1
    return word_id


def read_category():
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    cat_to_id = dict(zip(categories, range(len(categories))))
    return categories, cat_to_id


def process(filename, word_to_id, cat_to_id, max_length=300):
    """
    对filename读取准备模型输入数据 包括训练数据和测试数据
    :param filename: 数据文件名称
    :param word_to_id: vocab_word的id信息 只收集（数据中vocab_size=10000的输入量）
    :param cat_to_id: 分类标签
    :param max_length: padded后的最大长度
    :return: padding x_pad, y_pad
    """
    labels, contents = read_file(filename)

    data_id, label_id = [], []

    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    x_pad = keras.preprocessing.sequence.pad_sequences(data_id, max_length,
                                                       padding='post', truncating='post')
    y_pad = keras.utils.to_categorical(label_id)

    return x_pad, y_pad


def get_word2vec(filename):
    with np.load(filename) as data:
        return data['embeddings']


def batch_iter(x, y, batch_size=64):
    data_len = len(x)
    num_batch = int((data_len - 1)/batch_size) + 1
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id: end_id], y_shuffle[start_id: end_id]


def seq_length(x_batch):
    real_seq_len = []
    for line in x_batch:
        real_seq_len.append(np.sum(np.sign(line)))

    return real_seq_len


# parameters

In [2]:
class Parameters(object):

    embedding_dim =100
    vocab_size = 10000
    pre_training = None

    seq_length = 300
    num_classes = 10
    hidden_dim = 128
    filter_size = [2, 3, 4]
    num_filters = 128

    keep_prob = 0.5
    learning_rate = 1e-3
    # learning rate decay
    lr_decay = 0.9
    # gradient clipping threshold
    clip = 9.0

    num_epochs = 3
    batch_size = 64

    # train data
    train_filename = './data/cnews.train.txt'
    # test data
    test_filename = './data/cnews.test.txt'
    # val data
    val_filename = './data/cnews.val.txt'
    # vocabulary
    vocab_name = './data/vocab_word.txt'
    # vector_word trained by word2vec
    vector_word_filename = './data/vector_word.txt'
    # save vector_word to numpy file
    vector_word_npz = './data/vector_word.npz'

# model

In [3]:
class LstmCnn(object):

    def __init__(self):
        self.input_x = tf.placeholder(tf.int32, shape=[None, pm.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, shape=[None, pm.num_classes], name='input_y')
        self.length = tf.placeholder(tf.int32, shape=[None], name='rnn_length')
        self.keep_pro = tf.placeholder(tf.float32, name='dropout')
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.lstm_cnn()

    def lstm_cnn(self):

        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            self.embedding = tf.get_variable('embedding', shape=[pm.vocab_size, pm.embedding_dim],
                                             initializer=tf.constant_initializer(pm.pre_training))
            embedding_input = tf.nn.embedding_lookup(self.embedding, self.input_x)

        with tf.name_scope('LSTM'):
            cell = tf.nn.rnn_cell.LSTMCell(pm.hidden_dim, state_is_tuple=True)
            Cell = tf.contrib.rnn.DropoutWrapper(cell, pm.keep_prob)
            output, _ = tf.nn.dynamic_rnn(cell=Cell, inputs=embedding_input,
                                          sequence_length=self.length, dtype=tf.float32)
            print("LSTM的输出层：", output)

        with tf.name_scope('CNN'):
            outputs = tf.expand_dims(output, -1)  # [batch_size,seq_length, hidden_dim, 1]
            print("测试经过expand的dim为：", outputs)
            pooled_outputs = []
            for i, filter_size in enumerate(pm.filter_size):
                filter_shape = [filter_size, pm.hidden_dim, 1, pm.num_filters]
                w = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='w')
                # 每一个filters进行bias的添加 卷积操作
                b = tf.Variable(tf.constant(0.1, shape=[pm.num_filters]), name='b')
                conv = tf.nn.conv2d(outputs, w, strides=[1, 1, 1, 1], padding='VALID', name='conv')
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')

                # 池化操作
                pooled = tf.nn.max_pool(h, ksize=[1, pm.seq_length-filter_size+1, 1, 1],
                                        strides=[1,1,1,1], padding='VALID', name='pool')

                pooled_outputs.append(pooled)
            output_ = tf.concat(pooled_outputs, 3)
            print("CNN层的输出1为:", output_)
            # 拉平池化之后的矩阵 用于连接全连接层
            self.output = tf.reshape(output_, shape=[-1, 3*pm.num_filters])
            print("CNN层的输出2为:", self.output)

        with tf.name_scope('output'):
            out_final = tf.nn.dropout(self.output, keep_prob=self.keep_pro)
            # 全连接层
            o_w = tf.Variable(tf.truncated_normal([3*pm.num_filters, pm.num_classes], stddev=0.1), name='o_w')
            o_b = tf.Variable(tf.constant(0.1, shape=[pm.num_classes]), name='o_b')
            self.logits = tf.matmul(out_final, o_w) + o_b
            self.predict = tf.argmax(tf.nn.softmax(self.logits), 1, name='score')

        with tf.name_scope('loss'):
            cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)

        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(pm.learning_rate)
            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
            # 让权重的更新限制在一个合适的范围
            gradients, _ = tf.clip_by_global_norm(gradients, pm.clip)
            self.optimizer = optimizer.apply_gradients(zip(gradients, variables), global_step=self.global_step)

        with tf.name_scope('accuracy'):
            correct = tf.equal(self.predict, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='accuracy')

    def feed_data(self, x_batch, y_batch, real_seq_len, keep_pro):
        feed_dict = {self.input_x: x_batch,
                     self.input_y: y_batch,
                     self.length: real_seq_len,
                     self.keep_pro: keep_pro}
        return feed_dict

    def test(self, sess, x, y):
        global test_loss, test_accuracy
        batch_test = batch_iter(x, y, batch_size=pm.batch_size)
        for x_batch, y_batch in batch_test:
            real_seq_len = seq_length(x_batch)
            feed_dict = self.feed_data(x_batch, y_batch, real_seq_len, 1.0)
            test_loss, test_accuracy = sess.run([self.loss, self.accuracy], feed_dict=feed_dict)

        return test_loss, test_accuracy

# main function

In [4]:
import os
pm = Parameters()
filenames = [pm.train_filename, pm.test_filename, pm.val_filename]
categories, cat_to_id = read_category()
if not os.path.exists(pm.vocab_name):
    build_vocab_vector(filenames, 10000)
word_ids = get_word_id(pm.vocab_name)
pm.vocab_size = len(word_ids)
pm.pre_training = get_word2vec(pm.vector_word_npz)

# build the model

In [5]:
model = LstmCnn()

LSTM的输出层： Tensor("LSTM/rnn/transpose_1:0", shape=(?, 300, 128), dtype=float32)
测试经过expand的dim为： Tensor("CNN/ExpandDims:0", shape=(?, 300, 128, 1), dtype=float32)
CNN层的输出1为: Tensor("CNN/concat:0", shape=(?, 1, 1, 384), dtype=float32)
CNN层的输出2为: Tensor("CNN/Reshape:0", shape=(?, 384), dtype=float32)


# train the model

In [6]:
tensorboard_dir = './tensorboard/lstm-cnn'
save_dir = './checkpoints/lstm-cnn'
if not os.path.exists(tensorboard_dir):
    os.makedirs(tensorboard_dir)
if os.path.exists(save_dir):
    os.makedirs(save_dir)
save_path = os.path.join(save_dir, 'best_validation')

tf.summary.scalar('loss', model.loss)
tf.summary.scalar('accuracy', model.accuracy)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
writer.add_graph(sess.graph)

print("preparing the training data...")
x_train, y_train = process(pm.train_filename, word_ids, cat_to_id, max_length=300)
print("preparing the testing data...")
x_test, y_test = process(pm.test_filename, word_ids, cat_to_id, max_length=300)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\sunc\AppData\Local\Temp\jieba.cache


preparing the training data...


Loading model cost 0.691 seconds.
Prefix dict has been built successfully.


已经完成切割的news篇数为： 1000
已经完成切割的news篇数为： 2000
已经完成切割的news篇数为： 3000
已经完成切割的news篇数为： 4000
已经完成切割的news篇数为： 5000
已经完成切割的news篇数为： 6000
已经完成切割的news篇数为： 7000
已经完成切割的news篇数为： 8000
已经完成切割的news篇数为： 9000
已经完成切割的news篇数为： 10000
已经完成切割的news篇数为： 11000
已经完成切割的news篇数为： 12000
已经完成切割的news篇数为： 13000
已经完成切割的news篇数为： 14000
已经完成切割的news篇数为： 15000
已经完成切割的news篇数为： 16000
已经完成切割的news篇数为： 17000
已经完成切割的news篇数为： 18000
已经完成切割的news篇数为： 19000
已经完成切割的news篇数为： 20000
已经完成切割的news篇数为： 21000
已经完成切割的news篇数为： 22000
已经完成切割的news篇数为： 23000
已经完成切割的news篇数为： 24000
已经完成切割的news篇数为： 25000
已经完成切割的news篇数为： 26000
已经完成切割的news篇数为： 27000
已经完成切割的news篇数为： 28000
已经完成切割的news篇数为： 29000
已经完成切割的news篇数为： 30000
已经完成切割的news篇数为： 31000
已经完成切割的news篇数为： 32000
已经完成切割的news篇数为： 33000
已经完成切割的news篇数为： 34000
已经完成切割的news篇数为： 35000
数据读取出现错误！！!
已经完成切割的news篇数为： 36000
已经完成切割的news篇数为： 37000
已经完成切割的news篇数为： 38000
已经完成切割的news篇数为： 39000
已经完成切割的news篇数为： 40000
已经完成切割的news篇数为： 41000
已经完成切割的news篇数为： 42000
已经完成切割的news篇数为： 43000
已经完成切割的news篇数为： 44000
已经完成切割的news篇数为： 45000
已经完成切割的

In [20]:
def train():
    for epoch in range(pm.num_epochs):
        print('Epoch:', epoch+1)
        num_batchs = int((len(x_train) - 1) / pm.batch_size) + 1
        batch_train = batch_iter(x_train, y_train, batch_size=pm.batch_size)
        for x_batch, y_batch in batch_train:
            real_seq_len = seq_length(x_batch)
            feed_dict = model.feed_data(x_batch, y_batch, real_seq_len, pm.keep_prob)
            _, global_step, _summary, train_loss, train_accuracy = sess.run([model.optimizer, model.global_step, merged_summary,
                                                                                model.loss, model.accuracy], feed_dict=feed_dict)
            if global_step % 100 == 0:
                test_loss, test_accuracy = model.test(session, x_test, y_test)
                print('global_step:', global_step, 'train_loss:', train_loss, 'train_accuracy:', train_accuracy,
                      'test_loss:', test_loss, 'test_accuracy:', test_accuracy)

            if global_step % num_batchs == 0:
                print('Saving Model...')
                saver.save(session, save_path, global_step=global_step)

        pm.learning_rate *= pm.lr_decay

In [None]:
train()

Epoch: 1
