In [122]:
import tensorflow as tf
import numpy as np

In [None]:
import sys
from collections import Counter

import numpy as np
import keras as kr
from gensim import models

class Datahelper(object):
    def __init__(self, config):
        self._dataSource = config.dataSource
        self._stopWordSource = config.stopWordSource  
        
        self._sequenceLength = config.sequenceLength  # 每条输入的序列处理为定长
        self._embeddingSize = config.model.embeddingSize
        self._batchSize = config.batchSize
        self._rate = config.rate
        
        self._stopWordDict = {}
        
        self.trainReviews = []
        self.trainLabels = []
        
        self.evalReviews = []
        self.evalLabels = []
        
        self.wordEmbedding =None
        
        self._wordToIndex = {}
        self._indexToWord = {}
        
    def _readData(self, filename):
        contents, labels = [], []
        with open(filename) as f:
            for line in f:
                try:
                    label, content = line.strip().split('\t')
                    if content:
                        contents.append(list((content)))
                        labels.append((label))
                except Exception as e:
                    print(e)
        return contents, labels

In [None]:
# data helper
import sys
from collections import Counter

import numpy as np
import keras as kr
from gensim import models

def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list((content)))
                    labels.append((label))
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')


def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    # word: id 把所有的汉字都id化了
    return words, word_to_id


def read_category():
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']

    categories = [(x) for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id


def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad

def loadWord2Vec(wv_filename, words):
    wordVec = models.KeyedVectors.load_word2vec_format(wv_filename, binary=False)
    vocab = []
    embd = []
    cnt = 0
    tmp = []
    
    # lookup the embedding
    for w in words:
        try:
            vocab.append(w)
            embd.append(wordVec.wv[w])   
        except Exception as e:
            print('error',e)
            print(w + "is not in pretained vector")
            tmp.append(w)
    
            
    # 获取wordvector的纬度，添加unk并赋值指定维度的0
    word_dim = len(embd[-1])
    vocab.append("UNK")
    embd.append(np.random.randn(word_dim))   
    vocab.append("<PAD>")
    embd.append(np.zeros(word_dim)) 
    for w in tmp:
        vocab.append(w)
        embd.append((np.random.randn(word_dim)))
    
    print ("loaded word2vec")
    print('total vocab', len(vocab))
    print(len(embd), 'word vectors found')
    return vocab,embd

def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]


In [123]:
# tcnnconfig
class TCNNConfig(object):
    """CNN配置参数"""
    embedding_size = 64  # 词向量维度
    seq_length = 600  # 序列长度
    num_classes = 10  # 类别数
    num_filters = 256  # 卷积核数目
    kernel_size = 5  # 卷积核尺寸
    vocab_size = 5000  # 词汇表大小

    hidden_dim = 128  # 全连接层神经元

    dropout_keep_prob = 0.5  # dropout保留比例
    learning_rate = 1e-3  # 学习率

    batch_size = 64  # 每批训练大小
    num_epochs = 10  # 总迭代轮次

    print_per_batch = 100  # 每多少轮输出一次结果
    save_per_batch = 10  # 每多少轮存入tensorboard

    l2_reg_lambda = 0.1

In [1]:
# textcnn model define
# 实现思路：
# 将对x的标签化放在模型中完成，只将对y的label化放在外面。个人倾向于都放在模型中，后续进行修改
# embedding - cnn - maxpooling - fc - dropout - classify - result
class TextCNN(object):
    def __init__(self, config, embedding):
        self.config = config
        
        # placrholders for three inputs
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.l2_loss = tf.constant(0.0)
        
        # init cnn model different the original form, i prefer this one
        self.cnn()
    def cnn(self, ):
        # embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            # w is the embedding matrix thar we learn during training and use randomuniform distribution to initialize it
            # I use a pretrained vector to init the embedding weight
            W = tf.Variable(tf.cast(embedding, dtype=tf.float32, name = 'word2vec'), name = 'W')
            embedded_ouput = tf.nn.embedding_lookup(W, self.input_x)
            print('embedded_ouput', embedded_ouput)



            
        # define the cnn layer with max pooling
        # use 2d input instead of 4d so we could jsut conv1d, simplify the calculation of dimentsions
        with tf.name_scope('cnn'):
            # CNN layer
            conv = tf.layers.conv1d(embedded_ouput, self.config.num_filters, self.config.kernel_size, name='cnn_conv')
            # global max pooling layer
            # just get the max one on dimension 1
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        # add a fully-connect with a dropout
        
        with tf.name_scope("dropout", ):
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)
            
        with tf.name_scope("score", ):
            W = tf.get_variable(
                "W",
                shape=[self.config.hidden_dim, self.config.num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            
            self.logits = tf.nn.xw_plus_b(fc, W, b, name="scores")
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
        
        with tf.name_scope("loss",):
            # loss function cross_entropy
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy) + self.config.l2_reg_lambda * self.l2_loss
            
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
        
        with tf.name_scope("accuracy", ):
            # 准确率
            correct_predictions = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name = "accuracy")

            
        

In [125]:
## train in jupyter
from __future__ import print_function
import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab


# 这个代码拆解了很多，最后才进行了训练
base_dir = 'data/subcnews'
train_dir = os.path.join(base_dir, 'subcnews.train.txt')
test_dir = os.path.join(base_dir, 'subcnews.test.txt')
val_dir = os.path.join(base_dir, 'subcnews.val.txt')
vocab_dir = os.path.join(base_dir, 'subcnews.vocab.txt')

save_dir = 'checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob,
    }
    return feed_dict


def evaluate(sess, x_, y_):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len


def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                print('write reasult to tensorboard scaler')
                s = session.run(merged_summary, feed_dict=feed_dict)
#                     sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})
#                 s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            feed_dict[model.keep_prob] = config.dropout_keep_prob
            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break

def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)

    session = tf.Session()
    

    
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)




In [137]:
vocab[0]

'<PAD>'

In [139]:
config = TCNNConfig()
if not os.path.exists(vocab_dir):  # 如果不存在词汇表，重建
    build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)

# read wordvec from pretrained one
filename = "wiki_100.utf8.txt"
vocab,embd = loadWord2Vec(filename, words)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
# 
config.vocab_size = len(vocab) # len(words)
config.embedding_size = len(embd[0])

error "word '<PAD>' not in vocabulary"
<PAD>is not in pretained vector
error "word '，' not in vocabulary"
，is not in pretained vector
error "word '' not in vocabulary"
is not in pretained vector
error "word '：' not in vocabulary"
：is not in pretained vector
error "word '' not in vocabulary"
is not in pretained vector
error "word '？' not in vocabulary"
？is not in pretained vector
error "word '；' not in vocabulary"
；is not in pretained vector
error "word '—' not in vocabulary"
—is not in pretained vector
error "word '）' not in vocabulary"
）is not in pretained vector
error "word '（' not in vocabulary"
（is not in pretained vector
error "word '' not in vocabulary"
is not in pretained vector
error "word '』' not in vocabulary"
』is not in pretained vector
error "word '『' not in vocabulary"
『is not in pretained vector
error "word '後' not in vocabulary"
後is not in pretained vector
error "word '摺' not in vocabulary"
摺is not in pretained vector
error "word '」' not in vocabulary"
」is not in pretain

  


In [None]:
# main func
tf.reset_default_graph() # 添加这句的目的是在jupyter notebook里面clean缓存，不然会出现变量重用的问题
print('Configuring CNN model...')

model = TextCNN(config, embedding)
train()

Configuring CNN model...
embedded_ouput Tensor("embedding/embedding_lookup:0", shape=(?, 600, 100), dtype=float32, device=/device:CPU:0)
Configuring TensorBoard and Saver...
Loading training and validation data...
Time usage: 0:00:13
Training and evaluating...
Epoch: 1
write reasult to tensorboard scaler
Iter:      0, Train Loss:    3.3, Train Acc:  12.50%, Val Loss:    3.3, Val Acc:   8.76%, Time: 0:00:11 *
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:    100, Train Loss:    1.6, Train Acc:  68.75%, Val Loss:    1.7, Val Acc:  64.24%, Time: 0:01:22 *
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult 

write reasult to tensorboard scaler
Iter:   1700, Train Loss:   0.16, Train Acc:  96.88%, Val Loss:   0.32, Val Acc:  91.88%, Time: 0:19:49 
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:   1800, Train Loss:   0.24, Train Acc:  93.75%, Val Loss:    0.3, Val Acc:  92.44%, Time: 0:20:57 
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:   1900, Train Loss:   0.16, 

In [None]:
def _getWordEmbedding(self, words):
        """
        按照我们的数据集中的单词取出预训练好的word2vec中的词向量
        """
        
        wordVec = gensim.models.KeyedVectors.load_word2vec_format("../word2vec/word2Vec.bin", binary=True)
        vocab = []
        wordEmbedding = []
        
        # 添加 "pad" 和 "UNK", 
        vocab.append("pad")
        vocab.append("UNK")
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                print(word + "不存在于词向量中")
                
        return vocab, np.array(wordEmbedding)