In [122]:
import tensorflow as tf
import numpy as np

In [123]:
class TCNNConfig(object):
    """CNN配置参数"""
    embedding_size = 64  # 词向量维度
    seq_length = 600  # 序列长度
    num_classes = 10  # 类别数
    num_filters = 256  # 卷积核数目
    kernel_size = 5  # 卷积核尺寸
    vocab_size = 5000  # 词汇表大小

    hidden_dim = 128  # 全连接层神经元

    dropout_keep_prob = 0.5  # dropout保留比例
    learning_rate = 1e-3  # 学习率

    batch_size = 64  # 每批训练大小
    num_epochs = 10  # 总迭代轮次

    print_per_batch = 100  # 每多少轮输出一次结果
    save_per_batch = 10  # 每多少轮存入tensorboard

    l2_reg_lambda = 0.1

In [124]:
# 实现思路：
# 将对x的标签化放在模型中完成，只将对y的label化放在外面。个人倾向于都放在模型中，后续进行修改
# embedding - cnn - maxpooling - fc - dropout - classify - result
class TextCNN(object):
    def __init__(self, config, embedding):
        self.config = config
        
        # placrholders for three inputs
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.l2_loss = tf.constant(0.0)
        
        # init cnn model different the original form, i prefer this one
        self.cnn()
    def cnn(self, ):
        # embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            # w is the embedding matrix thar we learn during training and use randomuniform distribution to initialize it
            # I use a pretrained vector to init the embedding weight
            W = tf.Variable(tf.cast(embedding, dtype=tf.float32, name = 'word2vec'), name = 'W')
            embedded_ouput = tf.nn.embedding_lookup(W, self.input_x)
            print('embedded_ouput', embedded_ouput)



            
        # define the cnn layer with max pooling
        # use 2d input instead of 4d so we could jsut conv1d, simplify the calculation of dimentsions
        with tf.name_scope('cnn'):
            # CNN layer
            conv = tf.layers.conv1d(embedded_ouput, self.config.num_filters, self.config.kernel_size, name='cnn_conv')
            # global max pooling layer
            # just get the max one on dimension 1
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        # add a fully-connect with a dropout
        
        with tf.name_scope("dropout", ):
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)
            
        with tf.name_scope("score", ):
            W = tf.get_variable(
                "W",
                shape=[self.config.hidden_dim, self.config.num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            
            self.logits = tf.nn.xw_plus_b(fc, W, b, name="scores")
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
        
        with tf.name_scope("loss",):
            # loss function cross_entropy
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy) + self.config.l2_reg_lambda * self.l2_loss
            
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
        
        with tf.name_scope("accuracy", ):
            # 准确率
            correct_predictions = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name = "accuracy")

            
        

In [131]:
# main func
tf.reset_default_graph() # 添加这句的目的是在jupyter notebook里面clean缓存，不然会出现变量重用的问题
print('Configuring CNN model...')

model = TextCNN(config, embedding)
train()

Configuring CNN model...


TypeError: Expected binary or unicode string, got array([ 8.71970e-02, -8.34350e-02,  5.79560e-02,  1.43120e-01,
       -6.80000e-05,  1.23272e-01,  2.24390e-02, -3.23317e-01,
       -2.31756e-01, -9.26200e-03, -2.11264e-01, -2.16980e-02,
        2.46367e-01,  1.79090e-01,  5.41170e-02,  7.76380e-02,
       -5.45550e-02, -5.06300e-02,  7.23610e-02,  1.03788e-01,
        2.40412e-01, -3.86661e-01,  9.60130e-02, -1.92248e-01,
       -1.17727e-01,  2.29500e-03,  4.58870e-02,  3.01900e-03,
        1.68361e-01,  9.55230e-02, -1.97418e-01,  1.15341e-01,
        2.14799e-01, -2.12581e-01,  9.59420e-02, -6.13190e-02,
        8.91710e-02, -8.79190e-02,  1.13341e-01,  6.62700e-03,
       -5.92500e-03, -2.00036e-01, -1.63850e-02,  9.37650e-02,
        3.74700e-02, -9.29840e-02, -2.07072e-01, -3.88760e-02,
       -8.49360e-02, -2.07676e-01, -1.01227e-01, -1.27119e-01,
       -5.60400e-02, -9.51780e-02,  1.58290e-01,  7.62910e-02,
        2.62668e-01,  1.46720e-01,  1.41091e-01, -1.34479e-01,
       -2.13100e-01, -5.11230e-02,  1.43333e-01, -9.53900e-03,
       -4.88910e-02,  1.54130e-02,  9.26400e-03,  1.97350e-02,
       -1.22203e-01, -2.62570e-02,  5.64420e-02,  1.26553e-01,
        9.76150e-02,  3.88840e-02, -1.81495e-01,  1.75468e-01,
       -4.12880e-02,  3.29550e-02,  6.53140e-02,  1.49118e-01,
       -1.73668e-01,  1.53916e-01, -1.18243e-01,  3.89850e-02,
       -9.44480e-02,  6.70180e-02, -1.04510e-01,  3.18760e-02,
       -1.90216e-01, -1.35015e-01,  9.43600e-03, -5.22100e-03,
        3.38130e-02,  5.82360e-02, -1.36879e-01, -4.01890e-02,
        1.12950e-01,  4.51140e-02, -8.46500e-02,  1.11534e-01],
      dtype=float32)

In [125]:
## train in jupyter
from __future__ import print_function
import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab


# 这个代码拆解了很多，最后才进行了训练
base_dir = 'data/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')

save_dir = 'checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob,
    }
    return feed_dict


def evaluate(sess, x_, y_):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len


def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                print('write reasult to tensorboard scaler')
                s = session.run(merged_summary, feed_dict=feed_dict)
#                     sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})
#                 s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            feed_dict[model.keep_prob] = config.dropout_keep_prob
            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break


def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)

    session = tf.Session()
    

    
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)




In [136]:
from gensim import models

def loadWord2Vec(wv_filename, words):
    wordVec = models.KeyedVectors.load_word2vec_format(wv_filename, binary=False)
    vocab = []
    embd = []
    cnt = 0
    tmp = []
    
    # lookup the embedding
    for w in words:
        try:
            vocab.append(w)
            embd.append(wordVec.wv[w])   
        except Exception as e:
            print('error',e)
            print(w + "is not in pretained vector")
            tmp.append(w)
    
            
    # 获取wordvector的纬度，添加unk并赋值指定维度的0
    word_dim = len(embd[-1])
    vocab.append("UNK")
    embd.append(np.random.randn(word_dim))   
    vocab.append("<PAD>")
    embd.append(np.zeros(word_dim)) 
    for w in tmp:
        vocab.append(w)
        embd.append((np.random.randn(word_dim)))
    
    print ("loaded word2vec")
    print('total vocab', len(vocab))
    print(len(embd), 'word vectors found')
    return vocab,embd

In [137]:
vocab[0]

'<PAD>'

In [139]:
config = TCNNConfig()
if not os.path.exists(vocab_dir):  # 如果不存在词汇表，重建
    build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)

# read wordvec from pretrained one
filename = "wiki_100.utf8.txt"
vocab,embd = loadWord2Vec(filename, words)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
# 
config.vocab_size = len(vocab) # len(words)
config.embedding_size = len(embd[0])

error "word '<PAD>' not in vocabulary"
<PAD>is not in pretained vector
error "word '，' not in vocabulary"
，is not in pretained vector
error "word '' not in vocabulary"
is not in pretained vector
error "word '：' not in vocabulary"
：is not in pretained vector
error "word '' not in vocabulary"
is not in pretained vector
error "word '？' not in vocabulary"
？is not in pretained vector
error "word '；' not in vocabulary"
；is not in pretained vector
error "word '—' not in vocabulary"
—is not in pretained vector
error "word '）' not in vocabulary"
）is not in pretained vector
error "word '（' not in vocabulary"
（is not in pretained vector
error "word '' not in vocabulary"
is not in pretained vector
error "word '』' not in vocabulary"
』is not in pretained vector
error "word '『' not in vocabulary"
『is not in pretained vector
error "word '後' not in vocabulary"
後is not in pretained vector
error "word '摺' not in vocabulary"
摺is not in pretained vector
error "word '」' not in vocabulary"
」is not in pretain

  


In [140]:
# main func
tf.reset_default_graph() # 添加这句的目的是在jupyter notebook里面clean缓存，不然会出现变量重用的问题
print('Configuring CNN model...')

model = TextCNN(config, embedding)
train()

Configuring CNN model...
embedded_ouput Tensor("embedding/embedding_lookup:0", shape=(?, 600, 100), dtype=float32, device=/device:CPU:0)
Configuring TensorBoard and Saver...
Loading training and validation data...
Time usage: 0:00:13
Training and evaluating...
Epoch: 1
write reasult to tensorboard scaler
Iter:      0, Train Loss:    3.3, Train Acc:  12.50%, Val Loss:    3.3, Val Acc:   8.76%, Time: 0:00:11 *
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:    100, Train Loss:    1.6, Train Acc:  68.75%, Val Loss:    1.7, Val Acc:  64.24%, Time: 0:01:22 *
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult 

write reasult to tensorboard scaler
Iter:   1700, Train Loss:   0.16, Train Acc:  96.88%, Val Loss:   0.32, Val Acc:  91.88%, Time: 0:19:49 
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:   1800, Train Loss:   0.24, Train Acc:  93.75%, Val Loss:    0.3, Val Acc:  92.44%, Time: 0:20:57 
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:   1900, Train Loss:   0.16, 

write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:   3500, Train Loss:  0.054, Train Acc: 100.00%, Val Loss:   0.25, Val Acc:  93.60%, Time: 0:41:36 
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
Iter:   3600, Train Loss:   0.06, Train Acc: 100.00%, Val Loss:   0.25, Val Acc:  93.68%, Time: 0:42:50 
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scaler
write reasult to tensorboard scale

In [141]:
test()

Loading test data...
INFO:tensorflow:Restoring parameters from checkpoints/textcnn/best_validation
Testing...
Test Loss:   0.16, Test Acc:  96.20%
Precision, Recall and F1-Score...
             precision    recall  f1-score   support

         体育       1.00      0.99      0.99      1000
         财经       0.97      0.99      0.98      1000
         房产       1.00      1.00      1.00      1000
         家居       0.98      0.88      0.92      1000
         教育       0.87      0.96      0.91      1000
         科技       0.92      0.99      0.95      1000
         时尚       0.97      0.96      0.97      1000
         时政       0.96      0.92      0.94      1000
         游戏       0.99      0.96      0.98      1000
         娱乐       0.98      0.97      0.97      1000

avg / total       0.96      0.96      0.96     10000

Confusion Matrix...
[[992   0   0   0   4   1   0   3   0   0]
 [  0 992   1   0   2   1   0   4   0   0]
 [  0   0 996   0   2   2   0   0   0   0]
 [  1  13   0 875  42  27  10  

In [None]:
def _getWordEmbedding(self, words):
        """
        按照我们的数据集中的单词取出预训练好的word2vec中的词向量
        """
        
        wordVec = gensim.models.KeyedVectors.load_word2vec_format("../word2vec/word2Vec.bin", binary=True)
        vocab = []
        wordEmbedding = []
        
        # 添加 "pad" 和 "UNK", 
        vocab.append("pad")
        vocab.append("UNK")
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                print(word + "不存在于词向量中")
                
        return vocab, np.array(wordEmbedding)