# CNN Base分类器

## 库导入

In [1]:
import sys
import time
import os
import re
import numpy as np
import pandas as pd
import pickle
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import keras as kr


# 实时更新进度条
def print_flush(print_string):
    print(print_string, end='\r')
    sys.stdout.flush()

# 获取显存动态增长的会话 
def get_session():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

predictor_set = ["THUCNews", "TouTiao"]
name_of_predictor = predictor_set[1]

Instructions for updating:
non-resource variables are not supported in the long term


Using TensorFlow backend.


## 导入数据

### 路径导入

In [2]:
def getFilePath(rootDir):
    filePath_list = []
    for root, dirs, files in os.walk(rootDir):
        filePath_list += [os.path.join(root, file) for file in files if file != ".DS_Store"]
    return filePath_list

filePath_list = getFilePath("./DataSets/" + name_of_predictor)
len(filePath_list)

382688

### 文件内容读取

In [12]:
def getFile(filePath):
    with open(filePath, 'r', encoding='utf8') as file:
        fileContent = file.read()
    return fileContent

sequence_length = 600
sample_quantity = len(filePath_list)
startTime = time.time()
content_list = []
for i in range(sample_quantity):
    filePath = filePath_list[i]
    
    # 获取文本
    fileContent = getFile(filePath)
    # 正则处理
    fileContent_1 = re.sub('\s+', ' ', fileContent)
    # 长度截断
    fileContent_2 = fileContent_1[:sequence_length]
    content_list.append(fileContent_2)
    
    # 进度条
    index = i + 1
    if index % 100 == 0 or index == sample_quantity:
        percent = index / sample_quantity * 100
        percent_int = int(percent)
        half_percent_int = int(percent_int / 2)
        string_0 = "{0} / {1} ".format(index, sample_quantity)
        string_1 = '>' * half_percent_int + ' ' * (50 - half_percent_int)
        string_2 = " Percentage: {:.2f}%".format(percent)
        usedTime = time.time() - startTime
        string_3 = " Process speed: {:.2f} files/sec".format(index / usedTime)
        string_4 = " Total time: {:.2f} Seconds.".format(usedTime)
        print_string = string_0 + string_1 + string_2 + string_3 + string_4
        print_flush(print_string)

NameError: name 'filePath_list' is not defined

### 文件内容序列化保存

In [7]:
# 创建content_list文件夹
dir_name = "./" + name_of_predictor + "/content_list"
if not os.path.isdir(dir_name):
    os.makedirs(dir_name)
    
pickleFilePath = "./" + name_of_predictor + "/content_list/content_list.pickle"
with open(pickleFilePath, 'wb') as file:
    pickle.dump(content_list, file)
print("Content serialization finished.")

Content serialization finished.


### 文件内容加载

In [2]:
pickleFilePath = "./" + name_of_predictor + "/content_list/content_list.pickle"
with open(pickleFilePath, 'rb') as file:
    content_list = pickle.load(file)
print("Content load finished.")

Content load finished.


### 样本标签列表获取

In [8]:
# ！！自改 可能有问题
def get_label_list(rootDir):
    label_list = []
    for filePath in filePath_list:
        label_list.append(filePath.split('/')[3])
    return label_list

label_list = get_label_list("./DataSets/" + name_of_predictor)
print('Label length:', len(label_list)) 
pd.value_counts(label_list)

Label length: 382688


科技    41543
娱乐    39396
体育    37568
汽车    35785
游戏    29300
文化    28031
金融    27085
教育    27058
世界    26909
军事    24984
旅游    21422
农业    19322
房产    17672
民生     6273
股票      340
dtype: int64

### 样本标签列表序列化保存

In [9]:
pickleFilePath = './' + name_of_predictor + '/label_list.pickle'

with open(pickleFilePath, 'wb') as file:
    pickle.dump(label_list, file)
print("Label list serialization finished.")

Label list serialization finished.


### 样本标签列表读取

In [3]:
with open('./' + name_of_predictor + '/label_list.pickle', 'rb') as file:
    label_list = pickle.load(file)
print("Label list load finished.")

Label list load finished.


## 词汇表建立

### 词汇表构建和存储

In [10]:
from collections import Counter

def get_word_list(content_list, size):
    startTime = time.time()
    counter = Counter()
    sample_quantity = len(content_list)
    for i, content in enumerate(content_list, 1):
        counter.update(content)
        if i%1000==0 or i==sample_quantity:
            string_0 = '%d/ %d' %(i, sample_quantity)
            string_1 = ' Processed: %.2f%%' %(i/sample_quantity*100)
            usedTime = time.time() - startTime
            string_2 = ' Time cost: %.2f秒' %usedTime
            print_string = string_0 + string_1 + string_2
            print_flush(print_string)
    word_list_1 = [k[0] for k in counter.most_common(size-1)]
    word_list = ['PAD'] + word_list_1
    return word_list


vocabulary_size = 7000
word_list = get_word_list(content_list, vocabulary_size)

382688/ 382688 Processed: 100.00% Time cost: 3.97秒

In [11]:
with open('./' + name_of_predictor + '/word_list.pickle', 'wb') as file:
    pickle.dump(word_list, file)
print("Words list saved.")

Words list saved.


### 词汇表读取

In [4]:
import pickle

with open('./' + name_of_predictor + '/word_list.pickle', 'rb') as file:
    word_list = pickle.load(file)
print("Words list loaded.")

Words list loaded.


## 训练模型准备

### 输入X文字序列转id序列

In [5]:
sequence_length = 600
# 初始化id序列对查表
word2id_dict = dict([(b, a) for a, b in enumerate(word_list)])

# 单文章转id列表
def get_id_list(index):
    content = index if isinstance(index, str) else content_list[index]
    id_list = []
    for word in content[:sequence_length]:
        if word in word2id_dict:
            id_ = word2id_dict[word]
            id_list.append(id_)
        else:
            id_list.append(0)
    return id_list

# 多文章转id列表
def get_X(indexes):
    idList_list = [get_id_list(i) for i in indexes]
    X = kr.preprocessing.sequence.pad_sequences(idList_list, sequence_length)
    return X

### 输出Y标签One-Hot编码

In [6]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
labelEncoder.fit(label_list)
# 获取标签类别数量
category_quantity = labelEncoder.classes_.shape[0]

# One-Hot
def get_Y(indexes):
    # 先转2d供LabelEncoder使用
    part_label_list = [label_list[i] for i in indexes]
    oneHot_2d_array = labelEncoder.transform(part_label_list)
    Y = kr.utils.to_categorical(oneHot_2d_array, category_quantity)
    return Y

### 对不同标签类别分权重
让每个标签总体上的抽取概率相同，实际操作上等效于让样本不多的标签单样本权重拉高，样本多的标签单样本权重拉低，使其分类能被充分训练

In [7]:
def get_probability_list(label_list):
    count_series = pd.value_counts(label_list)
    category_quantity = len(count_series)
    category_weights = 1 / category_quantity
    # 利用count_series分权重
    label2weights_dict = dict([(a, b) for a, b in zip(count_series.index, category_weights / count_series)])
    probability_list = [label2weights_dict[i] for i in label_list]
    return probability_list

### 批量数据生成器线程

In [8]:
import threading
from sklearn.model_selection import train_test_split

sample_quantity = len(label_list)
index_1d_array = np.arange(sample_quantity)
train_index_1d_array, test_index_1d_array = train_test_split(index_1d_array, random_state=2019)
train_label_list = [label_list[k] for k in train_index_1d_array]
train_probability_list = get_probability_list(train_label_list)
batch_size = 128


class BatchDataThread(threading.Thread):
    def __init__(self, queue):
        super(BatchDataThread, self).__init__()
        self.queue = queue
        self.start()
    
    def run(self):
        while not self._is_stopped:
            if self.queue.qsize() < 4:
                selected_indexes = np.random.choice(
                    train_index_1d_array, size=batch_size, p=train_probability_list)
                batch_X = get_X(selected_indexes)
                batch_Y = get_Y(selected_indexes)
                put_tuple = batch_X.astype('int32'), batch_Y.astype('float32')
                self.queue.put(put_tuple)
            time.sleep(0.0001)

### 批量数据生成器类

In [9]:
import queue

class BatchDataGenerator(object):
    def __init__(self, worker_quantity=4):
        self.queue = queue.Queue()
        for i in range(worker_quantity):
            BatchDataThread(self.queue)
            
    def __iter__(self):
        return self
    
    def __next__(self):
        batch_data = self.queue.get()
        return batch_data
    
    
batchData_generator = BatchDataGenerator()

## 搭建神经网络

In [10]:
tf.reset_default_graph()
sequence_length = 600
X_holder = tf.placeholder(tf.int32, [None, sequence_length])
Y_holder = tf.placeholder(tf.float32, [None, category_quantity])
data_0 = X_holder # N *  600
vocabulary_size = 7000
embedding_size = 100
layer_1 = tf.get_variable('embedding', [vocabulary_size, embedding_size])
data_1 = tf.nn.embedding_lookup(layer_1, data_0) # N * 600 * 100
filter_quantiy = 128 
layer_2 = tf.layers.conv1d # 3 * 100 * 128
data_2 = layer_2(data_1, filter_quantiy, 3, padding='SAME') # N * 600 * 128
layer_3 = tf.layers.conv1d # 5 * 100 * 128
data_3 = layer_3(data_1, filter_quantiy, 5, padding='SAME') # N * 600 * 128
layer_4 = tf.layers.conv1d # 7 * 100 * 128
data_4 = layer_4(data_1, filter_quantiy, 7, padding='SAME') # N * 600 * 128
layer_5 = tf.concat
data_5 = layer_5([data_2, data_3, data_4], axis=2) # N * 600 * 384
layer_6 = tf.reduce_max
data_6 = layer_6(data_5, [1]) # N * 384
layer_7 = tf.layers.dense # 384 * 128
fc1_units = 128
data_7 = layer_7(data_6, fc1_units) # N * 128
layer_8 = tf.nn.relu
data_8 = layer_8(data_7) # N * 128
layer_9 = tf.layers.dense
data_9 = layer_9(data_8, category_quantity) # N * 14
layer_10 = tf.nn.softmax
data_10 = layer_10(data_9) # N * 14
layer_11 = tf.nn.softmax_cross_entropy_with_logits_v2
data_11 = layer_11(labels=Y_holder, logits=data_9) # N
loss = tf.reduce_mean(data_11) # 1
learning_rate = 5e-4
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
isCorrect = tf.equal(tf.argmax(Y_holder, 1), tf.argmax(data_10, 1))
accuracy = tf.reduce_mean(tf.cast(isCorrect, tf.float32))

Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Dense instead.


## 模型训练

### 参数初始化

In [18]:
init = tf.global_variables_initializer()
session = get_session()
session.run(init)

### 迭代训练

In [19]:
train_steps = 5000
startTime = time.time()
for step in range(1, train_steps+1):
    batch_X, batch_Y = next(batchData_generator)
    session.run(train, {X_holder:batch_X, Y_holder:batch_Y})
    if step % 2 == 0 :
        loss_value, accuracy_value = session.run([loss, accuracy], {X_holder:batch_X, Y_holder:batch_Y})
        usedTime = time.time() - startTime
        speed = step / usedTime
        print_string = 'Step:{0} Loss:{1:.4f} Acc:{2:.4f} Train Speed:{3:.2f} steps/sec'.format(step, loss_value, accuracy_value, speed)
        print_flush(print_string)

Step:5000 Loss:0.1789 Acc:0.9609 Train Speed:0.55 steps/sec

### 随机测试

In [13]:
import warnings
warnings.filterwarnings("ignore")

def predict(input_content):
    id_list = get_id_list(input_content)
    #  切割输入
    X = kr.preprocessing.sequence.pad_sequences([id_list], sequence_length)
    # 输入X数据，求图的data_10，即SoftMax层获得各分类概率矩阵Y
    Y = session.run(data_10, {X_holder:X})
    # 取概率矩阵中最大元素的索引为结果
    y = np.argmax(Y, axis = 1)
    # 第二大权重
    y_latent = np.argsort(np.max(Y, axis=0))[-2]
    # 解析结果
    label, label_latent = labelEncoder.inverse_transform(y)[0], labelEncoder.inverse_transform([y_latent])[0]
    return label, label_latent

selected_index = np.random.choice(test_index_1d_array, 1)[0]
selected_content = content_list[selected_index]
true_label = label_list[selected_index]
predict_label = predict(selected_content)

print("Selected text content: " + selected_content)
print("True label: " + true_label)
print("Predict label: " + predict_label[0])
print("Predict latent label: " + predict_label[1])

Selected text content: 张曼玉向汶川地震灾区捐款200万元，是个人捐款最多的女明星 
True label: 娱乐
Predict label: 世界
Predict latent label: 娱乐


## 评估分析

### 混淆矩阵

In [14]:
from sklearn.metrics import confusion_matrix

def predict_test():
    startTime = time.time()
    test_sample_quantity = len(test_index_1d_array)
    batch_size = 100
    predict_Y_list = []
    
    for i in range(0, test_sample_quantity, batch_size):
        part_index_1d_array = test_index_1d_array[i: i + batch_size]
        batch_X = get_X(part_index_1d_array)
        predict_Y = session.run(data_10, {X_holder:batch_X})
        predict_Y_list.extend(predict_Y)
        usedTime = time.time() - startTime
        print_string = "{0} / {1} Cost time: {2:.2f} seconds.".format(i, test_sample_quantity, usedTime)
        print_flush(print_string)
    
    print_string = "{0} / {1} Cost time: {2:.2f} seconds.".format(test_sample_quantity, test_sample_quantity, usedTime)
    print_flush(print_string)
    
    Y = np.array(predict_Y_list)
    y = np.argmax(Y, axis=1)
    predict_label_list = labelEncoder.inverse_transform(y)
    
    return predict_label_list

test_label_list = [label_list[i] for i in test_index_1d_array]
predict_label_list = predict_test()

cm = pd.DataFrame(confusion_matrix(test_label_list, predict_label_list),
             columns = labelEncoder.classes_, index = labelEncoder.classes_)

if not os.path.isdir("./results/" + name_of_predictor):
    os.makedirs("./results/" + name_of_predictor)
cm.to_csv("./results/" + name_of_predictor + '/' + name_of_predictor + '_CNNTensor_CM.csv')

cm

209019 / 209019 Cost time: 548.24 seconds.

Unnamed: 0,体育,娱乐,家居,彩票,情感,房产,教育,时尚,时政,游戏,社会,科技,股票,财经
体育,32790,111,20,76,2,2,43,13,40,4,40,22,13,3
娱乐,142,22169,123,2,9,16,127,117,71,36,188,124,17,22
家居,11,34,7860,0,5,24,26,23,4,5,26,59,23,10
彩票,20,4,2,1852,0,0,0,0,2,0,12,1,0,1
情感,1,0,2,1,898,0,3,2,0,0,0,0,0,0
房产,4,10,76,1,2,4675,16,4,16,2,53,22,70,48
教育,10,23,23,2,2,4,10261,10,55,7,84,38,9,11
时尚,2,15,43,0,0,0,13,3180,2,4,4,16,1,2
时政,56,53,36,6,0,34,268,22,14537,11,261,175,135,69
游戏,9,17,11,0,0,1,13,8,5,5836,10,196,5,2


### 报告表

In [15]:
from sklearn.metrics import precision_recall_fscore_support

def eval_model(y_true, y_pred, labels):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 计算总体的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    res.to_csv("./results/" + name_of_predictor + '/' + name_of_predictor + '_CNNTensor_PRFS.csv')
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label_list, predict_label_list, labelEncoder.classes_)

Unnamed: 0,Label,Precision,Recall,F1,Support
0,体育,0.98914,0.988276,0.988708,33179
1,娱乐,0.974376,0.957087,0.965654,23163
2,家居,0.915124,0.969174,0.941374,8110
3,彩票,0.916832,0.977825,0.946346,1894
4,情感,0.972914,0.990077,0.981421,907
5,房产,0.937062,0.935187,0.936123,4999
6,教育,0.889939,0.973622,0.929902,10539
7,时尚,0.925764,0.968921,0.946851,3282
8,时政,0.92193,0.928111,0.92501,15663
9,游戏,0.903266,0.954687,0.928265,6113


## 复用

### 模型保存

不要随便乱点！！！！！！！！

In [47]:
saver = tf.train.Saver()
modelFilePath = "./" + name_of_predictor + "/trained_model/CNN_Tensorflow.ckpt"
saver.save(session, modelFilePath)

'./TouTiao/trained_model/CNN_Tensorflow.ckpt'

### 模型加载

In [11]:
saver = tf.train.Saver()
session = get_session()
modelFilePath = "./" + name_of_predictor + "/trained_model/CNN_Tensorflow.ckpt"
saver.restore(session, modelFilePath)

INFO:tensorflow:Restoring parameters from ./TouTiao/trained_model/CNN_Tensorflow.ckpt


## 应用预测

### 即时输入预测

In [14]:
input_content = """
这轮改变，是从它关停周播剧场“青春进行时”开始的。
许多观众没有注意到的是，这个历时2015年到2021年六年时间
曾播出过《旋风少女》、《漂亮的李慧珍》、《流星花园》、《楚乔传》、《择天记》等爆款青春偶像剧的电视剧播放剧场，已经从湖南卫视悄然消失。
"""

print("Predict: " + predict(input_content)[0])
print("Latent predict: " + predict(input_content)[1])

Predict: 娱乐
Latent predict: 文化


### 新浪热搜即时预测

In [15]:
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/86.0.4240.198 Safari/537.36'}
html = requests.get('https://s.weibo.com/top/summary?cate=realtimehot', headers=headers)
page = html.content.decode()

res = re.findall(
    r'<tr class="">\s*<td class="td-01 ranktop">(\d+)<\/td>\s*<td class="td-02">\s*<a href="\/('
    r'.*?)&.*?target="_blank">(.*?)<\/a>', page, re.S)

# ind = [i[0] for i in res]
# link = ['https://s.weibo.com/' + i[1] for i in res]
title = [i[2] for i in res]
pred = [predict(t)[0] for t in title]
pred_latent = [predict(t)[1] for t in title]


dataFrame = pd.DataFrame({
        u'热搜标题': title,
        u'预测结果': pred,
        u'备用结果': pred_latent
    })
dataFrame.index = [i + 1 for i in range(50)]

dataFrame[['热搜标题', '预测结果', '备用结果']]
    

Unnamed: 0,热搜标题,预测结果,备用结果
1,新垣结衣星野源结婚,娱乐,军事
2,互联网人一分钟之内痛失两个老婆,科技,娱乐
3,中国疫苗接种剂次全球第一,农业,金融
4,石原里美或将退圈从政,旅游,文化
5,关晓彤 植物肉,娱乐,农业
6,三句话让男人为我花18万,民生,娱乐
7,印度一男子在树上隔离11天,世界,旅游
8,兵马俑也忍不住出雪糕了,旅游,军事
9,工信部点名要求下架APP仍可下载,科技,金融
10,南昌杀妻抛尸案将择期宣判,娱乐,世界
