## 0.导入库

### 0.1 常用的库

In [1]:
import os
import numpy as np
import time

### 0.2 需要使用的库

In [2]:
import pandas as pd
import pickle
import re

### 0.3 基本方法

In [3]:
import sys
# 实时更新进度条
def print_flush(print_string):
    print(print_string, end='\r')
    sys.stdout.flush()

# 导入深度学习库tensorflow    
import tensorflow as tf    
#  获取显存动态增长的会话 
def get_session():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

## 1. 文件内容的列表

### 1.1 文本文件路径的列表

In [24]:
def get_txtFilePath_list(root_dirPath):
    txtFilePath_list = []
    sub_dirName_list = next(os.walk(root_dirPath))[1]
    for sub_dirName in sub_dirName_list:
        sub_dirPath = os.path.join(root_dirPath, sub_dirName)
        fileName_list = next(os.walk(sub_dirPath))[2]
        filePath_list = [os.path.join(sub_dirPath, k) for k in fileName_list]
        txtFilePath_list.extend(filePath_list)
    return txtFilePath_list


root_dirPath = '../resources/THUCNews/'
txtFilePath_list = get_txtFilePath_list(root_dirPath)
print('文本文件路径的列表长度:', len(txtFilePath_list))

文本文件路径的列表长度: 836075


### 1.2 读取所有文本文件

In [5]:
def get_fileContent(txtFilePath):
    with open(txtFilePath, 'r', encoding='utf8') as file:
        fileContent = file.read()
    return fileContent


sequence_length = 600
sample_quantity = len(txtFilePath_list)
startTime = time.time()
content_list = []
for i in range(sample_quantity):
    txtFilePath = txtFilePath_list[i]
    fileContent = get_fileContent(txtFilePath)
    fileContent_1 = re.sub('\s+', ' ', fileContent)
    fileContent_2 = fileContent_1[:sequence_length]
    content_list.append(fileContent_2)
    # 打印提示信息，动态刷新进度条
    index = i + 1
    if index % 100 == 0 or index==sample_quantity:
        percent = index / sample_quantity * 100
        percent_int = int(percent)
        half_percent_int = int(percent_int / 2)
        string_0 = '%d/ %d ' %(index, sample_quantity)
        string_1 = '>' * half_percent_int + ' ' * (50-half_percent_int)
        string_2 = ' 进度百分比:%.2f%%' %percent
        usedTime = time.time() - startTime
        string_3 = ' 读取速度:%.2f文件/秒' %(index/usedTime)
        string_4 = ' 总共花费时间:%.2f秒' %(usedTime)
        print_string = string_0 + string_1 + string_2 + string_3 + string_4
        print_flush(print_string)

836075/ 836075 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 进度百分比:100.00% 读取速度:1430.98文件/秒 总共花费时间:584.27秒

### 1.3 把文件内容列表保存为pickle文件

In [6]:
pickleFilePath = '../resources/content_list.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(content_list, file)

### 1.4 从pickle文件加载文件内容列表

In [7]:
pickleFilePath = '../resources/content_list.pickle'
with open(pickleFilePath, 'rb') as file:
    content_list = pickle.load(file)

## 2. 样本标签的列表

### 2.1 获取样本标签的列表

In [8]:
def get_label_list(root_dirPath):
    label_list = []
    sub_dirName_list = next(os.walk(root_dirPath))[1]
    for sub_dirName in sub_dirName_list:
        sub_dirPath = os.path.join(root_dirPath, sub_dirName)
        fileName_list = next(os.walk(sub_dirPath))[2]
        part_label_list = [sub_dirName] * len(fileName_list)
        label_list.extend(part_label_list)
    return label_list


root_dirPath = '../resources/THUCNews/'
label_list = get_label_list(root_dirPath)
print('样本标签的列表长度:', len(label_list)) 
pd.value_counts(label_list)

样本标签的列表长度: 836075


科技    162929
股票    154398
体育    131604
娱乐     92632
时政     63086
社会     50849
教育     41936
财经     37098
家居     32586
游戏     24373
房产     20050
时尚     13368
彩票      7588
星座      3578
dtype: int64

## 3. 字列表

### 3.1 根据文件内容列表，统计计数获得出现次数排名前6999的字
#### 排名7000以后的字统一用'PAD'表示

In [9]:
from collections import Counter

def get_word_list(content_list, size):
    startTime = time.time()
    counter = Counter()
    sample_quantity = len(content_list)
    for i, content in enumerate(content_list, 1):
        counter.update(content)
        if i%1000==0 or i==sample_quantity:
            string_0 = '%d/ %d' %(i, sample_quantity)
            string_1 = ' 进度百分比: %.2f%%' %(i/sample_quantity*100)
            usedTime = time.time() - startTime
            string_2 = ' 花费时间: %.2f秒' %usedTime
            print_string = string_0 + string_1 + string_2
            print_flush(print_string)
    word_list_1 = [k[0] for k in counter.most_common(size-1)]
    word_list = ['PAD'] + word_list_1
    return word_list


vocabulary_size = 7000
word_list = get_word_list(content_list, vocabulary_size)

836075/ 836075 进度百分比: 100.00% 花费时间: 77.56秒

### 3.2 把字列表保存为pickle文件

In [10]:
pickleFilePath = '../resources/word_list.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(word_list, file)

### 3.3 从pickle文件加载字列表

In [11]:
pickleFilePath = '../resources/word_list.pickle'
with open(pickleFilePath, 'rb') as file:
    word_list = pickle.load(file)

## 4.数据准备

### 4.1 get_X

In [12]:
import keras

# sequence_length中文叫做序列长度，根据自己经验设置
# sequence_length设置为600，即根据文章的前600字则可判断文章类型
sequence_length = 600
word2index_dict = dict([(word, index) for index, word in enumerate(word_list)])


def get_index_list(content):
    index_list = []
    for word in content[:sequence_length]:
        if word in word2index_dict:
            index = word2index_dict[word]
            index_list.append(index)
        else:
            index_list.append(0)
    return index_list       

def get_X(part_content_list):
    indexList_list = [get_index_list(k) for k in part_content_list]
    X = keras.preprocessing.sequence.pad_sequences(indexList_list, sequence_length)        
    return X

Using TensorFlow backend.


### 4.2 get_Y

In [13]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
labelEncoder.fit(label_list)
category_quantity = labelEncoder.classes_.shape[0]

def get_Y(part_label_list):
    oneHot_2d_array = labelEncoder.transform(part_label_list)
    Y = keras.utils.to_categorical(oneHot_2d_array, category_quantity)
    return Y

### 4.3 使用带权重的抽样策略，计算每个样本的权重

In [14]:
def get_probability_list(label_list):
    count_series = pd.value_counts(label_list)
    category_quantity = len(count_series)
    category_weights = 1 / category_quantity
    label2weights_dict = dict([(a, b) for a, b in zip(count_series.index, category_weights/count_series)])
    probability_list = [label2weights_dict[k] for k in label_list]
    return probability_list

### 4.4 批量数据生产者线程

In [15]:
import threading
from sklearn.model_selection import train_test_split

sample_quantity = len(label_list)
index_1d_array = np.arange(sample_quantity)
train_index_1d_array, test_index_1d_array = train_test_split(index_1d_array, random_state=2019)
train_label_list = [label_list[k] for k in train_index_1d_array]
train_probability_list = get_probability_list(train_label_list)
batch_size = 128


class BatchDataThread(threading.Thread):
    def __init__(self, queue):
        super(BatchDataThread, self).__init__()
        self.queue = queue
        self.start()
    
    def run(self):
        while not self._is_stopped:
            if self.queue.qsize() < 3:
                selected_indexes = np.random.choice(
                    train_index_1d_array, size=batch_size, p=train_probability_list)
                part_content_list = [content_list[k] for k in selected_indexes]
                part_label_list = [label_list[k] for k in selected_indexes]
                batch_X = get_X(part_content_list)
                batch_Y = get_Y(part_label_list)
                put_tuple = batch_X.astype('int32'), batch_Y.astype('float32')
                self.queue.put(put_tuple)
            time.sleep(0.0001)            

### 4.5 批量数据生成器类

In [16]:
import queue

class BatchDataGenerator(object):
    def __init__(self, worker_quantity=3):
        self.queue = queue.Queue()
        for i in range(worker_quantity):
            BatchDataThread(self.queue)
            
    def __iter__(self):
        return self
    
    def __next__(self):
        batch_data = self.queue.get()
        return batch_data
    
    
batchData_generator = BatchDataGenerator()    

## 5.搭建神经网络

In [17]:
tf.reset_default_graph()
X_holder = tf.placeholder(tf.int32, [None, sequence_length])
Y_holder = tf.placeholder(tf.float32, [None, category_quantity])
data_0 = X_holder
vocabulary_size = 7000
embedding_size = 300
layer_1 = tf.get_variable('embedding', [vocabulary_size, embedding_size])
data_1 = tf.nn.embedding_lookup(layer_1, data_0)
filter_quantiy = 256
filter_size = 3
layer_2 = tf.layers.conv1d
data_2 = layer_2(data_1, filter_quantiy, filter_size, padding='SAME')
layer_3 = tf.reduce_max
data_3 = layer_3(data_2, [1])
layer_4 = tf.layers.dense
fc1_units = 128
data_4 = layer_4(data_3, fc1_units)
layer_5 = tf.nn.relu
data_5 = layer_5(data_4)
layer_6 = tf.layers.dense
data_6 = layer_6(data_5, category_quantity)
layer_7 = tf.nn.softmax
data_7 = layer_7(data_6)
layer_8 = tf.nn.softmax_cross_entropy_with_logits_v2
data_8 = layer_8(labels=Y_holder, logits=data_6)
loss = tf.reduce_mean(data_8)
learning_rate = 5e-4
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
isCorrect = tf.equal(tf.argmax(Y_holder, 1), tf.argmax(data_7, 1))
accuracy = tf.reduce_mean(tf.cast(isCorrect, tf.float32))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.conv1d instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


## 6.参数初始化

In [18]:
init = tf.global_variables_initializer()
session = get_session()
session.run(init)

## 7.模型训练

In [19]:
train_steps = 10000
for step in range(1, train_steps+1):
    batch_X, batch_Y = next(batchData_generator)
    session.run(train, {X_holder:batch_X, Y_holder:batch_Y})
    if step % 2 == 0 :
        loss_value, accuracy_value = session.run([loss, accuracy], {X_holder:batch_X, Y_holder:batch_Y})
        print_string = 'step:%d loss:%.4f accuracy:%.4f' %(step, loss_value, accuracy_value)
        print_flush(print_string)

step:10000 loss:0.1018 accuracy:0.9531

## 8.模型测试

In [20]:
import warnings
warnings.filterwarnings("ignore")
def predict(input_content):
    part_content_list = [input_content]
    X = get_X(part_content_list)
    Y = session.run(data_7, {X_holder:X})
    y = np.argmax(Y, axis=1)
    label = labelEncoder.inverse_transform(y)[0]
    return label

selected_index = np.random.choice(test_index_1d_array, 1)[0]
selected_content = content_list[selected_index]
true_label = label_list[selected_index]
predict_label = predict(selected_content)
print('选出文本内容为: ', selected_content)
print('真实标签: ', true_label)
print('预测标签: ', predict_label, '\n')
print('对于任意文本做分类预测，例如:')
input_content = "足球篮球"
print('predict("%s") :' %input_content, predict(input_content))

选出文本内容为:  韩国通过灰姑娘法 凌晨后青少年网游将强制断网 首尔市某精神病院院长金铉秀在接受韩国媒体采访时表示：“少年儿童游戏中毒的后果非常严重，其发展顺序一般为：丧失学习兴趣→价值观与父母冲突→沉迷游戏→同父母矛盾激化→(中学·高中时期)向父母施暴、离家出走、盗窃等，这样的事例非常之多。可以说，90%以上的青少年问题都与不良网络游戏有关。” 本报驻首尔记者王刚 近日，旨在限制青少年深夜进行网络游戏的《青少年保护法》修正案在韩国国会会议高票获得通过，并有望最早在今年11月份开始正式施行，此举一出，不但在韩国国内引发大规模讨论，而且也引来各国媒体的围观和热议。那么，韩国政府此番为何会“痛下杀手”？在发展网游产业与保护青少年之间，韩政府又是如何艰难却坚定地寻找平衡点的？ 国会通过“灰姑娘法” 禁止青少年深夜网游 4月29日下午，韩国国会对限制青少年深夜上网打游戏的《青少年保护法》(修正案因在子夜12点后将对未成年人网游强制断网，故俗称“灰姑娘法”)进行审议。摆在各位议员面前的有两个提议案：一个是禁止未满16岁的青少年在午夜12点至凌晨6点在线游戏；另一个是大国家党国会议员辛智浩提出的将年龄提高到19岁。最后经过激烈辩论，最终韩国国会法制委员会通过了16周岁这个修正案。参与投票的210名议员中，赞成117票，反对63票，弃权30票。而辛智浩提出的修正案则遭到否决。 按照规定，修正案将在通过后6个月(即
真实标签:  科技
预测标签:  时政 

对于任意文本做分类预测，例如:
predict("足球篮球") : 家居


## 9.混淆矩阵

In [21]:
from sklearn.metrics import confusion_matrix

def predict_test():
    startTime = time.time()
    test_sample_quantity = len(test_index_1d_array)
    batch_size  = 100
    predict_Y_list = []
    for i in range(0, test_sample_quantity, batch_size):
        part_index_1d_array = test_index_1d_array[i: i + batch_size]
        part_content_list = [content_list[k] for k in part_index_1d_array]
        batch_X = get_X(part_content_list)
        predict_Y = session.run(data_7, {X_holder:batch_X})
        predict_Y_list.extend(predict_Y)
        usedTime = time.time() - startTime
        print_string = '%d/ %d 花费时间:%.2f秒' %(i, test_sample_quantity, usedTime)
        print_flush(print_string)
    print_string = '%d/ %d 花费时间:%.2f秒' %(test_sample_quantity, test_sample_quantity, usedTime)
    print_flush(print_string)    
    Y = np.array(predict_Y_list)   
    y = np.argmax(Y, axis=1)
    predict_label_list = labelEncoder.inverse_transform(y)
    return predict_label_list


test_label_list = [label_list[k] for k in test_index_1d_array]
predict_label_list = predict_test()
pd.DataFrame(confusion_matrix(test_label_list, predict_label_list), 
             columns=labelEncoder.classes_,
             index=labelEncoder.classes_ )

209019/ 209019 花费时间:1012.82秒

Unnamed: 0,体育,娱乐,家居,彩票,房产,教育,时尚,时政,星座,游戏,社会,科技,股票,财经
体育,32100,313,32,135,2,39,12,162,0,50,76,52,9,6
娱乐,34,22529,57,1,9,39,64,120,9,27,121,89,4,9
家居,8,87,7944,0,18,22,45,36,1,13,22,79,27,23
彩票,19,4,1,1866,0,0,0,3,1,2,36,4,0,1
房产,0,16,59,2,4624,19,6,61,1,3,47,26,81,33
教育,3,51,19,4,1,9908,11,145,1,12,250,44,10,15
时尚,0,43,35,1,2,3,3174,4,1,6,6,9,0,2
时政,33,57,15,2,19,113,23,14986,0,10,213,146,79,51
星座,0,1,1,0,0,2,3,1,898,0,2,1,0,0
游戏,3,12,6,0,0,6,7,5,0,5821,12,172,4,1


## 10.报告表

In [22]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def eval_model(y_true, y_pred, labels):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 计算总体的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label_list, predict_label_list, labelEncoder.classes_)

Unnamed: 0,Label,Precision,Recall,F1,Support
0,体育,0.995318,0.973081,0.984074,32988
1,娱乐,0.951675,0.974775,0.963086,23112
2,家居,0.930974,0.954234,0.942461,8325
3,彩票,0.901014,0.963345,0.931138,1937
4,房产,0.951049,0.928887,0.939837,4978
5,教育,0.939949,0.945961,0.942946,10474
6,时尚,0.933255,0.965916,0.949305,3286
7,时政,0.864095,0.951673,0.905772,15747
8,星座,0.984649,0.987899,0.986271,909
9,游戏,0.866607,0.962308,0.911954,6049


In [23]:
话题预测  新闻分类

SyntaxError: invalid syntax (<ipython-input-23-664e4f2656d0>, line 1)