# Bayes分类器

## 库导入

主要使用keras，TensorFlow(1版)

In [1]:
import sys
import time
import os
import re
import numpy as np
import pandas as pd
import pickle
import jieba

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# 实时更新进度条
def print_flush(print_string):
    print(print_string, end='\r')
    sys.stdout.flush()

predictor_set = ["THUCNews", "TouTiao"]
name_of_predictor = predictor_set[1]

## 文件导入（CNN内有保存）

In [16]:
pickleFilePath = "./" + name_of_predictor + '/content_list/content_list.pickle'
with open(pickleFilePath, 'rb') as file:
    content_list = pickle.load(file)
    file.close()
pickleFilePath = "./" + name_of_predictor + '/label_list.pickle'
with open(pickleFilePath, 'rb') as file:
    label_list = pickle.load(file)
    file.close()
print("Load complete, {0} contents loaded, {1} labels loaded".format(len(content_list), len(label_list)))

Load complete, 382688 contents loaded, 382688 labels loaded


In [35]:
dataFrame=pd.DataFrame({
        'label':label_list,
        'content':content_list
        }) 
pd.value_counts(label_list)

科技    41543
娱乐    39396
体育    37568
汽车    35785
游戏    29300
文化    28031
金融    27085
教育    27058
世界    26909
军事    24984
旅游    21422
农业    19322
房产    17672
民生     6273
股票      340
dtype: int64

## 分词

### 逐内容处理

In [4]:
zhPattern = re.compile('[\u4e00-\u9fa5]+') 

def segment_reshape(dataFrame):
    startTime = time.time()
    sample_quantity = dataFrame.shape[0]
    for i, row in dataFrame.iterrows():
        segments = []
        fileContent = row['content']
        segs = jieba.cut(fileContent)
        for seg in segs:
            if zhPattern.search(seg):
                segments.append(seg)
        row['content'] = ' '.join(segments)
        if (i + 1)%1000==0 or i==sample_quantity - 1:
            string_0 = 'Segment process: %d/ %d' %(i + 1, sample_quantity)
            string_1 = ' Processed: %.2f%%' %(i/sample_quantity*100)
            usedTime = time.time() - startTime
            string_2 = ' Time cost: %.2f秒' %usedTime
            print_string = string_0 + string_1 + string_2
            print_flush(print_string)
        
segment_reshape(dataFrame)    

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/2b/swj1bjf912l41nfy725k11ph0000gn/T/jieba.cache
Loading model cost 0.649 seconds.
Prefix dict has been built successfully.


Segment process: 836075/ 836075 Processed: 100.00% Time cost: 3550.83秒

### 停止词列表导入 Github: goto456/stopwords

In [6]:
stopWordsPath = "./stopwords-master/cn_stopwords.txt"
stopWords = pd.read_csv(stopWordsPath,
                        encoding='utf8', 
                        names = ["stopword"],
                        quoting=3,
                        sep="\t")

### 移除停止词

In [7]:
# r' 防转义
countVectorizer = CountVectorizer(stop_words=stopWords["stopword"].values.tolist(),
                                  min_df=0, token_pattern=r'\b\w+\b')

text_vector = countVectorizer.fit_transform(dataFrame['content'])

### 全现有文件序列化

In [8]:
pickleFilePath = './' + name_of_predictor + '/label_content_dataFrame.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(dataFrame, file)
print("Label and content dataframe serialization finished.")

pickleFilePath = './' + name_of_predictor + '/content_vector.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(text_vector, file)
print("Content vector serialization finished.")

Label and content dataframe serialization finished.
Content vector serialization finished.


## 模型建立

### 全现有文件导入

In [2]:
pickleFilePath = './' + name_of_predictor + '/label_content_dataFrame.pickle'
with open(pickleFilePath, 'rb') as file:
    dataFrame = pickle.load(file)
print("Label and content dataframe load finished.")

pickleFilePath = './' + name_of_predictor + '/content_vector.pickle'
with open(pickleFilePath, 'rb') as file:
    text_vector = pickle.load(file)
print("Content vector load finished.")

Label and content dataframe load finished.
Content vector load finished.


### 结果标签集

In [3]:
le = LabelEncoder()
label_encode = le.fit_transform(dataFrame['label'])

### 训练集测试集划分

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_vector, label_encode, random_state=114514)

### 朴素多项式贝叶斯 (Scikit Learn)

In [10]:
bayesModel = MultinomialNB()
# 对标签label进行训练
bayesModel.fit(X_train, y_train)
score = bayesModel.score(X_train, y_train)
print("Bayes fit complete!")
print("Score on mean accuracy on the given test data and labels: {0:.6f} / {1:.6f}".format(score, 1))

Bayes fit complete!
Score on mean accuracy on the given test data and labels: 0.915193 / 1.000000


### 模型保存

In [11]:
import joblib
modelFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB.pkl"
leFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_LE.pkl"
cvFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_CV.pkl"

joblib.dump(bayesModel, modelFilePath) 
joblib.dump(le, leFilePath) 
joblib.dump(countVectorizer, cvFilePath) 

print("MNB model saved.")

MNB model saved.


### 模型加载

In [62]:
import joblib

modelFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB.pkl"
leFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_LE.pkl"
cvFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_CV.pkl"

bayesModel = joblib.load(modelFilePath)
le = joblib.load(leFilePath)
countVectorizer = joblib.load(cvFilePath)

## 预测和检验

In [12]:
y_pred = bayesModel.predict(X_test)
pred_label = le.inverse_transform(y_pred)
test_label = le.inverse_transform(y_test)

from sklearn.metrics import confusion_matrix

cm = pd.DataFrame(confusion_matrix(test_label, pred_label),
            columns=le.classes_, index=le.classes_)

if not os.path.isdir("./results/" + name_of_predictor):
    os.makedirs("./results/" + name_of_predictor)
cm.to_csv("./results/" + name_of_predictor + '/' + name_of_predictor + '_MNB_CM.csv')

cm

Unnamed: 0,世界,体育,军事,农业,娱乐,房产,教育,文化,旅游,民生,汽车,游戏,科技,股票,金融
世界,5617,34,370,45,145,17,62,38,121,21,51,19,101,0,85
体育,50,8859,19,12,161,6,76,26,29,3,34,100,39,0,26
军事,555,18,5379,15,55,5,55,36,36,3,42,76,56,0,24
农业,32,4,8,4267,29,43,49,47,115,31,30,11,31,0,94
娱乐,69,70,53,29,9020,11,43,123,54,50,20,122,62,0,12
房产,18,3,2,57,18,4085,27,6,59,5,18,2,42,0,78
教育,47,14,10,43,94,16,6185,103,45,13,21,33,155,0,45
文化,60,14,40,95,194,19,158,5950,225,14,24,57,68,0,34
旅游,143,20,17,96,71,60,50,134,4590,8,101,37,69,0,35
民生,27,5,7,29,164,12,29,46,12,1200,12,20,9,0,3


### 报告表

In [13]:
from sklearn.metrics import precision_recall_fscore_support

def eval_model(y_true, y_pred, labels):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 计算总体的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    res.to_csv("./results/" + name_of_predictor + '/' + name_of_predictor + '_MNB_PRFS.csv')
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label, pred_label, le.classes_)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Label,Precision,Recall,F1,Support
0,世界,0.817256,0.835117,0.82609,6726
1,体育,0.962621,0.938453,0.950384,9440
2,军事,0.884849,0.84642,0.865208,6355
3,农业,0.87224,0.890628,0.881338,4791
4,娱乐,0.879314,0.926268,0.90218,9738
5,房产,0.913053,0.924208,0.918597,4420
6,教育,0.886103,0.90636,0.896117,6824
7,文化,0.906736,0.855869,0.880568,6952
8,旅游,0.835761,0.845148,0.840428,5431
9,民生,0.883002,0.761905,0.817996,1575


### 人工输入预测

In [14]:
def predict(input_content):
    for i in range(len(input_content)):
        input_content[i] = ' '.join(jieba.cut(input_content[i]))
    input_seg_vec = countVectorizer.transform(input_content)
    
    return le.inverse_transform(bayesModel.predict(input_seg_vec))

def predict_latent(input_content):
    for i in range(len(input_content)):
        input_content[i] = ' '.join(jieba.cut(input_content[i]))
    input_seg_vec = countVectorizer.transform(input_content)
    predict_proba = bayesModel.predict_proba(input_seg_vec)
    latent = np.argsort(np.max(predict_proba, axis=0))[-2]
    return le.inverse_transform([latent])
    

In [16]:
input_content = ["""
在《向往的生活》里挑战自我，要去掏鸡蛋啦！为了鼓励张艺兴，网友们各出奇招，纷纷为他打call，这一届网友真的太行了
"""]

print("Predict: " + predict(input_content)[0])
print("Latent predict: " + predict_latent(input_content)[0])

Predict: 娱乐
Latent predict: 世界


### 新浪微博预测

In [18]:
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/86.0.4240.198 Safari/537.36'}
html = requests.get('https://s.weibo.com/top/summary?cate=realtimehot', headers=headers)
page = html.content.decode()

res = re.findall(
    r'<tr class="">\s*<td class="td-01 ranktop">(\d+)<\/td>\s*<td class="td-02">\s*<a href="\/('
    r'.*?)&.*?target="_blank">(.*?)<\/a>', page, re.S)

# ind = [i[0] for i in res]
# link = ['https://s.weibo.com/' + i[1] for i in res]
title = [i[2] for i in res]
pred = [predict([t])[0] for t in title]
pred_latent = [predict_latent([t])[0] for t in title]


dataFrame = pd.DataFrame({
        u'热搜标题': title,
        u'预测结果': pred,
        u'备用结果': pred_latent
    })
dataFrame.index = [i + 1 for i in range(50)]

dataFrame[['热搜标题', '预测结果', '备用结果']]

Unnamed: 0,热搜标题,预测结果,备用结果
1,易烊千玺 无聊一天出门走走,娱乐,旅游
2,钟南山说未来可能要定期打新冠疫苗,金融,农业
3,县长想让曹县持续火下去,农业,科技
4,乔欣发怼脸视频否认整容,娱乐,科技
5,原来武林外传是反内卷前辈,娱乐,游戏
6,赛格大厦 阻尼器,房产,文化
7,微信拍一拍新增炸一炸功能,科技,游戏
8,关晓彤 植物肉,娱乐,农业
9,清落,科技,娱乐
10,祝融号拍到的火星,娱乐,军事
