# Bayes分类器

## 库导入

主要使用keras，TensorFlow(1版)

In [1]:
import sys
import time
import os
import re
import numpy as np
import pandas as pd
import pickle
import jieba

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# 实时更新进度条
def print_flush(print_string):
    print(print_string, end='\r')
    sys.stdout.flush()

predictor_set = ["THUCNews", "TouTiao"]
name_of_predictor = predictor_set[1]

## 文件导入（CNN内有保存）

In [2]:
pickleFilePath = "./" + name_of_predictor + '/content_list/content_list.pickle'
with open(pickleFilePath, 'rb') as file:
    content_list = pickle.load(file)
    file.close()
pickleFilePath = "./" + name_of_predictor + '/label_list.pickle'
with open(pickleFilePath, 'rb') as file:
    label_list = pickle.load(file)
    file.close()
print("Load complete, {0} contents loaded, {1} labels loaded".format(len(content_list), len(label_list)))

Load complete, 386093 contents loaded, 386093 labels loaded


In [3]:
dataFrame=pd.DataFrame({
        'label':label_list,
        'content':content_list
        }) 
pd.value_counts(label_list)

科技    41543
娱乐    39396
体育    37568
汽车    35785
游戏    29300
文化    28031
金融    27085
教育    27058
世界    26909
军事    24984
旅游    21422
农业    19322
房产    17672
民生     6273
疫情     3405
股票      340
dtype: int64

## 分词

### 逐内容处理

In [4]:
zhPattern = re.compile('[\u4e00-\u9fa5]+') 

def segment_reshape(dataFrame):
    startTime = time.time()
    sample_quantity = dataFrame.shape[0]
    for i, row in dataFrame.iterrows():
        segments = []
        fileContent = row['content']
        segs = jieba.cut(fileContent)
        for seg in segs:
            if zhPattern.search(seg):
                segments.append(seg)
        row['content'] = ' '.join(segments)
        if (i + 1)%1000==0 or i==sample_quantity - 1:
            string_0 = 'Segment process: %d/ %d' %(i + 1, sample_quantity)
            string_1 = ' Processed: %.2f%%' %(i/sample_quantity*100)
            usedTime = time.time() - startTime
            string_2 = ' Time cost: %.2f秒' %usedTime
            print_string = string_0 + string_1 + string_2
            print_flush(print_string)
        

In [5]:
segment_reshape(dataFrame) 

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/2b/swj1bjf912l41nfy725k11ph0000gn/T/jieba.cache
Loading model cost 0.662 seconds.
Prefix dict has been built successfully.


Segment process: 386093/ 386093 Processed: 100.00% Time cost: 144.77秒

### 停止词列表导入 Github: goto456/stopwords

In [6]:
stopWordsPath = "./stopwords-master/cn_stopwords.txt"
stopWords = pd.read_csv(stopWordsPath,
                        encoding='utf8', 
                        names = ["stopword"],
                        quoting=3,
                        sep="\t")

### 移除停止词

In [7]:
# r' 防转义
countVectorizer = CountVectorizer(stop_words=stopWords["stopword"].values.tolist(),
                                  min_df=0, token_pattern=r'\b\w+\b')

text_vector = countVectorizer.fit_transform(dataFrame['content'])

### 全现有文件序列化

In [8]:
pickleFilePath = './' + name_of_predictor + '/label_content_dataFrame.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(dataFrame, file)
print("Label and content dataframe serialization finished.")

pickleFilePath = './' + name_of_predictor + '/content_vector.pickle'
with open(pickleFilePath, 'wb') as file:
    pickle.dump(text_vector, file)
print("Content vector serialization finished.")

Label and content dataframe serialization finished.
Content vector serialization finished.


## 模型建立

### 全现有文件导入

In [5]:
pickleFilePath = './' + name_of_predictor + '/label_content_dataFrame.pickle'
with open(pickleFilePath, 'rb') as file:
    dataFrame = pickle.load(file)
print("Label and content dataframe load finished.")

pickleFilePath = './' + name_of_predictor + '/content_vector.pickle'
with open(pickleFilePath, 'rb') as file:
    text_vector = pickle.load(file)
print("Content vector load finished.")

Label and content dataframe load finished.
Content vector load finished.


### 结果标签集

In [9]:
le = LabelEncoder()
label_encode = le.fit_transform(dataFrame['label'])

### 训练集测试集划分

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_vector, label_encode, random_state=114514)

### 朴素多项式贝叶斯 (Scikit Learn)

In [11]:
bayesModel = MultinomialNB()
# 对标签label进行训练
bayesModel.fit(X_train, y_train)
score = bayesModel.score(X_train, y_train)
print("Bayes fit complete!")
print("Score on mean accuracy on the given test data and labels: {0:.6f} / {1:.6f}".format(score, 1))

Bayes fit complete!
Score on mean accuracy on the given test data and labels: 0.915799 / 1.000000


### 模型保存

In [21]:
import joblib
modelFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB.pkl"
leFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_LE.pkl"
cvFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_CV.pkl"

joblib.dump(bayesModel, modelFilePath) 
joblib.dump(le, leFilePath) 
joblib.dump(countVectorizer, cvFilePath) 

print("MNB model saved.")

MNB model saved.


### 模型加载

In [24]:
import joblib

modelFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB.pkl"
leFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_LE.pkl"
cvFilePath = "./" + name_of_predictor + "/trained_model/MultinomialNB_CV.pkl"

bayesModel = joblib.load(modelFilePath)
le = joblib.load(leFilePath)
countVectorizer = joblib.load(cvFilePath)

## 预测和检验

In [13]:
y_pred = bayesModel.predict(X_test)
pred_label = le.inverse_transform(y_pred)
test_label = le.inverse_transform(y_test)

from sklearn.metrics import confusion_matrix

cm = pd.DataFrame(confusion_matrix(test_label, pred_label),
            columns=le.classes_, index=le.classes_)

if not os.path.isdir("./results/" + name_of_predictor):
    os.makedirs("./results/" + name_of_predictor)
cm.to_csv("./results/" + name_of_predictor + '/' + name_of_predictor + '_MNB_CM.csv')

cm

Unnamed: 0,世界,体育,军事,农业,娱乐,房产,教育,文化,旅游,民生,汽车,游戏,疫情,科技,股票,金融
世界,5578,27,392,31,155,16,59,39,130,23,53,21,27,106,0,80
体育,48,8854,18,15,158,5,60,25,50,5,43,83,1,39,0,22
军事,577,18,5369,10,45,2,58,35,27,4,41,86,5,51,0,14
农业,24,3,11,4238,31,47,53,51,133,32,31,6,6,45,0,85
娱乐,65,63,49,34,9053,10,43,119,49,52,26,125,0,58,0,15
房产,10,4,1,59,13,4171,35,8,68,5,24,2,2,36,0,95
教育,47,12,17,35,92,18,6163,97,46,15,20,35,8,144,0,41
文化,61,18,42,89,192,21,151,5921,228,15,22,57,3,67,0,35
旅游,159,21,20,114,83,50,48,113,4570,5,96,26,5,72,0,25
民生,25,4,10,24,177,11,29,42,10,1199,11,18,0,13,0,4


### 报告表

In [14]:
from sklearn.metrics import precision_recall_fscore_support

def eval_model(y_true, y_pred, labels):
    # 计算每个分类的Precision, Recall, f1, support
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # 计算总体的平均Precision, Recall, f1, support
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['总体'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    res.to_csv("./results/" + name_of_predictor + '/' + name_of_predictor + '_MNB_PRFS.csv')
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

eval_model(test_label, pred_label, le.classes_)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Label,Precision,Recall,F1,Support
0,世界,0.819571,0.827965,0.823747,6737
1,体育,0.961451,0.939317,0.950255,9426
2,军事,0.883495,0.846578,0.864643,6342
3,农业,0.870047,0.883653,0.876797,4796
4,娱乐,0.877229,0.927466,0.901648,9761
5,房产,0.917308,0.920141,0.918722,4533
6,教育,0.884726,0.907658,0.896045,6790
7,文化,0.910083,0.855389,0.881889,6922
8,旅游,0.829401,0.845201,0.837226,5407
9,民生,0.877745,0.760304,0.814815,1577


### 人工输入预测

In [15]:
def predict(input_content):
    for i in range(len(input_content)):
        input_content[i] = ' '.join(jieba.cut(input_content[i]))
    input_seg_vec = countVectorizer.transform(input_content)
    
    return le.inverse_transform(bayesModel.predict(input_seg_vec))

def predict_latent(input_content):
    for i in range(len(input_content)):
        input_content[i] = ' '.join(jieba.cut(input_content[i]))
    input_seg_vec = countVectorizer.transform(input_content)
    predict_proba = bayesModel.predict_proba(input_seg_vec)
    latent = np.argsort(np.max(predict_proba, axis=0))[-2]
    return le.inverse_transform([latent])
    

In [19]:
input_content = ["""
吴尊友介绍，在社会层面不会长期存在无症状感染者。
一般来说，无症状感染者的传染性比较小，无症状感染者通常是有长时间频繁接触，才可以造成传播，通常是家庭的共同生活，或者一起共同劳动。无症状感染者也不会连续传播一代、二代。
如果即使传播一代二代，没有临床病例的话，它就会自动终止了。如果出现临床的病例，可能引发局部的疫情。
"""]

print("Predict: " + predict(input_content)[0])
print("Latent predict: " + predict_latent(input_content)[0])

Predict: 疫情
Latent predict: 世界


### 新浪微博预测

In [22]:
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/86.0.4240.198 Safari/537.36'}
html = requests.get('https://s.weibo.com/top/summary?cate=realtimehot', headers=headers)
page = html.content.decode()

res = re.findall(
    r'<tr class="">\s*<td class="td-01 ranktop">(\d+)<\/td>\s*<td class="td-02">\s*<a href="\/('
    r'.*?)&.*?target="_blank">(.*?)<\/a>', page, re.S)

# ind = [i[0] for i in res]
# link = ['https://s.weibo.com/' + i[1] for i in res]
title = [i[2] for i in res]
pred = [predict([t])[0] for t in title]
pred_latent = [predict_latent([t])[0] for t in title]


dataFrame = pd.DataFrame({
        u'热搜标题': title,
        u'预测结果': pred,
        u'备用结果': pred_latent
    })
dataFrame.index = [i + 1 for i in range(50)]

dataFrame[['热搜标题', '预测结果', '备用结果']]

Unnamed: 0,热搜标题,预测结果,备用结果
1,精灵耳 整容,娱乐,游戏
2,金晨李斯丹妮挽手散步,娱乐,文化
3,广州荔湾发现1例核酸疑似阳性,疫情,科技
4,赵奕欢 领证之前都不能叫老公,娱乐,民生
5,广州疫情,疫情,体育
6,张桐没入围 离谱,娱乐,游戏
7,央视曝光听话水等新型毒品,农业,教育
8,白玉兰入围名单,娱乐,教育
9,女大学生打耳洞差点导致瘫痪,娱乐,世界
10,西藏和平解放与繁荣发展白皮书发表,疫情,旅游
