In [1]:
import time
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from utils import load_reviews, data_suffle

# stopwordPath = './data/stopword.txt'
# userDictPath = './data/user_dict.txt'
csvFilePath = '../../corpus/100k/allTrimed.csv'
modelPath = './data/bayes.model'

# 载入自定义字典
# jieba.load_userdict(userDictPath)

time_start = time.time()

labels, reviews = load_reviews(csvFilePath)
labels, reviews = data_suffle(labels, reviews)
# 将reviews的格式转为[str]，为CountVectorizer使用

# 1/4 分割数据集
n = len(labels) // 5
labels_train, reviews_train = labels[n:], reviews[n:]
labels_test, reviews_test = labels[:n], reviews[:n]

print(f'Load Corpus Cost {time.time() - time_start:.4f} Sec')

Load Corpus Cost 0.2823 Sec


In [2]:
time_start = time.time()

# 加载bayes分类器
# 统计法向量化
vectorizer = CountVectorizer(max_df=0.8, min_df=5)
tfidftransformer = TfidfTransformer()
# 先转换成词频矩阵，再计算TFIDF值
tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(np.str_(review) for review in reviews_train))
# 朴素贝叶斯中的多项式分类器
clf = MultinomialNB().fit(tfidf, labels_train)

print(f'Train Model Cost {time.time() - time_start:.4f} Sec')

Train Model Cost 0.6679 Sec


In [3]:
# 生成测试数据的tfidf矩阵
tfidf_test = tfidftransformer.transform(vectorizer.transform(np.str_(review) for review in reviews_test))
result = clf.predict(tfidf_test)

In [4]:
from sklearn import metrics
print(metrics.classification_report(labels_test, result))
print("准确率:", metrics.accuracy_score(labels_test, result))

              precision    recall  f1-score   support

           0       0.84      0.80      0.82     11910
           1       0.81      0.85      0.83     12087

    accuracy                           0.83     23997
   macro avg       0.83      0.83      0.83     23997
weighted avg       0.83      0.83      0.83     23997

准确率: 0.8257282160270034
