# 利用朴素贝叶斯实现垃圾邮件分类

#### 定义文件读取函数

In [3]:
import os
import re

ham_path = r'email/ham/'
spam_path = r'email/spam/'

def read_email(file_path):
    word_list = ""
    with open(file_path, 'r') as f:
        line = f.readline().lower()
        while line:
            word_list += line.lower()
            line = f.readline()
    # 将换行符替换为空格以便分割
    return word_list.replace("\n", " ")

#### 读取非垃圾邮件

In [5]:
# 正则表达式按非字母、数字分割
reg = re.compile('\\W+')
hams = []
for file_name in os.listdir(ham_path):
    word = read_email(ham_path + file_name)
    # 正则表达式利用非字母、数字进行分割
    word_list = reg.split(word)
    # 筛选空字符串
    hams.append([word for word in word_list if len(word) > 0])
len(hams)

23

#### 读取垃圾邮件

In [6]:
spams = []
for file_name in os.listdir(spam_path):
    word = read_email(spam_path + file_name)
    word_list = reg.split(word)
    spams.append([word for word in word_list if len(word) > 0])
len(spams)

24

#### 生成词集和标签

In [8]:
# 生成全部邮件的词集
word_list = hams + spams
# 生成对应标签
labels = [0] * len(hams) + [1] * len(spams)
# 获取所有词语的集合
all_words = set()
for w in word_list:
    all_words = all_words | set(w)
all_words = list(all_words)
all_words = {all_words[i] : i for i in range(len(all_words))}
len(all_words)

635

#### 统计词频，转换为词向量
如果使用伯努利分布，那么在此处，当每个词出现的时候，将词向量中对应位置为1但不递增（即要么是0，要么是1），这样更多考虑的是词与词之间的联系
如果使用多项式分布，那么词向量中对应位置可以递增，这样词出现次数对结果的影响将增大

In [10]:
word_vec = []
for l in word_list:
    cur_vec = [0] * len(all_words)
    for w in l:
        cur_vec[all_words[w]] = 1
    word_vec.append(cur_vec)

import numpy as np

word_vec = np.asarray(word_vec)
labels = np.asarray(labels)

print(word_vec.shape)
print(labels.shape)
word_vec

(47, 635)
(47,)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0]])

#### 划分训练集和测试集

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(word_vec, labels, test_size=0.2, stratify=labels)
y_test

array([0, 1, 1, 0, 0, 1, 1, 1, 0, 0])

#### 训练模型并预测

In [15]:
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)
print(pred)
y_test

[0 1 1 0 0 0 1 1 0 0]


array([0, 1, 1, 0, 0, 1, 1, 1, 0, 0])

#### 模型评估

In [17]:
from sklearn.metrics import classification_report
t = classification_report(y_test, pred, target_names=['0', '1'])
print(t)

              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       1.00      0.80      0.89         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10

