# 垃圾邮件分类学习目标：
- 了解朴素贝叶斯算法常见算法API


--- 


In [4]:
# 导入贝叶斯定理的分类算法
from sklearn.naive_bayes import MultinomialNB
# 导入pandas包
import pandas as pd
# 导入字符编码和解码
import codecs

# 导入正则表达式模块
import re
# 导入中文文本分词库
import jieba
# 导入事件模块
import time
# 导入特征提取模块
from sklearn.feature_extraction.text import CountVectorizer
# 导入numpy
import numpy as np
# 导入数据集划分模块
from sklearn.model_selection import train_test_split
# 模型保存和加载模块
import joblib
# 导入集合模块
from collections import Counter
# 导入随机模块
import random


In [7]:
def reading_email_data():
    '''
    读取邮件数据
    :return: 
    '''
    email_label_arr = []
    email_content_arr = []
    sample_number = 500
    
    # 1、获取所有邮件
    for line in open('../file/email/full/index', errors = 'ignore'):
        
        # 1、获取邮件标签（strip：用于移除字符串首尾的空白字符，包括空格、换行符（\n）、制表符（\t））
        label, data = line.strip().split()

        print('标签：', label)
        print('数据：', data)
        
        # 2、读取邮件内容
        file_name = '../file/email' + data[2:]
        file_data = codecs.open(file_name, 'r', 'gbk', errors = 'ignore').read()
        
        # 3、存储标签和数据
        email_label_arr.append(label)
        email_content_arr.append(file_data)

    # print(email_label_arr)
    # print(email_content_arr)

    # print('运行到-1')
        
    # 2、所有邮件中随机选择垃圾邮件和正常邮件1000封
    email_data = pd.DataFrame({'content': email_content_arr, 'label': email_label_arr})
    spam_email_data = email_data[email_data['label'] == 'spam'].sample(sample_number)
    ham_email_data = email_data[email_data['label'] == 'ham'].sample(sample_number)

    # print('运行到-2')
    # 3、将邮件拼接在一起
    email_data = pd.concat([spam_email_data, ham_email_data])
    # print('运行到-3')
    email_data.to_csv('../file/data_reduction.csv')

In [10]:
def handle_email():
    '''
    邮件数据处理
    :return: 
    '''
    
    # 1、读取数据
    email_data = pd.read_csv('../file/data_reduction.csv')
    
    # 2、数据预处理
    content_arr = []
    for index, email in enumerate(email_data['content'], 1):
        # 1、除去换行符   
        email = email.replace('\n', ' ')
        
        # 2、除去非中文内容
        email = re.sub('[^\u4e00-\u9fff]', '', email)
        
        # 3、去除多余空白内容
        email = ' '.join(email.split())
        
        # 4、分词
        email = ' '.join(jieba.lcut(email))

        print('处理完成之后的 email：',email)
        content_arr.append(email)
        
        if index % 100 == 0: print('已经预处理%5d 封邮件' % index)
            
        # 5、将数据写入文件
    data = pd.DataFrame({'content': content_arr, 'label': email_data['label']}).to_csv('../file/data_handle.csv')


In [5]:
def handle_email_data_vector():
    '''
    数据向量化
    :return:
    '''
    
    # 1、读入数据集
    email = pd.read_csv('../file/data_handle.csv')
    
    # 2、构建词频向量
    stop_word_arr = []
    for word in open('../file/email/stoplist.txt', 'r', encoding = 'gbk'):
        stop_word_arr.append(word.strip())

    email = email.dropna()
    # print('停止词频：', stop_word_arr)
    transformer  = CountVectorizer(stop_words = stop_word_arr)
    x = transformer.fit_transform(email['content']).toarray()
    print(x.shape)

    y = np.where(email['label'].values == 'ham', 0, 1)

    print('运行到-1')

    # 3、将训练数据存储
    data = pd.DataFrame(x)
    print('运行到-2')
    data[x.shape[1]] = y
    data.to_csv('../file/data_result.csv')

In [9]:
def email_model_training():
    # 1、读取数据
    data = pd.read_csv('../file/data_result.csv')
    x = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # 2、数据集分割
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

    # 3、模型训练
    estimator = MultinomialNB()
    estimator.fit(x_train, y_train)

    # 4、保存模型
    joblib.dump(estimator, '../file/multinomial_nb.pth')
    
    # 5、模型评估
    accuracy = estimator.score(x_test, y_test)
    print('预测准确率', accuracy)


In [10]:
email_model_training()

预测准确率 0.9119170984455959
