# 使用朴素贝叶斯过滤垃圾邮件

**说明:**

将 `email` 文件夹放在当前目录下。

In [122]:
import re
import os
import numpy as np

def get_data(path):
    files=os.listdir(path)
    DocList=[]
    for file in files:
        filename=path+"/"+file
        with open(filename,'r+',errors='ignore') as f:
            DocTmp=str.lower(f.read())
            DocTmp=re.split(r'\W*',DocTmp)
            DocTmp=[doc for doc in DocTmp if len(doc)>2]
            DocList.append(DocTmp)
    return(DocList)

def get_vocab_list(docs):
    vocabList=[]
    for doc in docs:
        vocabList+=doc
        vocabList=list(set(vocabList))
    
    return vocabList

def get_docs_word2vec(docs,vocab_list):
    
    vocab_len=len(vocab_list)
    docs_word2vec=[]
    
    for doc in docs:
        doc_word2vec=[0]*vocab_len
        for i in range(0,vocab_len):
            if vocab_list[i] in doc:
                doc_word2vec[i]=1
      
        docs_word2vec.append(doc_word2vec)
     
    return docs_word2vec

def bayes_model(docs_word2vec,y_label):
    
    assert len(docs_word2vec)==len(y_label),'docs_word2vec和y_label数量要求保存一致'
    sample_cnt=len(docs_word2vec)
    P1_y=sum(y_label)/len(y_label)
    x1_cnt=np.ones(len(docs_word2vec[0]))
    x0_cnt=np.ones(len(docs_word2vec[0]))
    x1_all=2
    x0_all=2
    
    for i in range(0,sample_cnt):
        
        if y_label[i]==1:
            
            x1_cnt+=docs_word2vec[i]
            x1_all+=sum(docs_word2vec[i])
            
        else:
            
            x0_cnt+=docs_word2vec[i]
            x0_all+=sum(docs_word2vec[i])
            
    P1_x_log=np.log(x1_cnt/x1_all)
    P0_x_log=np.log(x0_cnt/x0_all)
    
    return (P1_x_log,P0_x_log,P1_y)


def predict(model,docs_word2vec):
    
    P1_x_log=model[0]
    P0_x_log=model[1]
    P1_y=model[2]
    y_pre_label=[]
    for doc in docs_word2vec:
        P1=sum(doc*P1_x_log)+np.log(P1_y)
        P0=sum(doc*P0_x_log)+np.log(1-P1_y)
        
        pre_label=1 if P1>P0 else 0
        y_pre_label.append(pre_label)

    return y_pre_label

###  加载数据

In [135]:
DocList_ham=get_data('data/email/ham')
y_label_ham=[0]*len(DocList_ham)

DocList_spam=get_data('data/email/spam')
y_label_spam=[1]*len(DocList_spam)

DocList=DocList_ham+DocList_spam
y_label=y_label_ham+y_label_spam

print('第一封邮件分词结果',DocList[0])
print('第一封邮件y值标签:',y_label[0])

第一封邮件分词结果 ['peter', 'with', 'jose', 'out', 'town', 'you', 'want', 'meet', 'once', 'while', 'keep', 'things', 'going', 'and', 'some', 'interesting', 'stuff', 'let', 'know', 'eugene']
第一封邮件y值标签: 0


  return _compile(pattern, flags).split(string, maxsplit)


### 构建词库，将邮件转换为词向量表示形式

In [141]:
vocabList=get_vocab_list(DocList)
docs_word2vec=get_docs_word2vec(DocList,vocabList)

print('词库的前20个词条:',vocabList[0:20])
print('第一封邮件的词向量表示形式：',docs_word2vec[0])

词库的前20个词条: ['been', 'troy', 'issues', 'note', 'tesla', 'does', 'attaching', 'focusing', 'generates', 'get', 'when', 'support', '1924', 'sent', 'pharmacy', 'ideas', 'opportunity', 'experts', 'supplement', 'turd']
第一封邮件的词向量表示形式： [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0

### 贝叶斯模型训练

In [145]:
model=bayes_model(docs_word2vec,y_label)
print('P1_x_log:',model[0])

P1_x_log: [-6.51174533 -6.51174533 -6.51174533 -6.51174533 -6.51174533 -6.51174533
 -6.51174533 -6.51174533 -6.51174533 -5.12545097 -6.51174533 -6.51174533
 -6.51174533 -6.51174533 -5.12545097 -6.51174533 -5.81859815 -5.81859815
 -5.81859815 -6.51174533 -5.41313304 -6.51174533 -6.51174533 -5.12545097
 -6.51174533 -5.12545097 -5.41313304 -4.71998586 -5.81859815 -6.51174533
 -6.51174533 -3.94679597 -4.20916024 -4.02683868 -5.81859815 -6.51174533
 -6.51174533 -6.51174533 -6.51174533 -6.51174533 -6.51174533 -6.51174533
 -5.12545097 -6.51174533 -6.51174533 -6.51174533 -5.12545097 -6.51174533
 -5.81859815 -6.51174533 -6.51174533 -6.51174533 -5.41313304 -4.43230379
 -6.51174533 -5.81859815 -4.43230379 -5.81859815 -6.51174533 -5.81859815
 -5.81859815 -6.51174533 -6.51174533 -5.81859815 -5.12545097 -5.12545097
 -6.51174533 -6.51174533 -5.12545097 -6.51174533 -6.51174533 -6.51174533
 -6.51174533 -6.51174533 -6.51174533 -6.51174533 -6.51174533 -6.51174533
 -5.81859815 -5.81859815 -5.12545097 -6.5

In [149]:
print('P0_x_log:',model[1])

P0_x_log: [-5.48755937 -5.89302447 -5.89302447 -5.89302447 -5.89302447 -5.89302447
 -5.89302447 -5.89302447 -5.89302447 -4.97673374 -5.89302447 -5.89302447
 -5.89302447 -5.89302447 -6.58617165 -5.48755937 -6.58617165 -6.58617165
 -6.58617165 -5.89302447 -4.50673011 -5.89302447 -5.89302447 -6.58617165
 -5.89302447 -6.58617165 -6.58617165 -4.97673374 -6.58617165 -5.89302447
 -5.89302447 -6.58617165 -6.58617165 -3.81358293 -6.58617165 -5.89302447
 -5.89302447 -5.89302447 -5.48755937 -5.89302447 -5.89302447 -5.89302447
 -6.58617165 -5.89302447 -5.89302447 -5.89302447 -6.58617165 -5.89302447
 -6.58617165 -5.89302447 -5.89302447 -5.89302447 -6.58617165 -6.58617165
 -5.89302447 -6.58617165 -6.58617165 -5.89302447 -5.89302447 -6.58617165
 -6.58617165 -5.89302447 -5.48755937 -5.89302447 -6.58617165 -6.58617165
 -5.89302447 -5.89302447 -5.48755937 -5.89302447 -5.89302447 -5.89302447
 -5.89302447 -5.89302447 -5.89302447 -4.50673011 -5.89302447 -5.89302447
 -6.58617165 -5.48755937 -6.58617165 -5.8

In [148]:
print('P1_y:',model[2])

P1_y: 0.5


### 用模型预测

In [144]:
y_pre_label=predict(model,docs_word2vec)
print(y_pre_label)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### 从数据集中抽样80%作为训练集，20%作为验证集，迭代20次

In [213]:
for i in range(0,20):
    
    idx=np.random.permutation(sample_cnt)
    train_idx=idx[0:int(sample_cnt*0.8)]
    test_idx=idx[int(sample_cnt*0.8):]

    train_docs_word2vec=np.array(docs_word2vec)[train_idx,:].tolist()
    train_y_label=np.array(y_label)[train_idx].tolist()

    test_docs_word2vec=np.array(docs_word2vec)[test_idx,:].tolist()
    test_y_label=np.array(y_label)[test_idx].tolist()

    model=bayes_model(train_docs_word2vec,train_y_label)
    y_pre_label=predict(model,test_docs_word2vec)

    error=1-np.sum(np.array(y_pre_label)==np.array(test_y_label))/len(test_y_label)
    print('error:',round(error,3))

error: 0.0
error: 0.2
error: 0.1
error: 0.1
error: 0.0
error: 0.1
error: 0.0
error: 0.1
error: 0.0
error: 0.1
error: 0.0
error: 0.0
error: 0.0
error: 0.0
error: 0.0
error: 0.0
error: 0.1
error: 0.1
error: 0.1
error: 0.1


error: 0.1


In [206]:
np.array(y_pre_label)==np.array(test_y_label)  

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True])

In [204]:
test_y_label

[0, 1, 0, 1, 1, 1, 1, 0, 1, 0]