In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from collections import Counter
from numpy import log, ones,zeros

In [2]:
def textParse(bigString):  # 输入big string，输出word list
    import re
    listOfTokens = re.split('\W', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 0]

In [3]:
def load_data_and_split(data_path:str, val_rate:float):
    """
    加载数据集并划分训练集和验证集
    :param data_path: 数据集路径
    :param val_rate: 验证集比例
    :return: 训练集列表，验证集列表，训练集标签列表，验证集标签列表
    """
    sms = pd.read_csv(data_path, sep='\t', header=None)
    data_lines = sms.loc[:,1].values.tolist()
    label_list = sms.loc[:,0].values.tolist()
    data_list = []
    for data_line in data_lines:
        data_line = textParse(data_line)
        data_list.append(data_line)
    train_data, val_data, train_label, val_label = train_test_split(data_list, label_list, test_size=val_rate, 
                                                                        random_state=random.randint(0,1000),shuffle=True)
    return train_data,val_data, train_label,val_label

In [4]:
train_data, val_data, train_label, val_label = load_data_and_split('./SMSSpamCollection',0.2)

In [5]:
train_data

[['sad',
  'story',
  'of',
  'a',
  'man',
  'last',
  'week',
  'was',
  'my',
  'b',
  'day',
  'my',
  'wife',
  'did',
  'nt',
  'wish',
  'me',
  'my',
  'parents',
  'forgot',
  'n',
  'so',
  'did',
  'my',
  'kids',
  'i',
  'went',
  'to',
  'work',
  'even',
  'my',
  'colleagues',
  'did',
  'not',
  'wish'],
 ['she',
  'just',
  'broke',
  'down',
  'a',
  'list',
  'of',
  'reasons',
  'why',
  'nobody',
  's',
  'in',
  'town',
  'and',
  'i',
  'can',
  't',
  'tell',
  'if',
  'she',
  's',
  'being',
  'sarcastic',
  'or',
  'just',
  'faggy'],
 ['got',
  'ur',
  'mail',
  'dileep',
  'thank',
  'you',
  'so',
  'muchand',
  'look',
  'forward',
  'to',
  'lots',
  'of',
  'support',
  'very',
  'less',
  'contacts',
  'here',
  'remember',
  'one',
  'venugopal',
  'you',
  'mentioned',
  'tomorrow',
  'if',
  'not',
  'late',
  'i',
  'shall',
  'try',
  'to',
  'come',
  'up',
  'till',
  'there',
  'goodnight',
  'dear'],
 ['k', 'go', 'and', 'sleep', 'well', 'take

In [2]:
def creatVocabList(Data):
    List = set([])  # 创建一个空集合
    for document in Data:
        List = List | set(document)  # 对两个集合求并集
    return list(List)

In [47]:
def dataToVec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [46]:
def naiveBayesTrain(trainData, trainLabel):
    N = len(trainData)
    NWords = len(trainData[0])
    pAbusive = Counter(trainLabel)['spam']/N
    p0Num = ones(NWords)
    p1Num = ones(NWords)  # Laplace平滑
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(N):
        if trainLabel[i] == 'spam':
            p1Num += trainData[i]
            p1Denom += sum(trainData[i])
        else:
            p0Num += trainData[i]
            p0Denom += sum(trainData[i])
    p1Vect = log(p1Num / p1Denom)
    p0Vect = log(p0Num / p0Denom)

    return p0Vect, p1Vect, pAbusive

In [45]:
def naiveBayes(trainData, trainLabel):
    N = len(trainData)
    NWords = len(trainData[0])
    pAbusive = Counter(trainLabel)['spam']/N
    p0Num = ones(NWords)
    p1Num = ones(NWords)  # Laplace平滑
    p0Denom = 2.0
    p1Denom = 2.0`
    for i in range(N):
        if trainLabel[i] == 'spam':
            p1Num += trainData[i]
            p1Denom += sum(trainData[i])
        else:
            p0Num += trainData[i]
            p0Denom += sum(trainData[i])
    p1Vect = log(p1Num / p1Denom)
    p0Vect = log(p0Num / p0Denom)
    
    return p0Vect, p1Vect, pAbusive

In [41]:
def naiveBayesClf(testVec, p0Vec, p1Vec, pClass1):
    p1 = sum(testVec * p1Vec) + log(pClass1)  # element-wise mult
    p0 = sum(testVec * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 'spam'
    else:
        return 'ham'

In [48]:
def getAcc(train_data, val_data, train_label, val_label):
    data_list = creatVocabList(train_data)
    train_matrix = []
    for data in train_data:
        train_matrix.append(dataToVec(data_list,data))
    p0Vect, p1Vect, pAbusive = naiveBayesTrain(train_matrix,train_label)
    acc = 0.0
    for i in range(len(val_data)):
        val_list = dataToVec(data_list, val_data[i])
        if naiveBayesClf(val_list,p0Vect, p1Vect, pAbusive) == val_label[i]:
            acc += 1
    return round((acc/len(val_data)*100),4)

In [51]:
train_data, val_data, train_label, val_label = load_data_and_split('./SMSSpamCollection',0.2)
print(getAcc(train_data, val_data, train_label, val_label))

98.7444
