In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from collections import Counter
from numpy import log, ones,zeros

In [139]:
def textParse(bigString):
    """
    将字符串分割
    :param data_path: 数据集路径
    :param val_rate: 验证集比例
    :return: 训练集列表，验证集列表，训练集标签列表，验证集标签列表
    """
    import re
    listOfTokens = re.split('\W', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 0]

In [138]:
def load_data_and_split(data_path:str, val_rate:float):
    """
    加载数据集并划分训练集和验证集
    :param data_path: 数据集路径
    :param val_rate: 验证集比例
    :return: 训练集列表，验证集列表，训练集标签列表，验证集标签列表
    """
    sms = pd.read_csv(data_path, sep='\t', header=None)
    data_lines = sms.loc[:,1].values.tolist()
    label_list = sms.loc[:,0].values.tolist()
    data_list = []
    for data_line in data_lines:
        data_line = textParse(data_line)
        data_list.append(data_line)
    train_data, val_data, train_label, val_label = train_test_split(data_list, label_list, test_size=val_rate, 
                                                                        random_state=random.randint(0,1000),shuffle=True)
    return train_data,val_data, train_label,val_label

In [165]:
train_data, val_data, train_label, val_label = load_data_and_split('./SMSSpamCollection',0.3)

In [141]:
def creatVocabList(Data):
    List = set([])  # 创建一个空集合
    for document in Data:
        List = List | set(document)  # 对两个集合求并集
    return list(List)

In [170]:
def word2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return np.array(returnVec)

In [143]:
def words2Matrix(dataSet):
    data_list = creatVocabList(dataSet)
    train_matrix = []
    for data in dataSet:
        train_matrix.append(word2Vec(data_list,data))
    train_matrix = np.array(train_matrix)
    return train_matrix

In [76]:
train_matrix = words2Matrix(train_data)

In [176]:
def naiveBayesTrain(trainMatrix, trainLabel):
    trainLabel = np.array(trainLabel)
    categoryPro = Counter(trainLabel)
    p = ones((len(categoryPro.keys()),len(trainMatrix[0])))
    for i,key in enumerate(categoryPro.keys()):
        filertrainData = trainMatrix[trainLabel == key,:]
        p[i] += np.sum(filertrainData,axis=0)
        p[i] /= (np.sum(p[i])+2)
        p[i] = log(p[i])
        categoryPro[key] = log(categoryPro[key]/len(trainLabel))
    return p, categoryPro

In [160]:
isinstance(np.array(1),np.ndarray)

True

In [180]:
def naiveBayesClf(testVec, p, categoryPro):
    testVec = testVec.reshape(1,-1)
    label_list = list(categoryPro.keys())
    catePro = np.array(list(categoryPro.values()))
    pro_list = testVec @ p.T + catePro
    return label_list[np.argmax(pro_list)]

In [182]:
train_matrix = words2Matrix(train_data)

In [183]:
def getAcc(train_data, val_data, train_label, val_label):
    data_list = creatVocabList(train_data)
    train_matrix = words2Matrix(train_data)
    p, categoryPro = naiveBayesTrain(train_matrix,train_label)
    acc = 0.0
    for i in range(len(val_data)):
        val_list = word2Vec(data_list, val_data[i])
        predict = naiveBayesClf(val_list,p, categoryPro)
#         print(predict, val_label[i])
        if predict == val_label[i]:
            acc += 1
    return round((acc/len(val_data)*100),4)

In [184]:
print(getAcc(train_data, val_data, train_label, val_label))

98.6244
