In [224]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from collections import Counter
from math import log2,log10

In [10]:
sms = pd.read_csv('./SMSSpamCollection', sep='\t', header=None)
data = sms.loc[:,1]
target = sms.loc[:,0]
data = data.values.tolist()
target = target.values.tolist()

In [2]:
def textParse(bigString):  # 输入big string，输出word list
    import re
    listOfTokens = re.split('\W', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 0]

In [3]:
def load_data_and_split(data_path:str, val_rate:float):
    """
    加载数据集并划分训练集和验证集
    :param data_path: 数据集路径
    :param val_rate: 验证集比例
    :return: 训练集列表，验证集列表，训练集标签列表，验证集标签列表
    """
    sms = pd.read_csv(data_path, sep='\t', header=None)
    data_lines = sms.loc[:,1].values.tolist()
    label_list = sms.loc[:,0].values.tolist()
    data_list = []
    for data_line in data_lines:
        data_line = textParse(data_line)
        data_list.append(data_line)
    train_data, val_data, train_label, val_label = train_test_split(data_list, label_list, test_size=val_rate, 
                                                                        random_state=random.randint(0,1000),shuffle=True)
    return train_data,val_data, train_label,val_label

In [4]:
train_data, val_data, train_label, val_label = load_data_and_split('./SMSSpamCollection',0.2)

In [219]:
def wrods2Vec(dataSet):
    """
    计算数据集中各个词出现的概率
    """
    words2VecList = {}
    for data_line in dataSet:
        for word in data_line:
            if word in words2VecList:
                words2VecList[word] += 1
            else:
                words2VecList[word] = 1
    wordsSum = np.sum(list(words2VecList.values()))
    for key, value in words2VecList.items():
#         words2VecList[key] = value / wordsSum
        words2VecList[key] = (value+1) / (wordsSum+5)
    return words2VecList

In [159]:
def word2VecWithLable(dataSet, labels):
    """
    计算每个标签类别下各个词出现的概率
    """
    lableSet = set(labels)
    word2VecWithLableList = {}
    labelCount = Counter(labels)
    for lable in lableSet:
        subSet = []
        for i in range(len(labels)):
            if lable == labels[i]:
                subSet.append(dataSet[i])
        word2VecWithLableList[lable] = wrods2Vec(subSet)
#     for key, value in word2VecWithLableList.items():
#         print(key, np.sum(list(value.values())))
    return labelCount,word2VecWithLableList

In [122]:
def getAccuracy(train_data,train_label,val_data, val_label):
    labelCount,word2VecWithLableList = word2VecWithLable(train_data,train_label)
    acc = 0
    for i in range(len(val_data)):
#         print(classifyNB(labelCount,word2VecWithLableList,val_data[i]), val_label[i])
        if classifyNB(labelCount,word2VecWithLableList,val_data[i]) == val_label[i]:
            acc += 1
    return round((acc / len(val_data) * 100), 4)

In [128]:
train_data, val_data, train_label, val_label = load_data_and_split('./SMSSpamCollection',0.2)

In [239]:
def classifyNB(labelCount,word2VecWithLableList,words):
    """
    根据训练好的模型，预测输入向量的类型
    输出为预测类别
    """
    words = set(words)
#     print(words)
    lableList = list(word2VecWithLableList.keys())
    probaList = []
    for lable in lableList:
        proba = 0.0
        word2VecList = word2VecWithLableList[lable]
#         print(lable)
        for word in words:
            if word in word2VecList:
#                 print([word,word2VecList[word]])
                proba += log2(word2VecList[word])
            else:
#                 print(['****',1/(labelCount[lable]+5)])
                proba += log2(1/(labelCount[lable]+5))
        lablePro = labelCount[lable] / np.sum(list(labelCount.values()))
        probaList.append(proba + log2(lablePro))
#     print(probaList)
    return lableList[np.argmax(probaList)]

In [235]:
labelCount,word2VecWithLableList = word2VecWithLable(train_data,train_label)
# print(labelCount)
# print(word2VecWithLableList)
print(classifyNB(labelCount,word2VecWithLableList,val_data[1]))
print(val_label[1])

{'he', 'hundred', 'score', 'will', 'big', 'batsman', 'is', 'sac', 'set'}
ham
['he', 0.00336009620485976]
['hundred', 3.5369433735365894e-05]
['score', 3.5369433735365894e-05]
['will', 0.004704134686803664]
['big', 0.0004244332048243908]
['****', 0.0002599428125812321]
['is', 0.010699253704948184]
['sac', 3.5369433735365894e-05]
['set', 0.0002475860361475613]
spam
['****', 0.0016129032258064516]
['****', 0.0016129032258064516]
['****', 0.0016129032258064516]
['will', 0.0025375880225845335]
['big', 0.00019031910169384]
['****', 0.0016129032258064516]
['is', 0.00837404047452896]
['****', 0.0016129032258064516]
['set', 0.00012687940112922667]
[-102.16258557955642, -90.06377000838805]
spam
ham


In [202]:
labelCount,word2VecWithLableList = word2VecWithLable(train_data,train_label)
for i in range(len(val_data)):
    print(classifyNB(labelCount,word2VecWithLableList,val_data[i]), val_label[i])

ham ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham spam
ham ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham spam
ham ham
spam ham
ham ham
ham ham
ham spam
ham spam
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam spam
spam ham
ham ham
spam ham
spam ham
spam ham
ham spam
spam ham
ham ham
ham spam
spam ham
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam ham
spam spam
spam ham
ham spam
spam ham
spam ham
spam spam
ham spam
spam ham
spam ham
ham ham
spam ham
spam ham
spam ham
spam ham
spam ham
ham ham
spam ham

In [240]:
print(getAccuracy(train_data,train_label,val_data, val_label))

63.2287
