# 简单的朴素贝叶斯分类样例

In [1]:
import numpy as np
import re
import operator

In [2]:
# 创建一个单词列表
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 1是侮辱类 0为非侮辱类
    return postingList, classVec

In [3]:
# 创建一个不重复的列表包含全部词语
def createVocabList(dataSet):
    vocabSet = set([])  # 创建一个空集合
    for document in dataSet:  # 将数据中的每个词添加进集合中 求并集
        vocabSet = (vocabSet | set(document))
    return list(vocabSet)

In [4]:
# 将输入的单词列表与词典比对转化为01向量
def words2Vec(vocabList, inputSet):  
    returnVec = [0] * len(vocabList) # 创建空向量
    for word in inputSet:
        if word in vocabList:
            # 如输入单词存在于字典中则将对应位置赋值为1
            returnVec[vocabList.index(word)] = 1 
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [5]:
def trainNB0(trainMatrix, trainClass):
    # trainMatrix嵌套数组的个数 即为文本数
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])  # 文字个数
    # 攻击性文本数/总文本数(先验概率)
    pAbusive = sum(trainClass) / float(numTrainDocs)
    # 初始化np数组 使用Laplace Smoothing避免出现0概率
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        # 类别为侮辱类
        if trainClass[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        # 类别为非侮辱类
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 分别计算两类中该词汇出现的概率 取Log为了避免概率连乘导致过小
    p0Vect = np.log(p0Num / p0Denom)
    p1Vect = np.log(p1Num / p1Denom)
    return p0Vect, p1Vect, pAbusive

In [6]:
# 计算概率进行分类
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    # 分别计算该句话为侮辱性/非侮辱性的Log概率
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    # 比较两类概率哪个更大一些
    if p1 > p0:
        return 1
    else:
        return 0

In [7]:
def testingNB():
    # 读入词典
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    # 创建记录是否含有单词的训练向量
    for post in listOPosts:
        trainMat.append(words2Vec(myVocabList, post))
    # 使用trainNB0函数计算概率
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    # 创建一个样例
    testEntry = [['love', 'my', 'dalmation'], ['stupid', 'garbage'], 
                ['Think', 'He', 'Knows']]
    # 将句子转化为向量并进行分类
    for entry in testEntry:
        thisDoc = np.array(words2Vec(myVocabList, entry))
        print(entry, 'classified as: ', 
              classifyNB(thisDoc, p0V, p1V, pAb))

In [8]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1
the word: Think is not in my Vocabulary!
the word: He is not in my Vocabulary!
the word: Knows is not in my Vocabulary!
['Think', 'He', 'Knows'] classified as:  0


# 过滤垃圾邮件的简单实现

In [9]:
# 将句子分割成单词并统一为小写
def textParse(string): 
    # 使用正则表达式\W+ 匹配非字母数字下划线字符1次以上
    listOfTokens = re.split(r'\W+', string)
    res = []
    for tok in listOfTokens:
        if tok != '':
            res.append(tok.lower())
    # 如果该行为空则返回 None
    return res if len(res) > 0 else None

In [10]:
def spamTest():
    # 初始化一些数组
    docList = []
    classList = []
    
    # 分别读入0/1两类email
    for i in range(1, 26):
        with open('email/spam/%d.txt' % i) as spamFile:
            for line in spamFile:
                # 对每一行的文字进行分割
                wordList = textParse(line)
                # 如果wordList非空则进行以下操作
                if wordList is not None:
                    docList.append(wordList)
                    classList.append(1)
        with open('email/ham/%d.txt' % i) as hamFile:
            for line in hamFile:
                wordList = textParse(line)
                if wordList is not None:
                    docList.append(wordList)
                    classList.append(0)
    # 通过样例创建词典
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    # 随机抽取样本对NB分类器进行测试
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    # 初始化训练矩阵
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        # 使用words2Vec函数将句子转化为向量
        trainMat.append(words2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    # 使用trainNB0函数计算先验概率
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    # 对测试集中的样本进行分类
    for docIndex in testSet:
        wordVector = words2Vec(vocabList, docList[docIndex])
        # 使用classifyNB函数进行分类
        res = classifyNB(np.array(wordVector), p0V, p1V, pSpam)
        if res != classList[docIndex]:
            errorCount += 1
            # 打印错误分类的样本
            print('ground truth:', classList[docIndex],
                  "| wrongly classified #", docIndex, docList[docIndex], 'as', res)
    # 打印随机抽取的10个样本中错误率
    print('Error Rate: {} Error Number: {}'.format(float(errorCount/len(testSet)), errorCount))

In [11]:
spamTest()

ground truth: 0 | wrongly classified # 3 ['hi', 'peter'] as 1
Error Rate: 0.1 Error Number: 1
