# Case 1：文本分类

In [16]:
import numpy as np

## prepare data

In [66]:
# 实验样本：词条切割后的集合
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worhless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1 , 0, 1] #1有侮辱性文字 0正常言论
    return postingList, classVec

# 创建词汇集合列表
def createVocabList(dataSet):
    vocabSet = set([]) #集合
    for doc in dataSet:
        vocabSet = vocabSet | set(doc) #并集
    return list(vocabSet)

# 词集模型set()：将给定词组转换成 在词汇表出现的标记1或0
# 只在乎出现/不出现
def Words2Vec(vocabList, inputSet):
    res = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            res[vocabList.index(word)] = 1 #出现该文字的位置标记为1
        else: print('The word "%s" is not in my Vocabulary.'%word) #没有出现的单词打印出来
    return res

# 词袋模型
# care词频
def bagWords2Vec(vocabList, inputSet):
    res = [0] * len(vocabList)
    for word in inputSet:
        res[vocabList.index(word)] += 1
    return res

In [3]:
listPost, listClass = loadDataSet()
print(listPost), print(listClass)

[['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worhless', 'dog', 'food', 'stupid']]
[0, 1, 0, 1, 0, 1]


(None, None)

In [4]:
myVocabList = createVocabList(listPost)
print(myVocabList)

['please', 'to', 'ate', 'not', 'how', 'him', 'flea', 'problem', 'licks', 'park', 'posting', 'buying', 'steak', 'garbage', 'stupid', 'take', 'I', 'worthless', 'help', 'maybe', 'so', 'love', 'mr', 'dalmation', 'worhless', 'is', 'stop', 'food', 'quit', 'cute', 'dog', 'has', 'my']


## train NB

In [57]:
def trainNB0(trainMat, trainLabel):
    numTraindoc = len(trainMat)
    numWord = len(trainMat[0])
    p_Abusive = sum(trainLabel) / float(numTraindoc)
#     p0_num = np.zeros(numWord); p1_num = np.zeros(numWord) #分子
#     p0_denom = 0.; p1_denom = 0. #分母
# 为避免出现prob=0时导致NB结果为0，初始化变为
    p0_num = np.ones(numWord); p1_num = np.ones(numWord)
    p0_denom = 2.; p1_denom=2.
#     print('p0_num, p1_num, p0_denom, p1_denom:',p0_num, p1_num, p0_denom, p1_denom)

    for i in range(numTraindoc):
        if trainLabel[i] == 1:
            p1_num += trainMat[i]
            p1_denom += sum(trainMat[i])
#             print('p1_num, p1_denom:',p1_num, p1_denom)
        else:
            p0_num += trainMat[i]
            p0_denom += sum(trainMat[i])
#             print('p0_num, p0_denom:',p0_num, p0_denom)
#     p0 = p0_num / p0_denom; p1 = p1_num / p1_denom
# 为避免下溢，对乘积取对数 ln(a*b)=lna+lnb
    p0 = np.log(p0_num / p0_denom); p1 = np.log(p1_num / p1_denom)

    return p0, p1, p_Abusive

In [31]:
trainMat = []
for post in listPost:
    trainMat.append(Words2Vec(myVocabList, post))
print(len(trainMat),'posts,',len(trainMat[0]))

6 posts, 33


In [38]:
p0, p1, pAb = trainNB0(trainMat, listClass)
print(p0)
print('prob_absive:', pAb)

[0.04166667 0.04166667 0.04166667 0.         0.04166667 0.08333333
 0.04166667 0.04166667 0.04166667 0.         0.         0.
 0.04166667 0.         0.         0.         0.04166667 0.
 0.04166667 0.         0.04166667 0.04166667 0.04166667 0.04166667
 0.         0.04166667 0.04166667 0.         0.         0.04166667
 0.04166667 0.04166667 0.125     ]
prob_absive: 0.5


In [54]:
index = list(p1).index(max(p1))
print('No. %dth word (prob=%s) can mostly represent for Category 1.' % (index, max(p1)))
print('This word is "%s"' % myVocabList[index])

No. 14th word (prob=0.15789473684210525) can mostly represent for Category 1.
This word is "stupid"


## classify NB and test NB

In [58]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass)
    p0 = sum(vec2Classify * p0Vec) + np.log(1. - pClass)
    if p1 > p0: return 1
    else: return 0

In [72]:
def testNB(testEntry, myVocabList=myVocabList):
    testDoc = Words2Vec(myVocabList, testEntry)
    
    listPost, listClass = loadDataSet()
    trainMat = []
    for post in listPost: trainMat.append(Words2Vec(myVocabList, post))
    p0V, p1V, pAb = trainNB0(trainMat, listClass)
    
    print(testEntry,'is classified as', classifyNB(testDoc, p0V, p1V, pAb))

# ⬇️写的不对吧？
def testNB_bag(testEntry, myVocabList=myVocabList):
    testDoc = bagWords2Vec(myVocabList, testEntry)
    
    listPost, listClass = loadDataSet()
    trainMat = []
    for post in listPost: trainMat.append(bagWords2Vec(myVocabList, post))
    p0V, p1V, pAb = trainNB0(trainMat, listClass)
    
    print(testEntry,'is classified as', classifyNB(testDoc, p0V, p1V, pAb))

In [68]:
testEntry = ['love', 'my', 'dalmation']
testNB(testEntry)
testNB_bag(testEntry)

['love', 'my', 'dalmation'] is classified as 0
['love', 'my', 'dalmation'] is classified as 0


In [71]:
testE = ['stupid', 'garbage', 'stupid']#,'darling']
testNB(testE)
testNB_bag(testE)

['stupid', 'garbage', 'stupid'] is classified as 1
['stupid', 'garbage', 'stupid'] is classified as 1


# NB过滤垃圾邮件

## data prepare: slice the text

In [73]:
# 将text分割成词汇列表
def textParse(bigString):
    import re #正则表达式包
    listTokens = re.split(r'\W*', bigString) #reg=re.compile('\\W*') lis=reg.split(bigString)
    # 过滤掉长度<3的字符串(因为里面包含url地址)，自首字母大写的，全部改成小写词汇
    return [tok.lower() for tok in listTokens if len(tok)>2]

## Test

In [188]:
def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1, 26):
        wordList = textParse(open('./MLiAc4_spam/%d.txt'%i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #垃圾邮件
        
        wordList = textParse(open('./MLiAc4_ham/%d.txt'%i, encoding='windows-1252').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0) #正常邮件
    
    vocabList = createVocabList(docList) #单词表
    
    # 划分训练集和测试集
    trainSet = list(range(50)) #因为数字就是文档名称
    testSet = []
#     print(len(trainSet), trainSet)
    
    # 随机选取10个用于测试→重复运行取平均既是交叉验证
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainSet)))
        testSet.append(trainSet[randIndex])
        del(trainSet[randIndex])
    
    # train NB
    trainMat = []; trainClass = []
    for docIndex in trainSet:
        trainMat.append(Words2Vec(vocabList, docList[docIndex]))
        trainClass.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClass))
    
    # test NB: error rate
    errorCount = 0
    for docIndex in testSet:
        wordVec = Words2Vec(vocabList, docList[docIndex])
        predictClass = classifyNB(np.array(wordVec), p0V, p1V, pSpam)
        if predictClass != classList[docIndex]:
            errorCount += 1
            print("File %d class is %d, misclassified as %d."%(docIndex, classList[docIndex], predictClass))
    
    err_rate = float(errorCount/len(testSet))
    print("\terror rate is %f \n"%err_rate)
    
    return err_rate

In [186]:
l = [1,2,3]
print(np.average(l))

2.0


In [189]:
# test 10 times to get an average error rate
def CrossValid(times=10):
    errorlist = []
    for i in range(times):
        print("Cross Validation Test %d/%d"%(i+1,times))
        errorlist.append(spamTest())
#     print(errorlist)
#     print(len(errorlist))
    err_avg = np.average(errorlist)
    print("The average error rate of %d times corss validation is %.2f%%"%(times, float(err_avg)*100))
    return err_avg

CrossValid();

Cross Validation Test 1/10
File 21 class is 0, misclassified as 1.
File 15 class is 0, misclassified as 1.
File 37 class is 0, misclassified as 1.
File 19 class is 0, misclassified as 1.
File 5 class is 0, misclassified as 1.
File 11 class is 0, misclassified as 1.
File 23 class is 0, misclassified as 1.
	error rate is 0.700000 

Cross Validation Test 2/10
File 12 class is 1, misclassified as 0.
File 0 class is 1, misclassified as 0.
File 8 class is 1, misclassified as 0.
File 30 class is 1, misclassified as 0.
File 36 class is 1, misclassified as 0.
	error rate is 0.500000 

Cross Validation Test 3/10
File 28 class is 1, misclassified as 0.
File 36 class is 1, misclassified as 0.
File 26 class is 1, misclassified as 0.
File 44 class is 1, misclassified as 0.
File 34 class is 1, misclassified as 0.
	error rate is 0.500000 

Cross Validation Test 4/10
File 11 class is 0, misclassified as 1.
File 49 class is 0, misclassified as 1.
File 9 class is 0, misclassified as 1.
File 39 class is 0