### 基于概率论的分类方法：朴素贝叶斯
朴素贝叶斯的优缺点：<br>
优点：在数据较少的情况下仍然有效，可以处理多类别问题。<br>
缺点：对于输入数据的准备方式较为敏感。<br>
适用数据类型：标称型数据。<br>

### 使用朴素贝叶斯进行文档分类
#### 朴素贝叶斯的一般过程
（1）收集数据：可以使用任何方法。这次使用RSS源。<br>
（2）准备数据：需要数值型或者布尔型数据。<br>
（3）分析数据：有大量特征时，绘制特征作用不大，此时使用直方图效果更好。<br>
（4）训练算法：计算不同的独立特征的条件概率。<br>
（5）测试算法：计算错误率。<br>
（6）使用算法：一个常见的朴素贝叶斯应用是文档分类。可以在任意的分类场景中使用朴素贝叶斯分类器，不一定非要是文本。<br>
#### 朴素贝叶斯的两个假设：
（1）特征是独立分布的；<br>
（2）每个特征同等重要。

In [2]:
from numpy import *

In [3]:
# 词表到向量的转换函数
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [4]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print(myVocabList)
vec1 = setOfWords2Vec(myVocabList, listOPosts[0])
print(vec1)
vec2 = setOfWords2Vec(myVocabList, listOPosts[3])
print(vec2)

['licks', 'worthless', 'buying', 'please', 'stop', 'help', 'not', 'posting', 'food', 'how', 'so', 'park', 'quit', 'love', 'mr', 'dalmation', 'take', 'steak', 'my', 'flea', 'I', 'has', 'maybe', 'cute', 'is', 'to', 'stupid', 'ate', 'him', 'problems', 'dog', 'garbage']
[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]
[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]


In [5]:
# 朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)    # 先验概率
    p0Num = ones(numWords)
    p1Num = ones(numWords)    # 防止乘积为0
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:     
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vec = log(p1Num / p1Denom)    # 1类中每个词在所有词中的概率
    p0Vec = log(p0Num / p0Denom)    # log防止乘完数字太小
    return p0Vec, p1Vec, pAbusive

In [6]:
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(trainMat, listClasses)
print(pAb)
print(p0V)
print(p1V)

0.5
[-2.56494936 -3.25809654 -3.25809654 -2.56494936 -2.56494936 -2.56494936
 -3.25809654 -3.25809654 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -1.87180218 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.15948425 -2.56494936
 -2.56494936 -3.25809654]
[-3.04452244 -1.94591015 -2.35137526 -3.04452244 -2.35137526 -3.04452244
 -2.35137526 -2.35137526 -2.35137526 -3.04452244 -3.04452244 -2.35137526
 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244
 -3.04452244 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244
 -3.04452244 -2.35137526 -1.65822808 -3.04452244 -2.35137526 -3.04452244
 -1.94591015 -2.35137526]


In [7]:
# 朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [8]:
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


In [9]:
# 朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec

### 使用朴素贝叶斯过滤垃圾邮件

In [10]:
def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
    
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    trainingSet = list(range(50)); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error",docList[docIndex])
    print('the error rate is: ',float(errorCount)/len(testSet))

In [11]:
spamTest()

classification error ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don抰', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is:  0.1


  return _compile(pattern, flags).split(string, maxsplit)


### 使用朴素贝叶斯分类器从个人广告中获取区域倾向

In [12]:
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token]=fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30]       

def localWords(feed1,feed0):
    import feedparser
    docList=[]; classList = []; fullText =[]
    print(len(feed1['entries']))
    print(len(feed0['entries']))
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen)); testSet=[]           #create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V

In [13]:
import feedparser
# ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
# sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
ny = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')
sf = feedparser.parse('http://rss.yule.sohu.com/rss/yuletoutiao.xml')
vocabList, pSF, pNY = localWords(ny, sf)

60
30
the error rate is:  0.5


  return _compile(pattern, flags).split(string, maxsplit)


### 分析数据：显式地域相关的用词

In [14]:
def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])

In [15]:
getTopWords(ny, sf)

60
30
the error rate is:  0.5
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
据台湾媒体报道
据香港媒体报道
张译让剧组女生下跪
我们每
又是作为粉丝量巨大的年轻偶像鹿晗首度出演的电视剧
奔跑吧兄弟
韩国偶像组合bigbang成员top今天被爆料违法吸食大麻
曾舜晞
那预算呢
送给80岁的谢贤
孝顺儿女都会花心思送礼报恩
于年初为老公黄晓明诞下
英国telegrap
外媒评价周杰伦为
但是周董最近都火到国际上了
向西里_cili喊话
他笑言
和她们合照压力好大
秦舒培晒照毫无孕相
媒体接着追问打算买什么礼物给贾静雯
是代表香港的女神
孔令辉的欠债风波又有新进展
艺人张智霖
被送到医院抢救仍宣告不治
港星陈冠希1月被爆与女友秦舒培已经有爱的结晶
艺人杨颖
4月19日
而对top进行的毛发检测结果也
情侣合照
4月18日晚
对此top正在服兵役的首尔地方警察厅宣传部门负责人表示将根据检察院调查结果处罚top
土地注
唐禹哲发文否认恋网红
华人之光
中国女乒队前主教练
择天记
他回说
就该脚踏实地的努
潘金莲是不正经的女
购入浅水湾道低密度住宅怡峰一个望海景单位
中吴老师的扮演者张凯丽和张志坚扮演的高育良的对手戏也被网友称赞为
修杰楷
他倒下的瞬间影片在25日曝光
疑似自行ps合成
全剧最期待的对手戏
修杰楷出席活动
38万港币
妖精会
25日
为两房两厅连一个套房间格
而且预产期
约合人民币7430万元
想红想博得关注
只要贾静雯喜欢都可以
他接受访问时自曝二人都是他的女神
未料唱歌唱到一半突然昏倒
不过都是见到小宝宝的头和手
媒体问他
于将于今晚
由范冰冰主演
让现场所有观众吓呆了
享年56岁
但女方随后遭到网友起底
最近英国一家媒体对周董是大加赞赏
文章来源
张译曝光骗子以他名义冒充副导演和网友聊天截
angelababy
我不是潘金莲
出席活动
约合人民币3
唐禹哲20日也亲自出面
中因称
向来孝顺的谢霆锋今年的父亲节礼物特别大份
周杰伦的音乐才华我们已经不用多说
黄晓明baby和小海绵
的开播可谓是声势浩大
艺人唐禹哲被爆热恋大陆网红
有没有要
联合早报
top涉嫌去年10月在首尔自己家与一名女性友人吸食大麻一共三次
乒乓王子
刚刚出门的时候她
与同场

  return _compile(pattern, flags).split(string, maxsplit)
