### 1、创建实验样本，postingList返回进行词条切分后的文档集合，classVec是文档集合中各个文档所对应的类别标签的集合

In [64]:
def loadDataSet():
    postingList=[['my','dog','has','flea','problems','help','please'],
                ['maybe','not','take','him','to','dog','park','stupid'],
                ['my','dalmation','is','so','cute','I','love','him'],
                ['stop','posting','stupid','worthless','garbage'],
                ['mr','licks','ate','my','steak','how','to','stop','him'],
                ['quit','buying','worthless','dog','food','stupid']]
    classVec=[0,1,0,1,0,1] # 1代表侮辱性文字，0代表正常言论
    return postingList,classVec

### 2、创建词汇表（在所有文档中出现的不重复词的列表）

In [65]:
def createVocabList(dataSet): # dataSet为一个文档集合
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet | set(document)
    return list(vocabSet)

### 3、根据词汇表和输入的文档，输出对应的文档向量--词集模型

In [66]:
def setOfWords2Vec(vocabList,inputSet): # vocabList为词汇表，inputSet为输入的某个文档
    returnVec=[0]*len(vocabList) # returnVec为文档向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
        else:
            print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

### 根据词汇表和输入的文档，输出对应的文档向量--词袋模型

In [89]:
def bagOfWords2Vec(vocabList,inputSet): # vocabList为词汇表，inputSet为输入的某个文档
    returnVec=[0]*len(vocabList) # returnVec为文档向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] +=1
        else:
            print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

#### 测试createVocabList方法

In [67]:
#listOPosts,listClasses=loadDataSet()
#myVocabList=createVocabList(listOPosts)
#myVocabList

#### 测试setOfWords2Vec方法

In [68]:
#setOfWords2Vec(myVocabList,listOPosts[0])

In [69]:
import numpy as np
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs=len(trainMatrix) # 获取训练样本的文档数：6
    numWords=len(trainMatrix[0]) # 获取单个文档中的单词个数：32
    print (numTrainDocs,numWords,trainCategory,numTrainDocs)
    pAbusive=sum(trainCategory)/float(numTrainDocs)  # 训练样本标签sum/训练样本文档总数：3/6
    print (sum(trainCategory))
    print (pAbusive)
    
    # 避免因多个概率相乘中有值为0产生最终为0的结果不准确的情况，将所有词的初始出现次数置为1，并将分母初始化为2
    p0Num=np.ones(numWords) # 单个文档的标签零矩阵：正常词汇
    p1Num=np.ones(numWords) # 单个文档的标签零矩阵：侮辱性词汇
    p0Denom=2.0;p1Denom=2.0 
    for i in range(numTrainDocs): # 对每个文档执行以下操作
        if trainCategory[i]==1: # 如果是侮辱性文字,执行以下操作
            p1Num+=trainMatrix[i] # 
            p1Denom+=sum(trainMatrix[i]) # sum(trainMatrix[0])为8
        else:
            p0Num+=trainMatrix[i]
            p0Denom+=sum(trainMatrix[i])
    p1Vect=log(p1Num/p1Denom) # 避免多个小数相乘产生的下溢出，对原值取对数
    print (p1Num,p1Denom,p1Vect)
    p0Vect=log(p0Num/p0Denom) # 避免多个小数相乘产生的下溢出，对原值取对数
    return p0Vect,p1Vect,pAbusive

In [70]:
listOpsts,listClasses=loadDataSet()
myVocabList=createVocabList(listOpsts)
trainMat=[] 
#listOpsts 文档矩阵
#listClasses 标签集合
#myVocabList 唯一性的词条集合
# trainMat 每个文档中的所有词条对于的标签向量矩阵
for postinDoc in listOpsts: # 对于文档矩阵中的每一个文档，进行以下操作
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) # 将每个文档对于的向量append到trainMat中
p0V,p1V,pAb=trainNB0(trainMat,listClasses)
print (p0V,p1V,pAb)

6 32 [0, 1, 0, 1, 0, 1] 6
3
0.5
[0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 3. 0. 1. 2. 0. 1. 2. 0. 1. 0. 1.
 0. 0. 0. 1. 0. 0. 0. 1.] 19.0 [0.         0.         0.         0.         0.05263158 0.05263158
 0.         0.05263158 0.         0.05263158 0.05263158 0.
 0.05263158 0.15789474 0.         0.05263158 0.10526316 0.
 0.05263158 0.10526316 0.         0.05263158 0.         0.05263158
 0.         0.         0.         0.05263158 0.         0.
 0.         0.05263158]
[0.04166667 0.04166667 0.04166667 0.04166667 0.         0.04166667
 0.04166667 0.         0.04166667 0.         0.         0.04166667
 0.         0.         0.04166667 0.         0.         0.04166667
 0.         0.04166667 0.04166667 0.04166667 0.04166667 0.
 0.04166667 0.125      0.04166667 0.08333333 0.04166667 0.04166667
 0.04166667 0.        ] [0.         0.         0.         0.         0.05263158 0.05263158
 0.         0.05263158 0.         0.05263158 0.05263158 0.
 0.05263158 0.15789474 0.         0.05263158 0.10526

### 构建完整分类器

In [87]:
import math
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1=sum(vec2Classify * p1Vec) + math.log(pClass1)
    p0=sum(vec2Classify * p0Vec) + math.log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0

In [85]:
def testingNB():
    listOPosts,listClasses=loadDataSet()
    myVocabList=createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOpsts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0v,p1v,pAb=trainNB0(np.array(trainMat),np.array(listClasses))
    testEntry=['love','my','dalmation']
    thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0v,p1v,pAb))
    testEntry=['stupid','garbage']
    thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0v,p1v,pAb))

In [88]:
testingNB()

6 32 [0 1 0 1 0 1] 6
3
0.5
[0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 3. 0. 1. 2. 0. 1. 2. 0. 1. 0. 1.
 0. 0. 0. 1. 0. 0. 0. 1.] 19.0 [0.         0.         0.         0.         0.05263158 0.05263158
 0.         0.05263158 0.         0.05263158 0.05263158 0.
 0.05263158 0.15789474 0.         0.05263158 0.10526316 0.
 0.05263158 0.10526316 0.         0.05263158 0.         0.05263158
 0.         0.         0.         0.05263158 0.         0.
 0.         0.05263158]
['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1
