## Classifying with probability theory: naive Bayes

- Pros: Works with a small amount of data, handles multiple classes
- Cons: Sensitive to how the input data is prepared
- Works with: Nominal values

In [1]:
import pickle
import numpy as np

### Implement Naive Bayse

In [2]:
# Naive Bayes classifier training function
# Count the number of documents in each class
# for every training document:
#     for each class:
#         if a token appears in the document
#           -> increment the count for that token
#         increment the count for tokens
# for each class:
#     for each token:
#         divide the token count by the total token count
#           to get conditional probabilities
# return conditional probabilities for each class
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / numTrainDocs
    # Initialize probabilities
    # p0Num, p1Num = np.zeros(numWords), np.zeros(numWords)
    p0Num, p1Num = np.ones(numWords), np.ones(numWords)
    # p0Denom, p1Denom = 0.0, 0.0
    p0Denom, p1Denom = 2.0, 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            # Vector addition
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # Element-wise division
    # p1Vect = p1Num / p1Denom
    # p0Vect = p0Num / p0Denom
    p1Vect = np.log(p1Num / p1Denom)
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive

In [3]:
# Naive Bayes classify function
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    # Element-wise multiplication
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

### Experiment 1: Toy dataset

In [4]:
# Word list to vector function
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park',
                    'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop',
                    'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not
    return postingList, classVec


def createVocabList(dataSet):
    # Create an empty set
    vocabSet = set([])
    for document in dataSet:
        # Create the union of two sets
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    # Create a vector of all 0s
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [5]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)

In [6]:
print(myVocabList)

['my', 'licks', 'to', 'flea', 'dalmation', 'help', 'dog', 'maybe', 'has', 'how', 'problems', 'take', 'stop', 'food', 'love', 'cute', 'is', 'garbage', 'worthless', 'posting', 'mr', 'park', 'please', 'steak', 'buying', 'ate', 'him', 'not', 'I', 'stupid', 'quit', 'so']


In [7]:
print(setOfWords2Vec(myVocabList, listOPosts[0]))

[1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [8]:
print(setOfWords2Vec(myVocabList, listOPosts[3]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [9]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))

In [10]:
print(pAb)

0.5


In [11]:
print(p0V)

[-1.87180218 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.56494936
 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654
 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -3.25809654 -2.56494936 -3.25809654 -2.56494936 -2.56494936
 -3.25809654 -2.56494936 -2.15948425 -3.25809654 -2.56494936 -3.25809654
 -3.25809654 -2.56494936]


In [12]:
print(p1V)

[-3.04452244 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -3.04452244
 -1.94591015 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -2.35137526
 -2.35137526 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -2.35137526
 -1.94591015 -2.35137526 -3.04452244 -2.35137526 -3.04452244 -3.04452244
 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -1.65822808
 -2.35137526 -3.04452244]


In [13]:
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

In [14]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


### Experiment 2: Email dataset

In [15]:
# Naive Bayes bag-of-words model
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [16]:
# File parsing and full spam test functions
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


def spamTest():
    docList, classList = [], []
    # Load and parse text files
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i,
                                  errors='ignore').read())
        docList.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i,
                                  errors='ignore').read())
        docList.append(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    # Randomly create the training set
    trainingSet, testSet = list(range(50)), []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat, trainClasses = [], []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    # Classify the test set
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),
                      p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error", docList[docIndex])
    print('the error rate is: ', errorCount / len(testSet))

In [17]:
spamTest()

classification error ['yay', 'you', 'both', 'doing', 'fine', 'working', 'mba', 'design', 'strategy', 'cca', 'top', 'art', 'school', 'new', 'program', 'focusing', 'more', 'right', 'brained', 'creative', 'and', 'strategic', 'approach', 'management', 'the', 'way', 'done', 'today']
the error rate is:  0.1


### Experiment 3: RSS dataset

In [18]:
# Original websites are no longer accessible

In [19]:
# import feedparser
# ny = []
# for s in [0, 25, 50, 75, 100, 125]:
#     ny.extend(feedparser.parse( \
#         'https://newyork.craigslist.org/search/ccc?format=rss&s=' \
#         + str(s))['entries'])
# ny_summary = []
# for i in range(len(ny)):
#     ny_summary.append(ny[i]['summary'])
# ny_summary = list(set(ny_summary))
# sf = []
# for s in [0, 25, 50, 75, 100, 125]:
#     sf.extend(feedparser.parse( \
#         'https://sfbay.craigslist.org/search/ccc?format=rss&s=' \
#         + str(s))['entries'])
# sf_summary = []
# for i in range(len(sf)):
#     sf_summary.append(sf[i]['summary'])
# sf_summary = list(set(sf_summary))

In [20]:
# output_file = open("ny_summary.pkl", "wb")
# pickle.dump(ny_summary, output_file)
# output_file.close()
# output_file = open("sf_summary.pkl", "wb")
# pickle.dump(sf_summary, output_file)
# output_file.close()

In [21]:
# RSS feed classifier and frequent word removal functions
def calcMostFreq(vocabList, fullText):
    # Calculates frequency of occurrence
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=lambda x: x[1], reverse=True)
    return sortedFreq[:30]


def spamTest(feed1, feed0):
    docList, classList, fullText = [], [], []
    minLen = min(len(feed1), len(feed0))
    for i in range(minLen):
        wordList = textParse(feed1[i])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0[i])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    # Removes most frequently occurring words
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet, testSet = list(range(2 * minLen)), []
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat, trainClasses = [], []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    # Classify the test set
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),
                      p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            # print("classification error", docList[docIndex])
    print('the error rate is: ', errorCount / len(testSet))
    return vocabList, p0V, p1V

In [22]:
input_file = open("ny_summary.pkl", "rb")
ny = pickle.load(input_file)
input_file.close()
input_file = open("sf_summary.pkl", "rb")
sf = pickle.load(input_file)
input_file.close()
vocabList, p0V, p1V = spamTest(ny, sf)

the error rate is:  0.25


In [23]:
# Most descriptive word display function
def getTopWords(ny, sf):
    vocabList, p0V, p1V = spamTest(ny, sf)
    topNY, topSF = [], []
    for i in range(len(p0V)):
        if p0V[i] > -6.0:
            topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0:
            topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key=lambda x: x[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    print(list(map(lambda x: x[0], sortedSF)))
    sortedNY = sorted(topNY, key=lambda x: x[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    print(list(map(lambda x: x[0], sortedNY)))

In [24]:
getTopWords(ny, sf)

the error rate is:  0.3
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
['was', 'play', 'want', 'his', 'san', 'out', 'hello', 'available', 'text', 'one', 'name', 'years', 'were', 'puppy', 'experience', 'dogs', 'free', 'know', 'band', 'year', 'date', 'com', 'also', 'two', 'love', 'shots', 'trained', 'good', 'there', 'help', 'children', 'other', 'studio', 'she', 'next', 'any', 'has', 'about', 'right', 'playful', 'don', 'weeks', 'only', 'let', 'loves', 'experienced', 'see', 'jazz', 'had', 'great', 'breed', 'come', 'cat', 'potty', 'forever', 'got', 'puppies', 'guitar', 'now', 'professional', 'just', 'spayed', 'fee', 'pet']
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
['carpet', 'available', 'studio', 'more', 'new', 'out', 'cargo', 'other', 'one', 'com', 'time', 'work', 'she', 'open', 'well', 'needed', 'here', 'would', 'van', 'low', 'www', 'fun', 'player', 'day', 'family', 'needs', 'year', 'singer', 'years', 'has', 'curb', 'cat', 'rates', 'first', 'seeki