# 4   Classifying with probability theory: naive Bayes

## 4.5   Classifying text with Python

### 4.5.1   Prepare: making word vectors from text

In [10]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

***Listing 4.1*   Word list to vector function: `loadDataSet()` & `createVocabList()` & `setOfWords2Vec()`**

In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 
                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 
                  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 
                  ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 
                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 
                  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]   # 1 is abusive, 0 not
    return postingList, classVec

In [2]:
def createVocabList(dataSet):
    vocabSet = set([])   # Create an empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document)   # Create the union of two sets
    return list(vocabSet)

In [3]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [4]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print(myVocabList)

['take', 'food', 'quit', 'park', 'to', 'help', 'has', 'dog', 'so', 'love', 'is', 'dalmation', 'buying', 'not', 'stupid', 'worthless', 'my', 'ate', 'stop', 'problems', 'steak', 'garbage', 'licks', 'mr', 'how', 'flea', 'posting', 'cute', 'maybe', 'him', 'please', 'I']


In [5]:
setOfWords2Vec(myVocabList, listOPosts[0])

[0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0]

### 4.5.2   Train: calculating probabilities from word vectors

***Listing 4.2*   Naive Bayes classifier training function: `trainNB0()`**

In [6]:
from numpy import *

In [7]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = zeros(numWords)
    p1Num = zeros(numWords)   
    p0Denom = 0.0
    p1Denom = 0.0   # Initialize probabilities
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])   # Vector addition
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom   # Change to log()
    p0Vect = p0Num / p0Denom   # Change to log()
    # Element-wise division
    return p0Vect, p1Vect, pAbusive

In [8]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [9]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [13]:
pAb
p0V
p1V

0.5

array([0.        , 0.        , 0.        , 0.        , 0.04166667,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.04166667, 0.        , 0.        , 0.        ,
       0.        , 0.125     , 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.        , 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.        , 0.04166667, 0.        , 0.08333333,
       0.04166667, 0.04166667])

array([0.05263158, 0.05263158, 0.05263158, 0.05263158, 0.05263158,
       0.        , 0.        , 0.10526316, 0.        , 0.        ,
       0.        , 0.        , 0.05263158, 0.05263158, 0.15789474,
       0.10526316, 0.        , 0.        , 0.05263158, 0.        ,
       0.        , 0.05263158, 0.        , 0.        , 0.        ,
       0.        , 0.05263158, 0.        , 0.05263158, 0.05263158,
       0.        , 0.        ])

The largest probability in vectors above is 0.15789474 for the word *stupid*, which means this word is most indicative of a class `1` (abusive).

### 4.5.3   Test: modifying the classifier for real-world conditions

In [14]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = ones(numWords)   # modified
    p1Num = ones(numWords)   # modified 
    p0Denom = 2.0
    p1Denom = 2.0   # Initialize probabilities
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])   # Vector addition
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)   # modified
    p0Vect = log(p0Num/p0Denom)   # modified
    # Element-wise division
    return p0Vect, p1Vect, pAbusive

In [17]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

***Listing 4.3* Naive Bayes classify function: `classifyNB()` & `testingNB()`**

In [18]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    # Element-wise multiplication
    if p1 > p0:
        return 1
    else:
        return 0

In [19]:
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

In [20]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


### 4.5.4   Prepare: the bag-of-words document model

***Listing 4.4*   Naive Bayes bag-of-words model: `bagOfWords2VecMN()`**

In [21]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

## 4.6   Example: classifying spam email with naive Bayes

### 4.6.1   Prepare: tokenizing text

If you have a text string, you can split it using the Python string `.split()` method.

In [22]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

You can use regular expressions to split up the sentence on anything that isn't a word or number.

In [25]:
import re
regEx = re.compile('\W')   # The original code might be wrong
listOfTokens = regEx.split(mySent)
listOfTokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 '',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

To get rid of those empty strings:

In [26]:
[tok for tok in listOfTokens if len(tok) > 0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

To make every word lowercase:

In [27]:
[tok.lower() for tok in listOfTokens if len(tok) > 0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [30]:
emailText = open('/Users/duoduo/Desktop/Data/Machine Learning/Machine Learning in Action/MLA_SourceCode/Ch04/email/ham/6.txt', 
                encoding='ISO-8859-1').read()   # Add 'encoding' to compile a UTF-8 encoded file
listOfTokens = regEx.split(emailText)
listOfTokens

['Hello',
 '',
 '',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 '',
 'pages',
 'or',
 'files',
 '',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 '',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 '',
 '',
 '',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 '',
 '',
 'Instead',
 'of',
 'these',
 'features',
 '',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 '',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 '',
 '',
 'For',
 'example',
 '',
 'you',
 'can',
 'easily',
 

### 4.6.2   Test: cross validation with naive Bayes

***Listing 4.5*   File parsing and full spam test functions: `textParse()` & `spamTest()`**

In [33]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W', bigString)   # The original code might be wrong (same problem as above)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [34]:
textParse(mySent)

['this',
 'book',
 'the',
 'best',
 'book',
 'python',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [37]:
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1,26):
        wordList = textParse(open('/Users/duoduo/Desktop/Data/Machine Learning/Machine Learning in Action/MLA_SourceCode/Ch04/email/spam/%d.txt' % i, 
                                 encoding='ISO-8859-1').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('/Users/duoduo/Desktop/Data/Machine Learning/Machine Learning in Action/MLA_SourceCode/Ch04/email/ham/%d.txt' % i, 
                                 encoding='ISO-8859-1').read())
        docList.append(wordList)
        fullText.extend(wordList)   # Load and parse text files
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))   # Convert to a list since 'range' object doesn't support item deletion
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])   # Randomly create the training set
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:   # Classify the test set
            errorCount += 1
    print('the error rate is: ', float(errorCount) / len(testSet))

In [38]:
spamTest()

the error rate is:  0.2


In [39]:
spamTest()

the error rate is:  0.0


## 4.7   Example: using naive Bayes to reveal local attitudes from personal ads

### 4.7.1   Collect: importing RSS feeds

In [41]:
import feedparser

In [45]:
nasa = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')   # The website is replaced by another available one
nasa

{'feed': {'language': 'en-us',
  'title': 'NASA Image of the Day',
  'title_detail': {'type': 'text/plain',
   'language': 'en',
   'base': 'http://www.nasa.gov/',
   'value': 'NASA Image of the Day'},
  'subtitle': 'The latest NASA "Image of the Day" image.',
  'subtitle_detail': {'type': 'text/html',
   'language': 'en',
   'base': 'http://www.nasa.gov/',
   'value': 'The latest NASA "Image of the Day" image.'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'http://www.nasa.gov/'},
   {'rel': 'self',
    'href': 'http://www.nasa.gov/rss/dyn/image_of_the_day.rss',
    'type': 'application/atom+xml'}],
  'link': 'http://www.nasa.gov/',
  'authors': [{'email': 'yvette.smith-1@nasa.gov'}],
  'author': 'yvette.smith-1@nasa.gov',
  'author_detail': {'email': 'yvette.smith-1@nasa.gov'},
  'publisher': 'brian.dunbar@nasa.gov',
  'publisher_detail': {'email': 'brian.dunbar@nasa.gov'},
  'docs': 'http://blogs.law.harvard.edu/tech/rss'},
 'entries': [{'title': 'Hubble Ho

In [47]:
len(nasa['entries'])

60

***Listing 4.6*   RSS feed classifier and frequent word removal functions: `calcMostFreq()` & `localWords()`**

In [53]:
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)   # Calculates frequency of occurrence
    sortedFreq = sorted(freqDict.items(), key = operator.itemgetter(1), reverse = True)
    return sortedFreq[:30]

In [54]:
def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])   # Accesses one feed at a time
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])   # Removes most frequently occurring words
    trainingSet = list(range(2 * minLen))   # Convert to a list since 'range' object doesn't support item deletion
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V

In [55]:
nasa = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')
chelsea = feedparser.parse('https://sports.yahoo.com/soccer/teams/chelsea/rss/?shangrila=1')

If the classifier runs correctly, news about NASA will be labeled as 1 while news about Chelsea will be labeled as 0.

In [62]:
vocabList, pChe, pNasa = localWords(nasa, chelsea)

the error rate is:  0.4


In [63]:
vocabList, pChe, pNasa = localWords(nasa, chelsea)

the error rate is:  0.3


### 4.7.2   Analyze: displaying locally used words

***Listing 4.7*   Most descriptive word display function: `getTopWords()`**

In [82]:
def getTopWords(nasa, chelsea):
    import operator
    vocabList, p0V, p1V = localWords(nasa, chelsea)
    topNasa = []
    topChe = []
    for i in range(len(p0V)):
        if p0V[i] > -5.0:   # Modified the threshold to limit the list of words
            topChe.append((vocabList[i], p0V[i]))
        if p1V[i] > -5.0:
            topNasa.append((vocabList[i], p1V[i]))
    sortedChe = sorted(topChe, key = lambda pair: pair[1], reverse = True)
    print("CHELSEA:")
    for item in sortedChe:
        print(item[0])
    sortedNasa = sorted(topNasa, key = lambda pair: pair[1], reverse = True)
    print("\n")
    print("NASA:")
    for item in sortedNasa:
        print(item[0])

In [83]:
getTopWords(nasa, chelsea)

the error rate is:  0.45
CHELSEA:
old
his
mbuyamba
terms
broja
sevilla
barcelona
top
europa
made
spell
into
senior
loan
teams
even
evening
confirmed
official
mvv
ready
outlet
see
personal
that
breakout
having
trophy
brighton
squad
may
expected
record
maastricht
cup
formalise
holding
report
agreed
tonight
reported
switch
armando
sport
pursuit
make
claim
move
collection
build
vitesse
which
decade
eighth
inter
three
spanish
signed
albion
enjoyed
spending
14th
jadon
successful
saw
bargain
price
competition
firm
lampard
hoping
join
reportedly
back
bid
version
reguilon
agent
frank
complete
joining
interest
not
milan
completed
development
today
per
division
youth
partnership
tag
dutch
where
dunk
lifted
hove
continues
wrap
fresh
previously
debut
eleven
sancho
teamtalk
defender
once
end
says
linked
paper
flourish
relationship
xavier
winning
appearances
website
comes
final
won
gunners
fee
certainly
contract
hopes
him
deal
lewis
europe
signing
centre
25m
now
hopefuls

NASA:
national
mars
rover
ou

## 4.8   Summary