In [39]:
import re
from nltk.corpus import stopwords
import numpy as np

def textParse(textString):
    tokenList = re.split(r'[^a-zA-Z]', textString)
    wordList = [token.lower() for token in tokenList if len(token) > 2 and token.lower() not in set(stopwords.words('english'))]
    return wordList
    

In [41]:
message = "Hello, My name is Edward Zhang and I'm really happy to be here.666."
parsedMessage = textParse(message)
print(parsedMessage)

['hello', 'name', 'edward', 'zhang', 'really', 'happy']


In [3]:
def vocabList(data):
    vocabSet = set()
    for message in data:
        vocabSet = vocabSet | set(message)
    return list(vocabSet)


In [4]:
message2 = "Today I will be talking about how we can apply AI to our daily jobs here at Cisco."
parsedMessage2 = textParse(message2)
print(parsedMessage2)

['today', 'talking', 'apply', 'ai', 'daily', 'jobs', 'cisco']


In [15]:
vocabulary = vocabList([parsedMessage, parsedMessage2])
print(vocabulary)

['jobs', 'name', 'zhang', 'ai', 'daily', 'talking', 'cisco', 'edward', 'apply', 'really', 'hello', 'today', 'happy']


In [69]:
def bagOfWords2Vec(vocabulary, inputData):
    returnVec = [0] * len(vocabulary)
    for word in inputData:
        if word in vocabulary:
            returnVec[vocabulary.index(word)] += 1
    return returnVec
            

In [18]:
message3 = "Although AI is not widly used in Cisco today, I believe there is really a big room for improvememt and AI will be a major focus."
parsedMessage3 = textParse(message3)
print(bagOfWords2Vec(vocabulary, parsedMessage3))

[0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0]


In [70]:
import operator

def naiveBayesClassifier(trainingData, trainingLabel):
    # p0: Normal/Positive; p1: Spam/Abusive
    numOfDocs = len(trainingData)
    numOfVocab = len(trainingData[0])
    pAbusive = np.sum(trainingLabel)/(numOfDocs * 1.0)
    
    p0Num = np.ones(numOfVocab)
    p1Num = np.ones(numOfVocab)
    p0Denom = 0.0
    p1Denom = 0.0
    validationNum = 10
    
    for i in range(numOfDocs-validationNum):
        if trainingLabel[i] == 1:
            p1Num += trainingData[i]
            p1Denom += np.sum(trainingData[i])
            
        else:
            p0Num += trainingData[i]
            p0Denom += np.sum(trainingData[i])
    
    print(np.max(p1Num))
    print(p1Denom)
    print(p0Denom)
    p1Vec = np.log(p1Num/p1Denom)
    p0Vec = np.log(p0Num/p0Denom)
    
    print(pAbusive)
    print(p1Vec)
    print(p0Vec)
    
    validationData = trainingData[-10:]
    resultList = []  
    groundTrueList = trainingLabel[-10:]
    for test in validationData:
        p1 = sum(test * p1Vec) + np.log(pAbusive)
        p0 = sum(test * p0Vec) + np.log(1-pAbusive)
        
        if p1 > p0:
            resultList.append(1)
        else:
            resultList.append(0)
         
    return (list(map(operator.sub, resultList, groundTrueList))).count(0)/(validationNum*1.0)


In [71]:
docList = []
bagOfWordsDocList = []
classList = []
for i in range(1,26):
    wordList = textParse(open('email/spam/%d.txt' %i).read())
    docList.append(wordList)
    classList.append(1)
    wordList = textParse(open('email/ham/%d.txt' %i).read())
    docList.append(wordList)
    classList.append(0)

vocabulary = vocabList(docList)
bagOfWordsDocList = [bagOfWords2Vec(vocabulary, doc) for doc in docList]


In [72]:
print(naiveBayesClassifier(bagOfWordsDocList, classList))

11.0
459.0
518.0
0.5
[-6.12905021 -6.12905021 -6.12905021 -6.12905021 -4.74275585 -6.12905021
 -6.12905021 -6.12905021 -6.12905021 -3.73115494 -6.12905021 -5.03043792
 -6.12905021 -6.12905021 -6.12905021 -6.12905021 -6.12905021 -6.12905021
 -6.12905021 -5.03043792 -6.12905021 -6.12905021 -6.12905021 -6.12905021
 -6.12905021 -5.03043792 -6.12905021 -6.12905021 -6.12905021 -6.12905021
 -6.12905021 -6.12905021 -6.12905021 -6.12905021 -5.03043792 -6.12905021
 -6.12905021 -6.12905021 -6.12905021 -5.43590303 -6.12905021 -5.43590303
 -6.12905021 -6.12905021 -6.12905021 -6.12905021 -4.33729074 -6.12905021
 -6.12905021 -5.43590303 -5.43590303 -6.12905021 -6.12905021 -5.03043792
 -6.12905021 -6.12905021 -5.43590303 -6.12905021 -6.12905021 -5.43590303
 -5.43590303 -6.12905021 -6.12905021 -4.74275585 -6.12905021 -6.12905021
 -6.12905021 -4.33729074 -4.74275585 -6.12905021 -5.43590303 -6.12905021
 -6.12905021 -5.03043792 -5.43590303 -6.12905021 -5.03043792 -6.12905021
 -6.12905021 -6.12905021 -6.12