In [1]:
from numpy import *
import re

In [2]:
def createVocabList(dataset):
    vocab = set([])
    for document in dataset:
        vocab = vocab | set(document)
    return list(vocab)

In [3]:
def wordsToVec(vocab, inputset):
    vector = [0]*len(vocab)
    for word in inputset:
        if word in vocab:
            vector[vocab.index(word)] = 1
        else:
            print("Word not found in vocabulary: ", word)
    return vector

In [4]:
def trainNaiveBayes(training_mat, labels):
    training_mat = array(training_mat)
    num_docs = len(training_mat)
    num_words = len(training_mat[0])
    p_spam = sum(labels) / float(num_docs)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0_denom = 2.0
    p1_denom = 2.0

    for i in range(num_docs):
        if labels[i] == 1:
            p1_num += training_mat[i]
            p1_denom += sum(training_mat[i])
        else:
            p0_num += training_mat[i]
            p0_denom += sum(training_mat[i])

    p1vect = log(p1_num / p1_denom)
    p0vect = log(p0_num / p0_denom)

    return p0vect, p1vect, p_spam


In [5]:
def classify(vec_to_classify, p0vec, p1vec, pclass1):
    p1 = sum(vec_to_classify*p1vec) + log(pclass1)
    p0 = sum(vec_to_classify*p0vec) + log(1.0 - pclass1)
    if p1 > p0:
        return 1
    return 0

In [6]:
def textParse(given_string):
    tokens_list = re.split(r'\W+', given_string)
    return [t.lower() for t in tokens_list if len(t) > 2]

In [7]:
def spamTest():
    docList = []
    classList = []
    full_text = []
    #I have 25 non-spam (ham) emails and 25 spam emails -> hardcoding from 1 to 25. In other case, parse the folder and then read files
    for i in range(1,26):
        wordList = textParse(open(f"email/spam/{i}.txt").read())
        docList.append(wordList)
        full_text.extend(wordList)
        classList.append(1)
        wordList = textParse(open(f"email/ham/{i}.txt").read())
        docList.append(wordList)
        full_text.extend(wordList)
        classList.append(0)

    vocab_list = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        #lets add 10 files from training to test set
        random_ind = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[random_ind])
        del(trainingSet[random_ind])

    training_matrix = []
    training_class = []
    for i in trainingSet:
        training_matrix.append(wordsToVec(vocab_list, docList[i]))
        training_class.append(classList[i])
    p0V, p1V, pSpam = trainNaiveBayes(array(training_matrix), array(training_class))
    error = 0
    for i in testSet:
        word_vector = wordsToVec(vocab_list, docList[i])
        pred = classify(array(word_vector), p0V, p1V, pSpam) 
        if pred != classList[i]:
            print(f"Doc {i} misclassified: predicted {pred}, actual {classList[i]}")
            error += 1
    print(f"The error rate is {float(error)/len(testSet)}")
        

In [8]:
spamTest()

Doc 32 misclassified: predicted 0, actual 1
The error rate is 0.1


In [11]:
spamTest()

Doc 48 misclassified: predicted 0, actual 1
The error rate is 0.1


In [10]:
spamTest()

The error rate is 0.0
