<font size = "10"> Spam Detection in emails

<font size = "3"> Authors: M. Hooghiemstra and F. Nouwens

<font size="5"> Reading in the data </font>

In [1]:
import os
import re
import nltk
import random
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# -------------------TRAINING DATA----------------------------
#Reading in the spam and ham files
def readTrain():
    pathSpam = './trainAnnotated/spam/'

    spamFiles = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(pathSpam):
        for file in f:
            if '.txt' in file:
                spamFiles.append(os.path.join(r, file))

    spam = [0 for j in range(0,len(spamFiles))]
    for index,f in enumerate(spamFiles):
        #print(f)
        spam[index] = open(f, encoding='latin-1').read().splitlines()

    #print(spam)
    pathHam = './trainAnnotated/ham/'

    hamFiles = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(pathHam):
        for file in f:
            if '.txt' in file:
                hamFiles.append(os.path.join(r, file))

    ham = [0 for j in range(0,len(hamFiles))]
    for index,f in enumerate(hamFiles):
        ham[index] = open(f,encoding='latin-1').read().splitlines()
    
    spamList, hamList = optimalDataFormat(spam, ham)
    
    return (spamList, hamList)


In [3]:
#-----------------------------TEST DATA----------------------------------
#Reading in the spam and ham files
def readTest():
    pathSpam = './testAnnotated/spam/'

    spamFiles = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(pathSpam):
        for file in f:
            if '.txt' in file:
                spamFiles.append(os.path.join(r, file))

    spam = [0 for j in range(0,len(spamFiles))]
    for index,f in enumerate(spamFiles):
        #print(f)
        spam[index] = open(f, encoding='latin-1').read().splitlines()

    #print(spam)
    pathHam = './testAnnotated/ham/'

    hamFiles = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(pathHam):
        for file in f:
            if '.txt' in file:
                hamFiles.append(os.path.join(r, file))

    ham = [0 for j in range(0,len(hamFiles))]
    for index,f in enumerate(hamFiles):
        ham[index] = open(f,encoding='latin-1').read().splitlines()
    
    spamList, hamList = optimalDataFormat(spam, ham)
    
    return (spamList, hamList)


<font size="5"> Pre-processing the data </font>

In [4]:
def createDict(text):
    dictionary = dict([(word,True) for word in text]) 
    return dictionary

def removeStopWords(text):
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english') 
    #print(stop_words)
    text = [word for word in text if word not in stop_words]
    return text

def removePunc(text):
    punc = r"\\|\/|\%|\$|\-|\^|\[|\]|\<|\>|\&|\(|\)"        
    text = [word for word in text if not(re.match(punc,word))]
    return text

In [5]:
def optimalDataFormat(spam, ham):
    spamList= []
    hamList = []

    for i,file in enumerate(spam):
        tempSent = []
        for j,sentence in enumerate(file):
            sents = []
            #print(spam[i][j])
            sentences = sent_tokenize(spam[i][j])
    #         print('the sentences:',sentences)
            for sent in sentences:
    #             print('sentence:',sent)
                words = word_tokenize(sent)
    #             print('words tokenized:',words)
                words = removeStopWords(words)
                words = removePunc(words)
                sents += words
    #             print('preprocessed words:',sents)
    #             print("end of for loop sent in sentences")
            tempSent += sents
    #         print('sentences in file',tempSent)
        spamList.append(tempSent)
    #     print('spamlist:',spamList)

    for i,file in enumerate(ham):
        tempSent = []
        for j,sentence in enumerate(file):
            sents = []
            #print(ham[i][j])
            sentences = sent_tokenize(ham[i][j])
    #         print('the sentences:',sentences)
            for sent in sentences:
    #             print('sentence:',sent)
                words = word_tokenize(sent)
    #             print('words tokenized:',words)
                words = removeStopWords(words)
                words = removePunc(words)
                sents += words
    #             print('preprocessed words:',sents)
    #             print("end of for loop sent in sentences")
            tempSent += sents
    #         print('sentences in file',tempSent)
        hamList.append(tempSent)
    #     print('hamlist:',hamList)
    return (spamList, hamList)

In [6]:
def createTotalSet(spamList, hamList):
    hamList2 = []
    spamList2 = []
    for mail in hamList:
        hamList2.append((createDict(mail),"ham"))

    for mail in spamList:
        spamList2.append((createDict(mail),"spam"))


    #We combine the two dataset lists to eventually train the classifiers
    totalTrain = hamList2 + spamList2

    #To get unbiased results, we shuffle the dataset
    random.shuffle(totalTrain)
    return totalTrain

In [26]:
def createTotalSetNoDict(spamList, hamList):
    hamList2 = []
    spamList2 = []
    for i,message in enumerate(hamList):
        hamList2.append((hamList[i],0))

    for i,message in enumerate(spamList):
        hamList2.append((spamList[i],1))
        

    #print('hamList', hamList2[1])
    #We combine the two dataset lists to eventually train the classifiers
    totalTrain = hamList2 + spamList2

    #To get unbiased results, we shuffle the dataset
    random.shuffle(totalTrain)
    return totalTrain

In [22]:
#Running the whole code to obtain the total train data:
spamList, hamList = readTrain()
totalTrain = createTotalSet(spamList,hamList)

In [23]:
spamList, hamList = readTest()
totalTest = createTotalSet(spamList, hamList)

In [27]:
print('create train set')
spamList, hamList = readTrain()
totalTrainNoDict = createTotalSetNoDict(spamList,hamList)
print('create test set')
spamList, hamList = readTest()
totalTestNoDict = createTotalSetNoDict(spamList, hamList)

create train set
create test set


<font size="5"> Naïve Bayes Classifier </font>

In [28]:
#NAIVE BAYES CLASSIFIER
def NaiveBayes(trainX,testX):
    from sklearn.naive_bayes import MultinomialNB  
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.metrics import accuracy_score
    from nltk.classify import NaiveBayesClassifier
    
    labelsTest = []
    testSet = []
    for item in testX:
       # print(item[1])
        testSet.append(item[0])
        labelsTest.append(item[1]) 
        
    labelsTrain = []
    trainSet = []
    for item in trainX:
       # print(item[1])
        trainSet.append((item[0],item[1]))
        labelsTrain.append(item[1]) 
    
#     print(trainSet[0])

    NBmodel1 = MultinomialNB()
#     for i, message in enumerate(trainSet):
#         NBmodel1.fit(trainSet[i],labelsTrain)
    
    #NBmodel1.fit(trainSet,labelsTrain)
#     print('trainX',trainX[1])
    NBmodel2 = NaiveBayesClassifier.train(trainX)
    

    classifications = []
    for i,message in enumerate(testSet):
#         print('message is:', message)
#         classification = NBmodel1.predict(message)
        classification = NBmodel2.classify(message)
#         print('Message ', i, 'is:', classification)
        classifications.append(classification)
#         print('classification',classifications[i])
#         print('Actual value:', testX[i][1])

        
#     accuracy = nltk.classify.util.accuracy(NBmodel2, testSet)
#     print('accuracy:', accuracy*100,'%')
#     prediction = NBmodel1.predict(testSet)
    accuracy = accuracy_score(labelsTest, classifications) 
    print("accuracy is ", accuracy*100, "%")
    print("              Precision          Recall            F-score")
    #Code taken from https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    print("Macro", precision_recall_fscore_support(labelsTest, classifications, average='macro'))
    print("Micro",precision_recall_fscore_support(labelsTest, classifications, average='micro'))
    
NaiveBayes(totalTrain,totalTest)

accuracy is  97.46666666666667 %
              Precision          Recall            F-score
Macro (0.9756441682818379, 0.9746666666666666, 0.9746536442723461, None)
Micro (0.9746666666666667, 0.9746666666666667, 0.9746666666666667, None)


<font size="5"> Logistic Regression Classifier </font>

In [29]:
def LogisticRegression(trainX, testX):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.metrics import accuracy_score

    labelsTrainLR = []
    trainSetLR = []
    for item in trainX:
       # print(item[1])
        trainSetLR.append((item[0]))
        labelsTrainLR.append(item[1]) 
    
    print(trainSetLR[0])
    labelsTestLR = []
    testSetLR = []
    for item in testX:
       # print(item[1])
        testSetLR.append(item[0])
        labelsTestLR.append(item[1]) 
    
    LR = LogisticRegression(solver='liblinear', penalty='l1',class_weight='balanced')
    LRmodel = LR.fit(trainSetLR,labelsTrainLR)
    predictions = LR.predict(testX)
        
    classificationsLR = []
    for i,message in enumerate(testSetLR):
#         print('message is:', message)
#         classification = NBmodel1.predict(message)
        classificationLR = LR.classify(message)
#         print('Message ', i, 'is:', classification)
        classificationsLR.append(classificationLR)
#         print('classification',classifications[i])
#         print('Actual value:', testX[i][1])

        
#     accuracy = nltk.classify.util.accuracy(NBmodel2, testSet)
#     print('accuracy:', accuracy*100,'%')
#     prediction = NBmodel1.predict(testSet)
    accuracy = accuracy_score(labelsTestLR, classificationsLR) 
    print("accuracy is ", accuracy*100, "%")
    print("              Precision          Recall            F-score")
    #Code taken from https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    print("Macro", precision_recall_fscore_support(labelsTest, classifications, average='macro'))
    print("Micro",precision_recall_fscore_support(labelsTest, classifications, average='micro'))
    

# print(totalTrainNoDict[1])
LogisticRegression(totalTrainNoDict,totalTestNoDict)

    

['Subject', ':', 'new', 'hire', 'dinner', 'rsvps', "'", 'forget', 'let', 'know', 'interested', 'attending', 'new', 'hire', 'dinner', '6', 'pm', 'thursday', ',', 'april', '26', 'th', '!', 'held', 'oritalia', 'westin', 'hotel', '.', 'please', 'indicate', 'meal', 'selection', 'meat', 'beef', ',', 'veggie', 'pasta', '.', 'need', 'submit', 'final', 'entree', 'numbers', 'thursday', 'morning', '.', 'thanks', ',', 'grace', 'x', '8321']


ValueError: setting an array element with a sequence.

<font size="5"> Custom Word Filtering Classifier </font>

<font size="5"> Evaluation </font>