In [6]:
import math
import re
import sys
import matplotlib
import os

In [7]:
directory = "train/"

In [8]:
hamDictionary = {}
spamDictionary = {}
vocabulary = set()

def word_count_directory(directory):
    filelist=[os.path.join(directory,f) for f in os.listdir(directory)]
    noOfSpamFiles = 0
    noOfHamFiles = 0
    for file_path in filelist:
        with open(file_path,encoding='latin-1') as infile:
            # to store type of file 'spam' or 'ham'
            file_type = ''
            if 'spam' in file_path:
                file_type = 'spam'
                noOfSpamFiles += 1
            elif 'ham' in file_path:
                file_type = 'ham'
                noOfHamFiles += 1
            # Loop through each line of the file 
            for line in infile:
                # Remove the leading spaces and newline character 
                line = line.strip()
                # Convert the characters in line to lowercase to avoid case mismatch 
                lowerLine = str.lower(line)
                validWords = re.split('[^a-zA-Z]',lowerLine)
                # Iterate over each word in line 
                for word in validWords:
                    # if the word is not an empty space
                    if len(word) > 0:
                        
                        if file_type=='ham':
                            # Check if the word is already in dictionary
                            if word in hamDictionary:
                                hamDictionary[word] += 1
                            else:
                                # add word to dictionary with count 1
                                hamDictionary[word] = 1
                                # add word to vocabulary set
                                vocabulary.add(word)
                                # if this word is not present in spamDictionary, add it with count 0
                                if word not in spamDictionary:
                                    spamDictionary[word] = 0
                                
                        elif file_type=='spam':
                            # Check if the word is already in dictionary
                            if word in spamDictionary:
                                spamDictionary[word] += 1
                            else:
                                # add word to dictionary with count 1
                                spamDictionary[word] = 1
                                # add word to vocabulary set
                                vocabulary.add(word)
                                # if this word is not present in hamDictionary, add it with count 0
                                if word not in hamDictionary:
                                    hamDictionary[word] = 0
    return noOfSpamFiles,noOfHamFiles

In [9]:
spam_total, ham_total = word_count_directory(directory)
total = spam_total + ham_total
priorProbOfSpam = spam_total / total
priorProbOfHam = ham_total / total

In [10]:
def create_model(vocabulary,hamDictionary,spamDictionary):
    # sorting the vocabulary to maintain order in model.txt
    vocabulary = sorted(vocabulary)
    # creating file that would store the model
    f= open("model.txt","w+")
    # getting size of vocabulary
    N = len(vocabulary)
    # smoothing value
    delta = 0.5
    smoothed_N = (delta * N)
    # calculating smoothed denominator while calculating condinational probability of ham words
    ham_denominator = sum(hamDictionary.values()) + smoothed_N
    
    # calculating smoothed denominator while calculating condinational probability of spam words
    spam_denominator = sum(spamDictionary.values()) + smoothed_N
    
    for i,word in enumerate(vocabulary):
        
        # frequency of word in ham dictionary
        freq_in_ham = hamDictionary[word]
        # conditional probabiltiy of word in ham
        c_p_in_ham = (freq_in_ham + delta) / ham_denominator
        # frequency of word in spam dictionary
        freq_in_spam = spamDictionary[word]
        # conditional probabiltiy of word in spam
        c_p_in_spam = (freq_in_spam + delta) / spam_denominator
        hamDictionary[word]=c_p_in_ham
        spamDictionary[word]=c_p_in_spam
        
        # writing all the data to model.txt
        f.write(str(i+1)+'  '+word+'  '+str(freq_in_ham)+'  '+str( "{:.8f}".format(float( c_p_in_ham )) )+'  '+str(freq_in_spam)+'  '+str( "{:.8f}".format(float( c_p_in_spam )) )+'\n')
    # closing the file
    f.close()
    
create_model(vocabulary,hamDictionary,spamDictionary)

In [11]:
vocab_test=[]
logOfHam = math.log10(priorProbOfHam)
logOfSpam = math.log10(priorProbOfSpam)

truePositive = 0   # correct Ham -> result Ham
trueNegative = 0    # correct Spam -> result Spam
falsePositive = 0    # correct Spam -> result Ham
falseNegative = 0    # correct Ham -> result Spam

filelist=[os.path.join("test/",f) for f in os.listdir("test/")]
tempCounter=0

f = open("result.txt", "w+")   # 'w+' for reading and writing
f.truncate(0)

for file_path in filelist:
    
    with open(file_path,encoding='latin-1') as infile:

        fileName =file_path.rsplit('/',1)[1]
        tempCounter=tempCounter+1;
        scoreLogHam=logOfHam     # score for ham
        scoreLogSpam=logOfSpam    # score for spam

        if("test-ham" in file_path):
            correctClassification="ham"
        else:
            correctClassification="spam"

        vocab_test=[]
        for line in infile:

            line = line.strip() 
            line = line.lower()
            words=re.split('[^a-zA-Z]',line)
            words=list(filter(None, words))
            vocab_test=vocab_test+words


        for word in vocab_test:
            if word in vocabulary:
                scoreLogHam=scoreLogHam+math.log10(hamDictionary[word])     
                scoreLogSpam=scoreLogSpam+ math.log10(spamDictionary[word])
        

        if(scoreLogHam>scoreLogSpam):
            predictedClassification="ham"
        else:
            predictedClassification="spam"

        if(correctClassification == predictedClassification):
            lable="right"
        else:
            lable="wrong"

        if(correctClassification=="ham" and predictedClassification=="ham"):
            truePositive=truePositive+1
        elif(correctClassification=="spam" and predictedClassification=="spam"):
            trueNegative=trueNegative+1
        elif(correctClassification=="spam" and predictedClassification=="ham"):
            falsePositive=falsePositive+1
        elif(correctClassification=="ham" and predictedClassification=="spam"):
            falseNegative=falseNegative+1

        scoreLogHam= str( "{:.8f}".format(float( scoreLogHam )) )
        scoreLogSpam=str( "{:.8f}".format(float( scoreLogSpam )))
        f.write(str(str(tempCounter)+" "+str(fileName)+" "+str(predictedClassification)+" "+str(scoreLogHam)+" "+str(scoreLogSpam)+" "+str(correctClassification)+" "+str(lable)+"\n"))

f.close()

In [12]:
print(truePositive)
print(trueNegative)
print(falsePositive)
print(falseNegative)

394
336
64
6
