In [1]:
import re
import csv
import numpy as np

In [2]:
def Pre_Process_Data(file_name):
    # Process data path to data/xxx/xxx
    data = []
    with open(file_name, 'r') as file:
        for line in file:
            data.append(line.strip().split(' ../'))

    # Process data to list of words
    email_text = []
    delimeters = r'[ ,.\n\s]+' # RegEx to split words
    for datum in data:
        with open(datum[1], errors='ignore') as file: # Data that cant be decoded are ignored
            email_words = re.split(delimeters, file.read())
            # Filter the data to unique words only containing a-zA-Z and making all lowercase
            lower_email = list(set([word.lower() for word in email_words if re.match('^[a-zA-Z]+$', word)]))
            email_text.append(lower_email)

    # Remove empty entries
    D = [] # List of words in list
    omega = [] # labels
    for text, label in zip(email_text, data):
        if text != []:
            D.append(text)
            omega.append(label[0])
    return D, omega

In [3]:
def Train_Test_Split(D, omega):    
    # Randomize the data and split to 70-30
    permute = np.random.permutation(len(omega)) # Random index
    split_index = round(len(omega) * 0.7)
    D_train = []
    omega_train = []
    D_test = []
    omega_test = []
    for index, perm in enumerate(permute):
        if index < split_index:
            D_train.append(D[perm])
            omega_train.append(omega[perm])
        else:
            D_test.append(D[perm])
            omega_test.append(omega[perm])
    return D_train, omega_train, D_test, omega_test

In [4]:
def count_vocab(vocab, X):
    # Count the occurence of vocabulary words in the data
    words, counts = np.unique(X, return_counts=True) # Unique words in data and its count
    word_counts = np.zeros(len(vocab), dtype=int)
    for word, count in zip(words, counts):
        word_counts[np.where(vocab == word)] = count # np.where gets the index of the word in the vocabulary
    return word_counts

In [5]:
def Train_Naive_Bayes(D_train, omega_train, vocab=None):
    # Training of the algorithm
    # Note: likelihood computations are in the predict function to avoid repeating training steps when changing lambda)
    omega = np.array(omega_train) # Training labels
    D = np.array(D_train, dtype=object) # Training data converted
    N = omega.shape[0] # Number of training examples
    
    # Get the unique words or vocabulary of the training set
    if vocab is None:
        vocab = np.array(sorted(list(set(word for sublist in D for word in sublist))), dtype=str)
    V = len(vocab)

    # Compute prior probabilities
    n_spam = np.sum(omega == 'spam')
    n_ham = np.sum(omega == 'ham')
    prior_spam = n_spam/N
    prior_ham = n_ham/N

    # Compute the occurence of spam and ham words in each document
    D_spam = D[omega == 'spam']
    D_ham = D[omega == 'ham']
    X_spam = np.concatenate(D_spam) # Convert to a 1D array
    X_ham = np.concatenate(D_ham)
    words_spam = count_vocab(vocab, X_spam)
    words_ham = count_vocab(vocab, X_ham)
    print('V:', V)
    print('Prior Probability Spam:', prior_spam)
    print('Prior Probability Ham:', prior_ham)
    return prior_spam, prior_ham, words_spam, words_ham, n_spam, n_ham, vocab


In [6]:
def Test_Vocab(D_test, vocab):
    # Return a boolean matrix of vocabulary positioning of words present in the testing data
    D_test_bool = []
    for d in D_test:
        words = np.isin(vocab, d)
        D_test_bool.append(words)
    return D_test_bool

In [7]:
def Predict_Naive_Bayes(D_test_bool, omega_test, prior_spam, prior_ham, words_spam, words_ham, n_spam, n_ham, vocab, smoothing=1):
    # Predict the labels of the testing set
    V = len(vocab)
    likelihood_spam = (words_spam + smoothing)/(n_spam + smoothing*V)
    likelihood_ham = (words_ham + smoothing)/(n_ham + smoothing*V)
    # Log likelihood
    log_spam_1 = np.log(likelihood_spam)
    log_spam_0 = np.log(1 - likelihood_spam)
    log_ham_1 = np.log(likelihood_ham)
    log_ham_0 = np.log(1 - likelihood_ham)
    actual = np.array(omega_test, dtype=str)
    predictions = np.empty((0,), dtype=str)
    for words in D_test_bool:
        # np.where acts as vectorized 'if' statement where elements in log_1 or log_0 is used depending on the words present in the example
        log_prob_spam = np.sum(np.where(words == 1, log_spam_1, log_spam_0)) + np.log(prior_spam)
        log_prob_ham = np.sum(np.where(words == 1, log_ham_1, log_ham_0)) + np.log(prior_ham)
        prediction = np.where(log_prob_spam > log_prob_ham, 'spam', 'ham') # Gets the maximum log likelihood
        predictions = np.concatenate((predictions, prediction.reshape(1,)))
    return actual, predictions

In [8]:
def Compute_Metrics(actual, predictions):
    # Computes the evaluation metrics
    TP = np.sum(np.all((actual=='spam', actual==predictions), axis=0)) # Spam, classified as Spam
    TN = np.sum(np.all((actual=='ham', actual==predictions), axis=0)) # Ham, classified as Ham
    FP = np.sum(np.all((actual=='ham', actual!=predictions), axis=0)) # Ham, classified as Spam
    FN = np.sum(np.all((actual=='spam', actual!=predictions), axis=0)) # Spam, classified as Ham
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    print('Accuracy:', Accuracy)
    print('Precision:', Precision)
    print('Recall:', Recall)
    return Accuracy, Precision, Recall

In [9]:
D, omega = Pre_Process_Data('labels')
D_train, omega_train, D_test, omega_test = Train_Test_Split(D, omega)

In [10]:
training_output = Train_Naive_Bayes(D_train, omega_train)

V: 82977
Prior Probability Spam: 0.6581303116147309
Prior Probability Ham: 0.3418696883852691


In [11]:
# The boolean list is used to speed up prediction when changing lambda smoothing
D_test_bool = Test_Vocab(D_test, training_output[-1])

In [12]:
Metrics = np.zeros((5,3)) # [Accuracy, Precision, Recall]

---
$\lambda = 2$

In [13]:
actual_0, predictions_0 = Predict_Naive_Bayes(D_test_bool, omega_test, *training_output, smoothing=2)
Metrics[0] = Compute_Metrics(actual_0, predictions_0)

Accuracy: 0.9899532916189301
Precision: 0.991207034372502
Recall: 0.9935897435897436


---
$\lambda = 1$

In [14]:
actual_1, predictions_1 = Predict_Naive_Bayes(D_test_bool, omega_test, *training_output, smoothing=1)
Metrics[1] = Compute_Metrics(actual_1, predictions_1)

Accuracy: 0.9907464528069093
Precision: 0.9938461538461538
Recall: 0.9921207264957265


---
$\lambda = 0.5$

In [15]:
actual_2, predictions_2 = Predict_Naive_Bayes(D_test_bool, omega_test, *training_output, smoothing=0.5)
Metrics[2] = Compute_Metrics(actual_2, predictions_2)

Accuracy: 0.9893363884727241
Precision: 0.9944959054906699
Recall: 0.9893162393162394


---
$\lambda = 0.1$

In [16]:
actual_3, predictions_3 = Predict_Naive_Bayes(D_test_bool, omega_test, *training_output, smoothing=0.1)
Metrics[3] = Compute_Metrics(actual_3, predictions_3)

Accuracy: 0.9805234863840663
Precision: 0.9896240398868077
Recall: 0.9807692307692307


---
$\lambda = 0.005$

In [17]:
actual_4, predictions_4 = Predict_Naive_Bayes(D_test_bool, omega_test, *training_output, smoothing=0.005)
Metrics[4] = Compute_Metrics(actual_4, predictions_4)

Accuracy: 0.9694192297523575
Precision: 0.97035963641154
Recall: 0.9837072649572649


In [18]:
x_labels = np.array(['2', '1', '0.5', '0.1', '0.005'])
print('Max Accuracy, lambda='+x_labels[np.argmax(Metrics[:,0])])
print('Max Precision, lambda='+x_labels[np.argmax(Metrics[:,1])])
print('Max Recall, lambda='+x_labels[np.argmax(Metrics[:,2])])

Max Accuracy, lambda=1
Max Precision, lambda=0.5
Max Recall, lambda=2


In [19]:
def Mutual_Information(D, omega, num_words, reduce_words=False):
    # Determine the top words based on Mutual Information
    # Has the option to remove the infrequent and frequent words
    prior_spam, prior_ham, words_spam, words_ham, n_spam, n_ham, vocab = Train_Naive_Bayes(D, omega)
    word_occur = words_spam + words_ham
    V = len(vocab)
    prior_D_1 = word_occur / V # Number of times the word occur
    prior_D_0 = 1 - prior_D_1 # Number of times the word did not occur
    smoothing = 1
    likelihood_spam = (words_spam + smoothing)/(n_spam + smoothing*V)
    likelihood_ham = (words_ham + smoothing)/(n_ham + smoothing*V)

    # Log Mutual Information formula
    log_spam_1 = likelihood_spam * (np.log(likelihood_spam) - np.log(prior_D_1) - np.log(prior_spam))
    log_spam_0 = (1 - likelihood_spam) * (np.log(1 - likelihood_spam) - np.log(prior_D_0) - np.log(prior_spam))
    log_ham_1 = likelihood_ham * (np.log(likelihood_ham) - np.log(prior_D_1) - np.log(prior_ham))
    log_ham_0 = (1 - likelihood_ham) * (np.log(1 - likelihood_ham) - np.log(prior_D_0) - np.log(prior_ham))
    MI = log_spam_1 + log_spam_0 + log_ham_1 + log_ham_0

    # Option to remove frequent and infrequent words
    if reduce_words:
        MI[np.any([word_occur < 3, word_occur > 200], axis=0)] = 0

    # Get the top words
    info_words = vocab[np.argsort(MI)[-num_words:]]
    return info_words

# Getting the top 200 words using Mutual Information

In [20]:
vocab = Mutual_Information(D, omega, 200)
print('Top 200 informative words\n',vocab)

V: 98984
Prior Probability Spam: 0.6586642694727937
Prior Probability Ham: 0.3413357305272064
Top 200 informative words
 ['foxmail' 'first' 'send' 'week' 'zproxy' 'many' 'office' 'dmdx' 'contact'
 'file' 'cs' 'into' 'them' 'information' 'oct' 'back' 'two' 'over' 'visit'
 'dsl' 'way' 'nov' 'ra' 'its' 'fast' 'down' 'telesp' 'than' 'stock'
 'offer' 'very' 'board' 'used' 'make' 'these' 'version' 'work' 'devmail'
 'day' 'best' 'see' 'handyboard' 'here' 'should' 'problem' 'then' 'after'
 'anyone' 'x' 'could' 'uid' 'need' 'want' 'invoked' 'g' 'email'
 'university' 'jan' 'does' 'hi' 'good' 'other' 'help' 'time' 'they' 'news'
 'z' 'br' 'know' 'mar' 'body' 'jp' 'feb' 'apr' 'effects' 'product' 'just'
 'how' 'which' 'm' 'b' 'us' 'u' 'me' 'am' 'o' 'also' 'set' 'company' 'l'
 'p' 'n' 'thanks' 'what' 'r' 'up' 'only' 's' 'new' 'was' 'e' 'like' 't'
 'some' 'c' 'about' 'v' 'when' 'been' 'please' 'get' 'edua' 'campaign'
 'sender' 'allowed' 'would' 'so' 'now' 'opt' 'more' 'there' 'apache' 'use'
 'localhos

In [21]:
training_output_200 = Train_Naive_Bayes(D_train, omega_train, vocab=vocab)

V: 200
Prior Probability Spam: 0.6581303116147309
Prior Probability Ham: 0.3418696883852691


In [22]:
# The boolean matrix is used to spped up prediction when changing lambda smoothing
D_test_bool_200 = Test_Vocab(D_test, training_output_200[-1])

In [23]:
actual_200, predictions_200 = Predict_Naive_Bayes(D_test_bool_200, omega_test, *training_output_200, smoothing=0.5)
Metrics_200 = Compute_Metrics(actual_200, predictions_200)

Accuracy: 0.859257953644135
Precision: 0.9128241065171688
Recall: 0.8697916666666666


# Removing frequent and infrequent words

In [24]:
vocab = Mutual_Information(D, omega, 200, reduce_words=True)
print('Top 200 informative words\n',vocab)

V: 98984
Prior Probability Spam: 0.6586642694727937
Prior Probability Ham: 0.3413357305272064
Top 200 informative words
 ['corresponding' 'tables' 'unclaimed' 'vs' 'chris' 'bulletins' 'checking'
 'rules' 'premium' 'evaluation' 'chuck' 'linear' 'porn' 'supported'
 'gains' 'thread' 'accepted' 'procedure' 'ny' 'becomes' 'resolution'
 'ericson' 'advisory' 'necessarily' 'lomas' 'corporations' 'workshop'
 'sends' 'ject' 'ext' 'ballot' 'speedy' 'comp' 'counter' 'tatoosh'
 'intervals' 'measured' 'frequency' 'malicious' 'chngmeds' 'baggins'
 'winner' 'infrared' 'theoretical' 'wendy' 'encoders' 'receiver' 'convert'
 'hu' 'wrongfully' 'caused' 'que' 'ay' 'sequence' 'maya' 'mellon' 'tom'
 'hopefully' 'variables' 'od' 'essentially' 'vulnerabilities' 'kb'
 'proposed' 'indigo' 'overhead' 'frontpage' 'dwarves' 'nick' 'presents'
 'slightly' 'scale' 'electrical' 'banking' 'jcf' 'dhcp' 'occurs' 'weeek'
 'finphame' 'univers' 'instance' 'fails' 'reasonable' 'nut' 'pres'
 'brasiltelecom' 'laboratory' 'pio' 

In [25]:
training_output_200 = Train_Naive_Bayes(D_train, omega_train, vocab=vocab)

V: 200
Prior Probability Spam: 0.6581303116147309
Prior Probability Ham: 0.3418696883852691


In [26]:
# The boolean matrix is used to spped up prediction when changing lambda smoothing
D_test_bool_200 = Test_Vocab(D_test, training_output_200[-1])

In [27]:
actual_200, predictions_200 = Predict_Naive_Bayes(D_test_bool_200, omega_test, *training_output_200, smoothing=0.5)
Metrics_200 = Compute_Metrics(actual_200, predictions_200)

Accuracy: 0.8817308539702123
Precision: 0.8560834298957126
Recall: 0.9866452991452992


In [28]:
def Save_File(file_name, variable):
    with open(file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(variable)
    return

In [29]:
Save_File('D_train.csv', D_train)
Save_File('omega_train.csv', omega_train)
Save_File('D_test.csv', D_test)
Save_File('omega_test.csv', omega_test)