Import necessary packages

In [2]:
import numpy as np
import sklearn as sl
import pandas as pd
import matplotlib.pyplot as plt
import email
from collections import Counter as count
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

Google colab dependency

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import dataset

In [3]:
with open("labels",'r') as f:
    # Add files with label to
    files = pd.DataFrame([line.strip().replace("../","").split(' ') for line in f.readlines()], columns=['label','file'])
    files["label"] = np.where(files["label"] == "ham", 0, 1)

with open("stop_words.txt", 'r') as f:
    stop_words = [line.strip() for line in f.readlines()]
# stop_words
files.head()

Unnamed: 0,label,file
0,0,data/000/000
1,1,data/000/001
2,1,data/000/002
3,0,data/000/003
4,1,data/000/004


Create function to clean the email message. Remove all non alphabet characters.

In [None]:
CLEANR = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
LINKCLEANR = re.compile(r'https?://\S+')
HTMLCLEANR = re.compile(r'<[^>]*>|&([a-z0-9]+);')

def clean_email(mail):
    if len(mail) == 0:
        return ""
    # Ignore case
    mail = mail.lower()
    # Remove tags
    mail = re.sub(HTMLCLEANR,"",mail)
    # Split to words
    words = mail.split()

    for idx, word in enumerate(words):
        # Skip cleaning if all characters are in alphabet
        # if any(True for letter in word if letter in CLEANR else False):
        if not word.isalpha():
            # Remove links
            word = re.sub(LINKCLEANR,"",word)
            # Remove special characters and numbers
            word = ''.join(letter for letter in word if letter in CLEANR)

        # Replace stop words with blank
        if word in stop_words:
            words[idx] = ''
        else:
            words[idx] = word
    # Rejoin message
    cleaned_message = ' '.join([word for word in words if word != ''])
    return cleaned_message

On some files, replies are appended after the body separated by a timestamp. This function splits the body using the timestamps and reads it as an email to remove the headers. All email messages are then appended into a single message.

In [None]:
TIMESTAMP = re.compile(r'\d{1,2}-[a-z]{3}-\d{4}.*\b')

def split_replies(file):
    # Skip function if the email does not contain replies
    if re.findall(TIMESTAMP, file.lower()) == []:
        return file

    body = ''
    # Split by timestamp
    reply = re.split(TIMESTAMP, file.lower())
    for string in reply:
        # Read reply
        raw = email.message_from_string(string.lstrip())
        # Add reply to the body
        body += read_email(raw) + ' '
    # Return all replies
    return body

Reads the email.message data provided. Only decodes text and decodes non utf-8 format into the ISO 8859-1 format.
For multipart emails, it reads all parts and ignores non-text messages.

In [None]:
def read_email(raw):
    body = ""
    # If email is not multipart
    if not raw.is_multipart():
        # Only extract text
        if raw.get_content_maintype() == 'text':
            body = raw.get_payload(None,True)
            try:
                # Try decoding if format results in bytes array
                body = body.decode('ISO-8859-1')
            except AttributeError:
                pass
        return body
    # If email is multipart, read all parts
    for part in raw.walk():
        # Ignore non text parts
        if part == None or part.get_content_maintype() != 'text':
            continue

        message = part.get_payload(None, True)
        # Skip blank messages
        if message == None or message == '':
            continue
        try:
            # Try decoding if format results in bytes array
            message = message.decode('ISO-8859-1')
        finally:
            body += message + ' '
    return body

Reads the email file and passes the resulting body to the be split if possible.

In [None]:
def read_email_file(file):
    body = ""
    try:
        # Get email.message and pass to read_email
        raw = email.message_from_file(file)
        body = read_email(raw)
        # Split replies
        body = split_replies(body)

    except UnicodeDecodeError:
        print("bruh")
    return body

Reads all files in the dataframe and cleans them. The cleaned messages are then saved to csv to avoid rerunning the cleaning process during testing.

In [None]:
def clean_and_tokenize(files):
    contents = []
    for loc in files['file']:
        print("Cleaning Email - ", loc)
        # Read email file
        with open(f'{loc}', 'r', encoding = 'ISO-8859-1') as f:
            read = read_email_file(f)
            try:
                # Final pass
                read = read.decode('ISO-8859-1')
            except:
                pass
            finally:
                # Clean email
                content = clean_email(read)
            contents.append(content)
    files['email_content'] = contents
    # Save to csv to avoid rerunning
    files.to_csv('data.csv', encoding='utf-8')
    print('Cleaning data complete.')

Builds the vocabulary. Cleaned messages are split and all instances of each word are counted before getting the most common ones as part of the vocabulary.

In [4]:
def build_vocabulary(emails, vocabulary=1000):
    all_words = []
    for content in emails:
        all_words.extend(str(content).split())
    counts = count(all_words)
    vocab = counts.most_common(vocabulary)
    # print(vocab)
    return [pair[0] for pair in vocab]

Generates a feature matrix. A 1 in any cell means that message(row) contains the word(column) and 0 otherwise.

In [5]:
def generate_feature_matrix(emails, vocabulary):
    feature_matrix = np.zeros((len(emails), len(vocabulary)))
    for i, content in enumerate(emails):
        words = str(content).split()
        # print(words)
        for word in words:
            if word in vocabulary:
                feature_matrix[i, vocabulary.index(word)] = 1
    # print(feature_matrix)
    return feature_matrix

Computes priors

In [6]:
def compute_priors(label):
    spam = sum(label)
    return sum(label) / len(label), len(label)-spam / len(label)

Computes the chances of the word being spam or ham with smoothing

In [7]:
def compute_likelihoods(x, y, vocabulary, smoothing=1.0):
    word_count= len(vocabulary)

    # Count occurrences for spam and ham
    spam = np.zeros(word_count)
    ham = np.zeros(word_count)

    for i in range(len(y)):
        if y[i] == 1:
            spam += x[i]
        else:
            ham += x[i]

    # Total counts of words in spam and ham
    total_spam = np.sum(spam)
    total_ham = np.sum(ham)

    # Compute likelihoods with Laplace smoothing
    spam_chance = (spam + smoothing) / (total_spam + smoothing * word_count)
    ham_chance = (ham + smoothing) / (total_ham + smoothing * word_count)

    return spam_chance, ham_chance

Computes the log probabilities of the email being spam or ham

In [8]:
def classify_email(email, spam_chance, ham_chance, spam_priori, ham_priori, underflow = 1e-10):
    log_spam = np.log(spam_priori) + np.sum(email * np.log(spam_chance))
    log_ham = np.log(ham_priori + underflow) + np.sum(email * np.log(ham_chance + underflow))

    # Classify as spam if log_prob_spam is greater, otherwise ham
    return 1 if log_spam > log_ham else 0

Classifies test emails based on the training data

In [9]:
def classify_test(x, spam_chance, ham_chance, spam_priori, ham_priori):
    classes = []
    for email in x:
        email_class = classify_email(email, spam_chance, ham_chance, spam_priori, ham_priori)
        classes.append(email_class)
    return classes

Implement Naive Bayes Spam Filter

In [10]:
def naive_bayes_spam_filter():
    # Clean data
    # clean_and_tokenize(files)
    
    # Load cleaned dataset
    data = pd.read_csv("data.csv",index_col=0)

    # Split label and email contents
    label = data.loc[:, ['label']]
    content = data.loc[:, ['email_content']]

    # Split training data
    train_label = label.iloc[:21300,:].to_numpy().flatten()
    test_label = label.iloc[21300:,:].to_numpy().flatten()
    train_content = content.iloc[:21300,:].to_numpy().flatten()
    test_content = content.iloc[21300:,:].to_numpy().flatten()

    # Build vocabulary from the cleaned training emails
    vocab = build_vocabulary(train_content)

    # Create feature matrices
    train_data = generate_feature_matrix(train_content, vocab)
    test_data = generate_feature_matrix(test_content, vocab)

    # Compute priors
    spam_priori, ham_priori = compute_priors(train_label)

    # Compute likelihoods
    spam_chance, ham_chance = compute_likelihoods(train_data, train_label, vocab)

    # Classify the test set
    y = classify_test(test_data, spam_chance, ham_chance, spam_priori, ham_priori)

    # Evaluate performance
    accuracy = accuracy_score(test_label, y)
    precision = precision_score(test_label, y)
    recall = recall_score(test_label, y)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

naive_bayes_spam_filter()

Accuracy: 0.6042246701367873
Precision: 0.9914456800684346
Recall: 0.4163448585541087
