In [21]:
import glob
import os
import math
import re
from collections import defaultdict


In [22]:
def tokenize(text):
    # keep lowercase alphabetic words
    return re.findall(r'\b[a-z]+\b', text.lower())


In [23]:
def load_data(directory):
    x = []
    y = []
    for f in glob.glob(os.path.join(directory, "HAM.*.txt")):
        with open(f, 'r', encoding='latin1') as file:
            x.append(file.read())
            y.append(0)
    for f in glob.glob(os.path.join(directory, "SPAM.*.txt")):
        with open(f, 'r', encoding='latin1') as file:
            x.append(file.read())
            y.append(1)
    return x, y


In [24]:
def nb_train(x, y):
    model = {
        'ham_count': 0,
        'spam_count': 0,
        'ham_fd': defaultdict(int),
        'spam_fd': defaultdict(int)
    }

    for doc, label in zip(x, y):
        words = tokenize(doc)
        if label == 0:
            model['ham_count'] += 1
            for word in words:
                model['ham_fd'][word] += 1
        else:
            model['spam_count'] += 1
            for word in words:
                model['spam_fd'][word] += 1

    return model


In [25]:
def nb_test(docs, model, use_log=False, smoothing=False):
    predictions = []
    all_words = set(model['ham_fd'].keys()) | set(model['spam_fd'].keys())
    vocab_size = len(all_words)
    ham_total = sum(model['ham_fd'].values())
    spam_total = sum(model['spam_fd'].values())

    for doc in docs:
        words = tokenize(doc)

        if use_log:
            ham_prob = math.log(model['ham_count'] / (model['ham_count'] + model['spam_count']))
            spam_prob = math.log(model['spam_count'] / (model['ham_count'] + model['spam_count']))
        else:
            ham_prob = model['ham_count'] / (model['ham_count'] + model['spam_count'])
            spam_prob = model['spam_count'] / (model['ham_count'] + model['spam_count'])

        for word in words:
            ham_freq = model['ham_fd'][word]
            spam_freq = model['spam_fd'][word]

            if smoothing:
                ham_word_prob = (ham_freq + 1) / (ham_total + vocab_size)
                spam_word_prob = (spam_freq + 1) / (spam_total + vocab_size)
            else:
                ham_word_prob = ham_freq / ham_total if ham_freq > 0 else 1e-10
                spam_word_prob = spam_freq / spam_total if spam_freq > 0 else 1e-10

            if use_log:
                ham_prob += math.log(ham_word_prob)
                spam_prob += math.log(spam_word_prob)
            else:
                ham_prob *= ham_word_prob
                spam_prob *= spam_word_prob

        predictions.append(1 if spam_prob > ham_prob else 0)

    return predictions


In [26]:
def f_score(y_true, y_pred):
    tp = sum((yt == 1 and yp == 1) for yt, yp in zip(y_true, y_pred))
    fp = sum((yt == 0 and yp == 1) for yt, yp in zip(y_true, y_pred))
    fn = sum((yt == 1 and yp == 0) for yt, yp in zip(y_true, y_pred))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    if precision + recall == 0:
        return 0.0

    return 2 * precision * recall / (precision + recall)


In [None]:
training_dir = "./project2/SPAM_training_set"
test_dir = "./project2/SPAM_test_set"

x_train, y_train = load_data(training_dir)
x_test, y_test = load_data(test_dir)

print("Loaded", len(x_train), "training emails.")
print("Loaded", len(x_test), "testing emails.")


Loaded 18457 training emails.
Loaded 800 testing emails.


In [28]:
model = nb_train(x_train, y_train)
print("Model trained successfully!")


Model trained successfully!


In [29]:
configs = [
    (False, False),  # No log, No smoothing
    (False, True),   # No log, With smoothing
    (True, False),   # With log, No smoothing
    (True, True)     # With log, With smoothing
]

results = {}

for use_log, smoothing in configs:
    y_pred = nb_test(x_test, model, use_log=use_log, smoothing=smoothing)
    score = f_score(y_test, y_pred)
    results[(use_log, smoothing)] = score


In [30]:
for config, score in results.items():
    log_str = "Log" if config[0] else "No Log"
    smoothing_str = "Smoothing" if config[1] else "No Smoothing"
    print(f"{log_str} + {smoothing_str}: F1-Score = {score:.4f}")


No Log + No Smoothing: F1-Score = 0.6318
No Log + Smoothing: F1-Score = 0.6532
Log + No Smoothing: F1-Score = 0.9780
Log + Smoothing: F1-Score = 0.9699
