In [2]:
# Convert all files into .txt and name them with their 
# correct folder name since they are all just numbers.

import os

# Set A
#paths = ["ProjectTwo_Data/easy_ham",
#         "ProjectTwo_Data/hard_ham",
#         "ProjectTwo_Data/spam"
#         ]

# Set B
#paths = ["ProjectTwo_Data_BIG/easy_ham",
#         "ProjectTwo_Data_BIG/hard_ham",
#         "ProjectTwo_Data_BIG/spam"
#         ]

# Set C
paths = ["ProjectTwo_Data_BIG/easy_ham",
         "ProjectTwo_Data_BIG/easy_ham_2",
         "ProjectTwo_Data_BIG/hard_ham",
         "ProjectTwo_Data_BIG/spam",
         "ProjectTwo_Data_BIG/spam_2"
         ]

for path in paths:
    for filename in os.listdir(path):
        src = os.path.join(path, filename)
        dst = src + ".txt"
        os.rename(src, dst)
        
for path in paths:
    for filename in os.listdir(path):
        if filename.endswith(".txt"):
            parts = filename.split('.')
            new_name = parts[0] + ".txt"
            src = os.path.join(path, filename)
            dst = os.path.join(path, new_name)
            os.rename(src, dst)

for path in paths:
    folder_name = os.path.basename(path)  # This will get 'easy_ham', 'hard_ham', or 'spam'

    for filename in os.listdir(path):
        new_name = f"{folder_name}_{filename}"
        src = os.path.join(path, filename)
        dst = os.path.join(path, new_name)

        os.rename(src, dst)

In [3]:
# Move every 4th file into a "testing" folder
import shutil

for path in paths:
    test_path = os.path.join(path, "testing")
    if not os.path.exists(test_path):
        os.makedirs(test_path)


for path in paths:
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    for index, filename in enumerate(files, start=1):
        if index % 4 == 0:
            src = os.path.join(path, filename)
            dst = os.path.join(path, "testing", filename)
            shutil.move(src, dst)

In [4]:
import os
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

nltk.download('wordnet')

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Define a function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = text.split()

    # Lemmatization and Stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cfernandezmq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Lists to store subject lines and their labels
subject_lines = []
labels = []

count = 0

for path in paths:
    for filename in os.listdir(path):
        full_path = os.path.join(path, filename)
        if os.path.isfile(full_path):
            with open(full_path, 'r', encoding='utf-8', errors='ignore') as file:
                lines = file.readlines()
                for line in lines:
                    count += 1
                    if line.startswith("Subject:"):
                        # Apply preprocessing here
                        cleaned_subject = preprocess_text(line[len("Subject:"):].strip())
                        subject_lines.append(cleaned_subject)
                        labels.append('spam' if "spam" in path else 'ham')

print(count)
# Vectorize the subject lines
vectorizer = CountVectorizer(token_pattern=r'\b[a-zA-Z]+\b', stop_words='english')
X = vectorizer.fit_transform(subject_lines)

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB(alpha=1)  # Setting smoothing parameter alpha = 1
clf.fit(X, labels)

# To predict the probability P(spam|X=a) for a new subject line
subject_line_a = [
    preprocess_text("free rate money adv mortgag"),
    preprocess_text("free rate money adv ilug"),
    preprocess_text("free rate money razorus ilug"),
    preprocess_text("free rate wa razorus ilug"),
    preprocess_text("free spambay wa razorus ilug")
]

for subject in subject_line_a:
    X_a = vectorizer.transform([subject])  # Note that the input should be in list form
    probabilities = clf.predict_proba(X_a)
    p_spam_given_a = probabilities[0][clf.classes_ == 'spam'][0]

    print(f"P(spam|X={subject}): {p_spam_given_a}")

391310
P(spam|X=free rate money adv mortgag): 0.999999386867987
P(spam|X=free rate money adv ilug): 0.9997380074183535
P(spam|X=free rate money razoru ilug): 0.9803554328781104
P(spam|X=free rate wa razoru ilug): 0.1751626651956688
P(spam|X=free spambay wa razoru ilug): 0.0007402432146838626


In [101]:
import numpy as np

# Extract log probabilities of features given a class
log_prob_wk_spam = clf.feature_log_prob_[clf.classes_ == 'spam'][0]
log_prob_wk_ham = clf.feature_log_prob_[clf.classes_ == 'ham'][0]

# Convert log probabilities back to probabilities
prob_wk_spam = np.exp(log_prob_wk_spam)
prob_wk_ham = np.exp(log_prob_wk_ham)

# Get top 5 indices for spam and ham
top5_spam_indices = prob_wk_spam.argsort()[-5:][::-1]
top5_ham_indices = prob_wk_ham.argsort()[-5:][::-1]


In [102]:

# Output top 5 words with highest probabilities for spam and ham
print("Top 5 words with highest P(spam|wk):")
for i in top5_spam_indices:
    print(vectorizer.get_feature_names_out()[i])

Top 5 words with highest P(spam|wk):
free
adv
rate
money
mortgag


In [103]:
print("\nTop 5 words with highest P(ham|wk):")
for i in top5_ham_indices:
    print(vectorizer.get_feature_names_out()[i])


Top 5 words with highest P(ham|wk):
ilug
razorus
satalk
wa
spambay


In [104]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# List of paths for test data
paths_test = [os.path.join(p, 'testing') for p in paths]

# Prepare the test data
test_ham_subject_lines = []
test_spam_subject_lines = []

# Assuming paths_test is a list of testing paths
for path in paths_test:
    for filename in os.listdir(path):
        full_path = os.path.join(path, filename)
        if os.path.isfile(full_path):
            with open(full_path, 'r', encoding='utf-8', errors='ignore') as file:
                lines = file.readlines()
                for line in lines:
                    if line.startswith("Subject:"):
                        if "spam" in path:
                            test_spam_subject_lines.append(line[len("Subject:"):].strip())
                        else:
                            test_ham_subject_lines.append(line[len("Subject:"):].strip())

# Create a combined list and true labels
test_subject_lines = test_ham_subject_lines + test_spam_subject_lines
true_labels = ['ham'] * len(test_ham_subject_lines) + ['spam'] * len(test_spam_subject_lines)

# Vectorize the test data
X_test = vectorizer.transform(test_subject_lines)

# Predict using the trained classifier
predicted_labels = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, pos_label='spam')
recall = recall_score(true_labels, predicted_labels, pos_label='spam')

# Output results
print(f"Accuracy Rate: {accuracy}")
print(f"Precision Rate: {precision}")
print(f"Recall Rate: {recall}")


Accuracy Rate: 0.844205193160228
Precision Rate: 0.7970297029702971
Recall Rate: 0.6625514403292181
