In [11]:
import os
import email_read_util
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import joblib


In [3]:
DATA_DIR = '../datasets/trec07p/data/'
LABELS_FILE = '../datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [4]:
labels = {}
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [5]:
def read_email_files():
    X = []
    y = [] 
    for i in range(len(labels)):
        filename = 'inmail.' + str(i+1)
        file_path = os.path.join(DATA_DIR, filename)
        
        # Check if the file exists
        if not os.path.isfile(file_path):
            print(f"File {file_path} does not exist. Skipping.")
            continue
        
        email_str = email_read_util.extract_email_text(file_path)
        X.append(email_str)
        y.append(labels[filename])
    return X, y

In [6]:
X, y = read_email_files()

File ../datasets/trec07p/data/inmail.128 does not exist. Skipping.


In [12]:
X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X, y, range(len(y)), 
    train_size=TRAINING_SET_RATIO, random_state=2)

vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

# Initialize the classifier and make label predictions
mnb = MultinomialNB()
mnb.fit(X_train_vector, y_train)
y_pred = mnb.predict(X_test_vector)

# Print results
print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.1%}'.format(accuracy_score(y_test, y_pred)))

# Save the trained model and vectorizer
joblib.dump(mnb, './spam_classifier_model.pkl')
joblib.dump(vectorizer, './vectorizer.pkl')

              precision    recall  f1-score   support

        Spam       0.99      0.94      0.96     15053
         Ham       0.89      0.98      0.93      7573

    accuracy                           0.95     22626
   macro avg       0.94      0.96      0.95     22626
weighted avg       0.96      0.95      0.95     22626

Classification accuracy 95.3%


['./vectorizer.pkl']