In [22]:
import pandas as pd
import numpy as np
from os import makedirs, path, remove, rename, rmdir
from tarfile import open as open_tar
from shutil import rmtree
from urllib import request, parse
from glob import glob
from os import path
from re import sub
from email import message_from_file
from glob import glob
from sklearn.model_selection import StratifiedShuffleSplit
from collections import defaultdict
from functools import partial
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import gc

### Downloading the emails to train the model

In [23]:
def download_corpus(dataset_dir: str = 'data'):
    base_url = 'https://spamassassin.apache.org'
    corpus_path = 'old/publiccorpus'
    files = {
        '20021010_easy_ham.tar.bz2': 'ham',
        '20021010_hard_ham.tar.bz2': 'ham',
        '20021010_spam.tar.bz2': 'spam',
        '20030228_easy_ham.tar.bz2': 'ham',
        '20030228_easy_ham_2.tar.bz2': 'ham',
        '20030228_hard_ham.tar.bz2': 'ham',
        '20030228_spam.tar.bz2': 'spam',
        '20030228_spam_2.tar.bz2': 'spam',
        '20050311_spam_2.tar.bz2': 'spam' }

    #creates the folders: downloads, ham and spam
    downloads_dir = path.join(dataset_dir, 'downloads')
    ham_dir = path.join(dataset_dir, 'ham')
    spam_dir = path.join(dataset_dir, 'spam')

    makedirs(downloads_dir, exist_ok=True)
    makedirs(ham_dir, exist_ok=True)
    makedirs(spam_dir, exist_ok=True)


    for file, spam_or_ham in files.items():
        # download files from URL of each specific .bz2 file
        url = parse.urljoin(base_url, f'{corpus_path}/{file}')
        tar_filename = path.join(downloads_dir, file)
        request.urlretrieve(url, tar_filename)

        #list e-mails in the compressed .bz2 file
        emails = []
        with open_tar(tar_filename) as tar:
            tar.extractall(path=downloads_dir)
            for tarinfo in tar:
                if len(tarinfo.name.split('/')) > 1:
                    emails.append(tarinfo.name)

        # move e-mails to ham or spam directory
        for email in emails:
            directory, filename = email.split('/')
            directory = path.join(downloads_dir, directory)

            if not path.exists(path.join(dataset_dir, spam_or_ham, filename)):
                rename(path.join(directory, filename),
                   path.join(dataset_dir, spam_or_ham, filename))

        rmtree(directory)

download_corpus()

### How many e-mails are classified in our dataset as either Spam or not Spam?

In [24]:
#How many e-mails are classified in our dataset as either Spam or not Spam?
ham_dir = path.join('data', 'ham')
spam_dir = path.join('data', 'spam')

print('Number of Non-Spam E-mails:', len(glob(f'{ham_dir}/*')))
print('\nNumber of Spam E-mails:', len(glob(f'{spam_dir}/*')))

Number of Non-Spam E-mails: 6952

Number of Spam E-mails: 2399


### Classifier based on Naive Bayes

In [25]:
# Function to read emails from files
def read_emails(directory):
    emails = []
    for file in listdir(directory):
        with open(path.join(directory, file), 'r', encoding='latin1') as f:
            try:
                email = message_from_file(f)
                body = ''
                if email.is_multipart():
                    for part in email.walk():
                        if part.get_content_type() == 'text/plain':
                            body += part.get_payload()
                else:
                    body = email.get_payload()
                emails.append(body)
            except Exception as e:
                print(f"Error reading file {file}: {e}")
    return emails

# Directories
dataset_dir = 'data'
download_corpus(dataset_dir)

# Read emails

ham_dir = path.join('data', 'ham')
spam_dir = path.join('data', 'spam')

ham_emails = read_emails(ham_dir)
spam_emails = read_emails(spam_dir)

# Create labels
ham_labels = np.zeros(len(ham_emails))
spam_labels = np.ones(len(spam_emails))

# Concatenate ham and spam emails
all_emails = ham_emails + spam_emails
all_labels = np.concatenate([ham_labels, spam_labels])

# Vectorize emails
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(all_emails)

# Train-test split
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(stratified_split.split(X, all_labels))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = all_labels[train_index], all_labels[test_index]

# Train Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# Predictions
y_pred = naive_bayes.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


NameError: name 'listdir' is not defined

In [None]:
from email import message_from_string
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Given email
email_text = """
Subject: Get Rich Quick!

Dear Friend,

Congratulations! You've been selected to participate in an exclusive opportunity to make thousands of dollars from the comfort of your own home. Our revolutionary system guarantees quick and easy cash with minimal effort.

No more struggling to pay bills or worrying about financial security. With our proven method, you can start earning massive amounts of money in no time.

Here's what some of our satisfied customers have to say:
- "I was skeptical at first, but I'm now living my dream life thanks to this incredible system!" - John S.
- "I never thought making money online could be this simple. It's changed my life!" - Sarah L.

Don't miss out on this limited-time offer. Act now to secure your spot and start enjoying a life of financial freedom.

Click the link below to get started:
www.getrichquick.com

Remember, this opportunity is exclusive and won't last long. Take control of your financial future today!

Best regards,
The Get Rich Quick Team
"""

vectorizer = CountVectorizer() # Vectorize the email
X_email = vectorizer.fit_transform([email_text])

y_email_pred = naive_bayes.predict(X_email) # Predict using the trained Naive Bayes classifier

if y_email_pred[0] == 0:
    print("The email is classified as HAM (not spam).")
else:
    print("The email is classified as SPAM.")
