In [1]:
import pandas as pd
import os

In [2]:
def read_spam():
    category = 'spam'
    directory = r"D:\Email_Classification\enron1\enron1\spam"
    return read_category(category , directory)


def read_ham():
    category = 'ham'
    directory = r"D:\Email_Classification\enron1\enron1\ham"
    return read_category(category , directory)


def read_category(category,directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory,filename),'r') as fp:
            try:
                content = fp.read()
                emails.append({'name' : filename , 'content':content, 'category' : category})
            except:
                print(f'skipped {filename}')
    return emails

ham = read_ham()
spam = read_spam()

ham_df = pd.DataFrame.from_records(ham)
spam_df = pd.DataFrame.from_records(spam)

df = pd.concat([ham_df, spam_df], ignore_index=True)


skipped 2248.2004-09-23.GP.spam.txt
skipped 2526.2004-10-17.GP.spam.txt
skipped 2698.2004-10-31.GP.spam.txt
skipped 4566.2005-05-24.GP.spam.txt


In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEAVANATHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DEAVANATHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def preprocessor(e):
    e = re.sub(r'<.*?>','',e)
    e = re.sub(r'[^a-zA-Z\s]','',e)
    e = e.lower()
    words = word_tokenize(e)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    return ' '.join(words)

df['clean_content'] = df['content'].apply(preprocessor)

In [5]:
df.head()

Unnamed: 0,name,content,category,clean_content
0,0001.1999-12-10.farmer.ham.txt,Subject: christmas tree farm pictures\n,ham,subject christmas tree farm pictures
1,0002.1999-12-13.farmer.ham.txt,"Subject: vastar resources , inc .\ngary , prod...",ham,subject vastar resources inc gary production h...
2,0003.1999-12-14.farmer.ham.txt,Subject: calpine daily gas nomination\n- calpi...,ham,subject calpine daily gas nomination calpine d...
3,0004.1999-12-14.farmer.ham.txt,Subject: re : issue\nfyi - see note below - al...,ham,subject issue fyi see note already done stella...
4,0005.1999-12-14.farmer.ham.txt,Subject: meter 7268 nov allocation\nfyi .\n- -...,ham,subject meter nov allocation fyi forwarded lau...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , precision_score,recall_score,f1_score

In [8]:
vectorizer = TfidfVectorizer(max_features = 3000)

X = vectorizer.fit_transform(df['clean_content']).toarray()

y = df['category'].apply(lambda x:1 if x == 'spam' else 0)

X_train , X_test , y_train , y_test  = train_test_split(X, y ,test_size = 0.2,random_state=42)

nb_model = MultinomialNB()

nb_model.fit(X_train , y_train)

y_pred = nb_model.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test , y_pred)
f1 =f1_score(y_test,y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 0.95
Precision: 0.89
Recall: 0.94
F1-Score: 0.92
