# Comparing email spam detection models using Naive Bayes versus SVM

In [13]:
import pandas as pd
from typing import Optional
import numpy as np
import re
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Improvements:
1) Improved NLP with stemming+/lemmatization.

In [2]:
def prep_sw() -> set:
    """
    Function to return stopword set
    """

    stop_words = set(stopwords.words('english'))
    for i in ["'", 'ect', 'hou', 'enron', 'subject']:
        stop_words.add(i)

    return stop_words

In [3]:
def prep_df() -> pd.DataFrame:
    """
    Function to prepare working dataframe
    """

    df = pd.read_csv('../datasets/spam_ham_dataset.csv')
    df.drop(columns=['label', 'Unnamed: 0'], inplace=True)

    return df

In [4]:
def proc_df(df) -> None:
    """
    Function to process working dataframe.
    Modifies df inplace to add normalized feature vectors.
    Will elaborate in later commit ;).
    """

    # Create feature vector column
    df['feature_vector'] = None

    # Generate and randomize word list
    word_list = list(map(lambda x: x.lower(), words.words()[:25000]))
    np.random.shuffle(word_list)

    # Iterate through each row
    for i in range(len(df)):

        # Add processed entry to feature vector column of row and word_list
        feature_vector = proc_str(df.text[i])
        df.at[i, 'feature_vector'] = [1 if word in feature_vector else 0 for word in word_list]

    # Drop original text column
    df.drop(columns='text', inplace=True); return None

In [20]:
def get_model_data(df: pd.DataFrame, cv: Optional[bool] = True) -> tuple:
    """
    Wrapper function to generate model data.
    CV is a boolean signaling if CountVectorizer or self-defined 'vectorizer', proc_df.
    """

    if not cv:
        proc_df(df); fv = df['feature_vector']
    else:
        fv = CountVectorizer(analyzer=proc_str).fit_transform(df['text'])

    return train_test_split(fv, df['label_num'], test_size=0.20, random_state=0)

In [21]:
def proc_str(raw) -> list:
    """
    Function to process string
    """

    # Filter escape chars and tokenize body of df.text[i]
    txt_tokens = word_tokenize(re.sub("[^a-zA-Z' ]+", ' ', raw))[1:]

    # Filter stopwords and cases of filtered/tokenized extract
    txt_filtered = [token.lower() for token in txt_tokens if not token.lower() in stop_words]

    return txt_filtered

In [22]:
def model_report(ref, pred) -> None:

    print(f"""
    Classification Report:\n{classification_report(ref, pred)}\n
    Confusion Matrix:\n{confusion_matrix(ref, pred)}\n
    Accuracy Score:\n{accuracy_score(ref, pred)}
    """)

    return None

# Label_num
0: Not spam
1: Spam

In [41]:
df = prep_df()

In [42]:
stop_words = prep_sw()

In [43]:
X_train, X_test, y_train, y_test = get_model_data(df, cv=True)

In [45]:
classifier = MultinomialNB().fit(X_train, y_train)

In [46]:
train_pred = classifier.predict(X_train)

In [47]:
model_report(y_train, train_pred)


    Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2940
           1       0.99      0.97      0.98      1196

    accuracy                           0.99      4136
   macro avg       0.99      0.98      0.98      4136
weighted avg       0.99      0.99      0.99      4136


    Confusion Matrix:
[[2924   16]
 [  36 1160]]

    Accuracy Score:
0.9874274661508704
    


In [48]:
test_pred = classifier.predict(X_test)

In [49]:
model_report(y_test, test_pred)


    Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       732
           1       0.97      0.95      0.96       303

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035


    Confusion Matrix:
[[722  10]
 [ 15 288]]

    Accuracy Score:
0.9758454106280193
    
