In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The model we present is based on english text only.

In [2]:
X = []
Y = []

import os

for i in range(1,7):
    spam_path = f'./enron/enron{i}/spam'
    # List all files in the folder
    cnt_spam = 0
    files = os.listdir(spam_path)
    for file_name in files:
        # Construct the full path to the file
        file_path = os.path.join(spam_path, file_name)
        if os.path.isfile(file_path):
            # Open and read the file
            with open(file_path, 'r', encoding='latin-1') as file:
                content = file.read()
                X.append(content)
                Y.append(1)
                cnt_spam += 1
        
    ham_path = f'./enron/enron{i}/ham'       
    # List all files in the folder
    cnt_ham = 0
    files = os.listdir(ham_path)
    for file_name in files:
        # Construct the full path to the file
        file_path = os.path.join(ham_path, file_name)
        if os.path.isfile(file_path):
            # Open and read the file
            with open(file_path, 'r', encoding='latin-1') as file:
                content = file.read()
                X.append(content)
                Y.append(0)
                cnt_ham += 1


# split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### FEATURE EXTRACTION

We remove all punctuation and convert all characters to lowercase, then tokenize the string.

In [3]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

We also replace any number with "NUMBER", an email with "EMAIL" and a hyper link with "URL" before removing punctuations. We match the text using regex

In [4]:
import re

def parse_text(text):
    text = text.lower()
    text = re.sub(r'\$[\s]*[0-9]+', ' MONEY ', text)
    text = re.sub(r'(dollar|dollars|usd|rupee|rupees|pounds|pound|inr|money)', ' MONEY ', text)
    text = re.sub(r'[0-9]+', ' NUMBER ', text)
    text = re.sub(r'[^\s]+(\s?)\@(\s?)[^\s]+', ' EMAIL ', text)
    text = re.sub(r'[^\s]*(\.com|\.in|\.co\.in|\.co\.uk)', ' LINK ', text)
    text = re.sub(r'(https|http)(\s?):(\s?)/(\s?)/(\s?)[^\s]*', ' LINK ', text)
    for punct in string.punctuation:
        text = text.replace(punct, ' ')
    return text

def tokenize(text):
    lst = text.split()
    if len(lst) == 0: return []

    fin_lst = []
    if lst[0] != "subject":  # particular to enron dataset since it begins with subject in all emails
        fin_lst.append(lst[0])
    for i in range(1, len(lst)):
        if len(lst[i]) <= 1: continue
        fin_lst.append(lst[i])
    
    return fin_lst

We remove common words in english such as "I", "Me","Myself" etc

In [5]:
import nltk
nltk.download('stopwords')
for i in range(10):
    print(nltk.corpus.stopwords.words('english')[i])

i
me
my
myself
we
our
ours
ourselves
you
you're


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def rem_common_words(text):
    common_words = nltk.corpus.stopwords.words('english')
    return [word for word in text if word not in common_words]

We then lemmatize the word, which groups words that have the same core word. An example is _going, goes, gone_ which all have core meaning _go_.

In [7]:
from nltk import WordNetLemmatizer as wnl

def lemmatize_text(text):
    lemmatizer = wnl()
    return [lemmatizer.lemmatize(word) for word in text]

Final function which converts string to vector of words

In [8]:
def process_text(text):
    text = parse_text(text)
    text_list = tokenize(text)
    text_list = rem_common_words(text_list)
    text_list = lemmatize_text(text_list)

    #remove duplicates
    text_list = list(set(text_list))
    return text_list


We will now create a vocabulary array for a given text, which describes the presence of words within the text. 
We reduce the number of features by removing words that have occurred in negligible number of emails:
(Each word in a mail contributes only once)

In [9]:
def get_vocab(processed_emails):
    vocab = dict()
    for mail in processed_emails:
        for word in mail:
            if word in vocab.keys():
                vocab[word] += 1
            else:
                vocab[word] = 1
    vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

    # Now we remove words which are too rare 
    # less than 1% of the emails
    # which may carry no significance
    rem_words = []
    for word in vocab:
        if word[1] <= 0.01*len(processed_emails) :
            rem_words.append(word)
    for word in rem_words:
        vocab.remove(word)
    print(len(vocab))
    return vocab

def get_indices(vocab):
    indices = dict()
    for i, word in enumerate(vocab):
        indices[word[0]] = i
    return indices

In [10]:
def get_features_map(emails):
    processed_emails = [process_text(email) for email in emails]
    print("done processing")
    vocab = get_vocab(processed_emails)
    indices = get_indices(vocab)
    features = []
    for mail in processed_emails:
        feature = np.zeros(len(vocab), dtype=int)
        for word in mail:
            if word in indices.keys():
                feature[indices[word]] = 1
        features.append(feature)
    return features, indices

In [11]:
X_train, indices = get_features_map(X_train)

done processing
1604


In [12]:
def train(X, y):
    # to laplace smooth
    X.append(np.ones(len(X[0]), dtype=int))
    y.append(1)
    X.append(np.ones(len(X[0]), dtype=int))
    y.append(0)

    p = sum(y) / len(y)
    p_spam = np.zeros(len(X[0]), dtype = float)
    for i in range(len(X)):
        if y[i]:
            p_spam += X[i]
    p_spam = (p_spam) / (sum(y))
    p_ham = np.zeros(len(X[0]), dtype = float)
    for i in range(len(X)):
        if not y[i]:
            p_ham += X[i]
    p_ham = (p_ham) / (len(y) - sum(y))

    return p, p_spam, p_ham

p, p_spam, p_ham = train(X_train, Y_train)


In [13]:
def convert_to_feature(text, indices):
    text_list = process_text(text)
    feature = np.zeros(len(indices), dtype=int)
    for word in text_list:
        if word in indices.keys():
            feature[indices[word]] = 1
    return feature

def bernoulli(p_d,x_d):
    D = len(x_d) # number of features
    P = 1 # probability of the data point
    for d in range(D):
        if x_d[d] == 1:
            P *= p_d[d]
        else:
            P *= 1-p_d[d]
    return P

def predict(X_feature, p, p_spam, p_ham):
    y_pred = []
    for i in range(len(X_feature)):
        p_x_spam = bernoulli(p_spam,X_feature[i]) * p
        p_x_ham = bernoulli(p_ham,X_feature[i]) * (1-p)
        if p_x_spam > p_x_ham:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

def accuracy(y_pred, y):
    correct = 0
    for i in range(len(y)):
        if y_pred[i] == y[i]:
            correct += 1
    return correct / len(y)


## NAIVE BAYES:

In [14]:
X_test = [convert_to_feature(text, indices) for text in X_test]

y_pred = predict(X_test, p, p_spam, p_ham)
accuracy(y_pred, Y_test)

0.9384638196915777

## SVM: 

In [15]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, Y_train)

In [16]:
y_pred = model.predict(X_test)
model.score(X_test, Y_test)

0.9802787663107948

In [17]:
from sklearn.metrics import f1_score
f1_score(Y_test, y_pred)

0.9805013927576601

We export the trained model as a binary file

In [19]:
import pickle

# save the trained classification model as a pickle file
model_pkl_file = "spam_classifier_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)
