## Library imports
Import all the library's required for this notebook.

In [4]:
import os
import os.path
from collections import Counter
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from IPython.core.display import display
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier


## Check Python Version
Check Your Python version before running this notebook.
- Python 3.6.X is required to run this notebook.

In [5]:
import re
import sys


class PythonVersionException(Exception):
    print('Please use Python version 3.6.x')
    pass;


if re.match('3.6*', sys.version.split('(')[0]) is None:
    raise PythonVersionException


Please use Python version 3.6.x


## NLTK Corpus Sets
Run this section to check if the following corpus datasets have been downloaded, if they are missing this will download
them for you.

In [6]:
import nltk

nltkDataDir = '../data/nltk_data'

nltk.data.path.append(os.path.abspath(nltkDataDir))

try:
    stopwords.words('english')
except(LookupError, OSError) as e:
    nltk.download('stopwords', nltkDataDir)

try:
    WordNetLemmatizer().lemmatize("testing")
except(LookupError, OSError) as e:
    nltk.download('punkt', nltkDataDir)
    nltk.download('wordnet', nltkDataDir)


In [11]:
stoplist = stopwords.words('english')
remove_common_artifacts = False
verbose_logs = False

dictionary = []
formatted_dictionary = []
selected_set = None

train_features = []
train_labels = []

train_dev_features = []
train_dev_labels = []

test_features = []
test_labels = []

all_features = []
all_labels = []

all_features_scaled = 0
train_features_scaled = 0
train_dev_features_scaled = 0
test_features_scaled = 0

max_iterations = 10000

kn_classifier = KNeighborsClassifier(algorithm='brute')
mpnn_classifier = MLPClassifier(hidden_layer_sizes=50, solver='lbfgs', max_iter=5000)
lr_classifier = LogisticRegression(solver='lbfgs', max_iter=max_iterations)
rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
xgboost = XGBClassifier()
mnb_classifier = MultinomialNB()
gnb_classifier = GaussianNB()
bnb_classifier = BernoulliNB()
linear_svc = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', shrinking=True, probability=True,
                     tol=0.001, cache_size=1000, max_iter=max_iterations, decision_function_shape='ovr')

models = [kn_classifier,
          mpnn_classifier,
          lr_classifier,
          rf_classifier,
          xgboost,
          mnb_classifier,
          gnb_classifier,
          bnb_classifier,
          linear_svc,
          ]

cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)

scores = []
scoring = ['precision_macro', 'recall_macro', 'f1']
scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
report_labels = ['model_name', 'fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']


def enron_selector(enable_preprocess, save_files):  # function to identify platform and selected dataset to be applied.
    print('Loading enron corpus')

    def init_lists(folder_collection, label):  # function to retrieve and apply email content to array.
        a_list = []
        doc_id = 0
        file_list = []
        label = "Loading " + label + "..."
        for entry in folder_collection:
            b_list = os.listdir(entry)
            for item in b_list:
                file_list.append(entry + item)
        for a_file in file_list:
            f = open(a_file, 'r')
            if verbose_logs:
                process_status(doc_id, file_list, label)
            try:
                a_list.append(f.read())
            except UnicodeDecodeError:
                pass
            doc_id += 1
        f.close()
        return a_list

    enron_ = ['Enron1/', 'Enron2/', 'Enron3/', 'Enron4/', 'Enron5/', 'Enron6/']
    spam = []
    ham = []

    for i, sub in enumerate(enron_):
        spam.append('../data/enron_dataset/Enron/Processed/' + enron_[i] + 'spam/')
        ham.append('../data/enron_dataset/Enron/Processed/' + enron_[i] + 'ham/')

    spam = init_lists(spam, "spam")
    ham = init_lists(ham, "ham")

    if enable_preprocess:
        all_emails = [(email, 'spam') for email in spam]
        all_emails += [(email, 'ham') for email in ham]
        ham_emails, spam_emails = preprocess(all_emails)
    else:
        ham_emails = [(email, 'ham') for email in ham]
        spam_emails = [(email, 'spam') for email in spam]

    if save_files:
        if remove_common_artifacts:
            ham_file, spam_file = "../data/processed_ham_remove_common_artifacts.txt", "../data/processed_spam_remove_common_artifacts.txt"
        else:
            ham_file, spam_file = "../data/processed_ham.txt", "../data/processed_spam.txt"

        print("Writing ham file...")
        with open(ham_file, 'w') as fp:
            fp.write('\n'.join('{} {};'.format(x[0], x[1]) for x in ham_emails))
        print("Writing spam file...")
        with open(spam_file, 'w') as fp:
            fp.write('\n'.join('{} {};'.format(x[0], x[1]) for x in spam_emails))
    return ham_emails, spam_emails


def html_list():
    html_tag_list = []
    location = '../data/html_tag_list.txt'
    f = open(location, 'r')
    for i in f:
        html_tag_list.append(i.strip())
    f.close()
    return html_tag_list


def preprocess(collection):  # function to apply pre-processing: stop words, lemmatise.
    lemmatizer = WordNetLemmatizer()
    print('Pre-processing emails...')
    entry_id = 0
    doc_id = 0
    processed = []
    html_tags = [html_list()]
    common_artifacts = ['enron', 'subject', 'mail', 'cc', '``', 'email', '\n', 'www', 'com', '\nsubject']
    for entry in collection:
        if verbose_logs:
            process_status(doc_id, collection, 'Pre-processing emails...')
        for i, line in enumerate(entry):
            emails = ''
            if i == 0:
                for word in word_tokenize(line):
                    item = lemmatizer.lemmatize(word.lower())
                    if not item in stoplist:
                        if word.isalnum() == False or word in html_tags:
                            pass
                        elif remove_common_artifacts == True and item in common_artifacts:
                            pass
                        else:
                            emails = emails + item + ','
                processed.append(tuple((emails, entry[1])))
                del emails
                entry_id += 1
        doc_id += 1
    email_list = []
    ham = []
    spam = []
    for entry in processed:
        if entry[1] == 'ham':
            email_list.append(entry)
            ham.append(entry)
    for entry in processed:
        if entry[1] == 'spam':
            email_list.append(entry)
            spam.append(entry)
    return ham, spam


def dictionary_build(all_emails, preprocess):
    print("Building Dictionary...")

    html_tags = [html_list()]
    common_artifacts = ['enron', 'subject', 'mail', 'cc', '``', 'email', '\n', 'www', 'com', '\nsubject']
    all_words = []
    for entry in all_emails:
        for sentence in entry:
            if not sentence == 'ham' or sentence == 'spam':
                words = str(sentence).split(',')
                for word in words:
                    all_words.append(word)
    dict = Counter(all_words)
    if preprocess:
        list_to_remove = list(dict)
        for item in list_to_remove:
            if len(item) <= 1:
                del dict[item]
            elif item in html_tags:
                del dict[item]
            elif str(item).isdigit():
                del dict[item]
            elif remove_common_artifacts == True and item in common_artifacts:
                del dict[item]

    return dict


# function to extract features to matrix based on calculating occurrence of words based
def extract_features(data, label):
    # on dictionary.
    features_matrix = np.zeros((len(data), len(dictionary)))
    print('Feature extraction \'' + label + '\':')
    doc_id = 0
    all_words = []
    for entry in data:
        if verbose_logs:
            process_status(doc_id, data, label)
        for i, line in enumerate(entry):
            if i == 0:
                # print('[' + str(doc_id) + '] ', entry)
                words = line.split(',')
                for word in words:
                    all_words.append(words)
                    for j, d in enumerate(dictionary):
                        if d[0] == word:
                            word_id = j
                            features_matrix[doc_id, word_id] = words.count(word)
        doc_id = doc_id + 1
    return features_matrix


def calculate(ham, spam):
    main_proportion = 0.8
    ham_size = int(len(ham) * main_proportion)
    ham_train, ham_test = ham[:ham_size], ham[ham_size:]
    spam_size = int(len(spam) * main_proportion)
    spam_train, spam_test = spam[:spam_size], spam[spam_size:]
    ham_train_size, spam_train_size = int(len(ham_train) * main_proportion), int(len(spam_train) * main_proportion)
    ham_train, ham_train_dev = ham_train[:ham_train_size], ham_train[ham_train_size:]
    spam_train, spam_train_dev = spam_train[:spam_train_size], spam_train[spam_train_size:]
    train_set, train_dev_set, test_set = ham_train + spam_train, ham_train_dev + spam_train_dev, ham_test + spam_test
    train_labels = np.zeros(len(train_set))
    train_labels[(int((len(train_set)) - len(spam_train))):len(train_set)] = 1
    train_dev_labels = np.zeros(len(train_dev_set))
    train_dev_labels[(int((len(train_dev_set)) - len(spam_train_dev))):len(train_dev_set)] = 1
    test_labels = np.zeros(len(test_set))
    test_labels[(int((len(test_set)) - len(spam_test))):len(test_set)] = 1

    all_set = ham_train + ham_train_dev + ham_test + spam_train + spam_train_dev + spam_test
    all_labels = np.zeros(len(all_set))
    all_labels[(int((len(all_labels)) - len(spam_train + spam_train_dev + spam_test))): len(all_set)] = 1

    print("Train set:\n", "Ham: ", str(len(ham_train)), "\n", "Spam: ", str(len(spam_train)),
          "\nTrain_Dev:\n Ham:", str(len(ham_train_dev)), "\n Spam:", str(len(spam_train_dev)),
          "\nTest set:\n", "Ham: ", str(len(ham_test)), "\n", "Spam: ", str(len(spam_test)),
          "\nAll set:\n", "Ham: ", str(len(ham_train + ham_train_dev + ham_test)), "\n", "Spam: ",
          str(len(spam_train + spam_train_dev + spam_test)))
    return train_set, test_set, train_labels, test_labels, train_dev_set, train_dev_labels, all_set, all_labels


def load_(file, label):
    if verbose_logs == False:
        print('Loading ' + label + ' dataset')
    with open(file, 'r') as fp:
        values = []
        doc_id = 0
        size_file = fp.read().split(";")
        for item in size_file:
            if verbose_logs:
                process_status(doc_id, size_file, "loading " + label + " file...")
            values.append(item.split(", "))
            doc_id += 1
        return values


def process_status(id, data, label):
    if id + 1 < int(len(data)):
        end_atp = "\r"
    elif id + 1 <= int(len(data)):
        end_atp = "\n"
    return print(label, '%0.0i out of %0.0i: %0.0i' %
                 (id + 1, len(data), int((id + 1) * (100 / len(data)))), '%', end='\r', flush=True)


def determine_model_name(model):
    model_name = type(model).__name__
    if model_name.lower() == 'svc':
        return model_name + '_' + model.kernel
    else:
        return type(model).__name__


def handle_scores_to_csv(file_name, save):
    processed_scores = []
    for i, d in enumerate(scores):
        processed_score = [d[0]]
        print(d)
        for c, e in enumerate(scoring_parse_labels):
            item = d[1][e]
            item = item.astype(np.float)
            if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                processed_score.append('%0.6f' % (np.mean(item)))
            else:
                processed_score.append('%0.2f' % (float(np.mean(item))))
        processed_scores.append(processed_score)

    df = pd.DataFrame(processed_scores, columns=report_labels)
    if save:
        df.to_csv(file_name)

    return df


def roc_curve_report(X, y):
    for a in range(len(models)):
        model = models[a]
        model_name = determine_model_name(model)
        try:
            aucs = []
            i = 0;
            for train, test in cv.split(X, y):
                probas_ = model.fit(X[train], y[train]).predict_proba(X[test])
                fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
                roc_auc = auc(fpr, tpr)
                aucs.append(roc_auc)
                plt.plot(fpr, tpr, lw=1, alpha=0.5, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
                i += 1
            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC: ' + model_name)
            plt.legend(loc="lower right")
            plt.savefig(
                "../results/plots/test/devROC_%s_%0i_features_%0i_test.png" % (
                    model_name, feature_size, len(X)),
                dpi=100,
                facecolor='w', edgecolor='b', linewidth=1, orientation='portrait', papertype=None,
                format="png", transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)
            print("Created %s ROC figure" % model_name)
            plt.close()
        except (AttributeError, OverflowError) as detail:
            print(model_name + " Failed due to ", detail)


def shap_report(X, y, model):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)

    train_X, val_X, train_y, val_y = train_test_split(X, y.values.ravel(), random_state=1)

    model.fit(train_X, train_y)
    med = train_X.mean().values.reshape((1, train_X.shape[1]))

    # Create object that can calculate shap values
    explainer = shap.KernelExplainer(model.predict_proba, med)
    # Calculate Shap values
    shap_values = explainer.shap_values(val_X.iloc[0:len(train_X), :], nsamples=len(train_X))

    shap.summary_plot(shap_values[1], val_X.iloc[0:len(train_X), :], formatted_dictionary, show=False, color_bar=True,
                      max_display=10,
                      plot_size=(11, 8))
    report_file_name = str(
        len(dictionary)) + '_features_' + determine_model_name(
        model) + '_shap_plot_beeswarm_' + datetime.now().strftime(
        '%Y-%m-%dT%H-%M-%S%z') + '_.pdf'

    plt.savefig('../results/shap/' + report_file_name)
    plt.close()


def models_report(X, y, save):  # function to test and record via csv, all algorithms selected.
    print('Processing model tests...')
    for a in range(len(models)):
        model = models[a]
        model_name = determine_model_name(model)
        print('{} testing model...'.format(model_name))
        scores.append([model_name, cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)])

    report_file_name = str(len(dictionary)) + '_features' '_models_test_report_' + datetime.now().strftime(
        '%Y-%m-%dT%H-%M-%S%z') + '.csv'

    df = handle_scores_to_csv('../results/scores/{}'.format(report_file_name), save)
    print('Models test finished.')
    return df


def lr_test(X, y):
    if len(scores) > 0:
        scores.clear()

    scores.append(
        [determine_model_name(lr_classifier),
         cross_validate(lr_classifier, X, y, cv=cv, scoring=scoring, return_train_score=False)])
    handle_scores_to_csv('../results/scores/logistic_regression_test.csv', True)


df = pd.DataFrame(scores, columns=['model_name', 'f1_score'])
df.to_csv('../results/scores/svm_test_scores.csv')


# User Input
Please select the following options to for the notebook to process, please run the cell below to render the widgets.

In [12]:
from ipywidgets import widgets, GridspecLayout

defined_widgets = []

preprocess_label = widgets.Label('Preprocessing and Feature Extraction Stage:')
defined_widgets.append(preprocess_label)

feature_size_widget = widgets.IntSlider(
    value=200,
    min=50,
    max=1000,
    description='Feature size'
)
defined_widgets.append(feature_size_widget)

show_logs = widgets.Checkbox(
    value=False,
    description='Show logs during process',
)
defined_widgets.append(show_logs)

enable_preprocess_widget = widgets.Checkbox(
    value=True,
    description='Enable preprocessing data.',
)
defined_widgets.append(enable_preprocess_widget)

exclude_common_artifacts = widgets.Checkbox(
    value=True,
    description='Exclude common artifacts',
)
defined_widgets.append(exclude_common_artifacts)

reprocess_saved_feature_set = widgets.Checkbox(
    value=True,
    description='Use processed feature dataset',
)
defined_widgets.append(reprocess_saved_feature_set)

test_label_widget = widgets.Label('Set selection')
defined_widgets.append(test_label_widget)

test_with_all = widgets.Checkbox(
    value=False,
    description='Full Dataset',
)
defined_widgets.append(test_with_all)

test_with_train = widgets.Checkbox(
    value=False,
    description='Train Dataset',
)
defined_widgets.append(test_with_train)

test_with_dev = widgets.Checkbox(
    value=False,
    description='Development Dataset',
)
defined_widgets.append(test_with_dev)

test_with_test = widgets.Checkbox(
    value=True,
    description='Test Dataset',
)
defined_widgets.append(test_with_test)

grid = GridspecLayout(len(defined_widgets), 1)

for i in range(len(defined_widgets)):
    grid[i, 0] = defined_widgets[i]

display(grid)


GridspecLayout(children=(Label(value='Preprocessing and Feature Extraction Stage:', layout=Layout(grid_area='w…

In [13]:
if __name__ == "__main__":
    feature_size = feature_size_widget.value
    verbose_logs = show_logs.value
    remove_common_artifacts = exclude_common_artifacts.value
    scores = []
    if remove_common_artifacts:
        a_files = ["../data/processed_ham_remove_common_artifacts.txt",
                   "../data/processed_spam_remove_common_artifacts.txt"]
    else:
        a_files = ["../data/processed_ham.txt", "../data/processed_spam.txt"]

    if reprocess_saved_feature_set.value:
        ham_collection = load_(a_files[0], "ham")
        spam_collection = load_(a_files[1], "spam")
    else:
        ham_collection, spam_collection = enron_selector(enable_preprocess_widget.value, True)
    dictionary = dictionary_build((ham_collection + spam_collection), enable_preprocess_widget.value)
    dictionary = dictionary.most_common(feature_size)
    formatted_dictionary = []
    for item in dictionary:
        formatted_dictionary.append(item[0])

    print(dictionary)
    train_set, test_set, train_labels, test_labels, train_dev_set, train_dev_labels, all_set, all_labels = calculate(
        ham_collection, spam_collection)

    if test_with_all.value:
        all_features = extract_features(all_set, 'all_set')
    elif test_with_train.value:
        train_features = extract_features(train_set, "train")
        selected_set = 'train'
    elif test_with_dev.value:
        train_dev_features = extract_features(train_dev_set, "train_dev")
        selected_set = 'dev'
    elif test_with_test.value:
        test_features = extract_features(test_set, "test")
        selected_set = 'test'

    if isinstance(all_features, np.ndarray):
        all_features_scaled = MinMaxScaler().fit_transform(all_features, all_labels)

    if isinstance(train_features, np.ndarray):
        train_features_scaled = MinMaxScaler().fit_transform(train_features, train_labels)

    if isinstance(train_dev_features, np.ndarray):
        train_dev_features_scaled = MinMaxScaler().fit_transform(train_dev_features, train_dev_labels)

    if isinstance(test_features, np.ndarray):
        test_features_scaled = MinMaxScaler().fit_transform(test_features, test_labels)

    print('Finished feature extraction')


Loading ham dataset
Loading spam dataset
Building Dictionary...
[('ect', 35666), ('company', 28711), ('please', 20234), ('ha', 20085), ('spam', 17847), ('wa', 17802), ('hou', 17264), ('would', 15499), ('time', 14803), ('new', 14776), ('price', 14215), ('business', 13526), ('may', 13110), ('information', 13076), ('one', 12298), ('gas', 11919), ('said', 11887), ('market', 11652), ('energy', 11411), ('year', 11365), ('get', 11176), ('http', 11136), ('day', 10835), ('message', 10689), ('need', 10656), ('stock', 10416), ('deal', 9996), ('know', 9682), ('pm', 9676), ('service', 9634), ('also', 9229), ('report', 8988), ('power', 8709), ('security', 8623), ('vince', 8621), ('thanks', 8389), ('week', 8363), ('like', 8254), ('statement', 7960), ('corp', 7950), ('make', 7856), ('number', 7838), ('million', 7755), ('inc', 7395), ('group', 7387), ('could', 7330), ('sent', 7173), ('share', 7157), ('risk', 7129), ('product', 7073), ('trading', 6956), ('investment', 6926), ('money', 6839), ('see', 675

# Test Selection
Please make sure the last cell have completed the run successfully.

Run this cell to determine which feature set you've selected for reporting, refer to the user input cell on which set
was selected.

In [None]:
if test_with_train.value:
    selected_features = train_features_scaled
    selected_labels = train_labels
    print('Train set selected')
elif test_with_dev.value:
    selected_features = train_dev_features_scaled
    selected_labels = train_dev_labels
    print('Development set selected')
elif test_with_test.value:
    selected_features = test_features_scaled
    selected_labels = test_labels
    print('Test set selected')
else:
    selected_features = all_features_scaled
    selected_labels = all_labels
    print('Full dataset selected, size: {}'.format(len(selected_features)))
    print('WARNING - ANY REPORT WILL TAKE A LONG TIME TO COMPLETE DUE TO THE SIZE OF THE DATASET SELECTED')

df = pd.DataFrame(selected_features, columns=formatted_dictionary)


## Model Report
Run this cell to create the model report.

In [None]:
models_report(selected_features, selected_labels, True)

## ROC Plots
Run this cell to create the ROC plots for all models.

In [None]:
roc_curve_report(selected_features, selected_labels)

## SHAP Plots
**---WARNING---**

This report takes a considerable amount of time to run!

Run this cell to create the SHAP Plots for the top four models.

In [None]:
shap_report(selected_features, selected_labels, lr_classifier)

In [None]:
shap_report(selected_features, selected_labels, mpnn_classifier)

In [None]:
shap_report(selected_features, selected_labels, xgboost)

In [None]:
shap_report(selected_features, selected_labels, kn_classifier)


## T_test figures report
The following cell creates the report for all the figures to be used in the T_test implementation in R.

In [None]:
def t_test_results(X, y, iterations):  # function to test and record via csv, all algorithms selected.
    cv_collection = []

    # Ensure all cv collections are the same for each iteration of each model since shuffle is set to true.
    for i in range(iterations):
        cv_t = StratifiedKFold(n_splits=5, shuffle=True)
        cv_collection.append(cv_t)

    print('Processing features test...')

    for a in range(len(models)):
        model = models[a]
        model_name = type(model).__name__
        if model_name.lower() == 'svc':
            model_name_field = model_name + '_' + model.kernel
        else:
            model_name_field = type(model).__name__

        model_score = []
        for count in range(iterations):
            print('batch: {} | model: {}, processing...'.format(count + 1, model_name_field))
            score = cross_validate(models[a], X, y, cv=cv_collection[count], scoring='f1', return_train_score=False)
            if len(model_score) != 0:
                model_score = np.append(model_score, score['test_score'])
            else:
                model_score.append(score['test_score'])

        scores.append([model_name_field, ','.join(str(v) for v in model_score)])

    df = pd.DataFrame(scores, columns=['model_name', 'f1_score'])
    df.to_csv('../results/scores/t_test_values.csv')

    return df


t_test_results(selected_features, selected_labels, 4)

## Feature selection test

In [16]:
def feature_size_test():
    feature_sizes = [10, 25, 50, 75, 100, 125, 150, 200]
    df_results = []

    for size in feature_sizes:
        if len(scores) > 0:
            scores.clear()

        dictionary = dictionary_build((ham_collection + spam_collection), enable_preprocess_widget.value)
        dictionary = dictionary.most_common(size)
        formatted_dictionary = []
        for item in dictionary:
            formatted_dictionary.append(item[0])

        print(size)
        print(dictionary)

        train_set, test_set, train_labels, test_labels, train_dev_set, train_dev_labels, all_set, all_labels = calculate(
            ham_collection, spam_collection)

        test_features = extract_features(test_set, "test")

        test_features_scaled = MinMaxScaler().fit_transform(test_features, test_labels)

        df_results.append(models_report(test_features_scaled, test_labels, False))

    pd.DataFrame().append([df_results[0],
                           df_results[1],
                           df_results[2],
                           df_results[3],
                           df_results[4],
                           df_results[5],
                           df_results[6],
                           df_results[7]]).to_csv('../results/scores/feature_selection_test_report.csv')


feature_size_test()

Building Dictionary...
10
[('ect', 35666), ('company', 28711), ('please', 20234), ('ha', 20085), ('spam', 17847), ('wa', 17802), ('hou', 17264), ('would', 15499), ('time', 14803), ('new', 14776)]
Train set:
 Ham:  10588 
 Spam:  10978 
Train_Dev:
 Ham: 2648 
 Spam: 2745 
Test set:
 Ham:  3310 
 Spam:  3431 
All set:
 Ham:  16546 
 Spam:  17154
Feature extraction 'test':
Processing model tests...
KNeighborsClassifier testing model...
MLPClassifier testing model...
LogisticRegression testing model...
RandomForestClassifier testing model...
XGBClassifier testing model...
















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00499773, 0.007478  , 0.00599742, 0.0065012 , 0.00849986]), 'score_time': array([0.22300339, 0.19750738, 0.21149993, 0.25449729, 0.22699928]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([19.08197665, 16.04750013, 20.47950053, 29.36449981, 28.21449828]), 'score_time': array([0.00649595, 0.0059998 , 0.00750065, 0.00649834, 0.00599909]), 'test_precision_macro': array([0.94162501, 0.93844634, 0.92236785, 0.86750802, 0.87778601]), 'test_recall_macro': array([0.94170657, 0.93836814, 0.92008271, 0.86694397, 0.87738587]), 'test_f1': array([0.94161123, 0.93963636, 0.92416726, 0.871















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00549865, 0.00699782, 0.00599861, 0.00650096, 0.00599957]), 'score_time': array([0.22750282, 0.22850013, 0.21850038, 0.21749926, 0.24950051]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([17.44099927, 28.4109993 , 29.43099809, 32.99699903, 12.97249961]), 'score_time': array([0.00699782, 0.00750113, 0.00700164, 0.00900102, 0.00750184]), 'test_precision_macro': array([0.9478631 , 0.93248177, 0.91225615, 0.87087895, 0.90057009]), 'test_recall_macro': array([0.94774887, 0.93245796, 0.90964081, 0.87087895, 0.89948958]), 'test_f1': array([0.9472119 , 0.93372178, 0.91448763, 0.873















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00649834, 0.00600076, 0.00701714, 0.00700092, 0.00549841]), 'score_time': array([0.22550249, 0.22049594, 0.23597407, 0.24650025, 0.22650194]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([18.61999917, 23.9425025 , 18.61150217, 19.65699983, 22.97499967]), 'score_time': array([0.00649786, 0.00699997, 0.00749969, 0.00700068, 0.00600123]), 'test_precision_macro': array([0.95206996, 0.94580632, 0.8971298 , 0.87246737, 0.89850802]), 'test_recall_macro': array([0.95211568, 0.94586816, 0.89571314, 0.87225741, 0.89547532]), 'test_f1': array([0.95188749, 0.94667641, 0.90014265, 0.875















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00699568, 0.00547767, 0.00749779, 0.00650096, 0.00650048]), 'score_time': array([0.2300036 , 0.21800089, 0.23050046, 0.22499681, 0.24700236]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([24.85300112, 23.04149914, 20.87050104, 42.79499888, 16.50600123]), 'score_time': array([0.00650048, 0.00599885, 0.00600123, 0.00650048, 0.00750089]), 'test_precision_macro': array([0.94910408, 0.92808965, 0.90514406, 0.87727946, 0.88471755]), 'test_recall_macro': array([0.94914951, 0.92821691, 0.9039156 , 0.87388909, 0.88397206]), 'test_f1': array([0.94892672, 0.92851879, 0.90779128, 0.881















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00499892, 0.00600147, 0.00649953, 0.00699711, 0.00600076]), 'score_time': array([0.23047805, 0.22749996, 0.2545023 , 0.20749998, 0.23550081]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([19.99097776, 19.43000269, 29.76999784, 23.71550083, 19.9455061 ]), 'score_time': array([0.00699925, 0.0094974 , 0.01050186, 0.00649881, 0.00849509]), 'test_precision_macro': array([0.94568498, 0.94136693, 0.92154076, 0.88342683, 0.89085232]), 'test_recall_macro': array([0.94553798, 0.94138929, 0.91935384, 0.88152123, 0.888081  ]), 'test_f1': array([0.94494048, 0.94237783, 0.92340426, 0.887















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00602007, 0.00699782, 0.0095017 , 0.00600052, 0.00549936]), 'score_time': array([0.25347972, 0.24800038, 0.2475009 , 0.23949909, 0.24150252]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([37.02650118, 25.61750031, 20.03200364, 34.33050084, 20.37450027]), 'score_time': array([0.00649786, 0.00700092, 0.00649714, 0.01199937, 0.00750041]), 'test_precision_macro': array([0.94233712, 0.94209229, 0.91730268, 0.82931727, 0.89038572]), 'test_recall_macro': array([0.94243438, 0.94219742, 0.91573595, 0.82934918, 0.89001436]), 'test_f1': array([0.94239291, 0.94289898, 0.9194583 , 0.832















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00550032, 0.00597882, 0.00750351, 0.00750113, 0.008003  ]), 'score_time': array([0.23150206, 0.21600389, 0.22249794, 0.24949932, 0.26599693]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([31.67250037, 25.72799873, 28.92400146, 23.61000061, 32.7135005 ]), 'score_time': array([0.00999951, 0.00599933, 0.00699925, 0.00999951, 0.00999928]), 'test_precision_macro': array([0.9476939 , 0.94136378, 0.90960565, 0.88818607, 0.88199672]), 'test_recall_macro': array([0.9476939 , 0.94149498, 0.90748064, 0.88777492, 0.88204971]), 'test_f1': array([0.94736842, 0.94203962, 0.91205674, 0.890















MultinomialNB testing model...
GaussianNB testing model...
BernoulliNB testing model...
SVC_linear testing model...
['KNeighborsClassifier', {'fit_time': array([0.00949645, 0.00750303, 0.00700092, 0.0074985 , 0.00600076]), 'score_time': array([0.27849817, 0.23550081, 0.22649813, 0.23200107, 0.23700047]), 'test_precision_macro': array([0.83618452, 0.84612531, 0.81942448, 0.7632433 , 0.785552  ]), 'test_recall_macro': array([0.83496484, 0.82976535, 0.80534294, 0.74792571, 0.76789568]), 'test_f1': array([0.84188034, 0.84936961, 0.82689747, 0.7787262 , 0.79712042])}]
['MLPClassifier', {'fit_time': array([18.18650079, 42.91750097, 24.95550609, 30.0874989 , 16.99850154]), 'score_time': array([0.00650001, 0.00599957, 0.01149607, 0.00750113, 0.00599885]), 'test_precision_macro': array([0.94156841, 0.94365887, 0.92093035, 0.87359644, 0.89084728]), 'test_recall_macro': array([0.94167909, 0.94354945, 0.91690301, 0.86933094, 0.88985581]), 'test_f1': array([0.94169742, 0.94476744, 0.92221444, 0.877

## Hyperparameter Optimisation Tests
The following cells below contain the tests used to optimise some models used.

### K-Nearest Neighbours

In [None]:
def kn_test(X, y):
    if len(scores) > 0:
        scores.clear()

    algorithm_val = ['ball_tree', 'kd_tree', 'brute']
    model_scores = []
    processed_scores = []
    for a in range(len(algorithm_val)):
        model = KNeighborsClassifier(algorithm=algorithm_val[a])
        model_name = type(model).__name__ + '_' + model.algorithm
        model_scores.append([model_name, cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)])

    for i, d in enumerate(model_scores):
        print(d)
        for b in range(5):
            processed_score = [d[0]]
            for c, e in enumerate(scoring_parse_labels):
                item = d[1][e][b]
                item = item.astype(np.float)
                processed_score.append(item)
            processed_scores.append(processed_score)

    df = pd.DataFrame(processed_scores, columns=report_labels)
    df.to_csv('../results/scores/k_neighbour_classifier_test_score.csv')


kn_test(selected_features, selected_labels)

### Multilayer Perceptron Neural Network

In [None]:
# Multi-layer perceptron neural network test.
def mpnn_test(X, y):
    feature_sizes = [10, 25, 50, 75, 100, 125, 150, 200]
    nu_val = [25, 50, 75, 100, 200]
    h_layers = [1, 2, 3]
    iter_val = [2000, 5000, 10000]
    i = 1

    for a in range(len(nu_val)):
        for b in range(len(h_layers)):
            if b > 0:
                if b == 1:
                    nu_layer_val = nu_val[a], nu_val[a]
                elif b == 2:
                    nu_layer_val = nu_val[a], nu_val[a], nu_val[a]
            else:
                nu_layer_val = nu_val[a]
            for c in range(len(iter_val)):
                print('{} of total {}'.format(i, len(feature_sizes) * len(nu_val) * len(h_layers) * len(iter_val)))
                model = MLPClassifier(hidden_layer_sizes=(nu_layer_val), solver='lbfgs', max_iter=iter_val[c])
                scores.append(['{}_{}_{}_{}'.format(determine_model_name(model), nu_val[a], h_layers[b], iter_val[c]),
                               cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)])
                i += 1

    handle_scores_to_csv('../results/scores/mpnn_parameter_test_scores.csv', True)


mpnn_test(selected_features, selected_labels)

### Support Vector Machine

In [None]:
def svm_tests(X, y):
    if len(scores) > 0:
        scores.clear()
    kernel_val = ['linear', 'poly', 'rbf', 'sigmoid']
    model_scores = []
    processed_scores = []
    for a in range(len(kernel_val)):
        model = svm.SVC(C=1.0, kernel=kernel_val[a], degree=3, gamma='auto', coef0=0.0, shrinking=True,
                        probability=True, tol=0.001, cache_size=10000, class_weight=None, verbose=False,
                        max_iter=max_iterations, decision_function_shape='ovr', random_state=None)
        model_name = type(model).__name__ + '_' + model.kernel
        model_scores.append([model_name, cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)])

    for i, d in enumerate(model_scores):
        print(d)
        for b in range(5):
            processed_score = [d[0]]
            for c, e in enumerate(scoring_parse_labels):
                item = d[1][e][b]
                item = item.astype(np.float)
                processed_score.append(item)
            processed_scores.append(processed_score)

    df = pd.DataFrame(processed_scores, columns=report_labels)
    df.to_csv('../results/scores/svm_kernel_test_scores.csv')


svm_tests(selected_features, selected_labels)

## Dataset Preprocessing Comparison Test
The following test creates the comparison plot between preprocessed vs raw datsets.

In [None]:
def plot_test(processed_score, non_processed_score, model_labels):
    n_groups = len(model_labels)

    # create plot
    fig, ax = plt.subplots()
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8

    rects1 = plt.bar(index, processed_score, bar_width,
                     alpha=opacity,
                     color='m',
                     label='Processed')

    rects2 = plt.bar(index + bar_width, non_processed_score, bar_width,
                     alpha=opacity,
                     color='b',
                     label='Raw')

    plt.xlabel('Models')
    plt.ylabel('F1 Score')
    plt.title('Score difference between preprocessed vs raw dataset')
    plt.xticks(index + bar_width, model_labels)
    plt.legend()

    plt.tight_layout()
    plt.savefig('../results/preprocessing_score_difference.pdf', dpi=160)
    plt.show()


remove_common_artifacts = True
ham_collection_preprocessed, spam_collection_preprocessed = enron_selector(True, False)
remove_common_artifacts = False
ham_collection_no_processing, spam_collection_no_processing = enron_selector(False, False)
remove_common_artifacts = exclude_common_artifacts.value

ham_list = [ham_collection_preprocessed, ham_collection_no_processing]
spam_list = [spam_collection_preprocessed, spam_collection_no_processing]
dfs = []

for i in range(len(ham_list)):
    scores = []
    dictionary = dictionary_build((ham_list[i] + spam_list[i]), enable_preprocess_widget.value)
    dictionary = dictionary.most_common(feature_size)
    formatted_dictionary = []
    for item in dictionary:
        formatted_dictionary.append(item[0])

    print(dictionary)

    train_set, test_set, train_labels, test_labels, train_dev_set, train_dev_labels, all_set, all_labels = calculate(
        ham_list[i], spam_list[i])

    test_features = extract_features(test_set, "test")
    test_features_scaled = MinMaxScaler().fit_transform(test_features, test_labels)

    print('Finished feature extraction')

    dfs.append(models_report(test_features_scaled, test_labels))

dfs[0]['test_f1'].to_csv('../results/scores/comparison_processed_results.csv')
dfs[1]['test_f1'].to_csv('../results/scores/comparison_raw_results.csv')

# Get only top 4 models for plot.
model_select = [1, 4, 7, 8]
selected_processed_results = []
selected_non_processed_results = []
selected_model_names = []

for i, num in enumerate(model_select):
    if num == 1:
        selected_model_names.append('MPNN')
    elif num == 4:
        selected_model_names.append('XGBOOST')
    elif num == 7:
        selected_model_names.append('BernoulliNB')
    else:
        selected_model_names.append('LinearSVC')

    selected_processed_results.append(float(dfs[0]['test_f1'].iloc[num]))
    selected_non_processed_results.append(float(dfs[1]['test_f1'].iloc[num]))

plot_test(selected_processed_results, selected_non_processed_results, selected_model_names)