## Check Python Version
Check Your Python version before running this notebook.
- Python 3.6.X is required to run this notebook.

In [13]:
import re
import sys


class PythonVersionException(Exception):
    print('Please use Python verison 3.6.x')
    pass;


if re.match('3.6*', sys.version.split('(')[0]) is None:
    raise PythonVersionException

Please use Python verison 3.6.x


## Library imports
Import all the library's required for this notebook.

In [14]:
import os
import os.path
import numpy as np
from collections import Counter
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_validate, StratifiedKFold
from time import time
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import StandardScaler, MinMaxScaler


## NLTK Corpus Sets
Run this section to check if the following corpus datasets have been downloaded, if they are missing this will download
them for you.

In [15]:
import nltk

nltkDataDir = '../data/nltk_data'

nltk.data.path.append(os.path.abspath(nltkDataDir))

try:
    stopwords.words('english')
except(LookupError, OSError) as e:
    nltk.download('stopwords', nltkDataDir)

try:
    WordNetLemmatizer().lemmatize("testing")
except(LookupError, OSError) as e:
    nltk.download('punkt', nltkDataDir)
    nltk.download('wordnet', nltkDataDir)


In [20]:
stoplist = stopwords.words('english')


def enron_selector():  # function to identify platform and selected dataset to be applied.
    def init_lists(folder_collection, label):  # function to retrieve and apply email content to array.
        a_list = []
        doc_id = 0
        file_list = []
        label = "Loading " + label + "..."
        for entry in folder_collection:
            b_list = os.listdir(entry)
            for item in b_list:
                file_list.append(entry + item)
        for a_file in file_list:
            f = open(a_file, 'r')
            if verbose_logs:
                process_status(doc_id, file_list, label)
            try:
                a_list.append(f.read())
            except UnicodeDecodeError:
                pass
            doc_id += 1
        f.close()
        return a_list

    enron_ = ['Enron1/', 'Enron2/', 'Enron3/', 'Enron4/', 'Enron5/', 'Enron6/']
    spam = []
    ham = []

    for i, sub in enumerate(enron_):
        spam.append('../data/enron_dataset/Enron/Processed/' + enron_[i] + 'spam/')
        ham.append('../data/enron_dataset/Enron/Processed/' + enron_[i] + 'ham/')

    spam = init_lists(spam, "spam")
    ham = init_lists(ham, "ham")
    all_emails = [(email, 'spam') for email in spam]
    all_emails += [(email, 'ham') for email in ham]
    ham_emails, spam_emails = preprocess(all_emails)
    ham_file, spam_file = "../data/processed_ham.txt", "../data/processed_spam.txt"
    print("Writing ham file...")
    with open(ham_file, 'w') as fp:
        fp.write('\n'.join('{} {};'.format(x[0], x[1]) for x in ham_emails))
    print("Writing spam file...")
    with open(spam_file, 'w') as fp:
        fp.write('\n'.join('{} {};'.format(x[0], x[1]) for x in spam_emails))
    return ham_emails, spam_emails


def test_collection(test_select):  # function that outlines all tests to be carried out.
    if test_select == 1:
        data_size = 1000 / 2
    elif test_select == 2:
        data_size = 2000 / 2
    elif test_select == 3:
        data_size = 3000 / 2
    elif test_select == 4:
        data_size = 1500 / 2
    return data_size


def preprocess(collection):  # function to apply pre-processing: stop words, lemmatise.
    lemmatizer = WordNetLemmatizer()
    label = 'Pre-processing emails...'
    entry_id = 0
    doc_id = 0
    processed = []
    for entry in collection:
        if verbose_logs:
            process_status(doc_id, collection, label)
        for i, line in enumerate(entry):
            emails = ''
            if i == 0:
                words = []
                for word in word_tokenize(line):
                    item = lemmatizer.lemmatize(word.lower())
                    if not item in stoplist:
                        if word.isalnum() == False:
                            pass
                        else:
                            emails = emails + item + ','
                processed.append(tuple((emails, entry[1])))
                del emails
                entry_id += 1
        doc_id += 1
    email_list = []
    ham = []
    spam = []
    for entry in processed:
        if entry[1] == 'ham':
            email_list.append(entry)
            ham.append(entry)
    for entry in processed:
        if entry[1] == 'spam':
            email_list.append(entry)
            spam.append(entry)
    return ham, spam


def dictionary_build(all_emails):
    print("Building Dictionary...")

    def html_list():
        html_tag_list = []
        location = '../data/html_tag_list.txt'
        f = open(location, 'r')
        for i in f:
            html_tag_list.append(i.strip())
        f.close()
        return html_tag_list

    html_tags = [html_list()]
    common_email_words = ['subject', 'mail', 'cc', '``', 'email', '\n', 'www', 'com', '\nsubject']
    processed_emails = all_emails
    all_words = []
    for entry in all_emails:
        for sentence in entry:
            if not sentence == 'ham' or sentence == 'spam':
                words = str(sentence).split(',')
                for word in words:
                    all_words.append(word)
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)
    for item in list_to_remove:
        if len(item) <= 1:
            del dictionary[item]
        elif item in html_tags:
            del dictionary[item]
        elif str(item).isdigit():
            del dictionary[item]
        elif item in common_email_words:
            del dictionary[item]

    return dictionary


def extract_features(data,
                     label):  # function to extract features to matrix based on calculating occurrence of words based
    # on dictionary.
    features_matrix = np.zeros((len(data), len(dictionary)))
    label = 'Feature extraction \'' + label + '\':'
    doc_id = 0
    all_words = []
    for entry in data:
        if verbose_logs:
            process_status(doc_id, data, label)
        for i, line in enumerate(entry):
            if i == 0:
                # print('[' + str(doc_id) + '] ', entry)
                words = line.split(',')
                for word in words:
                    all_words.append(words)
                    for j, d in enumerate(dictionary):
                        if d[0] == word:
                            word_id = j
                            features_matrix[doc_id, word_id] = words.count(word)
        doc_id = doc_id + 1
    return features_matrix


def calculate(ham, spam):
    main_proportion = 0.8
    ham_size = int(len(ham) * main_proportion)
    ham_train, ham_test = ham[:ham_size], ham[ham_size:]
    spam_size = int(len(spam) * main_proportion)
    spam_train, spam_test = spam[:spam_size], spam[spam_size:]
    ham_train_size, spam_train_size = int(len(ham_train) * main_proportion), int(len(spam_train) * main_proportion)
    ham_train, ham_train_dev = ham_train[:ham_train_size], ham_train[ham_train_size:]
    spam_train, spam_train_dev = spam_train[:spam_train_size], spam_train[spam_train_size:]
    train_set, train_dev_set, test_set = ham_train + spam_train, ham_train_dev + spam_train_dev, ham_test + spam_test
    train_labels = np.zeros(len(train_set))
    train_labels[(int((len(train_set)) - len(spam_train))):len(train_set)] = 1
    train_dev_labels = np.zeros(len(train_dev_set))
    train_dev_labels[(int((len(train_dev_set)) - len(spam_train_dev))):len(train_dev_set)] = 1
    test_labels = np.zeros(len(test_set))
    test_labels[(int((len(test_set)) - len(spam_test))):len(test_set)] = 1
    print("Train set:\n", "Ham: ", str(len(ham_train)), "\n", "Spam: ", str(len(spam_train)),
          "\nTrain_Dev:\n Ham:", str(len(ham_train_dev)), "\n Spam:", str(len(spam_train_dev)),
          "\nTest set:\n", "Ham: ", str(len(ham_test)), "\n", "Spam: ", str(len(spam_test)))
    return train_set, test_set, train_labels, test_labels, train_dev_set, train_dev_labels


def load_(file, label):
    if verbose_logs == False:
        print('Loading ' + label + ' dataset')
    with open(file, 'r') as fp:
        values = []
        doc_id = 0
        size_file = fp.read().split(";")
        for item in size_file:
            if verbose_logs:
                process_status(doc_id, size_file, "loading " + label + " file...")
            values.append(item.split(", "))
            doc_id += 1
        return values


In [21]:
def process_status(id, data, label):
    if id + 1 < int(len(data)):
        end_atp = "\r"
    elif id + 1 <= int(len(data)):
        end_atp = "\n"
    return print(label, '%0.0i out of %0.0i: %0.0i' %
                 (id + 1, len(data), int((id + 1) * (100 / len(data)))), '%', end='\r', flush=True)

In [22]:
def roc_curve(X, y, model, cv, algorithm_name, i):
    try:
        # tprs = []
        aucs = []
        # mean_fpr = np.linspace(0, 1, 100)
        for train, test in cv.split(X, y):
            probas_ = model.fit(X[train], y[train]).predict_proba(X[test])
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
            # tprs.append(interp1d(mean_fpr, fpr, tpr))
            # tprs[-1][0] = 0.0
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            plt.plot(fpr, tpr, lw=1, alpha=0.5, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
            i += 1
        # mean_tpr = np.mean(tprs, axis=0)
        # mean_tpr[-1] = 1.0
        # mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        # plt.plot(mean_fpr, mean_tpr, color='b',
        #          label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        #          lw=2, alpha=.8)
        # std_tpr = np.std(tprs, axis=0)
        # tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        # tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        # plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
        #                  label=r'$\pm$ 1 std. dev.')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Development - ROC: ' + algorithm_name)
        plt.legend(loc="lower right")
        plt.savefig(
            "../results/plots/dev/devROC_%s_%0i_features_%0i_test.png" % (
                algorithm_name, feature_size, len(test_set)),
            dpi=100,
            facecolor='w', edgecolor='b', linewidth=1, orientation='portrait', papertype=None,
            format="png", transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)
        print("Created %s ROC figure" % (algorithm_name))
        plt.close()
    except (AttributeError, OverflowError) as detail:
        print(algorithm_name + " Failed due to ", detail)

    return 1


verbose_logs = False
train_dev_features = []
train_dev_labels = []


In [48]:
if __name__ == "__main__":
    feature_size = int(input("Type amount of features to use: "))
    model_process = str(input("Extract features or Test models? [Extract features = y, Test models = n")) == 'y'
    verbose_logs = str(input("Enable verbose logs? [y/n]")) == 'y'
    a_files = ["../data/processed_ham.txt", "../data/processed_spam.txt"]
    a_exist = [f for f in a_files if os.path.isfile(f)]
    usePreprocessedDatasets = False
    if a_exist:
        usePreprocessedDatasets = str(
            input("Preprocessed datasets for ham and spam found, would you like to use them? [y/n]")) == 'y'

    if usePreprocessedDatasets:
        ham_collection = load_(a_files[0], "ham")
        spam_collection = load_(a_files[1], "spam")
    else:
        ham_collection, spam_collection = enron_selector()
    dictionary = dictionary_build((ham_collection + spam_collection))
    dictionary = dictionary.most_common(feature_size)
    print(dictionary, "\n")
    train_set, test_set, train_labels, test_labels, train_dev_set, train_dev_labels = calculate(ham_collection,
                                                                                                spam_collection)
    if model_process:
        train_features = extract_features(train_set, "train")
        train_dev_features = extract_features(train_dev_set, "train_dev")
        test_features, test_labels = 0, 0
    else:
        test_features = extract_features(test_set, "test")
        train_features, train_labels = 0, 0
        train_dev_features, train_dev_labels = 0, 0

    train_features_scaled = 0
    train_dev_features_scaled = 0
    test_features_scaled = 0

    if isinstance(train_features, np.ndarray):
        train_features_scaled = MinMaxScaler().fit_transform(train_features, train_labels)

    if isinstance(train_dev_features, np.ndarray):
        train_dev_features_scaled = MinMaxScaler().fit_transform(train_dev_features, train_dev_labels)

    if isinstance(test_features, np.ndarray):
        test_features_scaled = MinMaxScaler().fit_transform(test_features, test_labels)

    print('Finished feature extraction')

Loading ham dataset
Loading spam dataset
Building Dictionary...
[('enron', 60909), ('ect', 35672), ('company', 28725), ('please', 20344), ('ha', 20101), ('spam', 17907), ('wa', 17822), ('hou', 17264), ('would', 15531), ('new', 15268), ('time', 14848), ('price', 14224), ('business', 13582), ('may', 13139), ('information', 13117), ('one', 12342), ('gas', 11954), ('said', 11889), ('market', 11671), ('get', 11498), ('energy', 11463), ('year', 11415), ('http', 11175), ('day', 10853), ('need', 10847), ('message', 10769), ('stock', 10472), ('deal', 10058), ('know', 9782), ('pm', 9676), ('service', 9642), ('also', 9232), ('report', 9002), ('power', 8777), ('vince', 8655), ('security', 8651), ('thanks', 8432), ('week', 8372), ('like', 8289), ('statement', 7962), ('corp', 7954), ('make', 7938), ('number', 7841), ('million', 7762), ('inc', 7398), ('group', 7390), ('could', 7342), ('risk', 7182), ('sent', 7175), ('share', 7168)] 

Train set:
 Ham:  10588 
 Spam:  10978 
Train_Dev:
 Ham: 2648 
 Spa

In [51]:
from sklearn.model_selection import train_test_split
import pandas as pd
import shap

def shap_report(x, y, dictionary):

    formatted_dictionary = []
    for item in dictionary:
        formatted_dictionary.append(item[0])

    X = pd.DataFrame(x, columns=formatted_dictionary)
    y = pd.DataFrame(y)
    
    train_X, val_X, train_y, val_y = train_test_split(X, y.values.ravel(), random_state=1)

    model = MLPClassifier(hidden_layer_sizes=5, solver='lbfgs', max_iter=10000).fit(train_X, train_y)
    # Create object that can calculate shap values
    explainer = shap.KernelExplainer(model.predict_proba, shap.sample(train_X, 100))

    # Calculate Shap values
    shap_values = explainer.shap_values(val_X)

    shap.summary_plot(shap_values[1], None, formatted_dictionary, show=False)
    plt.savefig('shap_plot.svg')
    plt.close()


shap_report(train_dev_features_scaled, train_dev_labels, dictionary)

  0%|          | 0/1349 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def models(train_features, train_dev_features, test_features, train_labels, train_dev_labels, test_labels, model_process
           , feature_size, test_set):
    # function to hold classifiers, fit and prediction and finally report the performance based on return method.
    algorithm_names = ['k-Neighbors Classifier',
                       'MLP Neural Network 1',
                       'MLP Neural Network 2',
                       'Logistic Regression',
                       'Random Forest',
                       'xgBoost',
                       'Multinomial Naive Bayes',
                       'Gaussian NB',
                       'Bernoulli NB',
                       'Rbf SVC',
                       'Linear SVC',
                       'Poly SVC',
                       'Sigmoid SVC']
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']

    print('Processing models...')

    model1 = KNeighborsClassifier(algorithm='brute')
    model2 = MLPClassifier(hidden_layer_sizes=75, solver='lbfgs', max_iter=25)
    model3 = MLPClassifier(hidden_layer_sizes=(150, 150), solver='lbfgs', max_iter=25)
    model4 = LogisticRegression(solver='lbfgs', max_iter=25)
    model5 = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
    model6 = XGBClassifier()
    model7 = MultinomialNB()
    model8 = GaussianNB()
    model9 = BernoulliNB()
    model10 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=200,
                      decision_function_shape='ovr', random_state=None)
    model11 = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=100,
                      decision_function_shape='ovr', random_state=None)
    model12 = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=25,
                      decision_function_shape='ovr', random_state=None)
    model13 = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=100,
                      decision_function_shape='ovr', random_state=None)
    models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11, model12,
              model13]

    if model_process:
        process_id = int(input("Test or development? (0/1)"))
        if process_id == 0:
            print("x-val train_set")
            score0 = cross_validate(model1, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score1 = cross_validate(model2, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score2 = cross_validate(model3, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score3 = cross_validate(model4, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score4 = cross_validate(model5, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score5 = cross_validate(model6, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score6 = cross_validate(model7, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score7 = cross_validate(model8, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score8 = cross_validate(model9, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score9 = cross_validate(model10, train_features, train_labels, scoring=scoring, cv=5,
                                    return_train_score=False)
            score10 = cross_validate(model11, train_features, train_labels, scoring=scoring, cv=5,
                                     return_train_score=False)
            score11 = cross_validate(model12, train_features, train_labels, scoring=scoring, cv=5,
                                     return_train_score=False)
            scores = [score0, score1, score2, score3, score4, score5, score6, score7, score8, score9, score10, score11]
            print('Train set model output...\n')
            for i, d in enumerate(scores):
                print(algorithm_names[i])
                for c, e in enumerate(scoring_parse_labels):
                    item = d.pop(e)
                    item = item.astype(np.float)
                    if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                        print(scoring_parse_labels[c], ':', '%0.6f' % (np.mean(item)))
                    else:
                        print(scoring_parse_labels[c], ':', '%0.0f' % (float(np.mean(item) * 100)) + '%')
                        if scoring_parse_labels[c] == 'test_f1':
                            print('\n')
        elif process_id == 1:
            print("ROC curve development")
            cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
            X, y = train_dev_features, train_dev_labels
            k = 0
            save_output = str(input('model record output (y/n): ')) == 'y'
            scores = []
            for model in models:
                i = 0
                if save_output:
                    k != roc_curve(X, y, model, cv, algorithm_names[k], i)
                else:
                    scores.append(cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False))

            for i, d in enumerate(scores):
                print(algorithm_names[i])
                for c, e in enumerate(scoring_parse_labels):
                    item = d.pop(e)
                    item = item.astype(np.float)
                    if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                        print(scoring_parse_labels[c], ':', '%0.6f' % (np.mean(item)))
                    else:
                        print(scoring_parse_labels[c], ':', '%0.0f' % (float(np.mean(item) * 100)) + '%')
                        if scoring_parse_labels[c] == 'test_f1':
                            print('\n')

    else:
        print("ROC Curve output")
        cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
        X, y = test_features, test_labels
        k = 0
        for model in models:
            i = 0
            k != roc_curve(X, y, model, cv, algorithm_names[k], i)


models(train_features, train_dev_features, test_features, train_labels, train_dev_labels, test_labels, model_process,
       feature_size, test_set)

In [None]:
def feature_test(train, test):  # function to test and record via csv, all algorithms selected.
    scores = []
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
    cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    model1 = KNeighborsClassifier(algorithm='brute')
    model2 = MLPClassifier(hidden_layer_sizes=75, solver='lbfgs', max_iter=25)
    model3 = MLPClassifier(hidden_layer_sizes=(150, 150), solver='lbfgs', max_iter=25)
    model4 = LogisticRegression(solver='lbfgs', max_iter=25)
    model5 = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)
    model6 = XGBClassifier()
    model7 = MultinomialNB()
    model8 = GaussianNB()
    model9 = BernoulliNB()
    model10 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=200,
                      decision_function_shape='ovr', random_state=None)
    model11 = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=100,
                      decision_function_shape='ovr', random_state=None)
    model12 = svm.SVC(C=1.0, kernel='poly', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=25,
                      decision_function_shape='ovr', random_state=None)
    model13 = svm.SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True,
                      tol=0.001, cache_size=1000, class_weight=None, verbose=False, max_iter=100,
                      decision_function_shape='ovr', random_state=None)
    models = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10, model11, model12,
              model13]
    for a in range(len(models)):
        scores.append(cross_validate(models[a], train, test, cv=cv, scoring=scoring, return_train_score=False))
    with open('../results/feature_test_1.csv', 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(scoring_parse_labels)
        for i, d in enumerate(scores):
            processed_scores = []
            for c, e in enumerate(scoring_parse_labels):
                item = d.pop(e)
                item = item.astype(np.float)
                if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                    processed_scores.append('%0.6f' % (np.mean(item)))
                else:
                    processed_scores.append('%0.2f' % (float(np.mean(item))))

            filewriter.writerow(processed_scores)


feature_test(train_dev_features, train_dev_labels)


In [None]:
def svm_test(X, y):  # function test for SVM models specified.
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(X, y)
    scores = []
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
    cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    max_iter = [10, 25, 50, 75, 100, 150, 200, 2000, 5000, 8000, 10000, 15000, 20000, 50000, 100000]
    for i in range(len(max_iter)):
        print("iterations: " + str(max_iter[i]))
        model = svm.LinearSVC(max_iter=max_iter[i])
        scores.append(cross_validate(model, x_scaled, y, cv=cv, scoring=scoring, return_train_score=False))

    with open('../results/svm_svc_test.csv', 'w', newline='') as csvfile:  # function to save output to csv file.
        filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(scoring_parse_labels)
        for i, d in enumerate(scores):
            processed_scores = []
            for c, e in enumerate(scoring_parse_labels):
                item = d.pop(e)
                item = item.astype(np.float)
                if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                    processed_scores.append('%0.6f' % (np.mean(item)))
                else:
                    processed_scores.append('%0.2f' % (float(np.mean(item))))

            filewriter.writerow(processed_scores)


svm_test(train_dev_features, train_dev_labels)


In [None]:
def mpnn_test(train, test):  # Multi-layer perceptron neural network test.
    nu_val = [10, 25, 50, 75, 100, 150, 200]
    h_layers = [1, 2, 3, 4, 5]
    iter_val = [10, 25, 50, 100, 200]
    scores = []
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
    cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    i = 0
    for a in range(len(nu_val)):
        for b in range(len(h_layers)):
            if b > 0:
                if b == 1:
                    nu_layer_val = nu_val[a], nu_val[a]
                elif b == 2:
                    nu_layer_val = nu_val[a], nu_val[a], nu_val[a]
                elif b == 3:
                    nu_layer_val = nu_val[a], nu_val[a], nu_val[a], nu_val[a]
                elif b == 4:
                    nu_layer_val = nu_val[a], nu_val[a], nu_val[a], nu_val[a], nu_val[a]
            else:
                nu_layer_val = nu_val[a]
            for c in range(len(iter_val)):
                print('%0.0i out of %0.0i/ %0.0i' %
                      (i, (int(len(nu_val) * len(h_layers) * len(iter_val))),
                       int(i * (100 / (int(len(nu_val) * len(h_layers) * len(iter_val)))))) + '%',
                      end='\r', flush=True)
                model = MLPClassifier(hidden_layer_sizes=(nu_layer_val), solver='lbfgs', max_iter=iter_val[c])
                scores.append(cross_validate(model, train, test, cv=cv, scoring=scoring, return_train_score=False))
                i += 1
    with open('../results/mpnn_test.csv', 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(scoring_parse_labels)
        for i, d in enumerate(scores):
            processed_scores = []
            for c, e in enumerate(scoring_parse_labels):
                item = d.pop(e)
                item = item.astype(np.float)
                if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                    processed_scores.append('%0.6f' % (np.mean(item)))
                else:
                    processed_scores.append('%0.2f' % (float(np.mean(item))))
            filewriter.writerow(processed_scores)


mpnn_test(train_dev_features, train_dev_labels)


In [None]:
def test_2(train, test):
    scores = []
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
    cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(train)
    i = 0
    kernal_val = ['linear', 'poly', 'rbf', 'sigmoid']
    iter_val = [10, 25, 50, 100, 200]
    for a in range(len(kernal_val)):
        for b in range(len(iter_val)):
            print('%0.0i out of %0.0i/ %0.0i' %
                  (i, (int(len(kernal_val) * len(iter_val))), int(i * (100 / (int(len(kernal_val) * len(iter_val))))))
                  + '%', end='\r', flush=True)
            model = svm.SVC(C=1.0, kernel=kernal_val[a], degree=3, gamma='auto', coef0=0.0, shrinking=True,
                            probability=True, tol=0.001, cache_size=10000, class_weight=None, verbose=False,
                            max_iter=iter_val[b], decision_function_shape='ovr', random_state=None)
            scores.append(cross_validate(model, scaled_data, test, cv=cv, scoring=scoring, return_train_score=False))
            i += 1
    with open('../results/test_2.csv', 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(scoring_parse_labels)
        for i, d in enumerate(scores):
            processed_scores = []
            for c, e in enumerate(scoring_parse_labels):
                item = d.pop(e)
                item = item.astype(np.float)
                if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                    processed_scores.append('%0.6f' % (np.mean(item)))
                else:
                    processed_scores.append('%0.2f' % (float(np.mean(item))))
            filewriter.writerow(processed_scores)


test_2(train_dev_features, train_dev_labels)


In [None]:
def test_3(train, test):  # todo Knn classifier test.
    scores = []
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
    cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    i = 0
    algorithm_val = ['ball_tree', 'kd_tree', 'brute']
    for a in range(len(algorithm_val)):
        model = KNeighborsClassifier(algorithm=algorithm_val[a])
        scores.append(cross_validate(model, train, test, cv=cv, scoring=scoring, return_train_score=False))
        i += 1
    with open('../results/test_3.csv', 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(scoring_parse_labels)
        for i, d in enumerate(scores):
            processed_scores = []
            for c, e in enumerate(scoring_parse_labels):
                item = d.pop(e)
                item = item.astype(np.float)
                if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                    processed_scores.append('%0.6f' % (np.mean(item)))
                else:
                    processed_scores.append('%0.2f' % (float(np.mean(item))))
            filewriter.writerow(processed_scores)


test_3(train_dev_features, train_dev_labels)


In [None]:
def test_4(train, test):
    scores = []
    scoring = ['precision_macro', 'recall_macro', 'f1']
    scoring_parse_labels = ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_f1']
    cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    i = 0
    model = LogisticRegression()
    scores.append(cross_validate(model, train, test, cv=cv, scoring=scoring, return_train_score=False))
    i += 1
    with open('../results/test_4.csv', 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(scoring_parse_labels)
        for i, d in enumerate(scores):
            processed_scores = []
            for c, e in enumerate(scoring_parse_labels):
                item = d.pop(e)
                item = item.astype(np.float)
                if scoring_parse_labels[c] == 'fit_time' or scoring_parse_labels[c] == 'score_time':
                    processed_scores.append('%0.6f' % (np.mean(item)))
                else:
                    processed_scores.append('%0.2f' % (float(np.mean(item))))
            filewriter.writerow(processed_scores)


test_4(train_dev_features, train_dev_labels)

In [None]:
# todo: implement t-test with Bonferroni correction, if not can this be done in excel with current results?


In [None]:
# todo: take best models and remove some of the pre-processing stages e.g. lemmatize, stop work and html tag removal,
# provide scores and potentially roc curves too.

In [None]:
# todo: implement confusion matrix.