In [6]:
import csv
import numpy as np
import pandas as pd

In [15]:
enron = pd.read_csv(f'./input/enron/messages.csv').fillna(' ')
X_enron = np.array(enron['message'])
y_enron = np.array(enron['label'])

In [16]:
ling_spam = pd.read_csv(f'./input/ling_spam_copy/messages.csv').fillna(' ')
X_ling_spam = np.array(ling_spam['message'])
y_ling_spam = np.array(ling_spam['label'])

In [17]:
spam_assasin = pd.read_csv(f'./input/spam_assasin_copy/messages.csv').fillna(' ')
X_spam_assasin = np.array(spam_assasin['message'])
y_spam_assasin = np.array(spam_assasin['label'])

In [18]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from matplotlib import pyplot

In [19]:
from mealpy.swarm_based import AO, HGS, SSA, MRFO, HHO

def resolve_dataset(name):
    if (name == 'enron'):
        return [X_enron, y_enron]
    elif (name == 'ling_spam'):
        return [X_ling_spam, y_ling_spam]
    elif (name == 'spam_assasin'):
        return [X_spam_assasin, y_spam_assasin]
    else:
        return
    
def resolve_clf(alg):
    if alg == 'AO':
        return AO.OriginalAO
    elif alg == 'HGS':
        return HGS.OriginalHGS
    elif alg == 'SSA':
        return SSA.OriginalSSA
    elif alg == 'MRFO':
        return MRFO.BaseMRFO
    elif alg == 'HHO':
        return HHO.BaseHHO


def test_bio_alg(clf, obj_function):
    problem = {
        'obj_func': obj_function,
        'lb': [0.0001, 0.0001, 0.0001],
        'ub': [1000, 1000, 1000],
        'minmax': 'max',
        'verbose': True,
    }
    model = clf(problem, epoch=10, pop_size=40)
    model.solve()
    return model.g_best

In [20]:
def get_best(alg, dataset):
    [X, y] = resolve_dataset(dataset)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    if (alg == 'RSCV'):
        tuned_parameters = {
            'epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }

        clf = Pipeline([
            ('tfidf_vectorizer', TfidfVectorizer(
                stop_words=stopwords.words('english'))),
            ('classificator', SGDClassifier(random_state=0, alpha=alpha, epsilon=epsilon, tol=tol))])
    
        model = RandomizedSearchCV(clf, tuned_parameters, scoring='accuracy', random_state=0)
        model.fit(X_train, y_train)
        params = model.best_params_
        return [params['alpha'], params['epsilon'], params['tol']]
    
    elif alg == "DEFAULT":
        return [0.0001, 0.1, 1e-3]

    def obj_function(solution):
        alpha, epsilon, tol = solution
        clf = Pipeline([
            ('tfidf_vectorizer', TfidfVectorizer(
                stop_words=stopwords.words('english'))),
            ('classificator', SGDClassifier(random_state=0, alpha=alpha, epsilon=epsilon, tol=tol))])
    
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        return accuracy_score(y_test, y_pred)

    clf = resolve_clf(alg)
    best_params_ = test_bio_alg(clf, obj_function)

    return best_params_[0]

In [21]:
import statistics

def test(train, test, alg): 
    [X, y] = resolve_dataset(train)
    [X2, y2] = resolve_dataset(test)

    params = get_best(alg, train)

    # split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25)

    clf = Pipeline([
        ('tfidf_vectorizer', TfidfVectorizer(
            stop_words=stopwords.words('english'))),
        ('classificator', SGDClassifier(random_state=0, alpha=params[0], epsilon=params[1], tol=params[2]))])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    y2_pred = clf.predict(X2_)

    test_acc = accuracy_score(y2, y2_pred)
    test_prec = precision_score(y2, y2_pred)
    test_recall = recall_score(y2, y2_pred)
    test_f1 = f1_score(y2, y2_pred)

    print(f'Acc: {acc} Prec: {prec} Recall: {recall} F1: {f1}')
    print(
        f'AccTest: {test_acc} PrecTest: {test_prec} RecallTest: {test_recall} F1Test: {test_f1}')

In [22]:
test(train='spam_assasin', test='ling_spam', alg='MRFO')

KeyboardInterrupt: 