In [1]:
from contextlib import redirect_stdout
from sklearn.model_selection import StratifiedKFold
from mealpy.swarm_based import AO, HGS, SSA, MRFO, HHO
from matplotlib import pyplot
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std

In [2]:
enron = pd.read_csv(f'./input/enron/messages.csv').fillna(' ')
X_enron = np.array(enron['message'])
y_enron = np.array(enron['label'])

ling_spam = pd.read_csv(f'./input/ling_spam_copy/messages.csv').fillna(' ')
X_ling_spam = np.array(ling_spam['message'])
y_ling_spam = np.array(ling_spam['label'])

spam_assasin = pd.read_csv(
    f'./input/spam_assasin_copy/messages.csv').fillna(' ')
X_spam_assasin = np.array(spam_assasin['message'])
y_spam_assasin = np.array(spam_assasin['label'])

In [14]:
BIO_ALGS = ['MRFO', 'HGS', 'AO', 'HHO']
ALGS = ["RSCV", "DEFAULT"] + BIO_ALGS
# ALGS = BIO_ALGS

def resolve_dataset(name):
    if (name == 'enron'):
        return [X_enron.copy(), y_enron.copy()]
    elif (name == 'ling_spam'):
        return [X_ling_spam.copy(), y_ling_spam.copy()]
    elif (name == 'spam_assasin'):
        return [X_spam_assasin.copy(), y_spam_assasin.copy()]
    else:
        return


def resolve_alg(alg):
    if alg == 'AO':
        return AO.OriginalAO
    elif alg == 'HGS':
        return HGS.OriginalHGS
    elif alg == 'SSA':
        return SSA.OriginalSSA
    elif alg == 'MRFO':
        return MRFO.BaseMRFO
    elif alg == 'HHO':
        return HHO.BaseHHO


def bio(alg, X, y):
    alg = resolve_alg(alg)
    cv = TfidfVectorizer(stop_words=stopwords.words('english'))
    skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

    alpha, epsilon, tol = [], [], []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train = cv.fit_transform(X_train)
        X_test = cv.transform(X_test)

        def obj_function(solution):
            alpha, epsilon, tol = solution
            clf = SGDClassifier(random_state=0, alpha=alpha,
                                epsilon=epsilon, tol=tol, n_jobs=-1)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            return accuracy_score(y_test, y_pred)

        problem = {
            'obj_func': obj_function,
            'lb': [0.0001, 0.0001, 0.0001],
            'ub': [1000, 1000, 1000],
            'minmax': 'max',
            'verbose': True,
        }

        model = alg(problem, epoch=10, pop_size=40)
        model.solve()
        a, e, t = model.g_best[0]
        alpha.append(a)
        epsilon.append(e)
        tol.append(t)

    return [mean(alpha), mean(epsilon), mean(tol)]


def get_best(alg, X, y):
    if (alg == 'RSCV'):
        distributions = {
            'clf__epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }
        clf = Pipeline([
            ('tfidf_vectorizer', TfidfVectorizer(
                stop_words=stopwords.words('english'))),
            ('clf', SGDClassifier(random_state=0, n_jobs=-1))])

        clf_random = RandomizedSearchCV(
            clf, distributions, scoring='accuracy', cv=10, random_state=0)
        clf_random.fit(X, y)
        best = clf_random.best_params_

        return [best['clf__alpha'], best['clf__epsilon'], best['clf__tol']]

    elif alg == 'DEFAULT':
        return [0.0001, 0.1, 1e-3]

    return bio(alg, X, y)

In [20]:
best_ling_spam_RSCV = get_best('RSCV', X_ling_spam, y_ling_spam)
print(best_ling_spam_RSCV)

[0.001, 1, 1]


In [21]:
best_ling_spam_DEFAULT = get_best('DEFAULT', X_ling_spam, y_ling_spam)
print(best_ling_spam_DEFAULT)

[0.0001, 0.1, 0.001]


In [22]:
best_ling_spam_MRFO = get_best('MRFO', X_ling_spam, y_ling_spam)
print(best_ling_spam_MRFO)

> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.28299 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.31906 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.33140 seconds
> Epoch: 4, Current best: 1.0, Global best: 1.0, Runtime: 0.37168 seconds
> Epoch: 5, Current best: 1.0, Global best: 1.0, Runtime: 0.33196 seconds
> Epoch: 6, Current best: 1.0, Global best: 1.0, Runtime: 0.32717 seconds
> Epoch: 7, Current best: 1.0, Global best: 1.0, Runtime: 0.32375 seconds
> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.32865 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.32464 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.33800 seconds
> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.28181 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.31773 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.31921 seconds
> Epoch: 4, Current best: 1.0, Global

In [23]:
best_ling_spam_HGS = get_best('HGS', X_ling_spam, y_ling_spam)
print(best_ling_spam_HGS)

> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.11977 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.13802 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.16347 seconds
> Epoch: 4, Current best: 1.0, Global best: 1.0, Runtime: 0.17009 seconds
> Epoch: 5, Current best: 1.0, Global best: 1.0, Runtime: 0.17634 seconds
> Epoch: 6, Current best: 1.0, Global best: 1.0, Runtime: 0.18750 seconds
> Epoch: 7, Current best: 1.0, Global best: 1.0, Runtime: 0.19571 seconds
> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.17831 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.18121 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.18446 seconds
> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.13961 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.14546 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.17126 seconds
> Epoch: 4, Current best: 1.0, Global

In [24]:
best_ling_spam_AO = get_best('AO', X_ling_spam, y_ling_spam)
print(best_ling_spam_AO)

> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.13079 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.13889 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.13988 seconds
> Epoch: 4, Current best: 1.0, Global best: 1.0, Runtime: 0.14474 seconds
> Epoch: 5, Current best: 1.0, Global best: 1.0, Runtime: 0.14731 seconds
> Epoch: 6, Current best: 1.0, Global best: 1.0, Runtime: 0.16614 seconds
> Epoch: 7, Current best: 1.0, Global best: 1.0, Runtime: 0.15120 seconds
> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.14729 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.14439 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.13303 seconds
> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.13148 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.14295 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.14090 seconds
> Epoch: 4, Current best: 1.0, Global

> Epoch: 9, Current best: 0.9895833333333334, Global best: 0.9895833333333334, Runtime: 0.18350 seconds
> Epoch: 10, Current best: 0.9895833333333334, Global best: 0.9895833333333334, Runtime: 0.17156 seconds
[0.22747603536905014, 67.24830980708012, 65.14629801277684]


In [25]:
best_ling_spam_HHO = get_best('HHO', X_ling_spam, y_ling_spam)
print(best_ling_spam_HHO)

> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.24081 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.23971 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.28976 seconds
> Epoch: 4, Current best: 1.0, Global best: 1.0, Runtime: 0.37653 seconds
> Epoch: 5, Current best: 1.0, Global best: 1.0, Runtime: 0.30408 seconds
> Epoch: 6, Current best: 1.0, Global best: 1.0, Runtime: 0.31922 seconds
> Epoch: 7, Current best: 1.0, Global best: 1.0, Runtime: 0.33480 seconds
> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.35954 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.34978 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.38185 seconds
> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.17051 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.24336 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.30237 seconds
> Epoch: 4, Current best: 1.0, Global

In [27]:
best_spam_assasin_RSCV = get_best('RSCV', X_spam_assasin, y_spam_assasin)
print(best_spam_assasin_RSCV)

[0.0001, 1, 10]


In [31]:
best_spam_assasin_DEFAULT = get_best('DEFAULT', X_spam_assasin, y_spam_assasin)
print(best_spam_assasin_DEFAULT)

[0.0001, 0.1, 0.001]


In [32]:
best_spam_assasin_MRFO = get_best('MRFO', X_spam_assasin, y_spam_assasin)
print(best_spam_assasin_MRFO)

> Epoch: 1, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.79899 seconds
> Epoch: 2, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.63930 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Runtime: 0.57336 seconds
> Epoch: 4, Current best: 1.0, Global best: 1.0, Runtime: 0.62200 seconds
> Epoch: 5, Current best: 1.0, Global best: 1.0, Runtime: 0.59596 seconds
> Epoch: 6, Current best: 1.0, Global best: 1.0, Runtime: 0.62867 seconds
> Epoch: 7, Current best: 1.0, Global best: 1.0, Runtime: 0.60176 seconds
> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.58463 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.56670 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.60854 seconds
> Epoch: 1, Current best: 1.0, Global best: 1.0, Runtime: 0.83391 seconds
> Epoch: 2, Current best: 1.0, Global best: 1.0, Runtime: 0.96664 seconds
> Epoch: 3, Current best: 1.0, Global best: 1.0, Ru

> Epoch: 1, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.58415 seconds
> Epoch: 2, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.54641 seconds
> Epoch: 3, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.63936 seconds
> Epoch: 4, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.64021 seconds
> Epoch: 5, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.64040 seconds
> Epoch: 6, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.61393 seconds
> Epoch: 7, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.62500 seconds
> Epoch: 8, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.64825 seconds
> Epoch: 9, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.65929 seconds
> Epoch: 10, Current best: 0.9978070175438597, Global best: 0.99

In [42]:
best_spam_assasin_HGS = get_best('HGS', X_spam_assasin, y_spam_assasin)
print(best_spam_assasin_HGS)

> Epoch: 1, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.35580 seconds
> Epoch: 2, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.57777 seconds
> Epoch: 3, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.63617 seconds
> Epoch: 4, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.70560 seconds
> Epoch: 5, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.76225 seconds
> Epoch: 6, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.75352 seconds
> Epoch: 7, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.76901 seconds
> Epoch: 8, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.79425 seconds
> Epoch: 9, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.78895 seconds
> Epoch: 10, Current best: 0.9978118161925602, Global best: 0.99

> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.77960 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.77395 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.79473 seconds
> Epoch: 1, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.33310 seconds
> Epoch: 2, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.46265 seconds
> Epoch: 3, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.55912 seconds
> Epoch: 4, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.60368 seconds
> Epoch: 5, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.62123 seconds
> Epoch: 6, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.65413 seconds
> Epoch: 7, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.65169 seconds
> Epoch: 8, Current best: 0.9978070175438597, Glo

In [50]:
best_spam_assasin_AO = get_best('AO', X_spam_assasin, y_spam_assasin)
print(best_spam_assasin_AO)

> Epoch: 1, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.31165 seconds
> Epoch: 2, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.32541 seconds
> Epoch: 3, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.32842 seconds
> Epoch: 4, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.32491 seconds
> Epoch: 5, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.31107 seconds
> Epoch: 6, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.32083 seconds
> Epoch: 7, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.22331 seconds
> Epoch: 8, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.42012 seconds
> Epoch: 9, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.48124 seconds
> Epoch: 10, Current best: 0.9978118161925602, Global best: 0.99

> Epoch: 5, Current best: 1.0, Global best: 1.0, Runtime: 0.31553 seconds
> Epoch: 6, Current best: 1.0, Global best: 1.0, Runtime: 0.31746 seconds
> Epoch: 7, Current best: 1.0, Global best: 1.0, Runtime: 0.58110 seconds
> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 0.52852 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 0.54967 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 0.53465 seconds
> Epoch: 1, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.30355 seconds
> Epoch: 2, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.30495 seconds
> Epoch: 3, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.30073 seconds
> Epoch: 4, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.30847 seconds
> Epoch: 5, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.31646 seconds
> Epoch: 6, Current best: 0.9978070

In [53]:
best_spam_assasin_HHO = get_best('HHO', X_spam_assasin, y_spam_assasin)
print(best_spam_assasin_HHO)

> Epoch: 1, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.56152 seconds
> Epoch: 2, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.07363 seconds
> Epoch: 3, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.07252 seconds
> Epoch: 4, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 0.96875 seconds
> Epoch: 5, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.24654 seconds
> Epoch: 6, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.25014 seconds
> Epoch: 7, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.36351 seconds
> Epoch: 8, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.37462 seconds
> Epoch: 9, Current best: 0.9978118161925602, Global best: 0.9978118161925602, Runtime: 1.39140 seconds
> Epoch: 10, Current best: 0.9978118161925602, Global best: 0.99

> Epoch: 8, Current best: 1.0, Global best: 1.0, Runtime: 1.16888 seconds
> Epoch: 9, Current best: 1.0, Global best: 1.0, Runtime: 1.20130 seconds
> Epoch: 10, Current best: 1.0, Global best: 1.0, Runtime: 1.32392 seconds
> Epoch: 1, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.69098 seconds
> Epoch: 2, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.75394 seconds
> Epoch: 3, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 0.93653 seconds
> Epoch: 4, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 1.24113 seconds
> Epoch: 5, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 1.20747 seconds
> Epoch: 6, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 1.21644 seconds
> Epoch: 7, Current best: 0.9978070175438597, Global best: 0.9978070175438597, Runtime: 1.31787 seconds
> Epoch: 8, Current best: 0.9978070175438597, Glo

In [34]:
best_enron_RSCV = get_best('RSCV', X_enron, y_enron)
print(best_enron_RSCV)

[0.0001, 1, 10]


In [35]:
best_enron_DEFAULT = get_best('DEFAULT', X_enron, y_enron)
print(best_enron_DEFAULT)

[0.0001, 0.1, 0.001]


In [None]:
best_enron_MRFO = get_best('MRFO', X_enron, y_enron)
print(best_enron_MRFO)

In [None]:
best_enron_HGS = get_best('HGS', X_enron, y_enron)
print(best_enron_HGS)

In [None]:
best_enron_AO = get_best('AO', X_enron, y_enron)
print(best_enron_AO)

In [None]:
best_enron_HHO = get_best('HHO', X_enron, y_enron)
print(best_enron_HHO)

In [44]:
def create_clf(params):
    alpha, epsilon, tol = params
    return Pipeline([
        ('tfidf_vectorizer', TfidfVectorizer(
            stop_words=stopwords.words('english'))),
        ('clf', SGDClassifier(random_state=0, alpha=alpha,
         epsilon=epsilon, tol=tol, n_jobs=-1))
    ])

In [46]:
ling_spam_RSCV = create_clf(best_ling_spam_RSCV)
ling_spam_DEFAULT = create_clf(best_ling_spam_DEFAULT)
ling_spam_MRFO = create_clf(best_ling_spam_MRFO)
ling_spam_HGS = create_clf(best_ling_spam_HGS)
ling_spam_AO = create_clf(best_ling_spam_AO)
ling_spam_HHO = create_clf(best_ling_spam_HHO)

In [51]:
print('Train cross-val accuracy scores:')
print(f'ling_spam_RSCV {mean(cross_val_score(ling_spam_RSCV, X_ling_spam, y_ling_spam, cv=10))}')
print(f'ling_spam_DEFAULT {mean(cross_val_score(ling_spam_DEFAULT, X_ling_spam, y_ling_spam, cv=10))}')
print(f'ling_spam_MRFO {mean(cross_val_score(ling_spam_MRFO, X_ling_spam, y_ling_spam, cv=10))}')
print(f'ling_spam_HGS {mean(cross_val_score(ling_spam_HGS, X_ling_spam, y_ling_spam, cv=10))}')
print(f'ling_spam_AO {mean(cross_val_score(ling_spam_AO, X_ling_spam, y_ling_spam, cv=10))}')
print(f'ling_spam_HHO {mean(cross_val_score(ling_spam_HHO, X_ling_spam, y_ling_spam, cv=10))}')

Train cross-val accuracy scores:
ling_spam_RSCV 0.9979166666666668
ling_spam_DEFAULT 0.9968750000000002
ling_spam_MRFO 0.9958333333333333
ling_spam_HGS 0.9968750000000002
ling_spam_AO 0.9302190721649485
ling_spam_HHO 0.9958333333333332


In [54]:
spam_assasin_RSCV = create_clf(best_spam_assasin_RSCV)
spam_assasin_DEFAULT = create_clf(best_spam_assasin_DEFAULT)
spam_assasin_MRFO = create_clf(best_spam_assasin_MRFO)
spam_assasin_HGS = create_clf(best_spam_assasin_HGS)
spam_assasin_AO = create_clf(best_spam_assasin_AO)
spam_assasin_HHO = create_clf(best_spam_assasin_HHO)

In [55]:
print('Train cross-val accuracy scores:')
print(f'spam_assasin_RSCV {mean(cross_val_score(spam_assasin_RSCV, X_spam_assasin, y_spam_assasin, cv=10))}')
print(f'spam_assasin_DEFAULT {mean(cross_val_score(spam_assasin_DEFAULT, X_spam_assasin, y_spam_assasin, cv=10))}')
print(f'spam_assasin_MRFO {mean(cross_val_score(spam_assasin_MRFO, X_spam_assasin, y_spam_assasin, cv=10))}')
print(f'spam_assasin_HGS {mean(cross_val_score(spam_assasin_HGS, X_spam_assasin, y_spam_assasin, cv=10))}')
print(f'spam_assasin_AO {mean(cross_val_score(spam_assasin_AO, X_spam_assasin, y_spam_assasin, cv=10))}')
print(f'spam_assasin_HHO {mean(cross_val_score(spam_assasin_HHO, X_spam_assasin, y_spam_assasin, cv=10))}')

Train cross-val accuracy scores:
spam_assasin_RSCV 0.9967110061806596
spam_assasin_DEFAULT 0.9971496026718876
spam_assasin_MRFO 0.9967110061806596
spam_assasin_HGS 0.9969303044262736
spam_assasin_AO 0.9967110061806596
spam_assasin_HHO 0.9967110061806596


In [None]:
enron_RSCV = create_clf(best_enron_RSCV)
enron_DEFAULT = create_clf(best_enron_DEFAULT)
enron_MRFO = create_clf(best_enron_MRFO)
enron_HGS = create_clf(best_enron_HGS)
enron_AO = create_clf(best_enron_AO)
enron_HHO = create_clf(best_enron_HHO)

In [None]:
print('Train cross-val accuracy scores:')
print(f'enron_RSCV {mean(cross_val_score(enron_RSCV, X_enron, y_enron, cv=10))}')
print(f'enron_DEFAULT {mean(cross_val_score(enron_DEFAULT, X_enron, y_enron, cv=10))}')
print(f'enron_MRFO {mean(cross_val_score(enron_MRFO, X_enron, y_enron, cv=10))}')
print(f'enron_HGS {mean(cross_val_score(enron_HGS, X_enron, y_enron, cv=10))}')
print(f'enron_AO {mean(cross_val_score(enron_AO, X_enron, y_enron, cv=10))}')
print(f'enron_HHO {mean(cross_val_score(enron_HHO, X_enron, y_enron, cv=10))}')

In [57]:
pyplot.rcParams['figure.figsize'] = [10, 5]

In [None]:
for clf in [ling_spam_RSCV, ling_spam_DEFAULT, ling_spam_MRFO, ling_spam_HGS, ling_spam_AO, ling_spam_HHO]:
    clf.fit(X_ling_spam, y_ling_spam)

In [60]:
# test ling_spam on spam_assasin

for clf in [ling_spam_RSCV, ling_spam_DEFAULT, ling_spam_MRFO, ling_spam_HGS, ling_spam_AO, ling_spam_HHO]:
    X, y = X_spam_assasin, y_spam_assasin
    
    y_score = clf.decision_function(X)
    y_pred = clf.predict(X)

    print(f'Accuracy: {accuracy_score(y, y_pred)}')
    print(f'Confusion matrix {confusion_matrix(y, y_pred)}')
    print(f'ROC: {roc_auc_score(y, y_pred)}\n')
    
    y_fpr, y_tpr, _ = roc_curve(y, y_score)
    pyplot.plot(y_fpr, y_tpr, marker='.', label=alg)

ns_probs = [0 for _ in range(len(y))]
ns_fpr, ns_tpr, _ = roc_curve(y, ns_probs)

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='Без навыков')
pyplot.xlabel('Ошибка первого рода')
pyplot.ylabel('Чувствительность')
pyplot.legend()
pyplot.show()

NotFittedError: The TF-IDF vectorizer is not fitted