In [4]:
import csv
import numpy as np
import pandas as pd

df = pd.read_csv(f'./input/enron/messages.csv')
df = df.fillna(' ')

X = np.array(df['message'])
y = np.array(df['label'])

In [5]:
from mealpy.swarm_based import AO, HGS, SSA, MRFO

def resolve_clf(alg):
    if alg == 'AO':
        return AO.OriginalAO
    elif alg == 'HGS':
        return HGS.OriginalHGS
    elif alg == 'SSA':
        return SSA.OriginalSSA
    elif alg == 'MRFO':
        return MRFO.BaseMRFO


def test_bio_alg(clf, obj_function):
    problem = {
        'obj_func': obj_function,
        'lb': [0.0001, 0.0001, 0.0001],
        'ub': [1000, 1000, 1000],
        'minmax': 'max',
        'verbose': True,
    }
    model = clf(problem, epoch=10, pop_size=60)
    model.solve()
    return model.g_best

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# size of test part of dataset
SIZE = 0.25

In [14]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE)

# tokenization
cv = TfidfVectorizer(stop_words=stopwords.words('english'))
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [16]:
BIO_ALGS = ['AO', 'HGS', 'SSA', 'MRFO']

In [17]:
def obj_function(solution):
    alpha, epsilon, tol = solution
    clf = SGDClassifier(alpha=alpha, epsilon=epsilon, tol=tol)
    clf.fit(X_train, y_train)

    calibrator = CalibratedClassifierCV(clf, cv='prefit')
    model = calibrator.fit(X_train, y_train)
    y_score = model.predict_proba(X_test)

    return roc_auc_score(y_test, y_score[:, 1])

In [25]:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score

In [28]:
alg = 'AO'
clf = resolve_clf(alg)
best_params_ = test_bio_alg(clf, obj_function)
alpha, epsilon, tol = best_params_[0]

model = SGDClassifier(alpha=alpha, epsilon=epsilon, tol=tol)
model.fit(X_train, y_train)

# print best params
print(f'{alg} with enron is best with: ', f'{best_params_}'.rjust(29, ' '))

> Epoch: 1, Current best: 0.99906499056149, Global best: 0.99906499056149, Runtime: 8.63985 seconds
> Epoch: 2, Current best: 0.99906499056149, Global best: 0.99906499056149, Runtime: 7.44782 seconds
> Epoch: 3, Current best: 0.99906499056149, Global best: 0.99906499056149, Runtime: 6.85458 seconds
> Epoch: 4, Current best: 0.99906499056149, Global best: 0.99906499056149, Runtime: 6.30865 seconds
> Epoch: 5, Current best: 0.99906499056149, Global best: 0.99906499056149, Runtime: 6.25252 seconds
> Epoch: 6, Current best: 0.99906499056149, Global best: 0.99906499056149, Runtime: 6.22552 seconds
> Epoch: 7, Current best: 0.999065441087028, Global best: 0.999065441087028, Runtime: 15.93744 seconds
> Epoch: 8, Current best: 0.999065441087028, Global best: 0.999065441087028, Runtime: 12.87079 seconds
> Epoch: 9, Current best: 0.999065441087028, Global best: 0.999065441087028, Runtime: 10.42839 seconds
> Epoch: 10, Current best: 0.999065441087028, Global best: 0.999065441087028, Runtime: 12.7

In [29]:
alg = 'HGS'
clf = resolve_clf(alg)
best_params_ = test_bio_alg(clf, obj_function)
alpha, epsilon, tol = best_params_[0]

model = SGDClassifier(alpha=alpha, epsilon=epsilon, tol=tol)
model.fit(X_train, y_train)

# print best params
print(f'{alg} with enron is best with: ', f'{best_params_}'.rjust(29, ' '))

> Epoch: 1, Current best: 0.9990660042439505, Global best: 0.9990660042439505, Runtime: 7.22585 seconds
> Epoch: 2, Current best: 0.9990665110851807, Global best: 0.9990665110851807, Runtime: 10.32216 seconds
> Epoch: 3, Current best: 0.9990657226654892, Global best: 0.9990665110851807, Runtime: 12.81471 seconds
> Epoch: 4, Current best: 0.9990631884593377, Global best: 0.9990665110851807, Runtime: 15.08706 seconds
> Epoch: 5, Current best: 0.9990662858224117, Global best: 0.9990665110851807, Runtime: 14.46896 seconds
> Epoch: 6, Current best: 0.9990646526673365, Global best: 0.9990665110851807, Runtime: 13.22796 seconds
> Epoch: 7, Current best: 0.9990656100341048, Global best: 0.9990665110851807, Runtime: 13.68029 seconds
> Epoch: 8, Current best: 0.9990650468771823, Global best: 0.9990665110851807, Runtime: 17.21571 seconds
> Epoch: 9, Current best: 0.9990641458261061, Global best: 0.9990665110851807, Runtime: 15.37429 seconds
> Epoch: 10, Current best: 0.9990646526673365, Global be

In [30]:
alg = 'SSA'
clf = resolve_clf(alg)
best_params_ = test_bio_alg(clf, obj_function)
alpha, epsilon, tol = best_params_[0]

model = SGDClassifier(alpha=alpha, epsilon=epsilon, tol=tol)
model.fit(X_train, y_train)

# print best params
print(f'{alg} with enron is best with: ', f'{best_params_}'.rjust(29, ' '))

> Epoch: 1, Current best: 0.9941685100669931, Global best: 0.9941685100669931, Runtime: 15.86160 seconds
> Epoch: 2, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 13.54730 seconds
> Epoch: 3, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 13.09876 seconds
> Epoch: 4, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 14.29289 seconds
> Epoch: 5, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 19.16693 seconds
> Epoch: 6, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 16.62466 seconds
> Epoch: 7, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 15.82414 seconds
> Epoch: 8, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 18.57276 seconds
> Epoch: 9, Current best: 0.9967776160891679, Global best: 0.9967776160891679, Runtime: 17.89888 seconds
> Epoch: 10, Current best: 0.9967776160891679, Global b

In [31]:
alg = 'MRFO'
clf = resolve_clf(alg)
best_params_ = test_bio_alg(clf, obj_function)
alpha, epsilon, tol = best_params_[0]

model = SGDClassifier(alpha=alpha, epsilon=epsilon, tol=tol)
model.fit(X_train, y_train)

# print best params
print(f'{alg} with enron is best with: ', f'{best_params_}'.rjust(29, ' '))

> Epoch: 1, Current best: 0.9990847010537792, Global best: 0.9990847010537792, Runtime: 17.79025 seconds
> Epoch: 2, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 16.46893 seconds
> Epoch: 3, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 17.41599 seconds
> Epoch: 4, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 18.10333 seconds
> Epoch: 5, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 15.52336 seconds
> Epoch: 6, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 17.94680 seconds
> Epoch: 7, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 22.49260 seconds
> Epoch: 8, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 15.40007 seconds
> Epoch: 9, Current best: 0.9991026094439163, Global best: 0.9991026094439163, Runtime: 15.90241 seconds
> Epoch: 10, Current best: 0.9991026094439163, Global b

In [45]:
tuned_parameters = {
    'epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

clf = SGDClassifier(random_state=0)
model = RandomizedSearchCV(clf, tuned_parameters, scoring="roc_auc", random_state=0)
model.fit(X_train, y_train)
print('RSCV', model.best_params_)

RSCV {'tol': 10, 'epsilon': 1, 'alpha': 0.0001}


NameError: name 'X_enron' is not defined