In [4]:
from sklearn.naive_bayes import MultinomialNB
from audioop import cross
from contextlib import redirect_stdout
from sklearn.model_selection import StratifiedKFold
from mealpy.swarm_based import AO, HGS, SSA, MRFO, HHO
from matplotlib import pyplot
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, auc
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std

DEFAULT_PARAMS = [0.0001, 0.1, 1e-3]

In [5]:
enron = pd.read_csv(f'./input/enron/messages.csv').fillna(' ')
X_enron = np.array(enron['message'])
y_enron = np.array(enron['label'])

ling_spam = pd.read_csv(f'./input/ling_spam/messages.csv').fillna(' ')
X_ling_spam = np.array(ling_spam['message'])
y_ling_spam = np.array(ling_spam['label'])

spam_assasin = pd.read_csv(f'./input/spam_assasin/messages.csv').fillna(' ')
X_spam_assasin = np.array(spam_assasin['message'])
y_spam_assasin = np.array(spam_assasin['label'])

In [6]:
def resolve_dataset(name):
    if (name == 'Enron'):
        return [X_enron.copy(), y_enron.copy()]
    elif (name == 'Ling Spam'):
        return [X_ling_spam.copy(), y_ling_spam.copy()]
    elif (name == 'Spam Assasin'):
        return [X_spam_assasin.copy(), y_spam_assasin.copy()]
    else:
        return


def resolve_alg(alg):
    if alg == 'AO':
        return AO.OriginalAO
    elif alg == 'HGS':
        return HGS.OriginalHGS
    elif alg == 'SSA':
        return SSA.OriginalSSA
    elif alg == 'MRFO':
        return MRFO.BaseMRFO

In [7]:
def get_best(alg, X, y):
    if (alg == 'RSCV'):
        distributions = {
            'clf__epsilon': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__tol': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__class_weight': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }
        skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
        clf = Pipeline([
            ('tfidf_vectorizer', TfidfVectorizer(
                stop_words=stopwords.words('english'))),
            ('clf', SGDClassifier(random_state=0, n_jobs=-1, class_weight='balanced'))])

        clf_random = RandomizedSearchCV(
            clf, distributions, scoring='accuracy', cv=skf, random_state=0)
        clf_random.fit(X, y)
        best = clf_random.best_params_

        return [best['clf__alpha'], best['clf__epsilon'], best['clf__tol']]

    alg = resolve_alg(alg)
    cv = TfidfVectorizer(stop_words=stopwords.words('english'))
    skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

    alpha, epsilon, tol = [], [], []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train = cv.fit_transform(X_train)
        X_test = cv.transform(X_test)

        def obj_function(solution):
            alpha, epsilon, tol = solution
            clf = SGDClassifier(random_state=0, class_weight='balanced', alpha=alpha,
                                epsilon=epsilon, tol=tol, n_jobs=-1)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            return accuracy_score(y_test, y_pred)

        problem = {
            'fit_func': obj_function,
            'lb': [0.0001, 0.0001, 0.0001],
            'ub': [1000, 1000, 1000],
            'minmax': 'max',
            'verbose': True,
        }

        model = alg(problem, epoch=10, pop_size=40)
        model.solve()
        a, e, t = model.g_best[0]
        alpha.append(a)
        epsilon.append(e)
        tol.append(t)

    return [mean(alpha), mean(epsilon), mean(tol)]

In [41]:
clf = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer(
        stop_words=stopwords.words('english'))),
    # ('clf', SGDClassifier(random_state=0, n_jobs=-1, class_weight='balanced'))])
    ('clf', SGDClassifier(random_state=0, n_jobs=-1, class_weight={1.: 1, 0.:5}))])

# X, y = resolve_dataset("Enron")
X, y = resolve_dataset("Ling Spam")
# X, y = resolve_dataset("Enron")

# print(mean(cross_val_score(clf, X, y, scoring='accuracy')))

clf.fit(X, y)

y_pred = clf.predict(X_ling_spam)

print('LS F1: ', f1_score(y_ling_spam, y_pred))
print('LS ACC: ', accuracy_score(y_ling_spam, y_pred))

y_pred = clf.predict(X_enron)

print('EN F1: ', f1_score(y_enron, y_pred))
print('EN ACC: ', accuracy_score(y_enron, y_pred))

y_pred = clf.predict(X_spam_assasin)

print('SA F1: ', f1_score(y_spam_assasin, y_pred))
print('SA ACC: ', accuracy_score(y_spam_assasin, y_pred))

LS F1:  1.0
LS ACC:  1.0
EN F1:  0.6319881525360977
EN ACC:  0.7051757378021652
SA F1:  0.798914178325329
SA ACC:  0.8956663055254604


In [84]:
clf = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer(
        stop_words=stopwords.words('english'))),
    # ('clf', SGDClassifier(random_state=0, n_jobs=-1, class_weight='balanced'))])
    ('clf', SGDClassifier(random_state=0, n_jobs=-1, class_weight={1.: 1, 0.:12}))])

# X, y = resolve_dataset("Enron")
# X, y = resolve_dataset("Ling Spam")
X, y = resolve_dataset("Spam Assasin")

# print(mean(cross_val_score(clf, X, y, scoring='accuracy')))

clf.fit(X, y)

# y_pred = clf.predict(X_ling_spam)

# print('LS F1: ', f1_score(y_ling_spam, y_pred))
# print('LS ACC: ', accuracy_score(y_ling_spam, y_pred))

y_pred = clf.predict(X_enron)

print('EN F1: ', f1_score(y_enron, y_pred))
print('EN ACC: ', accuracy_score(y_enron, y_pred))

# y_pred = clf.predict(X_spam_assasin)

# print('SA F1: ', f1_score(y_spam_assasin, y_pred))
# print('SA ACC: ', accuracy_score(y_spam_assasin, y_pred))

LS F1:  0.8391167192429021
LS ACC:  0.947113722779122
SA F1:  0.9980250164581962
SA ACC:  0.9990249187432286
