In [None]:
import pandas as pd
from scipy.io import arff
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearnex import patch_sklearn

n_cpu = os.cpu_count()-3

patch_sklearn()

In [2]:
def processNSLKDD():
    arff_train = arff.loadarff('./Data/nsl-kdd/KDDTrain+.arff')
    arff_test = arff.loadarff('./Data/nsl-kdd/KDDTest+.arff')
    train_data = pd.DataFrame(arff_train[0])
    test_data = pd.DataFrame(arff_test[0])
    
    for f in test_data.select_dtypes(include='O').columns:
        train_data[f] = train_data[f].str.decode(encoding='utf-8')
        test_data[f] = test_data[f].str.decode(encoding='utf-8')

    for f in ['land', 'logged_in', 'is_host_login', 'is_guest_login']:
        train_data[f] = train_data[f].map({'0': 0, '1': 1})
        test_data[f] = test_data[f].map({'0': 0, '1': 1})

    X_train = train_data.drop(['class', 'num_outbound_cmds'], axis=1).select_dtypes(include='number')

    X_max, X_min = X_train.max(axis=0), X_train.min(axis=0)
    X_train = (X_train - X_min) / (X_max - X_min)

    Y_train = train_data['class'].map({'normal': 1, 'anomaly': 0})

    X_test = test_data.drop(['class', 'num_outbound_cmds'], axis=1).select_dtypes(include='number')

    X_test = (X_test - X_min) / (X_max - X_min)
    X_test.clip(0, 1)

    Y_test = test_data['class'].map({'normal': 1, 'anomaly': 0})

    return X_train, Y_train, X_test, Y_test

In [None]:
X_train, Y_train, X_test, Y_test = processNSLKDD()
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

# Evaluation

In [None]:
columns_result = pd.read_pickle('./Result/columns_result.pkl')
columns_result.head()

In [37]:
def validateAndTest(model, X_train, Y_train, X_test, Y_test):
    cv = StratifiedKFold(shuffle=True, random_state=0)
    cv_score = cross_val_score(model, X_train, Y_train, scoring='f1', cv=cv, n_jobs=n_cpu).mean()

    model.fit(X_train.values, Y_train.values)
    predictions = model.predict(X_test.values)
    test_score = f1_score(Y_test.values, predictions)

    return cv_score, test_score

In [35]:
def getEvaluateResult(X_train, Y_train, X_test, Y_test):
    scores = pd.DataFrame()
    
    for model, model_name in zip([LogisticRegression(C=100, solver='liblinear', random_state=0),
                                  GradientBoostingClassifier(n_estimators=200, random_state=0)], ['LR', 'GB']):
        for method in ['UFS', 'SBS', 'RFE', 'Importance', 'Greedy']:
            columns_to_select = []
            cv_scores, test_scores = [], []
            for _, row in columns_result[method].items():
                columns_to_select += [row]

                cv_score, test_score = validateAndTest(model, X_train[columns_to_select], Y_train, X_test[columns_to_select], Y_test)
                cv_scores.append(cv_score)
                test_scores.append(test_score)

            scores[f'cv_{method}_{model_name}'] = cv_scores
            scores[f'test_{method}_{model_name}'] = test_scores

        for method in ['Union', 'Intersection', 'Quorum']:
            cv_scores, test_scores = [], []
            prev = None
            for _, columns_to_select in columns_result[method].items():
                if len(columns_to_select) > 1:
                    if columns_to_select != prev:
                        cv_score, test_score = validateAndTest(model, X_train[columns_to_select], Y_train, X_test[columns_to_select], Y_test)
                        cv_scores.append(cv_score)
                        test_scores.append(test_score)
                    else:
                        cv_scores.append(cv_scores[-1])
                        test_scores.append(test_scores[-1])
                else:
                    cv_scores.append(0)
                    test_scores.append(0)
                
                prev = columns_to_select
            
            scores[f'cv_{method}_{model_name}'] = cv_scores
            scores[f'test_{method}_{model_name}'] = test_scores

    return scores

In [None]:
scores = getEvaluateResult(X_train.iloc[:100], Y_train.iloc[:100], X_test.iloc[:100], Y_test.iloc[:100])
scores

In [52]:
scores.to_pickle('./Result/scores_result.pkl')

# Stopping Point

In [44]:
def MaxDelta(score):
    max_delta = 0
    index = len(score) - 1
    for i in range(len(score)-1):
        delta = score[i+1] - score[i]
        if delta > max_delta:
            max_delta = delta
            index = i

    return index

def MinPerfReq(score, tolerence):
    score_req = score[-1] * (1 - tolerence)
    for i in range(len(score)-1, 0, -1):
        if score[i] < score_req:
            return i+1

    return len(score) - 1

def MaxScore(score, size, factor):
    best_performance = 0
    index = len(score) - 1
    for i in range(len(score)):
        adj_score = score[i] - (factor * size[i])
        if adj_score > best_performance:
            best_performance = adj_score
            index = i

    return index

In [None]:
tolerence = 0.03
factor = 0.03

stopping_points = {}

for model_name in ['LR', 'GB']:
    for method in ['UFS', 'SBS', 'RFE', 'Importance', 'Union', 'Intersection', 'Quorum', 'Greedy']:
        cv_scores = scores[f'cv_{method}_{model_name}'].to_list()
        test_scores = scores[f'test_{method}_{model_name}'].to_list()

        if method in ['Union', 'Intersection', 'Quorum']:
            subset_size = [len(i) for i in columns_result[method]]
        else:
            subset_size = [i+1 for i in range(scores.shape[0])]

        stop1 = MaxDelta(cv_scores)
        stop2 = MinPerfReq(cv_scores, tolerence)
        stop3 = MaxScore(cv_scores, subset_size, factor)

        for stop_name, index in zip(['MaxDelta', 'MinPerfReq', 'MaxScore'], [stop1, stop2, stop3]):
            stopping_points[f'{stop_name}_size_{method}_{model_name}'] = subset_size[index]
            stopping_points[f'{stop_name}_cv_score_{method}_{model_name}'] = cv_scores[index]
            stopping_points[f'{stop_name}_test_score_{method}_{model_name}'] = test_scores[index]

stopping_points = pd.Series(stopping_points)
stopping_points

In [51]:
stopping_points.to_pickle('./Result/stopping_points.pkl')