In [50]:
import pandas as pd
import numpy as np
from scipy.io import arff
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

n_cpu = os.cpu_count() - 3

In [None]:
from sklearn.base import clone
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, SequentialFeatureSelector, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearnex import patch_sklearn

patch_sklearn()

In [6]:
def processNSLKDD():
    arff_train = arff.loadarff('./Data/nsl-kdd/KDDTrain+.arff')
    arff_test = arff.loadarff('./Data/nsl-kdd/KDDTest+.arff')
    train_data = pd.DataFrame(arff_train[0])
    test_data = pd.DataFrame(arff_test[0])
    
    for f in test_data.select_dtypes(include='O').columns:
        train_data[f] = train_data[f].str.decode(encoding='utf-8')
        test_data[f] = test_data[f].str.decode(encoding='utf-8')

    for f in ['land', 'logged_in', 'is_host_login', 'is_guest_login']:
        train_data[f] = train_data[f].map({'0': 0, '1': 1}).astype(int)
        test_data[f] = test_data[f].map({'0': 0, '1': 1}).astype(int)

    X_train = train_data.drop(['class', 'num_outbound_cmds'], axis=1).select_dtypes(include='number')

    X_max, X_min = X_train.max(axis=0), X_train.min(axis=0)
    X_train = (X_train - X_min) / (X_max - X_min)

    Y_train = train_data['class'].map({'normal': 1, 'anomaly': 0})

    X_test = test_data.drop(['class', 'num_outbound_cmds'], axis=1).select_dtypes(include='number')

    X_test = (X_test - X_min) / (X_max - X_min)
    X_test.clip(0, 1)

    Y_test = test_data['class'].map({'normal': 1, 'anomaly': 0})

    return X_train, Y_train, X_test, Y_test

In [None]:
X_train, Y_train, X_test, Y_test = processNSLKDD()
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

# Feature Selection

## Correlation-Based

In [5]:
def selectByCFS(X_train, Y_train):
    def getMerit(subset):
        k = len(subset)
        rcf = corr_target.loc[subset].mean()

        corr_cur = X_train[subset].corr().abs().values
        rff = corr_cur[np.tril_indices_from(corr_cur)].mean()

        merit = (k * rcf) / np.sqrt(k + k * (k-1) * rff)

        return merit

    corr_target = X_train.corrwith(Y_train).abs().sort_values(ascending=False)

    columns_to_select = X_train.columns.tolist()
    cur_merit = getMerit(columns_to_select)
    n =  len(X_train.columns)

    while len(columns_to_select) > 1:
        merits = pd.DataFrame(columns=['Feature', 'Merit'])
        for f in columns_to_select:
            cur_set = [ff for ff in columns_to_select if ff != f]
            k = len(cur_set)

            rcf = corr_target.loc[cur_set].mean()

            corr_cur = X_train[cur_set].corr().abs().values
            rff = corr_cur[np.tril_indices_from(corr_cur)].mean()

            merit = (k * rcf) / np.sqrt(k + k * (k-1) * rff)
            
            merits.loc[len(merits)] = [f, merit]
        
        merits = merits.sort_values(by='Merit', ascending=False)

        if merits.loc[0, 'Merit'] > cur_merit:
            cur_merit = merits.loc[0, 'Merit']
            columns_to_select.remove(merits.loc[0, 'Feature'])
        else:
            break

        del merits

    print(columns_to_select, cur_merit)

    return columns_to_select

In [None]:
columns_to_select = selectByCFS(X_train, Y_train)
columns_to_select

## UFS

In [30]:
def selectByUFS(X_train, Y_train):
    selector = SelectKBest(score_func=mutual_info_classif, k='all')
    selector.fit(X_train, Y_train)

    return X_train.columns[np.argsort(-selector.scores_)].to_list()

In [None]:
columns_by_UFS = selectByUFS(X_train.iloc[:100], Y_train.iloc[:100])
columns_by_UFS

## SBS

In [18]:
def selectBySBS(X_train, Y_train):
    model = RandomForestClassifier(random_state=0, oob_score=True)
    cv = StratifiedKFold(shuffle=True, random_state=0)
    n_features = X_train.shape[1]
    columns_by_SBS = []

    for i in range(n_features-1):
        X_train_sub = X_train.drop(columns_by_SBS, axis=1)
        selector = SequentialFeatureSelector(model, n_features_to_select=X_train_sub.shape[1]-1, direction='backward', scoring='f1', cv=cv, n_jobs=n_cpu)
        selector.fit(X_train_sub, Y_train)
        columns_by_SBS.insert(0, X_train_sub.columns[~selector.get_support()][0])
        del X_train_sub

    columns_by_SBS.insert(0, X_train.columns.drop(columns_by_SBS)[0])

    return columns_by_SBS

In [None]:
columns_by_SBS = selectBySBS(X_train.iloc[:100], Y_train.iloc[:100])
columns_by_SBS

## RFE

In [20]:
def selectByRFE(X_train, Y_train):
    model = RandomForestClassifier(random_state=0, oob_score=True, n_jobs=n_cpu)

    selector = RFE(model, n_features_to_select=1)
    selector.fit(X_train, Y_train)

    return X_train.columns[np.argsort(selector.ranking_)].tolist()

In [None]:
columns_by_RFE = selectByRFE(X_train.iloc[:100], Y_train.iloc[:100])
columns_by_RFE

## Importance

In [22]:
def selectByImportance(X_train, Y_train):
    model = RandomForestClassifier(random_state=0, oob_score=True, n_jobs=n_cpu)
    model.fit(X_train, Y_train)

    return X_train.columns[np.argsort(-model.feature_importances_)].tolist()

In [None]:
columns_by_imp = selectByImportance(X_train.iloc[:100], Y_train.iloc[:100])
columns_by_imp

## Set

In [None]:
len(columns_by_UFS)

In [71]:
def selectBySet(columns_by_UFS, columns_by_SBS, columns_by_RFE, columns_by_imp):    
    n_features = len(columns_by_UFS)

    columns_by_union = []
    columns_by_intersection = []
    columns_by_quorum = []

    for i in range(1, n_features+1):
        columns_by_union.append(list(set().union(columns_by_UFS[:i], columns_by_SBS[:i], columns_by_RFE[:i], columns_by_imp[:i])))
        columns_by_intersection.append(list(set(columns_by_UFS[:i]).intersection(columns_by_SBS[:i], columns_by_RFE[:i], columns_by_imp[:i])))
        columns_sum = columns_by_UFS[:i] + columns_by_SBS[:i] + columns_by_RFE[:i] + columns_by_imp[:i]
        columns_by_quorum.append([f for f in set(columns_sum) if columns_sum.count(f) > 2])

    return columns_by_union, columns_by_intersection, columns_by_quorum

In [72]:
columns_by_union, columns_by_intersection, columns_by_quorum = selectBySet(columns_by_UFS, columns_by_SBS, columns_by_RFE, columns_by_imp)

In [None]:
print(columns_by_union[0])
print([len(i) for i in columns_by_union])
print([len(i) for i in columns_by_intersection])
print([len(i) for i in columns_by_quorum])

## Greedy

In [47]:
def selectByGreedy(X_train, Y_train, columns_by_UFS):    
    model = RandomForestClassifier(random_state=0, oob_score=True)
    cv = StratifiedKFold(shuffle=True, random_state=0)

    n_features = X_train.shape[1]

    greedy_features = []

    for i in range(n_features-1):
        candidates = [columns_by_UFS[0]]

        X_train_sub = X_train.drop(greedy_features, axis=1)

        selector = SequentialFeatureSelector(model, n_features_to_select=X_train_sub.shape[1]-1, direction='backward', scoring='f1', cv=cv, n_jobs=n_cpu)
        selector.fit(X_train_sub, Y_train)
        candidates.append(X_train_sub.columns[~selector.get_support()][0])

        selector = RFE(model, n_features_to_select=X_train_sub.shape[1]-1)
        selector.fit(X_train_sub, Y_train)
        candidates.append(X_train_sub.columns[~selector.get_support()][0])

        model_imp = clone(model)
        model_imp.fit(X_train_sub, Y_train)
        candidates.append(X_train_sub.columns[np.argsort(model_imp.feature_importances_)][0])

        scores = []
        for f in candidates:
            cv_score = cross_val_score(model, X_train_sub.drop(f, axis=1), Y_train, scoring='f1', cv=cv, n_jobs=n_cpu)
            scores.append(cv_score.mean())

        feature_to_remove = candidates[np.argmax(scores)]
        greedy_features.append(feature_to_remove)
        columns_by_UFS.remove(feature_to_remove)  

        print(feature_to_remove)  

    greedy_features.append(X_train.columns.drop(greedy_features)[0])
    greedy_features.reverse()

    return greedy_features

In [None]:
columns_by_greedy = selectByGreedy(X_train.iloc[:100], Y_train.iloc[:100], columns_by_UFS.copy())

In [74]:
columns_result = pd.DataFrame(columns=['UFS', 'SBS', 'RFE', 'Importance', 'Union', 'Intersection', 'Quorum', 'Greedy'])
for f, l in zip(columns_result.columns, [columns_by_UFS, columns_by_SBS, columns_by_RFE, columns_by_imp, columns_by_union, columns_by_intersection, columns_by_quorum, columns_by_greedy]):
    columns_result[f] = l

In [None]:
columns_result.head()

In [77]:
columns_result.to_pickle('./Result/columns_result.pkl')