In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

In [3]:
import pandas as pd
import util.common as util
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split
import pathlib
import pickle
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Input
from keras.regularizers import l2
from util.AUROCEarlyStoppingPruneCallback import AUROCEarlyStoppingPruneCallback

import neptune.new as neptune
from neptune.new.types import File

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import ThresholdPruner
import optuna.visualization as vis

In [5]:
# Load Data
clean_dir = "/project/data/cicids2017/clean/"
n_benign_val = 1500
x_benign_train, y_benign_train, x_benign_val, y_benign_val, x_benign_test, y_benign_test, x_malicious_train, y_malicious_train, x_malicious_test, y_malicious_test, attack_type_train, attack_type_test, attack_type = util.load_data(clean_dir, sample_size=1948, train_size=n_benign_val, val_size=6815, test_size=56468)

                                         #Original  #Sampled  #Train  \
Class        Impl                                                      
Benign       Benign                        2071822     64783    1500   
(D)DOS       DDoS                           128014       775       0   
             DoS GoldenEye                   10286        63       0   
             DoS Hulk                       172726      1046       0   
             DoS Slowhttptest                 5228        31       0   
             DoS slowloris                    5383        33       0   
             ALL                            321637      1948       0   
Botnet       Bot                              1948      1948       0   
             ALL                              1948      1948       0   
Brute Force  FTP-Patator                      5931      1263       0   
             SSH-Patator                      3219       685       0   
             ALL                              9150      1948    

In [5]:
# Input data
algorithm = "nn" # "rf"
neptune_project = "Stage2-Multi"
dataset = "cic-ids-2017"
stage = "stage2" # "stage2"

In [13]:
x_train, x_val, y_train, y_val, at_train, at_val = train_test_split(x_malicious_train, y_malicious_train, attack_type_train, stratify=attack_type_train, test_size=1500, random_state=42, shuffle=True)

In [11]:
at_train.value_counts()

Bot                           1064
PortScan                      1064
Web Attack  Brute Force       730
FTP-Patator                    686
DoS Hulk                       577
DDoS                           422
SSH-Patator                    378
Web Attack  XSS               322
DoS GoldenEye                   31
DoS Slowhttptest                17
DoS slowloris                   17
Web Attack  Sql Injection      12
Name: Label, dtype: int64

In [12]:
at_val.value_counts()

Bot                           300
PortScan                      300
Web Attack  Brute Force      206
FTP-Patator                   198
DoS Hulk                      155
DDoS                          121
SSH-Patator                   102
Web Attack  XSS               93
DoS GoldenEye                  13
DoS slowloris                   6
DoS Slowhttptest                5
Web Attack  Sql Injection      1
Name: Label, dtype: int64

In [7]:
y_train_n = pd.get_dummies(y_train)
y_val_n = pd.get_dummies(y_val)

In [8]:
x_val_unk = np.concatenate((x_val, x_benign_train))
y_val_unk = np.concatenate((y_val, np.full(n_benign_val, "Unknown")))
y_val_unk_n = pd.get_dummies(y_val_unk)

In [9]:
np.unique(y_train, return_counts=True)

(array(['(D)DOS', 'Botnet', 'Brute Force', 'Port Scan', 'Web Attack'],
       dtype=object),
 array([1063, 1063, 1063, 1063, 1063]))

In [10]:
np.unique(y_val_unk, return_counts=True)

(array(['(D)DOS', 'Botnet', 'Brute Force', 'Port Scan', 'Unknown',
        'Web Attack'], dtype=object),
 array([ 300,  300,  300,  300, 1500,  300]))

In [11]:
scaler = QuantileTransformer(output_distribution='uniform') # uniform instead of normal with range [0,1]
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)
x_val_unk_s = scaler.transform(x_val_unk)

In [12]:
run = neptune.init(project=f'verkerken/{neptune_project}', tags=[dataset, algorithm], api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiMGJlYTgzNzEtM2U3YS00ODI5LWEzMzgtM2M0MjcyMDIxOWUwIn0=')

Info (NVML): NVML Shared Library Not Found. GPU usage metrics may not be reported. For more information, see https://docs-legacy.neptune.ai/logging-and-managing-experiment-results/logging-experiment-data.html#hardware-consumption 


https://ui.neptune.ai/verkerken/Stage2-Multi/e/STAG-6


In [13]:
run_id = run['sys/id'].fetch()
save_dir = f"results/{stage}/{algorithm}/{run_id}"
pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)

In [14]:
def create_nn(params):
    input_layer = Input(shape=(params["n_neurons"][0],))
    model = input_layer
    
    for n in params["n_neurons"][1:-1]:
        model = Dense(n, activation=params['hidden_activation'], activity_regularizer=l2(params["l2_reg"]))(model)
    
    model = Dense(params["n_neurons"][-1], activation=params['output_activation'], activity_regularizer=l2(params['l2_reg']))(model)
    nn = Model(inputs=input_layer, outputs=model)
    nn.compile(optimizer=params['optimizer'], loss=params['loss'], metrics=[tf.keras.metrics.CategoricalAccuracy()])
    return nn

In [15]:
params_nn = {
    'scaler': "quantile", 
    'output_activation': 'softmax',
    "hidden_activation": 'relu',
    "optimizer": "adam",
    "loss": "categorical_crossentropy",
    "n_neurons": [67, 50, 5],
    "l2_reg": 0.1
}

In [23]:
def objective_nn(trial):
    params = params_nn.copy()
    params['trial_id'] = trial.number
    n_layers = trial.suggest_int("n_layers", 1, 6)
    params['n_neurons'] = [x_train_s.shape[1]]
    for i in range(n_layers):
        params['n_neurons'].append(trial.suggest_int(f'n_layer_{i}', 5, max(5, params['n_neurons'][-1])))
    params['n_neurons'].append(5)
    params["l2_reg"] = trial.suggest_loguniform('l2', 1e-10, 1e-1)
    print(params)
    model = create_nn(params)
    history = model.fit(
        x_train_s,
        y_train_n,
        validation_data=(x_val_s, y_val_n),
        epochs=50,
        shuffle=True,
        verbose=0,
        callbacks=[tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', 
            patience=3, 
            min_delta=0.01, 
            mode='min', 
            restore_best_weights=True, 
            verbose=1
        )]
    )
    
    y_pred = model.predict(x_val_unk_s)
    
    # Find optimal threshold for unknown class with F1 score (macro & weighted)
    fmacro, fweight, thresholds, f_best = util.optimal_fscore_multi(y_val_unk, y_pred, y_train_n.columns)
    fig = util.plot_f_multi(fmacro, fweight, thresholds, f_best)

    # Plot confusion matrix for optimal threshold
    y_pred_weight = np.where(np.max(y_pred, axis=1) > f_best["f1_weighted_threshold"], y_train_n.columns[np.argmax(y_pred, axis=1)], 'Unknown')
    y_pred_macro = np.where(np.max(y_pred, axis=1) > f_best["f1_macro_threshold"], y_train_n.columns[np.argmax(y_pred, axis=1)], 'Unknown')

    classes = ['(D)DOS', 'Botnet', 'Brute Force', 'Port Scan', 'Web Attack', 'Unknown']
    fig_weight = util.plot_confusion_matrix(y_val_unk, y_pred_weight, values=classes, labels=classes)
    fig_macro = util.plot_confusion_matrix(y_val_unk, y_pred_macro, values=classes, labels=classes)
    
    # Log metrics to neptune
    run["metrics"].log(f_best)
    run["params"].log(params)
    run[f"trials/{trial.number}"] = f_best
    run[f"trials/{trial.number}/fscore_plot"].upload(fig)
    run[f"trials/{trial.number}/confusion_weight"].upload(fig_weight)
    run[f"trials/{trial.number}/confusion_macro"].upload(fig_macro)
    run[f"trials/{trial.number}/history"] = history.history
    run["f1_macro"].log(f_best['f1_macro'])
    run["f1_weight"].log(f_best['f1_weighted'])
    
    trial.set_user_attr("f1_macro", f_best['f1_macro'])
    trial.set_user_attr("f1_weight", f_best['f1_weighted'])
    
    # Save scikit-learn model locally and upload to neptune
    
    # Save Keras model locally and upload to neptune
    model.save(f'{save_dir}/models/model_{trial.number}.h5')
    run[f"trials/{trial.number}/model"].upload(f'{save_dir}/models/model_{trial.number}.h5')
    plt.close('all')
    
    return f_best['f1_weighted']

In [26]:
study = optuna.create_study(
    study_name=run_id, 
    direction='maximize', 
    sampler=TPESampler(n_startup_trials=200, n_ei_candidates=24, multivariate=True),
    storage='mysql://optuna:optuna@localhost/optuna_db', 
    load_if_exists=True
)
study.optimize(objective_nn, n_trials=4)

[32m[I 2021-05-03 13:52:14,190][0m Using an existing study with name 'STAG-6' instead of creating a new one.[0m


{'scaler': 'quantile', 'output_activation': 'softmax', 'hidden_activation': 'relu', 'optimizer': 'adam', 'loss': 'categorical_crossentropy', 'n_neurons': [67, 57, 57, 45, 22, 5], 'l2_reg': 1.2178063803439584e-07, 'trial_id': 4}
Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping


[32m[I 2021-05-03 13:52:33,648][0m Trial 4 finished with value: 0.8847055064314988 and parameters: {'n_layers': 4, 'n_layer_0': 57, 'n_layer_1': 57, 'n_layer_2': 45, 'n_layer_3': 22, 'l2': 1.2178063803439584e-07}. Best is trial 4 with value: 0.884705.[0m


{'scaler': 'quantile', 'output_activation': 'softmax', 'hidden_activation': 'relu', 'optimizer': 'adam', 'loss': 'categorical_crossentropy', 'n_neurons': [67, 6, 6, 5, 5], 'l2_reg': 9.225971635985735e-05, 'trial_id': 5}
Restoring model weights from the end of the best epoch.
Epoch 00021: early stopping


[32m[I 2021-05-03 13:52:53,935][0m Trial 5 finished with value: 0.7768350359098047 and parameters: {'n_layers': 3, 'n_layer_0': 6, 'n_layer_1': 6, 'n_layer_2': 5, 'l2': 9.225971635985735e-05}. Best is trial 4 with value: 0.884705.[0m


{'scaler': 'quantile', 'output_activation': 'softmax', 'hidden_activation': 'relu', 'optimizer': 'adam', 'loss': 'categorical_crossentropy', 'n_neurons': [67, 41, 33, 29, 16, 5, 5], 'l2_reg': 9.726086000265545e-05, 'trial_id': 6}
Restoring model weights from the end of the best epoch.
Epoch 00027: early stopping


[32m[I 2021-05-03 13:53:17,785][0m Trial 6 finished with value: 0.776762429048587 and parameters: {'n_layers': 5, 'n_layer_0': 41, 'n_layer_1': 33, 'n_layer_2': 29, 'n_layer_3': 16, 'n_layer_4': 5, 'l2': 9.726086000265545e-05}. Best is trial 4 with value: 0.884705.[0m


{'scaler': 'quantile', 'output_activation': 'softmax', 'hidden_activation': 'relu', 'optimizer': 'adam', 'loss': 'categorical_crossentropy', 'n_neurons': [67, 48, 46, 27, 22, 19, 5], 'l2_reg': 0.004772204156371458, 'trial_id': 7}
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping


[32m[I 2021-05-03 13:53:38,163][0m Trial 7 finished with value: 0.8558060400766178 and parameters: {'n_layers': 5, 'n_layer_0': 48, 'n_layer_1': 46, 'n_layer_2': 27, 'n_layer_3': 22, 'n_layer_4': 19, 'l2': 0.004772204156371458}. Best is trial 4 with value: 0.884705.[0m


In [27]:
results = study.trials_dataframe()
results.sort_values(by="value", inplace=True, ascending=False)
results.to_csv(f"{save_dir}/results.csv")

run['results_df'].upload(File.as_html(results))
run['optuna/study'].upload(File.as_pickle(study))

run['optuna/param_importances_2'].upload(vis.plot_param_importances(study))
run['optuna/optimization_history'].upload(vis.plot_optimization_history(study))
run['optuna/param_slice'].upload(vis.plot_slice(study))
run['optuna/parallel_coordinate'].upload(vis.plot_parallel_coordinate(study))
run['optuna/param_contour'].upload(vis.plot_contour(study))