In [11]:
%load_ext autoreload
%autoreload 2

In [12]:
import numpy as np
import pandas as pd
import util.common as util

# Load Data

In [13]:
clean_dir = "/project/data/cicids2017/clean/"
x_binary_train, y_binary_train, x_binary_val, y_binary_val, x_binary_test, y_binary_test, x_multi_train, y_multi_test = util.load_data(clean_dir, train_size=20000, sample_size=1948)

(D)DOS          321637
Port Scan        90694
Brute Force       9150
Web Attack        2143
Botnet            1948
Infiltration        36
Heartbleed          11
Name: Label, dtype: int64
Attack type:    #Original:     #Sampled:      #Train:       #Test:
      (D)DOS        321637          1948         1363          585
      Botnet          1948          1948         1363          585
 Brute Force          9150          1948         1363          585
  Heartbleed            11            11            0           11
Infiltration            36            36            0           36
   Port Scan         90694          1948         1363          585
  Web Attack          2143          1948         1363          585


In [14]:
np.unique(y_binary_train, return_counts=True)

(array([1.]), array([20000]))

In [15]:
np.unique(y_binary_val, return_counts=True)

(array([-1.,  1.]), array([  6815, 100000]))

In [16]:
np.unique(y_binary_test, return_counts=True)

(array([-1.,  1.]), array([ 2972, 30000]))

## Normalise data

In [7]:
# from sklearn.preprocessing import QuantileTransformer

# binary_scaler = QuantileTransformer(output_distribution='normal')
# x_binary_train_s = binary_scaler.fit_transform(x_binary_train)
# x_binary_val_s = binary_scaler.transform(x_binary_val)
# x_binary_test_s = binary_scaler.transform(x_binary_test)

In [8]:
# from sklearn.preprocessing import MinMaxScaler

# binary_scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
# x_binary_train_s = binary_scaler.fit_transform(x_binary_train)
# x_binary_val_s = binary_scaler.transform(x_binary_val)
# x_binary_test_s = binary_scaler.transform(x_binary_test)

In [17]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, MinMaxScaler
scalers = {
    'standard': StandardScaler(),
    'robust': RobustScaler(quantile_range=(25, 75)),
    'quantile': QuantileTransformer(output_distribution='normal'),
    'minmax': MinMaxScaler(feature_range=(0, 1), copy=True)
}
x_train = {}
x_val = {}
for key, value in scalers.items():
    x_train[key] = value.fit_transform(x_binary_train)
    x_val[key] = value.transform(x_binary_val)

# Train Model

In [18]:
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline

In [19]:
def create_model(params):
    return Pipeline(
        [
            ("pca", PCA(n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=42)), 
            ("ocsvm", OneClassSVM(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=True, max_iter=-1))
        ]
    ).set_params(**params)

In [20]:
params = {
#     "scaler": "standard",
    "pca__n_components": 19,
    "ocsvm__kernel": "rbf",
    "ocsvm__gamma": 0.1318,
    'ocsvm__nu': 0.0358
}

In [56]:
params = {
#     "scaler": "standard",
    "pca__n_components": 6,
    "ocsvm__kernel": "rbf",
    "ocsvm__gamma": 0.162411,
#     'ocsvm__nu': 0.635848336984203345
}

In [21]:
model = create_model(params)

In [58]:
params["scaler"] = "standard"

In [22]:
params["scaler"] = "quantile"

In [23]:
model.fit(x_train[params["scaler"]])

[LibSVM]

Pipeline(steps=[('pca', PCA(n_components=19, random_state=42)),
                ('ocsvm', OneClassSVM(gamma=0.1318, nu=0.0358, verbose=True))])

In [24]:
scores = model.decision_function(x_val[params["scaler"]])

In [25]:
val_metrics = util.evaluate_results(y_binary_val, -scores)
val_metrics

f1                   precision    0.405302
                     recall       0.482318
                     f1           0.440469
                     f2           0.464659
f2                   precision    0.300039
                     recall       0.560235
                     f1           0.390788
                     f2           0.477429
f1threshold                       0.052452
f2threshold                      -0.002256
au_precision_recall               0.310581
auroc                             0.812241
dtype: float64

# Log final results to disk and neptuen

In [1]:
import optuna

In [2]:
study_storage = 'results/binary/ocsvm.db'
list(map(lambda s: s.study_name, optuna.study.get_all_study_summaries(storage=f"sqlite:///{study_storage}")))

['TWOS-47',
 'TWOS-48',
 'TWOS-49',
 'TWOS-50',
 'TWOS-51',
 'TWOS-52',
 'TWOS-53',
 'TWOS-54',
 'TWOS-55']

In [3]:
study_name = "TWOS-55"
save_dir = f'results/binary/ocsvm/{study_name}'

In [5]:
study = optuna.load_study(study_name=study_name, storage=f"sqlite:///{study_storage}")
results = study.trials_dataframe()
results.sort_values(by=['value'], inplace=True, ascending=False)
results.to_csv(f'{save_dir}/result.csv')
results.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_gamma,params_n_comp,params_nu,params_scaler,user_attrs_au_precision_recall,...,user_attrs_f1f2,user_attrs_f1precision,user_attrs_f1recall,user_attrs_f1threshold,user_attrs_f2f1,user_attrs_f2f2,user_attrs_f2precision,user_attrs_f2recall,user_attrs_f2threshold,state
1067,1067,0.885405,2021-03-04 15:46:29.951131,2021-03-04 15:46:45.883670,0 days 00:00:15.932539,0.061968,22.0,0.084535,quantile,0.327007,...,0.525975,0.34098,0.608511,0.085758,0.40938,0.55366,0.285417,0.723698,-0.003608,COMPLETE
828,828,0.884954,2021-03-04 15:25:56.776596,2021-03-04 15:26:10.968010,0 days 00:00:14.191414,0.063872,25.0,0.088535,quantile,0.326744,...,0.545055,0.346267,0.63639,0.100979,0.44139,0.552707,0.330463,0.664417,0.055183,COMPLETE
1666,1666,0.88325,2021-03-04 16:38:27.843841,2021-03-04 16:38:39.820689,0 days 00:00:11.976848,0.065431,26.0,0.07623,quantile,0.317972,...,0.537845,0.373155,0.604549,0.116185,0.453525,0.558835,0.345129,0.661189,0.045097,COMPLETE
1079,1079,0.882658,2021-03-04 15:47:23.818171,2021-03-04 15:47:37.811332,0 days 00:00:13.993161,0.050753,26.0,0.079925,quantile,0.321438,...,0.408571,0.43829,0.401761,0.666783,0.406865,0.543107,0.28691,0.699193,-0.003747,COMPLETE
1104,1104,0.882135,2021-03-04 15:49:11.975766,2021-03-04 15:49:22.706789,0 days 00:00:10.731023,0.057988,24.0,0.073578,quantile,0.316762,...,0.492283,0.353752,0.545708,0.09996,0.399862,0.53055,0.283481,0.678357,-0.003097,COMPLETE


In [6]:
import neptune
from neptunecontrib.monitoring.optuna import log_study_info

project = neptune.init(project_qualified_name='verkerken/two-stage-binary', api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiMGJlYTgzNzEtM2U3YS00ODI5LWEzMzgtM2M0MjcyMDIxOWUwIn0=')
my_exp = project.get_experiments(id=study_name)[0]
log_study_info(study, experiment=my_exp)

In [7]:
from neptunecontrib.api.table import log_table
log_table("results_overview", results, experiment=my_exp)

# Explore Results

In [27]:
results.sort_values(by=['user_attrs_auroc'], ascending=False)[:100].tail()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_gamma,params_n_comp,params_nu,params_scaler,user_attrs_au_precision_recall,...,user_attrs_f1f2,user_attrs_f1precision,user_attrs_f1recall,user_attrs_f1threshold,user_attrs_f2f1,user_attrs_f2f2,user_attrs_f2precision,user_attrs_f2recall,user_attrs_f2threshold,state
823,823,0.86733,2021-03-04 15:25:33.765429,2021-03-04 15:25:56.538397,0 days 00:00:22.772968,0.050731,22.0,0.204466,quantile,0.287657,...,0.509094,0.269995,0.653852,1.947488,0.360162,0.511985,0.241035,0.712106,0.780013,COMPLETE
758,758,0.86725,2021-03-04 15:19:07.416138,2021-03-04 15:19:18.805844,0 days 00:00:11.389706,0.092558,21.0,0.075272,quantile,0.292621,...,0.468881,0.30302,0.543213,0.045633,0.366499,0.488092,0.258974,0.626706,-0.002916,COMPLETE
405,405,0.867204,2021-03-04 14:47:29.990543,2021-03-04 14:47:52.951196,0 days 00:00:22.960653,0.127771,55.0,0.139007,quantile,0.299362,...,0.51946,0.367564,0.57931,0.871544,0.445139,0.524526,0.355472,0.595304,0.756419,COMPLETE
286,286,0.867191,2021-03-04 14:35:32.326251,2021-03-04 14:35:53.952912,0 days 00:00:21.626661,0.121541,48.0,0.131462,quantile,0.300593,...,0.520171,0.370548,0.578577,0.757501,0.449282,0.525295,0.361981,0.592076,0.673755,COMPLETE
1061,1061,0.867124,2021-03-04 15:45:43.861240,2021-03-04 15:46:04.667962,0 days 00:00:20.806722,0.098894,41.0,0.133105,quantile,0.298383,...,0.510127,0.34711,0.57799,0.702278,0.427867,0.516111,0.33298,0.598386,0.561144,COMPLETE
