In [1]:
%load_ext autoreload
%autoreload 2

In [48]:
import numpy as np
import pandas as pd
import util.common as util

# Load Data

In [65]:
clean_dir = "/project/data/cicids2017/clean/"
x_benign_train, y_benign_train, x_binary_val, y_binary_val, x_binary_test, y_binary_test, x_multi_train, y_multi_test = util.load_data(clean_dir, sample_size=1948)

(D)DOS          321637
Port Scan        90694
Brute Force       9150
Web Attack        2143
Botnet            1948
Infiltration        36
Heartbleed          11
Name: Label, dtype: int64
Attack type:    #Original:     #Sampled:      #Train:       #Test:
      (D)DOS        321637          1948         1363          585
      Botnet          1948          1948         1363          585
 Brute Force          9150          1948         1363          585
  Heartbleed            11            11            0           11
Infiltration            36            36            0           36
   Port Scan         90694          1948         1363          585
  Web Attack          2143          1948         1363          585


In [66]:
x_benign_train.shape

(100000, 67)

In [72]:
np.unique(y_binary_val, return_counts=True)

(array([-1.,  1.]), array([  6815, 100000]))

In [73]:
np.unique(y_binary_test, return_counts=True)

(array([-1.,  1.]), array([ 2972, 30000]))

In [50]:
df_benign = pd.read_parquet("/project/data/cicids2017/clean/all_benign.parquet")
df_malicious = pd.read_parquet("/project/data/cicids2017/clean/all_malicious.parquet")

In [4]:
df_benign.columns

Index(['Destination Port', 'Protocol', 'Timestamp', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Fwd Packets Length Total', 'Bwd Packets Length Total',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count

In [5]:
df_benign.shape

(2071822, 70)

In [6]:
df_malicious.shape

(425619, 70)

In [7]:
df_benign['Label'].value_counts()

Benign    2071822
Name: Label, dtype: int64

In [8]:
df_malicious['Label'].value_counts()

DoS Hulk                      172726
DDoS                          128014
PortScan                       90694
DoS GoldenEye                  10286
FTP-Patator                     5931
DoS slowloris                   5383
DoS Slowhttptest                5228
SSH-Patator                     3219
Bot                             1948
Web Attack  Brute Force        1470
Web Attack  XSS                 652
Infiltration                      36
Web Attack  Sql Injection        21
Heartbleed                        11
Name: Label, dtype: int64

# Preprocess Data

## Benign

In [9]:
from sklearn.model_selection import train_test_split

train_size = 10000 # 1M
val_size = 100000   # 100k
test_size = 30000   # 100k
x_binary = df_benign.drop(columns=['Label', 'Timestamp', 'Destination Port'])
x_binary_train, x_benign_test = train_test_split(x_binary, train_size=train_size, random_state=42, shuffle=True)
x_benign_val, x_benign_test = train_test_split(x_benign_test, train_size=val_size, test_size=test_size, random_state=42, shuffle=True)

In [10]:
x_binary_train.shape

(10000, 67)

In [11]:
x_benign_val.shape

(100000, 67)

In [12]:
x_benign_test.shape

(30000, 67)

## Malicious

In [13]:
df_malicious.Label = df_malicious.Label.map({
    'DoS Hulk':'(D)DOS', 
    'PortScan':'Port Scan', 
    'DDoS':'(D)DOS', 
    'DoS slowloris':'(D)DOS', 
    'DoS Slowhttptest':'(D)DOS', 
    'DoS GoldenEye':'(D)DOS', 
    'SSH-Patator':'Brute Force', 
    'FTP-Patator':'Brute Force', 
    'Bot': 'Botnet', 
    'Web Attack \x96 Brute Force': 'Web Attack', 
    'Web Attack \x96 Sql Injection': 'Web Attack', 
    'Web Attack \x96 XSS': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
})
df_malicious.Label.value_counts()

(D)DOS          321637
Port Scan        90694
Brute Force       9150
Web Attack        2143
Botnet            1948
Infiltration        36
Heartbleed          11
Name: Label, dtype: int64

In [14]:
random_state = np.random.RandomState(42)
sample_size = 1948 # balance dataset, minimum reasonable number of samples per attack type
train_size = 0.7

train_idx = np.empty((0,), dtype=int)
test_idx = np.empty((0,), dtype=int)

print('{:>12}  {:>12}  {:>12} {:>12} {:>12}'.format("Attack type:", "#Original:", "#Sampled:", "#Train:", "#Test:"))
for attack_type in np.unique(df_malicious.Label):
    attack_type_count = np.count_nonzero(df_malicious.Label == attack_type)
    attack_idx = random_state.choice(range(attack_type_count), size=min(attack_type_count, sample_size), replace=False)
    if attack_type_count < sample_size:
        # Use attack class for testing only, not enough samples for training
        test_idx = np.concatenate((test_idx, np.flatnonzero(df_malicious.Label == attack_type)[attack_idx]))
        print('{:>12}  {:>12}  {:>12} {:>12} {:>12}'.format(attack_type, attack_type_count, attack_idx.shape[0], 0, attack_idx.shape[0]))
    else:
        # Splits attack class over train and test set in stratified manner
        attack_train_idx = random_state.choice(range(attack_idx.shape[0]), size=int(train_size*sample_size), replace=False)
        attack_test_mask = np.ones(sample_size, dtype=bool)
        attack_test_mask[attack_train_idx] = False
        attack_test_idx = attack_idx[attack_test_mask]
        attack_original_idx = np.flatnonzero(df_malicious.Label == attack_type)
        train_idx = np.concatenate((train_idx, attack_original_idx[attack_train_idx]))
        test_idx = np.concatenate((test_idx, attack_original_idx[attack_test_idx]))
        print('{:>12}  {:>12}  {:>12} {:>12} {:>12}'.format(attack_type, attack_type_count, attack_idx.shape[0], attack_train_idx.shape[0], attack_test_idx.shape[0]))

random_state.shuffle(train_idx)
random_state.shuffle(test_idx)

# df_multi_train = df_malicious.iloc[train_idx]
# df_multi_test = df_malicious.iloc[test_idx]
# df_multi_leftover = df_malicious[~df_malicious.index.isin(np.concatenate((train_idx, test_idx)))]

Attack type:    #Original:     #Sampled:      #Train:       #Test:
      (D)DOS        321637          1948         1363          585
      Botnet          1948          1948         1363          585
 Brute Force          9150          1948         1363          585
  Heartbleed            11            11            0           11
Infiltration            36            36            0           36
   Port Scan         90694          1948         1363          585
  Web Attack          2143          1948         1363          585


In [15]:
y_multi = df_malicious.Label
x_multi = df_malicious.drop(columns=['Label', 'Timestamp', 'Destination Port'])
x_multi_train, x_malicious_test, y_multi_train, y_malicious_test = (x_multi.iloc[train_idx], x_multi.iloc[test_idx], y_multi.iloc[train_idx], y_multi.iloc[test_idx])

In [16]:
y_multi_train.value_counts()

Web Attack     1363
Brute Force    1363
(D)DOS         1363
Port Scan      1363
Botnet         1363
Name: Label, dtype: int64

In [17]:
y_malicious_test.value_counts()

(D)DOS          585
Web Attack      585
Port Scan       585
Botnet          585
Brute Force     585
Infiltration     36
Heartbleed       11
Name: Label, dtype: int64

## Binary

In [18]:
# Add malicious samples to binary validation and test data
x_binary_val = np.concatenate((x_benign_val, x_multi_train))
y_binary_val = np.concatenate((np.full(x_benign_val.shape[0], 1), np.full(x_multi_train.shape[0], -1)))

x_binary_test = np.concatenate((x_benign_test, x_malicious_test))
y_binary_test = np.concatenate((np.full(x_benign_test.shape[0], 1), np.full(x_malicious_test.shape[0], -1)))

In [19]:
print(x_binary_train.shape)
print(x_binary_val.shape)
print(y_binary_val.shape)
print(x_binary_test.shape)
print(y_binary_test.shape)

(10000, 67)
(106815, 67)
(106815,)
(32972, 67)
(32972,)


## Multi-Class

In [20]:
print(x_multi_train.shape)
print(y_multi_train.shape)
print(x_malicious_test.shape)
print(y_malicious_test.shape)

(6815, 67)
(6815,)
(2972, 67)
(2972,)


## Normalisation

In [21]:
from sklearn.preprocessing import QuantileTransformer

binary_scaler = QuantileTransformer(output_distribution='normal')
x_binary_train = binary_scaler.fit_transform(x_binary_train)
x_binary_val = binary_scaler.transform(x_binary_val)
x_binary_test = binary_scaler.transform(x_binary_test)

In [None]:
multi_scaler = QuantileTransformer(output_distribution='normal')
x_multi_train = multi_scaler.fit_transform(x_multi_train)
x_malicious_test = multi_scaler.transform(x_malicious_test)

# Model Training

## Binary

In [22]:
from keras.models import Model
from keras.layers import Dense, Input
from keras.regularizers import l2

# TODO Tied Weights AE
# https://towardsdatascience.com/build-the-right-autoencoder-tune-and-optimize-using-pca-principles-part-ii-24b9cca69bd6

def create_model(trial, input_dim):
    input_layer = Input(shape=(input_dim,))
    model = input_layer

    encoder_layers = trial.suggest_int('encoder_layers', 1, 5)
    l2_reg = trial.suggest_loguniform('l2', 1e-10, 1e-1)
    
    # Encoder
    hidden_neurons = [input_dim]
    for i in range(encoder_layers):
        n_neuron = trial.suggest_int(f'n_layer_{i}', 1, max(1, hidden_neurons[-1] - 1))
        hidden_neurons.append(n_neuron)
        model = Dense(n_neuron, activation='relu', activity_regularizer=l2(l2_reg))(model)

    # Decoder
    hidden_neurons = hidden_neurons[1:-1]
    for i in reversed(hidden_neurons):
        model = Dense(i, activation='relu', activity_regularizer=l2(l2_reg))(model)

    # Output Layer
    model = Dense(input_dim, activation='sigmoid', activity_regularizer=l2(l2_reg))(model)
    autoencoder = Model(inputs=input_layer, outputs=model)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    return autoencoder

In [23]:
def anomaly_scores(original, transformed):
    sse = np.sum((original - transformed)**2, axis=1)
    return sse

In [24]:
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, average_precision_score, roc_auc_score, precision_recall_curve, auc, roc_curve

def evaluate_results(y_true, score):
    precision, recall, threshold = precision_recall_curve(y_true, score, pos_label=-1)
    au_precision_recall = auc(recall, precision)
    results = pd.DataFrame({'precision': precision, 'recall': recall})
    results["f1"] = 2*precision*recall/(precision+recall)
    max_index = results["f1"].idxmax()
    best = results.loc[results["f1"].idxmax()]
    best["threshold"] = threshold[max_index]
    best["au_precision_recall"] = au_precision_recall
    fpr, tpr, thresholds = roc_curve(y_true, score, pos_label=-1)
    best["auroc"] = auc(fpr, tpr)
    return best

In [25]:
from util.AUROCEarlyStoppingPruneCallback import AUROCEarlyStoppingPruneCallback

def objective(trial, x_train, x_val, y_val):
    autoencoder = create_model(trial, input_dim=x_train.shape[1])
    history = autoencoder.fit(
        x_train,
        x_train,
        epochs=20, 
        shuffle=True,
        verbose=0,
        callbacks=[
            AUROCEarlyStoppingPruneCallback(
                x_val, 
                y_val, 
                trial,
                min_delta=0.001,
                patience=3,
                mode='max',
                restore_best_weights=True,
                verbose=1
            )]
    )
    trial.set_user_attr('epochs', len(history.history['loss']))
    trial.set_user_attr('losses', history.history['loss'])
    x_val_autoencoder = autoencoder.predict(x_val)
    val_score = anomaly_scores(x_val, x_val_autoencoder)
    val_metrics = evaluate_results(y_val, val_score)

    # Save Keras model
    autoencoder.save(f'{save_dir}models/autoencoder_{trial.number}.h5')
    return val_metrics['auroc']

In [26]:
study_name = "binary_ae"
save_dir = "results/binary/"
study_storage = "results/binary/optuna.db"

In [27]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import ThresholdPruner

study = optuna.create_study(
    study_name=study_name, 
    direction='maximize', 
    sampler=TPESampler(n_startup_trials=10, n_ei_candidates=24, multivariate=True),
    pruner=ThresholdPruner(lower=0.5),
    storage=f'sqlite:///{study_storage}',
    load_if_exists=True
)
study.optimize(lambda trial: objective(trial, x_binary_train, x_binary_val, y_binary_val), n_trials=5, n_jobs=-1)

# Save results
results = study.trials_dataframe()
results.sort_values(by=['value'], inplace=True, ascending=False)
results.to_csv(f'{save_dir}result.csv')

[32m[I 2021-02-25 14:24:21,724][0m A new study created in RDB with name: binary_ae[0m


Epoch #0	Validation AUROC: 0.5307303983859134	Best AUROC: -inf
Epoch #1	Validation AUROC: 0.5289071951577403	Best AUROC: 0.5307303983859134
Epoch #2	Validation AUROC: 0.5290251320616287	Best AUROC: 0.5307303983859134
Epoch #3	Validation AUROC: 0.5286518701393984	Best AUROC: 0.5307303983859134
Restoring model weights from the end of the best epoch
Epoch 00004: early stopping


  results["f1"] = 2*precision*recall/(precision+recall)
[32m[I 2021-02-25 14:24:48,419][0m Trial 0 finished with value: 0.5307303983859134 and parameters: {'encoder_layers': 1, 'l2': 0.01428888487323317, 'n_layer_0': 58}. Best is trial 0 with value: 0.5307303983859134.[0m


Epoch #0	Validation AUROC: 0.5297442956713133	Best AUROC: -inf
Epoch #1	Validation AUROC: 0.5301041166544387	Best AUROC: 0.5297442956713133
Epoch #2	Validation AUROC: 0.5302693462949376	Best AUROC: 0.5297442956713133
Epoch #3	Validation AUROC: 0.5302994622157007	Best AUROC: 0.5297442956713133
Restoring model weights from the end of the best epoch
Epoch 00004: early stopping


  results["f1"] = 2*precision*recall/(precision+recall)
[32m[I 2021-02-25 14:25:13,397][0m Trial 1 finished with value: 0.5297442956713133 and parameters: {'encoder_layers': 1, 'l2': 0.011846262117309953, 'n_layer_0': 62}. Best is trial 0 with value: 0.5307303983859134.[0m


Epoch #0	Validation AUROC: 0.5305614578136464	Best AUROC: -inf
Epoch #1	Validation AUROC: 0.5308417057960382	Best AUROC: 0.5305614578136464
Epoch #2	Validation AUROC: 0.5324568870139398	Best AUROC: 0.5305614578136464
Epoch #3	Validation AUROC: 0.5321327417461482	Best AUROC: 0.5324568870139398
Epoch #4	Validation AUROC: 0.5323611614086574	Best AUROC: 0.5324568870139398
Epoch #5	Validation AUROC: 0.5322787079970652	Best AUROC: 0.5324568870139398
Restoring model weights from the end of the best epoch
Epoch 00006: early stopping


  results["f1"] = 2*precision*recall/(precision+recall)
[32m[I 2021-02-25 14:25:53,097][0m Trial 2 finished with value: 0.5324568870139398 and parameters: {'encoder_layers': 4, 'l2': 0.0001114057508392709, 'n_layer_0': 24, 'n_layer_1': 16, 'n_layer_2': 14, 'n_layer_3': 7}. Best is trial 2 with value: 0.5324568870139398.[0m


Epoch #0	Validation AUROC: 0.5296885568598679	Best AUROC: -inf
Epoch #1	Validation AUROC: 0.5301917982391784	Best AUROC: 0.5296885568598679
Epoch #2	Validation AUROC: 0.530098345561262	Best AUROC: 0.5296885568598679
Epoch #3	Validation AUROC: 0.5295600432868672	Best AUROC: 0.5296885568598679
Restoring model weights from the end of the best epoch
Epoch 00004: early stopping


  results["f1"] = 2*precision*recall/(precision+recall)
[32m[I 2021-02-25 14:26:16,570][0m Trial 3 finished with value: 0.5296885568598679 and parameters: {'encoder_layers': 1, 'l2': 3.7410141038754e-10, 'n_layer_0': 9}. Best is trial 2 with value: 0.5324568870139398.[0m


Epoch #0	Validation AUROC: 0.5298644541452677	Best AUROC: -inf
Epoch #1	Validation AUROC: 0.5297821841526046	Best AUROC: 0.5298644541452677
Epoch #2	Validation AUROC: 0.5286525685986794	Best AUROC: 0.5298644541452677
Epoch #3	Validation AUROC: 0.527615412325752	Best AUROC: 0.5298644541452677
Restoring model weights from the end of the best epoch
Epoch 00004: early stopping


  results["f1"] = 2*precision*recall/(precision+recall)
[32m[I 2021-02-25 14:26:41,448][0m Trial 4 finished with value: 0.5298644541452677 and parameters: {'encoder_layers': 2, 'l2': 1.1395932123991422e-07, 'n_layer_0': 38, 'n_layer_1': 23}. Best is trial 2 with value: 0.5324568870139398.[0m


In [28]:
from keras.models import load_model

model = load_model(f"{save_dir}/models/autoencoder_10.h5")

In [31]:
x_pred = model.predict(x_binary_val)
scores = anomaly_scores(x_binary_val, x_pred)

In [33]:
evaluate_results(y_binary_val, scores)

  results["f1"] = 2*precision*recall/(precision+recall)


precision              1.139032e-01
recall                 5.879677e-01
f1                     1.908368e-01
threshold              4.785408e+12
au_precision_recall    7.837432e-02
auroc                  6.064048e-01
Name: 67804, dtype: float64

In [29]:
from keras.models import load_model
ae_2017 = load_model(f'/project/cic-ids-2017/results/autoencoder/optuna/models/autoencoder_443.h5')
x_pred = ae_2017.predict(x_binary_val)
score = anomaly_scores(x_binary_val, x_pred)

In [30]:
evaluate_results(y_binary_val, score)

  results["f1"] = 2*precision*recall/(precision+recall)


precision               0.223064
recall                  0.345708
f1                      0.271163
threshold              54.456198
au_precision_recall     0.232508
auroc                   0.797346
Name: 76628, dtype: float64