In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random as rn

import os

import tensorflow as tf

#DNN
from tensorflow.keras.layers import Lambda, Concatenate, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras import regularizers
from tensorflow.keras.initializers import glorot_normal

#sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


import matplotlib.ticker as ticker
from numpy import load
import pickle
from tensorflow.compat.v1.keras import backend as K
import math

import time

from sklearn.metrics import *
from metrics import save_metrics_iiet

import warnings
warnings.filterwarnings("ignore")

print("libraries loaded")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
def refresh_riproducibility(seed):
   
    #set seed 

    #Python SEED
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    #numpy seed
    np.random.seed(seed)
    
    #tf seed
    tf.random.set_seed(seed)
    
    #rn seed
    rn.seed(seed)
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)
    
    #
    from tensorflow.compat.v1.keras import backend as K
    
    #sess
    sess = tf.compat.v1.get_default_session()
    K.set_session(sess)
    
    #tf seed
    tf.compat.v1.set_random_seed(seed)
    
    #os.environ['KERAS_BACKEND'] = "tensorflow"
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

In [None]:
def sampling(args):
    z_mean, z_logvar = args
    eps = tf.random.normal(shape=tf.shape(z_mean))
    return eps * tf.exp(z_logvar * .5) + z_mean


def init_sparse_vae(params):
    input_dim = params["input_dimension"]
    output_dim = input_dim
    optimizer = params["optimizer"]
    loss = params["loss"]
    inner_seed = params["seed"]
    
    # this is our input placeholder
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # encoding phase
    sparse = tf.keras.layers.Dense(1024, activation='relu')(input_layer)    
    encoded_l0 = tf.keras.layers.Dense(512, activation='relu')(sparse)    # 256
    
    z_mean = tf.keras.layers.Dense(256, name="z_mean")(encoded_l0) #8
    z_logvar = tf.keras.layers.Dense(256, name="z_log_var")(encoded_l0)
    latent_space = tf.keras.layers.Lambda(sampling, output_shape=(256,))([z_mean, z_logvar])

    # decoding phase 
    
    decoded_l0 = tf.keras.layers.Dense(512, activation="relu")(latent_space)
    
    l0_in = tf.keras.layers.Concatenate()([encoded_l0, decoded_l0])
    sparse_dec = tf.keras.layers.Dense(1024, activation="relu")(l0_in)
    
    l_in = tf.keras.layers.Concatenate()([sparse, sparse_dec])
    output_layer = tf.keras.layers.Dense(output_dim, activation="sigmoid")(l_in)

    # this model maps an input to its reconstruction
    model = tf.keras.models.Model(input_layer, output_layer, name = 'SparseVAE')
    model.summary()

    reconstruction_loss = tf.keras.losses.mean_squared_error(input_layer, output_layer)
    kl_loss = -0.5 * tf.reduce_mean((0.5 * z_logvar - tf.exp(0.5 * z_logvar) - tf.square(z_mean) + 1.))
    vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)

    model.add_loss(vae_loss)
    model.add_metric(kl_loss, name="kl_loss")
    model.add_metric(reconstruction_loss, name="reconstruction_loss")
    model.compile(optimizer=optimizer)

    return model
    

def build_detector(data, columns_names, params, model_selector, model_path):

    view = data[columns_names]
    batch_size = params["batch_size"]
    num_epoch = params["num_epoch"]
    verbose = params["verbose_output"]
    print("Model Selector "+str(model_selector))
    
    if model_selector == 0:
        model = init_vae(params)
    elif model_selector == 1:
        model = init_deep_ae(params)
    elif model_selector == 2:
        model = init_sparse_ae(params)
    elif model_selector == 3:
        model =  init_unet(params)
    elif model_selector == 4:
        model =  init_sparse_vae(params)
    else:
        print('ERROR: NO MODEL')
        sys.exit()
                    
    print("best model path: ", model_path)
    check = ModelCheckpoint(model_path, monitor='loss', verbose=2, save_best_only=True, save_weights_only=True, mode='min')
  
    history = model.fit(view, view, batch_size=batch_size, epochs=num_epoch, verbose=verbose, callbacks=[check])
    
    return model, history

In [None]:
#Further models
def init_deep_ae(params):
    
    #read params
    
    input_dim = params["input_dimension"]
    output_dim = input_dim
    optimizer = params["optimizer"]
    loss = params["loss"]
    inner_seed = params["seed"]
    
    # this is our input placeholder
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # encoding phase
    encoded_l0 = tf.keras.layers.Dense(256, activation='relu')(input_layer)    
    encoded_l1 = tf.keras.layers.Dense(128, activation='relu')(encoded_l0) 
    encoded_l2 = tf.keras.layers.Dense(64, activation='relu')(encoded_l1)
    
    latent_space = tf.keras.layers.Dense(32, activation='relu')(encoded_l2)

    
    # decoding phase 
    decoded_l2 = tf.keras.layers.Dense(64, activation="relu")(latent_space)
    decoded_l1 = tf.keras.layers.Dense(128, activation="relu")(decoded_l2)
    decoded_l0 = tf.keras.layers.Dense(256, activation="relu")(decoded_l1)
    
    output_layer = tf.keras.layers.Dense(output_dim, activation="sigmoid")(decoded_l0)

    # this model maps an input to its reconstruction
    model = tf.keras.models.Model(input_layer, output_layer, name = 'DAE')

    model.compile(optimizer=optimizer, loss=loss)

    return model


def init_sparse_ae(params):
    
    #read params
    
    input_dim = params["input_dimension"]
    output_dim = input_dim
    optimizer = params["optimizer"]
    loss = params["loss"]
    inner_seed = params["seed"]

    # this is our input placeholder
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # encoding phase    
    latent_space = tf.keras.layers.Dense(1024, activation='relu')(input_layer)
    
    # decoding phase 
    output_layer = tf.keras.layers.Dense(output_dim, activation="sigmoid")(latent_space)

    # this model maps an input to its reconstruction
    model = tf.keras.models.Model(input_layer, output_layer, name = 'SAE')

    model.compile(optimizer=optimizer, loss=loss)

    return model

def init_unet(params):
    
    #read params
    
    input_dim = params["input_dimension"]
    output_dim = input_dim
    optimizer = params["optimizer"]
    loss = params["loss"]
    inner_seed = params["seed"]
    #reg_value = params["reg_value"]
    
    
    # this is our input placeholder
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # encoding phase
    encoded_l0 = tf.keras.layers.Dense(256, activation='relu')(input_layer)    
    encoded_l1 = tf.keras.layers.Dense(128, activation='relu')(encoded_l0)    
    encoded_l2 = tf.keras.layers.Dense(64, activation='relu')(encoded_l1)
    
    latent_space = tf.keras.layers.Dense(32, activation='relu')(encoded_l2)
    
    # decoding phase 
    decoded_l2 = tf.keras.layers.Dense(64, activation="relu")(latent_space)
    
    l1_in = tf.keras.layers.Concatenate()([decoded_l2, encoded_l2])
    decoded_l1 = tf.keras.layers.Dense(128, activation="relu")(l1_in)
    
    l0_in = tf.keras.layers.Concatenate()([decoded_l1, encoded_l1])
    decoded_l0 = tf.keras.layers.Dense(256, activation="relu")(l0_in)
    
    out_in = tf.keras.layers.Concatenate()([decoded_l0, encoded_l0])
    output_layer = tf.keras.layers.Dense(output_dim, activation="sigmoid")(out_in)

    # this model maps an input to its reconstruction
    model = tf.keras.models.Model(input_layer, output_layer, name = 'UNET')

    model.compile(optimizer=optimizer, loss=loss)

    return model

def init_vae(params):
    input_dim = params["input_dimension"]
    output_dim = input_dim
    optimizer = params["optimizer"]
    loss = params["loss"]
    inner_seed = params["seed"]
    
    # this is our input placeholder
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    
    # encoding phase
    encoded_l0 = tf.keras.layers.Dense(512, activation='relu')(input_layer)    
    
    z_mean = tf.keras.layers.Dense(256, name="z_mean")(encoded_l0)
    z_logvar = tf.keras.layers.Dense(256, name="z_log_var")(encoded_l0)
    latent_space = tf.keras.layers.Lambda(sampling, output_shape=(256,))([z_mean, z_logvar])

    # decoding phase 
    decoded_l0 = tf.keras.layers.Dense(512, activation="relu")(latent_space)
    
    output_layer = tf.keras.layers.Dense(output_dim, activation="sigmoid")(decoded_l0)

    # this model maps an input to its reconstruction
    model = tf.keras.models.Model(input_layer, output_layer, name = 'VAE')
    model.summary()

    reconstruction_loss = tf.keras.losses.mean_squared_error(input_layer, output_layer)
    kl_loss = -0.5 * tf.reduce_mean((0.5 * z_logvar - tf.exp(0.5 * z_logvar) - tf.square(z_mean) + 1.))
    vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)

    model.add_loss(vae_loss)
    model.add_metric(kl_loss, name="kl_loss")
    model.add_metric(reconstruction_loss, name="reconstruction_loss")
    model.compile(optimizer=optimizer)

    return model

In [None]:
def add_more_feat(d, columns, add_to_list=False):
    
    range_values = [2, 4, 8, 16]
    
    list_copy = columns.copy()
    
    for c in columns:
        
        d["one_minus_"+c] = 1 - d[c].clip(0,1) 
        
        #power 
        for v in range_values:
            d["power_"+str(v)+"_"+c] = d[c]**v
        
        #root
        for v in range_values:
            d["root_"+str(v)+"_"+c] = d[c].clip(0,1)**(1/v)
        
        #sin
        d["sin_"+c] = np.sin(math.pi * d[c].clip(0,1))
        
        #log
        d["log_"+c] = np.log((d[c].clip(0,1)+1)/math.log(2))
            
        #exp
        d["exp_"+c] = np.exp(d[c]-1)
        
        if add_to_list:
            
            list_copy.append("one_minus_"+c)
            
            for v in range_values:
                list_copy.append("power_"+str(v)+"_"+c)
            for v in range_values:
                list_copy.append("root_"+str(v)+"_"+c)   

            list_copy.append("sin_"+c)
            list_copy.append("log_"+c)
            list_copy.append("exp_"+c)
                   
    return d, list_copy

In [None]:
training_percentage = 100
model_ = 'SUVNet'
dataset = 'SOHO'
model_name = f"model_{model_}_training_perc_{str(training_percentage)}_dataset_{dataset}"
model_name

In [None]:
model_path = "./output/model/"+model_name

In [None]:
## SET PARAMETERS

# set random seed for reproducibility
seed = 23071983
np.random.seed(seed)

to_remove = ['flow', 'index', 'src', 'dst', 'src_port', 'dst_port', 'protocol', 'flowstarttime', 'flowendtime', 'label']

VERBOSE_OUTPUT = 1

ADD_FEATURES = 1
DROP_REDUNDANT_FEATURES = 1  
ADD_NOISE = 0
ADD_BATCH = 0 #FIXED
num_epochs = 250
batch_size = 1024

work_area_data = './dataset/SOHO'

#Read Training set

# Read the training set
training = pd.read_csv(os.path.join(work_area_data, 'training_set.csv'))
print(training.shape)
print(training.columns)
training.describe()

#define feat
columns_of_interest = [x for x in training.columns if x not in to_remove] 
#initial feat set
original_feat = columns_of_interest.copy()
print(columns_of_interest)

In [None]:
#drop correlate feats

if DROP_REDUNDANT_FEATURES:
    cor_matrix = training.drop(columns=to_remove).corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
    #print(upper_tri)
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
    print()
    print("number to drop:", len(to_drop))
    print("to drop: ", to_drop)

    #update feats
    columns_of_interest = [x for x in columns_of_interest if x not in to_drop]
    original_feat = columns_of_interest.copy()

    print("Final Columns:", columns_of_interest)


In [None]:
#fit and apply scaler on training set
scaler_x =  MinMaxScaler(feature_range=(0, 1)) #MinMaxScaler(feature_range=(0, 1)) # StandardScaler()
scaler_x.fit(training[columns_of_interest])
training[columns_of_interest] = scaler_x.transform(training[columns_of_interest])
print(training[columns_of_interest].shape)
print("done")

In [None]:
#add further feat
if ADD_FEATURES:
    training, columns_of_interest = add_more_feat(training, columns_of_interest, True)

    print(len(columns_of_interest))
    print(training.shape)

    print("done")

In [None]:
print(training[columns_of_interest].shape)
print(training[columns_of_interest].describe())

In [None]:
print("input dim: ", len(columns_of_interest))

#init params
reconstructor_params = {"input_dimension": len(columns_of_interest),
                        "batch_size" : batch_size,
                        "num_epoch" : num_epochs,
                        "verbose_output" : VERBOSE_OUTPUT,
                        "optimizer":'adam',
                        "seed": seed,
                        "use_bn": ADD_BATCH,
                        "add_noise": ADD_NOISE,
                        #"reg_value": 10e-7,
                        "loss":"mse"}


#refresh ripr.
refresh_riproducibility(seed)

model_id = 4

# starts training phase

start_time = int(round(time.time() * 1000))
detector, history = build_detector(training, columns_of_interest, reconstructor_params, model_id, model_path)
end_time = int(round(time.time() * 1000))
total_time = end_time - start_time
print('Total Learning Time:'+str(total_time))

In [None]:
type(history.history)
his = history.history['reconstruction_loss']
df_his = pd.DataFrame(data=his, columns=["Loss"])
df_his['epoch'] = range(1, len(df_his) + 1)
print(df_his)

In [None]:
#Save History
file_history = "./output/history/history_"+model_name
print(file_history)
np.save(file_history, history.history)

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

file_history = "./output/history/history_"+model_name
history_read=np.load(file_history+'.npy', allow_pickle='TRUE').item()
his = history_read['reconstruction_loss']
df_his = pd.DataFrame(data=his, columns=["Loss"])
df_his['epoch'] = range(1, len(df_his) + 1)
df_his['Model']='SUVNet' 

plt.plot(df_his['epoch'], df_his['Loss'])
plt.show()

In [None]:
plt.plot(df_his['epoch'], df_his['Loss'])
plt.yscale('log')
plt.show()

In [None]:
print("input dim: ", len(columns_of_interest))

#init params
reconstructor_params = {"input_dimension": len(columns_of_interest),
                        "batch_size" : batc_size,
                        "num_epoch" : num_epochs,
                        "verbose_output" : VERBOSE_OUTPUT,
                        "optimizer":'adam',
                        "seed": seed,
                        "use_bn": ADD_BATCH,
                        "add_noise": ADD_NOISE,
                        #"reg_value": 10e-7,
                        "loss":"mse"}

#refresh ripr.
refresh_riproducibility(seed)

detector =  init_sparse_vae(reconstructor_params)

In [None]:
#reload best model
detector.load_weights(model_path)

In [None]:
#compute reconstruction
training_predictions = detector.predict(training[columns_of_interest], batch_size=4096)

In [None]:
# compute outlierness on training
outlierness_training = np.sum(np.power(np.absolute(training_predictions - training[columns_of_interest]), 1), axis=1)

In [None]:
def test(detector, file_path, threshold, attack_type=None, th=None, model='VAE'):
    print(f'Attack type: {attack_type}')
    print(f'Threshold: {threshold}')
    
    # Read the test set
    testset = pd.read_csv(file_path)
    
    #extract class
    y_test = testset["class"]
    print(y_test)
    
    #compute scaling
    testset[original_feat] = scaler_x.transform(testset[original_feat])
    
    #add derived feat
    if ADD_FEATURES:
        testset, _ = add_more_feat(testset, original_feat, False)
        print("done")

    # print(testset.columns)
    
    detector.load_weights(model_path)
    
    #compute reconstructions
    start_time = int(round(time.time() * 1000))

    predictions = detector.predict(testset[columns_of_interest], batch_size=4096)

    end_time = int(round(time.time() * 1000))
    total_time = end_time - start_time
    #print(start_time)
    #print(end_time)
    print('Total Prediction Time:'+str(total_time))
    print('Single Prediction Time:'+str(total_time/testset.shape[0]))
    
    #compute outlierness
    outlierness = np.sum(np.power(np.absolute(predictions - testset[columns_of_interest]), 1), axis=1)
    
    #compute prediction
    y_pred = outlierness > threshold

    #debug: num of yes
    np.sum(y_pred)

    report_map = classification_report(y_test, y_pred, output_dict=True)
    print(report_map)
    acc_score = accuracy_score(y_test, y_pred)

    result = str(acc_score)+";"+str(report_map['1.0']['precision']) + ";" + str(report_map['1.0']['recall']) + ";" + str(report_map['1.0']['f1-score']) 
    print("acc;prec;rec;f1")
    print(result)
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    far = cm[0][1]/(cm[0][1]+cm[1][1])
    print("FAR: ", far)
    
    auc_score = roc_auc_score(y_test, outlierness)
    print("AUC: ", auc_score)

    pr1, rec1, thr1 = precision_recall_curve(y_test, outlierness)
    auc_score_pr = auc(rec1,pr1)
    print("AUC-PR: ", auc_score_pr)
    
    import imblearn as imb
    from imblearn.metrics import geometric_mean_score
    g_mean = str(round(geometric_mean_score(y_test, y_pred, average = 'binary'), 3))
    print(("G-Mean: ", g_mean))

    return outlierness, y_test

In [None]:
_mean = np.mean(outlierness_training)
_std = np.std(outlierness_training)

In [None]:
model_name = 'SparseVAE'
attack_type = 'slowDoS'

In [None]:
alpha = 2.5
threshold = _mean + alpha * _std
test(detector, os.path.join(work_area_data, 'test_soho.csv'), threshold, attack_type, th=str(alpha), model=model_name)

alpha = 2.75
threshold = _mean + alpha * _std
test(detector, os.path.join(work_area_data, 'test_soho.csv'), threshold, attack_type, th=str(alpha), model=model_name)

alpha = 3.
threshold = _mean + alpha * _std
test(detector, os.path.join(work_area_data, 'test_soho.csv'), threshold, attack_type, th=str(alpha), model=model_name)


--------------