In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef, roc_auc_score
import gc
import tensorflow as tf



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfunswnb15v2/NF-UNSW-NB15-V2.parquet
/kaggle/input/nfunswnb15v2/NetFlow v2 Features.csv


In [2]:
features_to_be_removed = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'Label']
seed = 42

def remove_features(df, feats=features_to_be_removed):
    X = df.drop(columns=feats)
    y = df.Label
    return X, y

def train_test_validation_scaled(X, y, test_size):
    scaler = MinMaxScaler()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=seed, stratify=y)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    train_data = X_train[y_train == 0]  # only benign samples from train set
    attack_data = X_train[y_train == 1]  # only attack samples from train set
    
    # Define a validation set
    idx_for_validation_set = int(train_data.shape[0] * 0.9)
    val_data = train_data[idx_for_validation_set:]  
    train_data = train_data[:idx_for_validation_set]
    
    del scaler
    del X_train
    del y_train
    gc.collect()
    
    return train_data, val_data, X_test, y_test


def load_data(path, test_size=0.2):
    df = pd.read_parquet(path)
    X, y = remove_features(df)
    del df
    gc.collect()
    train_data, val_data, X_test, y_test = train_test_validation_scaled(X, y, test_size)
    return train_data, val_data, X_test, y_test

def load_model(input_shape):
    model = tf.keras.models.Sequential([
                tf.keras.layers.Input(shape=(input_shape,)),
                tf.keras.layers.Dense(32, activation='relu'),
                tf.keras.layers.Dense(16, activation='relu'),
                tf.keras.layers.Dense(8, activation='relu'),
                tf.keras.layers.Dense(4, activation='relu'),
                tf.keras.layers.Dense(8, activation='relu'),
                tf.keras.layers.Dense(16, activation='relu'),
                tf.keras.layers.Dense(32, activation='relu'),
                tf.keras.layers.Dense(input_shape, activation='sigmoid')]
    )
    
    return model

def eval_training(y_test, preds):
    acc = accuracy_score(y_test, preds)
    rec = recall_score(y_test, preds)
    prec = precision_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    missrate = fn / (fn + tp)
    fallout = fp / (fp + tn)
    auc = roc_auc_score(y_test, preds)
    
    return acc, rec, prec, f1, mcc, missrate, fallout, auc


def test_model_on_dataset(path, supplied_model=None, optimizer='adam', loss='mean_squared_error', batch_size=128, epochs=20):
    print(f'\nTESTING DATASET WITH PATH {path}')
    print('='*80)
    print()
        
    train_data, val_data, X_test, y_test = load_data(path)
        
    model = None
        
    if supplied_model == None:
        model = load_model(train_data.shape[1])
    else:
        model = supplied_model
            
    model.compile(optimizer=optimizer, loss=loss)

    history = model.fit(
            train_data,
            train_data,
            batch_size=batch_size,
            shuffle=True,
            epochs=epochs
    )
    
    del train_data 
    gc.collect()
        
    val_inference = model.predict(val_data)
    val_losses = np.mean(abs(val_data - val_inference), axis=1)
    del val_data
    gc.collect()
    
    threshold = np.quantile(val_losses, 0.95)
    inference = model.predict(X_test)
    losses = np.mean(abs(X_test - inference), axis=1)
    del X_test
    gc.collect()
    
    test_eval = losses > threshold
    acc, rec, prec, f1, mcc, missrate, fallout, auc = eval_training(y_test, test_eval)
    
    print("Saving the model")
    model.save("autoencoder.h5")
    print()
    print(f'ACCURACY: {acc}')
    print(f'RECALL: {rec}')
    print(f'PRECISION: {prec}')
    print(f'F1-SCORE: {f1}')
    print(f'MATTHEWS CORRELATION COEFFICIENT: {mcc}')
    print(f'MISSRATE: {missrate}')
    print(f'FALLOUT: {fallout}')
    print(f'AUC: {auc}')
    print()
        
        

def test_model_on_datasets(paths, supplied_model=None, optimizer='adam', loss='mean_squared_error', batch_size=128, epochs=20):
    
    for path in paths:
        
        test_model_on_dataset(path, supplied_model, optimizer, loss, batch_size, epochs)

In [3]:
data_paths = ['/kaggle/input/nfunswnb15v2/NF-UNSW-NB15-V2.parquet']
sampled_data_paths = ['/kaggle/input/sampled-datasets-v2/NF-UNSW-NB15-V2.parquet']

test_model_on_datasets(data_paths, epochs=10)


TESTING DATASET WITH PATH /kaggle/input/nfunswnb15v2/NF-UNSW-NB15-V2.parquet

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Saving the model

ACCURACY: 0.9396349305019014
RECALL: 0.6856686201385189
PRECISION: 0.3482849604221636
F1-SCORE: 0.46193189465655704
MATTHEWS CORRELATION COEFFICIENT: 0.4616328851518688
MISSRATE: 0.3143313798614811
FALLOUT: 0.050390628065063704
AUC: 0.8176389960367276

