In [None]:
from google.colab import drive # to access G-Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = "/content/drive/My Drive/C3I_NIDS"

In [None]:
data_dir = "/content/drive/MyDrive/c3i/2018/proc"

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
import os
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve,auc,roc_curve
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, matthews_corrcoef, average_precision_score, roc_auc_score, precision_recall_curve, auc, roc_curve

In [None]:
def load_data(data_dir, filename, verbose=True, train_size=50000, test_size=None, data2018=False):
    df = pd.read_csv(f"{data_dir}/{filename}")
    if data2018:
        Y = df["Label"].map(lambda x: 1 if (x == "BENIGN") else -1)
        labels = df["Label"]
        df.drop(columns=["Label", "Timestamp", "Destination Port"], inplace=True)
    else:
        Y = df["Label"].map(lambda x: 1 if (x == "Benign") else -1)
        labels = df["Label"]
        df.drop(columns=["Label", "Timestamp", "Dst Port"], inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(df, Y, train_size=train_size, test_size=test_size, shuffle=True, stratify=Y, random_state=42)
    if verbose:
        print("***** Train Data *****")
        print(labels.loc[y_train.index].value_counts())
        print("***** Test Data *****")
        print(labels.loc[y_test.index].value_counts())
    return X_train, X_test, y_train, y_test, labels.loc[y_train.index], labels.loc[y_test.index]


def load_data_fraud(data_dir, verbose=True, data2018=False):
    df = pd.read_csv(f"{data_dir}/all_malicious.csv")
    if data2018:
        Y = df["Label"].map(lambda x: 1 if (x == "BENIGN") else -1)
        labels = df["Label"]
        df.drop(columns=["Label", "Timestamp", "Destination Port"], inplace=True)
    else:
        Y = df["Label"].map(lambda x: 1 if (x == "Benign") else -1)
        labels = df["Label"]
        df.drop(columns=["Label", "Timestamp", "Dst Port"], inplace=True)
    if verbose:
        print("***** Data *****")
        print(labels.value_counts())
    return df, Y, labels

In [None]:
train_size=50000
test_size=500000
epochs = 30
validation_perc=0.15

# Load benign transactions
X_train, X_test, y_train, y_test, train_labels, test_labels = load_data(data_dir, "benign_1M.csv", train_size=train_size, test_size=test_size, verbose=False)
# Load all fraud transactions
X_fraud, y_fraud, labels_fraud = load_data_fraud(data_dir, verbose=False)


X_test = X_test.append(X_fraud)
y_test = y_test.append(y_fraud)
test_labels = test_labels.append(labels_fraud)


X_val, X_t, y_val, y_t, label_val, label_t = train_test_split(X_test, y_test, test_labels, train_size=validation_perc, random_state=42, stratify=test_labels, shuffle=True)

In [None]:
X_train.shape

(50000, 67)

In [None]:
print("***** Train Data *****")
print(train_labels.value_counts())
print("***** Validation Data *****")
print(label_val.value_counts())
print("***** Test Data *****")
print(label_t.value_counts())

***** Train Data *****
Benign    50000
Name: Label, dtype: int64
***** Validation Data *****
Benign                      75000
DDOS attack-HOIC            29829
DoS attacks-Hulk            21780
Bot                         21680
Infilteration               19468
SSH-Bruteforce              14106
DoS attacks-GoldenEye        6211
DoS attacks-Slowloris        1486
DDOS attack-LOIC-UDP          260
Brute Force -Web               83
Brute Force -XSS               34
SQL Injection                  12
FTP-BruteForce                  8
DoS attacks-SlowHTTPTest        6
Name: Label, dtype: int64
***** Test Data *****
Benign                      425000
DDOS attack-HOIC            169032
DoS attacks-Hulk            123419
Bot                         122855
Infilteration               110318
SSH-Bruteforce               79935
DoS attacks-GoldenEye        35195
DoS attacks-Slowloris         8422
DDOS attack-LOIC-UDP          1470
Brute Force -Web               471
Brute Force -XSS               19

In [None]:
def anomaly_scores(original, transformed):
    sse = np.sum((original - transformed)**2, axis=1)
    return sse

def evaluate_results(y_true, score):
    precision, recall, threshold = precision_recall_curve(y_true, score, pos_label=-1)
    au_precision_recall = auc(recall, precision)
    results = pd.DataFrame({'precision': precision, 'recall': recall})
    results["f1"] = 2*precision*recall/(precision+recall)
    max_index = results["f1"].idxmax()
    best = results.loc[results["f1"].idxmax()]
    best["threshold"] = threshold[max_index]
    best["au_precision_recall"] = au_precision_recall
    fpr, tpr, thresholds = roc_curve(y_true, score, pos_label=-1)
    best["auroc"] = auc(fpr, tpr)
    return best

def evaluate_predictions(y_true, y_pred):
    results = {}
    results['recall'] = recall_score(y_true, y_pred, pos_label=-1, zero_division=0)
    results['precision'] = precision_score(y_true, y_pred, pos_label=-1, zero_division=0)
    results['f1'] = f1_score(y_true, y_pred, pos_label=-1, zero_division=0)
    return results

def evaluate_test_data(y_true, score, threshold):
    y_pred = np.array([1 if score < threshold else -1 for score in score])
    results = evaluate_predictions(y_true, y_pred)
    precision, recall, threshold = precision_recall_curve(y_true, score, pos_label=-1)
    results['au_precision_recall'] = auc(recall, precision)
    fpr, tpr, thresholds = roc_curve(y_true, score, pos_label=-1)
    results["auroc"] = auc(fpr, tpr)
    return results

In [None]:
scaler = QuantileTransformer(output_distribution='normal')
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)

In [None]:
pca = PCA(n_components=32, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)
pca.fit(X_train_s)
x_train_feature = pca.transform(X_train_s)
x_val_feature = pca.transform(X_val_s)
#X_val_pca = pca.transform(X_val_s)
X_val_pca_inv = pca.inverse_transform(x_val_feature)
val_score_pca = anomaly_scores(X_val_s, X_val_pca_inv)
val_metrics_pca = evaluate_results(y_val, val_score_pca)


In [None]:
val_metrics_pca

precision              0.773014
recall                 0.874238
f1                     0.820516
threshold              0.251872
au_precision_recall    0.791557
auroc                  0.770298
Name: 57527, dtype: float64

In [None]:
# Isolation Forest        
forest = IsolationForest(bootstrap=False, contamination=4.161677249308696e-05, max_features=0.6674219692639616, max_samples=0.8646939376341813, random_state=42, verbose=0, warm_start=False)
        
forest.fit(x_train_feature)
y_val_pred = forest.predict(x_val_feature)
val_metrics_if = evaluate_predictions(y_val, y_val_pred)
y_val_score = forest.decision_function(x_val_feature)
val_metrics_if = evaluate_results(y_val, -y_val_score)

In [None]:
val_metrics_if

precision              0.805743
recall                 0.905596
f1                     0.852756
threshold             -0.319912
au_precision_recall    0.809632
auroc                  0.810264
Name: 42428, dtype: float64

In [None]:
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM

pca = PCA(n_components=32, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)

pca = PCA(n_components=32, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)
pca.fit(X_train_s)
x_train_feature = pca.transform(X_train_s)
x_val_feature = pca.transform(X_val_s)

svm = OneClassSVM(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
svm.fit(x_train_feature)
y_val_pred = svm.predict(x_val_feature)
val_metrics = evaluate_predictions(y_val, y_val_pred)



In [None]:
val_metrics

{'f1': 0.5133642524477633,
 'precision': 0.581903344905494,
 'recall': 0.45926950410131956}

In [None]:
y_val_score = svm.decision_function(x_val_feature)

evaluate_results(y_val, -y_val_score)

  if __name__ == '__main__':


precision                0.620985
recall                   0.992702
f1                       0.764031
threshold             -550.215984
au_precision_recall      0.545456
auroc                    0.400312
Name: 6182, dtype: float64

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.regularizers import l2
from keras.losses import mean_squared_error

