In [1]:
import os

In [2]:
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "1"

In [3]:
import tensorflow as tf

In [4]:
tf.__version__

'2.9.1'

In [5]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,MinMaxScaler,LabelEncoder
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Flatten, Conv1D, MaxPooling1D, Conv2D,\
    MaxPooling2D

In [7]:
from DL_Models import one_hot, ANN, CNN_1D, CNN_2D, LSTM, CNN_LSTM

In [8]:
import dpkt
import time

In [9]:
numeric_features = ["duration","orig_bytes","resp_bytes", "missed_bytes","local_orig","local_resp",
                    "orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes"]

onehotlists = ["proto","service",'conn_state','history',"tunnel_parents"]

In [10]:
def detailed_label_converter (x):
    if x == '-':
        return 0
    elif  x == "PartOfAHorizontalPortScan":
        return 1
    elif x == "DDoS":
        return 2
    else:
        return 3
    
def binary_label_converter (x):
    if str(x).lower() == 'benign':
        return 0
    else:
        return 1
    
def insertnumbers (x):
    if ((x == '-') | (x == '(empty)')):
        return 99
    else:
        return x

In [11]:
def matrix_to3D(X_train, X_test):
    dim1 = X_train.shape[1]
    divs = [i for i in range(1,dim1+1) if (dim1%i == 0)]
    if len(divs) == 2: # i.e. prime number
        # Add zeros column
        X_train = np.concatenate((X_train, np.zeros((X_train.shape[0],1))), axis=1)
        X_test = np.concatenate((X_test, np.zeros((X_test.shape[0],1))), axis=1)
        dim1 = X_train.shape[1]
        divs = [i for i in range(1,dim1+1) if (dim1%i == 0)]        
    mid_idx = len(divs)//2

    return X_train.reshape(-1, divs[mid_idx], int(dim1/divs[mid_idx]), 1), X_test.reshape(-1, divs[mid_idx], int(dim1/divs[mid_idx]), 1)

In [None]:
# Read in csv data
total_df = pd.read_csv("./IoT23_Dataset/original/datasets/3_data_v2/S04_R_5_000_000.csv")
#train_df = pd.read_csv("./IoT23_Dataset/original/datasets/3_data_v2/S04_R_5_000_000_clean.csv_train.csv")
#test_df = pd.read_csv("./IoT23_Dataset/original/datasets/3_data_v2/S04_R_5_000_000_clean.csv_test.csv")

In [13]:
total_df.drop(columns=['ts', 'uid', 'id.orig_h', 'id.orig_p','id.resp_h','id.resp_p', 'detailed-label'], inplace=True)
#train_df.drop(columns=['id.orig_h', 'id.orig_p','id.resp_h','id.resp_p'], inplace=True)
#test_df.drop(columns=['id.orig_h', 'id.orig_p','id.resp_h','id.resp_p'], inplace=True)

In [14]:
total_df_1, total_df_2 = train_test_split(total_df, test_size=0.6)

In [15]:
total_df_1[["duration","orig_bytes","resp_bytes", "missed_bytes","local_orig","local_resp",
                    "orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes"]] = total_df_1[["duration","orig_bytes","resp_bytes", "missed_bytes","local_orig","local_resp",
                    "orig_pkts","orig_ip_bytes","resp_pkts","resp_ip_bytes"]].applymap(insertnumbers)

In [16]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer2 = Pipeline(steps=[('onehotencoder', OneHotEncoder(sparse = False,handle_unknown='ignore',drop = 'first'))])

In [17]:
F17_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat1', categorical_transformer1,['id.orig_h','id.resp_h']),
        ('cat2', categorical_transformer2, onehotlists)],remainder = "drop",verbose_feature_names_out=True)

In [18]:
total_df_1['label'] = total_df_1['label'].map(binary_label_converter)

In [19]:
labels = total_df_1.pop('label')

In [20]:
labels = labels.values

In [None]:
F17_preprocessor.fit(total_df_1)

In [22]:
filename = "f17_pipeline.pkl"
file = open(filename, 'wb')
pickle.dump(F17_preprocessor, file)
file.close()

In [23]:
total_df_1 = F17_preprocessor.transform(total_df_1)

In [24]:
len(total_df_1[0])

206

In [25]:
train_df, X_test = train_test_split(total_df_1, test_size=0.2, shuffle=False)

In [26]:
len(train_df)

6400000

In [28]:
len(X_test)

1600000

In [18]:
#train_df['label'] = train_df['label'].map(binary_label_converter)

In [19]:
#test_df['label'] = test_df['label'].map(binary_label_converter)

In [20]:
#train_df['label'].unique()

array([0, 1], dtype=int64)

In [21]:
#train_df['label'].unique()

array([0, 1], dtype=int64)

In [29]:
# Split train_df into training and validation sets (80/20)
X_train_orig, X_validation = train_test_split(train_df, test_size=0.2,shuffle=False)

In [30]:
X_train, X_train_2 = train_test_split(X_train_orig, test_size=0.2, shuffle=False)

In [34]:
y_train = labels[0:4096000]
y_test = labels[6400000:8000000]
y_validation = labels[5120000:6400000]

In [35]:
# All model parameters
learning_rate = 1e-3
decay_rate = 1e-5
dropout_rate = 0.5
n_batch = 100
n_epochs = 1  # Loop 1000 times on the dataset
filters = 128
kernel_size = 4
strides = 1
CNN_layers = 2
clf_reg = 1e-5
save_dir_k = './ANN_model'

In [37]:
print(np.unique(y_train))
print(np.unique(y_test))
print(np.unique(y_validation))

[0 1]
[0 1]
[0 1]


In [38]:
np.shape(X_test)

(1600000, 206)

In [39]:
np.shape(X_validation)

(1280000, 206)

In [40]:
np.shape(X_train)

(4096000, 206)

# ANN

In [41]:
# FOR ANN
X_train_ANN = X_train.reshape(-1, X_train.shape[1], 1)
X_validation_ANN = X_validation.reshape(-1, X_train.shape[1], 1)
X_test_ANN = X_test.reshape(-1, X_test.shape[1], 1)

In [42]:
model = ANN(input_shape=(X_train_ANN.shape[1],1,), n_classes=2)

In [None]:
history=model.train(X_train_ANN, y_train, X_validation_ANN, y_validation,
                                n_batch, 
                                n_epochs,
                                learning_rate,
                                decay_rate,
                                save_dir_k)

In [44]:
print(history.history['accuracy'])
print(history.history['val_accuracy'])

[0.9962910413742065]
[0.9969648718833923]


In [None]:
ypred = model.classify(X_test)

In [46]:
count = 0
for i in range(len(ypred)):
    if np.argmax(ypred[i]) == y_test[i]:
        count += 1

acc = float(count)/len(ypred)

In [47]:
print(acc)

0.99706375


In [48]:
model.model.save('ANN_Binary_3')

INFO:tensorflow:Assets written to: ANN_Binary_3\assets


# CNN 1D

In [49]:
model2 = CNN_1D(input_shape=(X_train.shape[1],1,), 
                    n_classes=2,
                    filters=filters,
                    kernel_size=kernel_size,
                    strides=strides,
                    dense_units=128,
                    dropout_rate=dropout_rate,
                    CNN_layers=CNN_layers,
                    clf_reg=clf_reg)

INPUT SHAPE: (206, 1)


In [None]:
save_dir_k = './CNN_model'
history2 = model2.train(X_train, y_train, X_validation, y_validation,
                                n_batch, 
                                n_epochs,
                                learning_rate,
                                decay_rate,
                                save_dir_k)

In [51]:
print(history2.history['accuracy'])
print(history2.history['val_accuracy'])

[0.9958486557006836]
[0.9975000023841858]


In [52]:
ypred = model2.classify(X_test)



In [53]:
ypred[0]

array([1.6754926e-05, 9.9998319e-01], dtype=float32)

In [54]:
count = 0
for i in range(len(ypred)):
    if np.argmax(ypred[i]) == y_test[i]:
        count += 1

acc = float(count)/len(ypred)
print(acc)

0.9976125


In [None]:
model2.model.save('./CNN_Binary_3')

# CNN 2D

In [56]:
X_train_2D, X_test_2D = matrix_to3D(X_train, X_test)

In [57]:
X_train_2D, X_validation_2D = matrix_to3D(X_train, X_validation)

In [58]:
model3 = CNN_2D(input_shape=(X_train_2D.shape[1],X_train_2D.shape[2],1),
        n_classes=2,
        filters=filters,
        kernel_size=kernel_size,
        strides=strides,
        dense_units=128,
        dropout_rate=dropout_rate,
        CNN_layers=CNN_layers,
        clf_reg=clf_reg)

In [None]:
history3 = model3.train(X_train_2D, y_train, X_validation_2D, y_validation,
                                n_batch, 
                                n_epochs,
                                learning_rate,
                                decay_rate,
                                save_dir_k)

In [60]:
print(history3.history['accuracy'])
print(history3.history['val_accuracy'])

[0.996063232421875]
[0.9976351857185364]


In [61]:
ypred = model3.classify(X_test_2D)



In [62]:
count = 0
for i in range(len(ypred)):
    if np.argmax(ypred[i]) == y_test[i]:
        count += 1

acc = float(count)/len(ypred)
print(acc)

0.997708125


In [None]:
model3.model.save('./CNN2D_Binary_3')

# LSTM

In [None]:
X_train_2D, X_test_2D = matrix_to3D(X_train, X_test)
X_train_2D, X_validation_2D = matrix_to3D(X_train, X_validation)

In [64]:
X_train_LSTM = X_train_2D.reshape(-1, X_train_2D.shape[1], X_train_2D.shape[2])
X_validation_LSTM = X_validation_2D.reshape(-1, X_validation_2D.shape[1], X_validation_2D.shape[2])
X_test_LSTM = X_test_2D.reshape(-1, X_test_2D.shape[1], X_test_2D.shape[2])

In [65]:
model4 = LSTM(input_shape=(X_train_LSTM.shape[1],X_train_LSTM.shape[2]), 
        n_classes=2,
        dense_units=128,
        dropout_rate=dropout_rate,
        LSTM_layers=2,
        LSTM_units=128,
        lstm_reg=1e-4,
        clf_reg=clf_reg)

In [None]:
history4=model4.train(X_train_LSTM, y_train, X_validation_LSTM, y_validation,
                                n_batch, 
                                n_epochs,
                                learning_rate,
                                decay_rate,
                                save_dir_k)

In [67]:
print(history4.history['accuracy'])
print(history4.history['val_accuracy'])

[0.7499873042106628]
[0.7501156330108643]


In [68]:
ypred = model4.classify(X_test_LSTM)



In [69]:
count = 0
for i in range(len(ypred)):
    if np.argmax(ypred[i]) == y_test[i]:
        count += 1

acc = float(count)/len(ypred)
print(acc)

0.75023125


In [None]:
model4.model.save('./LSTM_Binary_3')

# CNN + LSTM

In [71]:
model5 = CNN_LSTM(input_shape=(X_train.shape[1],1,), 
                    n_classes=2,
                    dropout_rate=dropout_rate,
                    lstm_reg=1e-4,
                    clf_reg=clf_reg)

In [None]:
history5 = model5.train(X_train, y_train, X_validation, y_validation,
                                n_batch, 
                                n_epochs,
                                learning_rate,
                                decay_rate,
                                save_dir_k)

In [73]:
print(history5.history['accuracy'])
print(history5.history['val_accuracy'])

[0.749539315700531]
[0.7501156330108643]


In [74]:
ypred = model5.classify(X_test)



In [75]:
count = 0
for i in range(len(ypred)):
    if np.argmax(ypred[i]) == y_test[i]:
        count += 1

acc = float(count)/len(ypred)
print(acc)

0.75023125


In [None]:
model5.model.save('./CNN_LSTM_Binary_3')

## Binary Classification Accuracies

Without Custom Preprocessing:

ANN: 90.409%
CNN_1D: 85.128%
CNN_2D: 95.214%
LSTM: 99.738%
LSTM+CNN: 99.712%

With Custom Preprocessing:

ANN: 99.706%
CNN: 99.761%
CNN_2D: 99.771%
LSTM: 75.023%
CNN+LSTM: 75.023%