In [1]:
# This is an implementation of (Gwon et. al., 2017)

import numpy as np
import csv
import tensorflow as tf
import pickle
from scipy.io import arff
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_data, meta = arff.loadarff('KDDTrain+.arff')
test_data, meta_test = arff.loadarff('KDDTest+.arff')
print(meta)

Dataset: 'KDDTrain'
	duration's type is numeric
	protocol_type's type is nominal, range is ('tcp', 'udp', 'icmp')
	service's type is nominal, range is ('aol', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'harvest', 'hostnames', 'http', 'http_2784', 'http_443', 'http_8001', 'imap4', 'IRC', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois', 'X11', 'Z39_50')
	flag's type is nominal, range is ('OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH')
	src_bytes's type is numeric
	dst_bytes's type is numeric

In [3]:
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

train_feat = train_df.iloc[:,:-1]
test_feat = test_df.iloc[:,:-1]

train_labels = train_df.iloc[:,-1]
test_labels = test_df.iloc[:,-1]

train_feat

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,150.0,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,255.0,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,30.0,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,255.0,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,255.0,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


In [4]:
# Need one embedding layer for each categorical variable
# Variables are protocol, service, flag, land, logged_in, is_host_login, is_guest_login

nom_vars = [1,2,3,6,11,20,21]

# prepare input data
def prepare_inputs(train_feat, test_feat):
    x_train_enc, x_test_enc = list(), list()
    for i, col in enumerate(train_feat):
        if i in nom_vars:
            # First, convert each nominal variable such that 
            #  each unique value of the variable maps to a number 
            #  from 0 to n_classes - 1
            le = LabelEncoder()
            train_enc = le.fit_transform(train_feat[col])
            test_enc = le.transform(test_feat.iloc[:,i])
            
            # Add these encoded layers to the lists to return
            x_train_enc.append(train_enc)
            x_test_enc.append(test_enc)

    return x_train_enc, x_test_enc

In [5]:
# prepare target
def prepare_targets(y_train, y_test):
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [6]:
# Create the embedding layers

x_train_enc, x_test_enc = prepare_inputs(train_feat, test_feat)
y_train_enc, y_test_enc = prepare_targets(train_labels, test_labels)
in_layers = list()
em_layers = list()

for i, feat in enumerate(x_train_enc):
        # Input layer
        in_layer = tf.keras.layers.Input(shape=(1,))


        # Calculate the number of classes in each encoded variable
        n_classes = len(np.unique(feat))

        # Embedding layer
        em_layer = tf.keras.layers.Embedding(n_classes, 8)(in_layer)
        em_layers.append(em_layer)

        in_layers.append(in_layer)
        


In [7]:
print(len(x_train_enc))
print(len(in_layers))

7
7


In [8]:
# Remove the necessary columns in train_feat and test_feat
train_feat.drop(train_feat.columns[nom_vars],axis=1,inplace=True)
test_feat.drop(test_feat.columns[nom_vars],axis=1,inplace=True)


np_train = train_feat.to_numpy().astype('float32')
np_test = test_feat.to_numpy().astype('float32')


In [9]:
x_train_normal, x_test_normal = list(), list()
for i, col in enumerate(train_feat):
        # Add these encoded layers to the lists to return
        x_train_normal.append(train_feat[col].values)
        x_test_normal.append(test_feat.iloc[:,i].values)

In [10]:
print(len(x_train_normal))

34


In [11]:
in_layers_normal = list()
for i, feat in enumerate(x_train_enc):
        # Input layer
        normal_input = tf.keras.layers.Input(shape=(1,))
        in_layers_normal.append(normal_input)

In [12]:
embedding = tf.keras.layers.concatenate(em_layers) # shape is 1 x 56 (7*8)
embedding = tf.keras.layers.BatchNormalization(center=False, scale=False)(embedding)

normal = tf.keras.layers.Dense(56)(normal_input)
normal = tf.keras.layers.Reshape((1,56))(normal)
normal = tf.keras.layers.BatchNormalization(center=False, scale=False)(normal)

#concatenated = tf.keras.layers.Concatenate(axis=1)([normal, embedding]) 
#lstm = tf.keras.layers.LSTM(100,activation='relu')(concatenated)


In [13]:
# Please note that all of the nominal values have now been embedded

model1 = tf.keras.models.Model(inputs=in_layers, outputs=embedding)
model2 = tf.keras.models.Model(inputs=in_layers_normal, outputs=normal)

mergedOut = tf.keras.layers.Add()([model1.output,model2.output])
mergedOut = tf.keras.layers.LSTM(100,activation='relu')(mergedOut)
#mergedOut = tf.keras.layers.LeakyRelu(alpha=0.1)(mergedOut) #LeakyRelu requires Tensorflow 2.6

mergedOut = tf.keras.layers.Dense(50,activation='relu')(mergedOut)
#mergedOut = tf.keras.layers.LeakyRelu(alpha=0.1)(mergedOut)

mergedOut = tf.keras.layers.Dropout(0.5)(mergedOut)

mergedOut = tf.keras.layers.Dense(10,activation='relu')(mergedOut)
#mergedOut = tf.keras.layers.LeakyRelu(alpha=0.1)(mergedOut)
mergedOut = tf.keras.layers.Dense(1, activation='sigmoid')(mergedOut)
# We have 1 as the output because we want a single value

model = tf.keras.models.Model([model1.input,model2.input], outputs=mergedOut)

# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit([x_train_enc,x_train_normal], y_train_enc, epochs=10, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate([x_test_enc,x_test_normal], y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))


Epoch 1/10
7874/7874 - 11s - loss: 0.1372 - accuracy: 0.9595
Epoch 2/10
7874/7874 - 9s - loss: 0.1235 - accuracy: 0.9623
Epoch 3/10
7874/7874 - 9s - loss: 0.1211 - accuracy: 0.9629
Epoch 4/10
7874/7874 - 9s - loss: 0.1213 - accuracy: 0.9628
Epoch 5/10
7874/7874 - 9s - loss: 0.1209 - accuracy: 0.9628
Epoch 6/10
7874/7874 - 10s - loss: 0.1206 - accuracy: 0.9628
Epoch 7/10
7874/7874 - 10s - loss: 0.1201 - accuracy: 0.9627
Epoch 8/10
7874/7874 - 9s - loss: 0.1200 - accuracy: 0.9629
Epoch 9/10
7874/7874 - 10s - loss: 0.1199 - accuracy: 0.9628
Epoch 10/10
7874/7874 - 10s - loss: 0.1199 - accuracy: 0.9630
Accuracy: 80.08
