In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")
import itertools
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from keras.utils.data_utils import get_file

In [None]:
# Downloading training and test sets to local drive
try:
    training_set_path = get_file('KDDTrain%2B.csv', origin='https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv')
except:
    print('Error downloading')
    raise


try:
    test_set_path = get_file('KDDTest%2B.csv', origin='https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv')
except:
    print('Error downloading')
    raise
training_df = pd.read_csv(training_set_path, header=None)
testing_df = pd.read_csv(test_set_path, header=None)




Downloading data from https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv
Downloading data from https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv


In [None]:
training_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [None]:
testing_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [None]:
 columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome',
    'difficulty'
]
training_df.columns = columns
testing_df.columns = columns

In [None]:
print("Training set has {} rows.".format(len(training_df)))
print("Testing set has {} rows.".format(len(testing_df)))

Training set has 125973 rows.
Testing set has 22543 rows.


In [None]:
training_outcomes=training_df["outcome"].unique()
testing_outcomes=testing_df["outcome"].unique()
print("The training set has {} possible outcomes \n".format(len(training_outcomes)) )
print(", ".join(training_outcomes)+".")
print("\nThe testing set has {} possible outcomes \n".format(len(testing_outcomes)))
print(", ".join(testing_outcomes)+".")

The training set has 23 possible outcomes 

normal, neptune, warezclient, ipsweep, portsweep, teardrop, nmap, satan, smurf, pod, back, guess_passwd, ftp_write, multihop, rootkit, buffer_overflow, imap, warezmaster, phf, land, loadmodule, spy, perl.

The testing set has 38 possible outcomes 

neptune, normal, saint, mscan, guess_passwd, smurf, apache2, satan, buffer_overflow, back, warezmaster, snmpgetattack, processtable, pod, httptunnel, nmap, ps, snmpguess, ipsweep, mailbomb, portsweep, multihop, named, sendmail, loadmodule, xterm, worm, teardrop, rootkit, xlock, perl, land, xsnoop, sqlattack, ftp_write, imap, udpstorm, phf.


In [None]:
# A list ot attack names that belong to each general attack type
dos_attacks=["snmpgetattack","back","land","neptune","smurf","teardrop","pod","apache2","udpstorm","processtable","mailbomb"]
r2l_attacks=["snmpguess","worm","httptunnel","named","xlock","xsnoop","sendmail","ftp_write","guess_passwd","imap","multihop","phf","spy","warezclient","warezmaster"]
u2r_attacks=["sqlattack","buffer_overflow","loadmodule","perl","rootkit","xterm","ps"]
probe_attacks=["ipsweep","nmap","portsweep","satan","saint","mscan"]

# Our new labels
classes=["Normal","Dos","R2L","U2R","Probe"]

#Helper function to label samples to 5 classes
def label_attack (row):
    if row["outcome"] in dos_attacks:
        return classes[1]
    if row["outcome"] in r2l_attacks:
        return classes[2]
    if row["outcome"] in u2r_attacks:
        return classes[3]
    if row["outcome"] in probe_attacks:
        return classes[4]
    return classes[0]


#We combine the datasets temporarily to do the labeling
test_samples_length = len(testing_df)
df=pd.concat([training_df,testing_df])
df["Class"]=df.apply(label_attack,axis=1)


# The old outcome field is dropped since it was replaced with the Class field, the difficulty field will be dropped as well.
df=df.drop("outcome",axis=1)
df=df.drop("difficulty",axis=1)

# we again split the data into training and test sets.
type_testing_df = testing_df
training_df= df.iloc[:-test_samples_length, :]
testing_df= df.iloc[-test_samples_length:,:]

In [None]:
training_outcomes=training_df["Class"].unique()
testing_outcomes=testing_df["Class"].unique()
print("The training set has {} possible outcomes \n".format(len(training_outcomes)) )
print(", ".join(training_outcomes)+".")
print("\nThe testing set has {} possible outcomes \n".format(len(testing_outcomes)))
print(", ".join(testing_outcomes)+".")

The training set has 5 possible outcomes 

Normal, Dos, R2L, Probe, U2R.

The testing set has 5 possible outcomes 

Dos, Normal, Probe, R2L, U2R.


In [None]:
# Helper function for scaling continous values
def minmax_scale_values(training_df,testing_df, col_name):
    scaler = MinMaxScaler()
    scaler = scaler.fit(training_df[col_name].values.reshape(-1, 1))
    train_values_standardized = scaler.transform(training_df[col_name].values.reshape(-1, 1))
    training_df[col_name] = train_values_standardized
    test_values_standardized = scaler.transform(testing_df[col_name].values.reshape(-1, 1))
    testing_df[col_name] = test_values_standardized


#Helper function for one hot encoding
def encode_text(training_df,testing_df, name):
    training_set_dummies = pd.get_dummies(training_df[name])
    testing_set_dummies = pd.get_dummies(testing_df[name])
    for x in training_set_dummies.columns:
        dummy_name = "{}_{}".format(name, x)
        training_df[dummy_name] = training_set_dummies[x]
        if x in testing_set_dummies.columns :
            testing_df[dummy_name]=testing_set_dummies[x]
        else :
            testing_df[dummy_name]=np.zeros(len(testing_df))
    training_df.drop(name, axis=1, inplace=True)
    testing_df.drop(name, axis=1, inplace=True)


sympolic_columns=["protocol_type","service","flag"]
label_column="Class"
for column in df.columns :
    if column in sympolic_columns:
        encode_text(training_df,testing_df,column)
    elif not column == label_column:
        minmax_scale_values(training_df,testing_df, column)

In [None]:
training_df.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,3.558064e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0.0,1.057999e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.0,1.681203e-07,6.223962e-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0.0,1.442067e-07,3.20626e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
training_df.columns

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       ...
       'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0',
       'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH'],
      dtype='object', length=123)

In [None]:
# Assuming 'data' is the DataFrame
column_names = training_df.columns.tolist()

# Display all column names
for name in column_names:
    print(name)
np.save('/content/drive/MyDrive/Colab Notebooks/xNIDS/Data/kdd_after_features.npy', column_names)

duration
src_bytes
dst_bytes
land
wrong_fragment
urgent
hot
num_failed_logins
logged_in
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
is_host_login
is_guest_login
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
Class
protocol_type_icmp
protocol_type_tcp
protocol_type_udp
service_IRC
service_X11
service_Z39_50
service_aol
service_auth
service_bgp
service_courier
service_csnet_ns
service_ctf
service_daytime
service_discard
service_domain
service_domain_u
service_echo
service_eco_i
service_ecr_i
service_efs
service_exec
service_finger
service_ftp
service_ftp_data
service_gopher
service_harvest
service_hostnames
service_http
serv

In [None]:
x,y=training_df,training_df.pop("Class").values
x=x.values
x_test,y_test=testing_df,testing_df.pop("Class").values
x_test=x_test.values
y0=np.ones(len(y),np.int8)
y0[np.where(y==classes[0])]=0
y0_test=np.ones(len(y_test),np.int8)
y0_test[np.where(y_test==classes[0])]=0
input_shape = x.shape[1]

In [None]:
x_test.shape[0]

22543

In [None]:
x_test.shape[-1]

122

In [None]:

from tensorflow import keras
from keras.layers import LSTM, Input, Dense, Dropout
from keras.models import Model, Sequential

# Reshape the training and test data
x_train = np.reshape(x, (x.shape[0], 1, x.shape[-1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[-1]))

# LSTM requirements
lst = Sequential()

# Input layer and LSTM layer with 50 neurons
lst.add(LSTM(50, batch_input_shape=(1, 1, x.shape[-1]), stateful=True, return_sequences=True))
lst.add(Dropout(0.2))  # Dropout layer with 20% dropout rate

# Additional LSTM layer with 50 neurons
lst.add(LSTM(10))
lst.add(Dropout(0.2))  # Dropout layer with 20% dropout rate
# Output layer with sigmoid activation
lst.add(Dense(1, activation='sigmoid'))

lst.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lst.summary()

# Training the model with stateful LSTM
for epoch in range(2):
    history = lst.fit(x_train, y0, epochs=1, batch_size=1, validation_split=0.2, shuffle=False)
    lst.reset_states()

test_results = lst.evaluate(x_test, y0_test, batch_size=1, verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1] * 100}%')

# Save the model
model_path = "/content/drive/MyDrive/Colab Notebooks/xNIDS/Models/lstm_history_model.h5"
lst.save(model_path)
print("Model saved.")

# Load the model
new_model = keras.models.load_model(model_path)
new_model.summary()
print("Model loaded.")

import matplotlib.pyplot as plt

# Plot accuracy vs epoch of train and test dataset
#plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_accuracy'])
#plt.title("Plot of accuracy vs epoch for train and test dataset")
#plt.ylabel('accuracy')
#plt.xlabel('epoch')
#plt.legend(['train', 'test'], loc='best')
#plt.show()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (1, 1, 50)                34600     
                                                                 
 dropout (Dropout)           (1, 1, 50)                0         
                                                                 
 lstm_1 (LSTM)               (1, 10)                   2440      
                                                                 
 dropout_1 (Dropout)         (1, 10)                   0         
                                                                 
 dense (Dense)               (1, 1)                    11        
                                                                 
Total params: 37,051
Trainable params: 37,051
Non-trainable params: 0
_________________________________________________________________
Test results - Loss: 1.213534951210022 - Accuracy: 7

In [None]:
# Prepare data for explanations.
# Make predictions
predicted_probabilities = lst.predict(x_test,batch_size=1)
predicted_labels = (predicted_probabilities >= 0.5).astype(int)

rnn_false_positives = []  # Store indices of false positives
rnn_false_negatives = []  # Store indices of false negatives
for i in range(len(predicted_labels)):
    if predicted_labels[i][0] != y0_test[i]:
        if predicted_labels[i][0] == 1:  # False positive
            rnn_false_positives.append(i)
        else:
           rnn_false_negatives.append(i)



In [None]:
rnn_false_positives

[18,
 29,
 33,
 222,
 230,
 237,
 275,
 295,
 304,
 312,
 331,
 337,
 358,
 366,
 399,
 412,
 427,
 433,
 453,
 469,
 476,
 477,
 495,
 555,
 569,
 571,
 594,
 640,
 645,
 673,
 676,
 720,
 721,
 722,
 753,
 837,
 869,
 890,
 942,
 1017,
 1069,
 1091,
 1111,
 1128,
 1183,
 1199,
 1216,
 1226,
 1261,
 1328,
 1397,
 1434,
 1472,
 1476,
 1484,
 1501,
 1526,
 1580,
 1622,
 1626,
 1730,
 1777,
 1899,
 1929,
 1946,
 2024,
 2052,
 2058,
 2068,
 2075,
 2093,
 2136,
 2164,
 2227,
 2267,
 2273,
 2293,
 2313,
 2315,
 2375,
 2387,
 2466,
 2480,
 2488,
 2489,
 2517,
 2518,
 2567,
 2587,
 2666,
 2688,
 2761,
 2775,
 2817,
 2863,
 2865,
 2868,
 2937,
 2938,
 2943,
 2960,
 2967,
 3017,
 3074,
 3125,
 3176,
 3201,
 3247,
 3317,
 3320,
 3363,
 3411,
 3472,
 3486,
 3494,
 3497,
 3524,
 3551,
 3673,
 3691,
 3747,
 3827,
 3861,
 3863,
 3882,
 3894,
 3923,
 3971,
 4048,
 4082,
 4085,
 4115,
 4195,
 4305,
 4355,
 4405,
 4417,
 4423,
 4468,
 4499,
 4575,
 4611,
 4625,
 4644,
 4711,
 4808,
 4822,
 4854,
 4863,

In [None]:
#rnn_false_negatives

In [None]:
kdd_selected_fp_rows = type_testing_df.loc[19000:19114]
kdd_selected_fp_rows_122 = pd.DataFrame(x_test[19000:19115].reshape(115,122))
kdd_selected_fp_rows.to_csv('/content/drive/MyDrive/Colab Notebooks/xNIDS/Data/kdd_history_selected_fp_rows.csv', index=True)
kdd_selected_fp_rows_122.to_csv('/content/drive/MyDrive/Colab Notebooks/xNIDS/Data/kdd_history_selected_fp_rows_122.csv', index=True)
# 19114

In [None]:
kdd_selected_fp_rows

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome,difficulty
19000,0,tcp,http,SF,142,6103,0,0,0,0,...,1.00,0.00,0.05,0.04,0.00,0.00,0.00,0.00,normal,21
19001,0,udp,private,SF,47,44,0,0,0,0,...,1.00,0.00,0.57,0.00,0.00,0.00,0.00,0.00,normal,14
19002,0,tcp,http,SF,159,601,0,0,0,0,...,1.00,0.00,0.02,0.02,0.00,0.00,0.00,0.00,normal,21
19003,0,tcp,http,S0,0,0,0,0,0,0,...,0.96,0.01,0.00,0.00,0.20,0.21,0.56,0.59,apache2,15
19004,0,tcp,other,REJ,0,0,0,0,0,0,...,0.00,1.00,0.00,0.00,0.13,0.00,0.87,1.00,saint,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19110,0,tcp,http,SF,170,603,0,0,0,0,...,1.00,0.00,1.00,0.02,0.00,0.00,0.00,0.00,normal,21
19111,0,tcp,http,SF,324,1227,0,0,0,0,...,1.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,normal,21
19112,0,tcp,http,SF,308,825,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
19113,0,tcp,http,SF,243,5333,0,0,0,0,...,1.00,0.00,0.01,0.05,0.00,0.00,0.00,0.00,normal,21


In [None]:
predicted_probabilities[19114]

array([0.92638266], dtype=float32)

In [None]:
idx = 19114

In [None]:
y0_test[idx]

0

In [None]:
lst.predict(x_test[idx].reshape(1,1,-1))



array([[0.9903234]], dtype=float32)

In [None]:
lst.predict(x_test[idx-1:idx+1],batch_size=1)



array([[0.00105977],
       [0.97901255]], dtype=float32)

In [None]:
lst.predict(x_test[idx-3:idx+1],batch_size=1)



array([[0.01316929],
       [0.00207315],
       [0.00095898],
       [0.9317424 ]], dtype=float32)

In [None]:
lst.predict(x_test[idx-7:idx+1],batch_size=1)



array([[9.9961805e-01],
       [2.6946910e-02],
       [9.9867439e-01],
       [1.8075944e-03],
       [3.7709917e-03],
       [1.8523487e-03],
       [9.6929516e-04],
       [9.2648917e-01]], dtype=float32)

In [None]:
lst.predict(x_test[idx-15:idx+1],batch_size=1)



array([[9.9962002e-01],
       [9.9953932e-01],
       [1.7771536e-03],
       [4.7203647e-03],
       [9.2692074e-04],
       [9.9185260e-04],
       [9.9522946e-04],
       [2.5704914e-01],
       [9.9961978e-01],
       [2.7482037e-02],
       [9.9868304e-01],
       [1.8075197e-03],
       [3.7704094e-03],
       [1.8532976e-03],
       [9.6933905e-04],
       [9.2638475e-01]], dtype=float32)

In [None]:
#kdd_selected_fp_rows