### Loading Required Libraries & Functions

In [1]:
# Making sure we're running on proper tensorflow version
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Importing useful libraries
import numpy as np
import pandas as pd
import pickle
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score

In [3]:
# Defining util Functions
def conv_array(df):
    x,y=df.drop('Class',1),df['Class'].values
    x=x.values
    y0=np.ones(len(y),np.int8)
    y0[np.where(y=='normal')]=0
    y0[np.where(y=='dos')]=1
    y0[np.where(y=='r2l')]=2
    y0[np.where(y=='u2r')]=3
    y0[np.where(y=='probe')]=4
    return x,y,y0

# Function for saving trained models
def save_model(model, filename="model.sav"):
    pickle.dump(model, open(filename, 'wb'))
    print("Model has been saved at: ", filename)

### Loading and Cleaning Dataset

In [4]:
# Downloading training and test sets to local disk
!wget "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv" -O "KDDTrain.csv"
!wget 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv' -O 'KDDTest.csv'

--2023-06-02 12:21:26--  https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14612857 (14M) [text/plain]
Saving to: ‘KDDTrain.csv’


2023-06-02 12:21:28 (208 MB/s) - ‘KDDTrain.csv’ saved [14612857/14612857]

--2023-06-02 12:21:28--  https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2639177 (2.5M) [text/plain]
Saving to: ‘KDDTest.csv’


2023-06-02 12:21:28 (185 MB/s) - ‘KDDTest.csv’ saved [263

In [5]:
# Reading the data from CSV files using Pandas

training_set_path = "KDDTrain.csv"
test_set_path = "KDDTest.csv"

training_df = pd.read_csv(training_set_path, header=None)
testing_df = pd.read_csv(test_set_path, header=None)

print("Training set has {} rows.".format(len(training_df)))
print("Testing set has {} rows.".format(len(testing_df)))

Training set has 125973 rows.
Testing set has 22543 rows.


In [6]:
# Adding Column names to Dataset

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'difficulty']
training_df.columns = columns
testing_df.columns = columns

In [7]:
# A list ot attack names that belong to each general attack type
dos_attacks=["snmpgetattack","back","land","neptune","smurf","teardrop","pod","apache2","udpstorm","processtable","mailbomb"]
r2l_attacks=["snmpguess","worm","httptunnel","named","xlock","xsnoop","sendmail","ftp_write","guess_passwd","imap","multihop","phf","spy","warezclient","warezmaster"]
u2r_attacks=["sqlattack","buffer_overflow","loadmodule","perl","rootkit","xterm","ps"]
probe_attacks=["ipsweep","nmap","portsweep","satan","saint","mscan"]

# Helper function to label samples to 5 classes
def label_attack (row):
    if row["outcome"] in dos_attacks:
        return "dos"
    if row["outcome"] in r2l_attacks:
        return "r2l"
    if row["outcome"] in u2r_attacks:
        return "u2r"
    if row["outcome"] in probe_attacks:
        return "probe"                        
    return "normal"


# We combine the datasets temporarily to do the labeling 
test_samples_length = len(testing_df)
df=pd.concat([training_df,testing_df])
df["Class"]=df.apply(label_attack,axis=1)

# The old outcome field is dropped since it was replaced with the Class field, the difficulty field will be dropped as well.
df=df.drop("outcome",axis=1)
df=df.drop("difficulty",axis=1)

# We again split the data into training and test sets.
training_df= df.iloc[:-test_samples_length, :]
testing_df= df.iloc[-test_samples_length:,:]

In [8]:
# Training Dataset
training_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [9]:
# Helper function for scaling continous values
def minmax_scale_values(training_df,testing_df, col_name):
    scaler = MinMaxScaler()
    scaler = scaler.fit(training_df[col_name].values.reshape(-1, 1))
    train_values_standardized = scaler.transform(training_df[col_name].values.reshape(-1, 1))
    training_df[col_name] = train_values_standardized
    test_values_standardized = scaler.transform(testing_df[col_name].values.reshape(-1, 1))
    testing_df[col_name] = test_values_standardized
    
    
#Helper function for one hot encoding
def encode_text(training_df,testing_df, name):
    training_set_dummies = pd.get_dummies(training_df[name])
    testing_set_dummies = pd.get_dummies(testing_df[name])
    for x in training_set_dummies.columns:
        dummy_name = "{}_{}".format(name, x)
        training_df[dummy_name] = training_set_dummies[x]
        if x in testing_set_dummies.columns :
            testing_df[dummy_name]=testing_set_dummies[x]
        else :
            testing_df[dummy_name]=np.zeros(len(testing_df))
    training_df.drop(name, axis=1, inplace=True)
    testing_df.drop(name, axis=1, inplace=True)
    
    
sympolic_columns=["protocol_type","service","flag"]
label_column="Class"
for column in df.columns :
    if column in sympolic_columns:
        encode_text(training_df,testing_df,column)
    elif not column == label_column:
        minmax_scale_values(training_df,testing_df, column)

In [10]:
# Training Dataset after one-hot encoding
training_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,3.558064e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0.0,1.057999e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.0,1.681203e-07,6.223962e-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0.0,1.442067e-07,3.20626e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
testing_df.to_pickle("./testing_df.pkl")

In [12]:
unpickled_df = pd.read_pickle("./testing_df.pkl")

In [13]:
training_df.Class.value_counts()

normal    67343
dos       45927
probe     11656
r2l         995
u2r          52
Name: Class, dtype: int64

In [14]:
# Creating final dataset

x_train, y_train, y0_train = conv_array(training_df)
print(y_train[0], y0_train[0])

x_test,y_test,y0_test = conv_array(testing_df)
print(y_test[0], y0_test[0])

normal 0
dos 1


## Training Part starts from here

### Random Forest Model

In [15]:
# Loading the model
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=100)

# Training the model
random_forest_model.fit(x_train, y0_train)
print("Model has been trained.")

Model has been trained.


In [16]:
y0_test[10]

4

In [17]:
random_forest_model.predict_proba(x_test[10].reshape(1,122))

array([[0.91654762, 0.03      , 0.02      , 0.        , 0.03345238]])

In [21]:
# Using model for predictions

dict = {0:"Normal    ", 1:"dos ", 2:"u2r", 3:"r2l", 4:"probe"}

y_pred = random_forest_model.predict(x_test)
print("Prediction | Expected")
print("----------------------")
for i in range(10):
    print(dict[y_pred[i]],"|",y_test[i])

Prediction | Expected
----------------------
dos  | dos
dos  | dos
Normal     | normal
probe | probe
Normal     | probe
Normal     | normal
Normal     | normal
Normal     | r2l
Normal     | normal
Normal     | r2l


In [19]:
# Analysing the model's predictions
result = random_forest_model.score(x_test, y0_test)
print(result)

accuracy=accuracy_score(y0_test,y_pred)
recall=recall_score(y0_test,y_pred,average='micro')
precision=precision_score(y0_test,y_pred,average='micro')
f1=f1_score(y0_test,y_pred,average='micro')
print("Performance over the testing data set \n")
print("Accuracy : {} , Recall : {} , Precision : {} , F1 : {}\n".format(accuracy,recall,precision,f1 ))

0.7538925608836445
Performance over the testing data set 

Accuracy : 0.7538925608836445 , Recall : 0.7538925608836445 , Precision : 0.7538925608836445 , F1 : 0.7538925608836445



In [22]:
save_model(random_forest_model, "random_forest_model.sav")

Model has been saved at:  random_forest_model.sav


### Support Vector Machine Model

In [23]:
from sklearn.svm import SVC
# Loading the model
svm_model = SVC(kernel='linear',probability=True)

In [24]:
# Training the model
svm_model.fit(x_train, y0_train)
print("Model has been trained.")

y0_test[10]

svm_model.predict_proba(x_test[10].reshape(1, 122))

Model has been trained.


array([[9.92690316e-01, 1.21711377e-03, 9.66368123e-04, 4.03513879e-03,
        1.09106318e-03]])

In [28]:
# Using model for predictions

dict = {0: "Normal    ", 1: "dos ", 2: "u2r", 3: "r2l", 4: "probe"}

y_pred = svm_model.predict(x_test)
print("Prediction | Expected")
print("----------------------")
for i in range(10):
    print(dict[y_pred[i]], "|", y_test[i])

Prediction | Expected
----------------------
dos  | dos
dos  | dos
Normal     | normal
probe | probe
Normal     | probe
Normal     | normal
Normal     | normal
Normal     | r2l
Normal     | normal
Normal     | r2l


In [29]:
# Analysing the model's predictions
result = svm_model.score(x_test, y0_test)
print(result)

accuracy = accuracy_score(y0_test, y_pred)
recall = recall_score(y0_test, y_pred, average='micro')
precision = precision_score(y0_test, y_pred, average='micro')
f1 = f1_score(y0_test, y_pred, average='micro')
print("Performance over the testing data set \n")
print("Accuracy : {}, Recall : {}, Precision : {}, F1 : {}\n".format(accuracy, recall, precision, f1))

0.7712371911458102
Performance over the testing data set 

Accuracy : 0.7712371911458102, Recall : 0.7712371911458102, Precision : 0.7712371911458102, F1 : 0.7712371911458104



In [30]:
save_model(svm_model, "Linear_SVM_model.sav")

Model has been saved at:  Linear_SVM_model.sav
