In [1]:
# Models: modeling with different algorithms

# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, confusion_matrix

import utils

# global variable
TRAIN_TXT_PATH = "./data/KDDTrain+.txt"
METADATA_PATH = "./data/KDDTrain+.arff"
TEST_PATH = "./data/KDDTest+.txt"
TEST_EXC_21_PATH = "./data/KDDTest-21.txt"
SEED = 111
LABEL = "class"


In [2]:
# load and process data
df = pd.read_csv(TRAIN_TXT_PATH)

def pre_pre_process_data(df):
    df.columns = utils.get_col_names(METADATA_PATH)
    df = utils.convert_label_to_binary(df, LABEL)
    df = utils.get_numeric_cols(df)
    return df

df = pre_pre_process_data(df)
df



Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,level
0,0,146,0,0,0,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,0,15
1,0,0,0,0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,1,19
2,0,232,8153,0,0,0,0,0,1,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,0,21
3,0,199,420,0,0,0,0,0,1,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,21
4,0,0,0,0,0,0,0,0,0,0,...,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.00,1,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0,0,0,0,0,0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,1,20
125968,8,105,145,0,0,0,0,0,0,0,...,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,0,21
125969,0,2231,384,0,0,0,0,0,1,0,...,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,0,18
125970,0,0,0,0,0,0,0,0,0,0,...,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,1,20


In [3]:
# X-y split (features & label) - avoid data-leakage
X = df.drop(columns=["class", "level"], axis=1)
FEATURES = X.columns

y = df[LABEL]

# train-val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED)
print(f"{X_train.shape=} {y_train.shape=} {X_val.shape=} {y_val.shape=}")
display(X_train)
display(y_train)


X_train.shape=(88180, 38) y_train.shape=(88180,) X_val.shape=(37792, 38) y_val.shape=(37792,)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
118060,0,1032,0,0,0,0,0,0,0,0,...,255,255,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0
437,0,222,310,0,0,0,0,0,1,0,...,170,255,1.00,0.00,0.01,0.01,0.0,0.0,0.0,0.0
95153,0,293,1680,0,0,0,0,0,1,0,...,255,255,1.00,0.00,0.01,0.00,0.0,0.0,0.0,0.0
83297,0,0,0,0,0,0,0,0,0,0,...,255,4,0.02,0.05,0.00,0.00,1.0,1.0,0.0,0.0
85083,0,313,259,0,0,0,0,0,1,0,...,129,255,1.00,0.00,0.01,0.02,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105747,3122,146,105,0,0,0,0,0,0,0,...,255,1,0.00,0.66,0.96,0.00,0.0,0.0,0.0,0.0
102486,0,225,5304,0,0,0,0,0,1,0,...,38,255,1.00,0.00,0.03,0.01,0.0,0.0,0.0,0.0
4820,0,226,1110,0,0,0,0,0,1,0,...,41,255,1.00,0.00,0.02,0.07,0.0,0.0,0.0,0.0
10196,0,8766,0,0,0,0,0,0,1,0,...,133,48,0.32,0.04,0.32,0.04,0.0,0.0,0.0,0.0


118060    1
437       0
95153     0
83297     1
85083     0
         ..
105747    0
102486    0
4820      0
10196     0
77652     1
Name: class, Length: 88180, dtype: int64

In [4]:
# 1: model with naive bayes
from sklearn.naive_bayes import BernoulliNB
nb_model = BernoulliNB().fit(X_train, y_train)
y_val_pred = nb_model.predict(X_val)
print(classification_report(y_val, y_val_pred,  digits=4))
print("AUC: ", roc_auc_score(y_val, y_val_pred))

              precision    recall  f1-score   support

           0     0.8721    0.9344    0.9022     20182
           1     0.9181    0.8430    0.8789     17610

    accuracy                         0.8918     37792
   macro avg     0.8951    0.8887    0.8906     37792
weighted avg     0.8936    0.8918    0.8914     37792

AUC:  0.8886919633267983


In [5]:
# 2: model with logistic regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression().fit(X_train, y_train)
y_val_pred = lr_model.predict(X_val)
print(classification_report(y_val, y_val_pred,  digits=4))
print("AUC: ", roc_auc_score(y_val, y_val_pred))


              precision    recall  f1-score   support

           0     0.8793    0.8889    0.8840     20182
           1     0.8710    0.8601    0.8655     17610

    accuracy                         0.8755     37792
   macro avg     0.8751    0.8745    0.8748     37792
weighted avg     0.8754    0.8755    0.8754     37792

AUC:  0.8744988239051886


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# 3: model with KNN
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier().fit(X_train, y_train)
y_val_pred = knn_model.predict(X_val)
print(classification_report(y_val, y_val_pred,  digits=4))
print("AUC: ", roc_auc_score(y_val, y_val_pred))



              precision    recall  f1-score   support

           0     0.9962    0.9942    0.9952     20182
           1     0.9934    0.9957    0.9945     17610

    accuracy                         0.9949     37792
   macro avg     0.9948    0.9949    0.9949     37792
weighted avg     0.9949    0.9949    0.9949     37792

AUC:  0.9949435126155506


In [7]:
# 4: model with SVM
from sklearn.svm import SVC
svm_model = SVC().fit(X_train, y_train)
y_val_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_val_pred,  digits=4))

#               precision    recall  f1-score   support

#            0     0.5343    0.9999    0.6965     20182
#            1     0.9231    0.0014    0.0027     17610

#     accuracy                         0.5346     37792
#    macro avg     0.7287    0.5006    0.3496     37792
# weighted avg     0.7155    0.5346    0.3732     37792


              precision    recall  f1-score   support

           0     0.5343    0.9999    0.6965     20182
           1     0.9231    0.0014    0.0027     17610

    accuracy                         0.5346     37792
   macro avg     0.7287    0.5006    0.3496     37792
weighted avg     0.7155    0.5346    0.3732     37792



In [8]:
# model with xgboost (ensemble)
from xgboost import XGBClassifier

# fit (train) model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# test model
y_val_pred = xgb_model.predict(X_val)

# evaluate model
print(classification_report(y_val, y_val_pred,  digits=4))
print("AUC: ", roc_auc_score(y_val, y_val_pred))


              precision    recall  f1-score   support

           0     0.9986    0.9992    0.9989     20182
           1     0.9991    0.9984    0.9988     17610

    accuracy                         0.9988     37792
   macro avg     0.9989    0.9988    0.9988     37792
weighted avg     0.9988    0.9988    0.9988     37792

AUC:  0.9988086043354143


In [9]:
# # model with ensemble majority voting
# from sklearn.ensemble import VotingClassifier

# ensemble_model = VotingClassifier(estimators=[
#     ('xgb', xgb_model),
#     ('lr', lr_model),
#     ('knn', knn_model),
# ], voting='soft')
# ensemble_model.fit(X_train, y_train)

# y_val_pred = ensemble_model.predict(X_val)
# print(classification_report(y_val, y_val_pred,  digits=4))
# print("AUC: ", roc_auc_score(y_val, y_val_pred))

In [10]:
# load test data
df_test = pd.read_csv(TEST_PATH)
display(df_test)
df_test = pre_pre_process_data(df_test)

X_test = df_test[FEATURES]
y_test = df_test[LABEL]

Unnamed: 0,0,tcp,private,REJ,0.1,0.2,0.3,0.4,0.5,0.6,...,0.04.1,0.06.1,0.00.3,0.00.4,0.00.5,0.00.6,1.00.2,1.00.3,neptune,21
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune,21
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal,21
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint,15
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan,11
4,0,tcp,http,SF,267,14515,0,0,0,0,...,1.00,0.00,0.01,0.03,0.01,0.0,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22539,0,tcp,http,SF,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back,15
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal,21


In [11]:
y_test.value_counts()

class
1    12832
0     9711
Name: count, dtype: int64

In [12]:
# evaluate models on the test set
print("Naive-Bayes MODEL:")
y_test_pred = nb_model.predict(X_test)
print("AUC: ", roc_auc_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4), end="---\n")

print("Logistic-Regression MODEL:")
y_test_pred = lr_model.predict(X_test)
print("AUC: ", roc_auc_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4), end="---\n")

print("KNN MODEL:")
y_test_pred = knn_model.predict(X_test)
print("AUC: ", roc_auc_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4), end="---\n")

print("SVM MODEL:")
y_test_pred = svm_model.predict(X_test)
print("AUC: ", roc_auc_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4), end="---\n")

print("XGB MODEL:")
y_test_pred = xgb_model.predict(X_test)
print("AUC: ", roc_auc_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4), end="---\n")

Naive-Bayes MODEL:
AUC:  0.7926630229274408
[[9440  271]
 [4963 7869]]
              precision    recall  f1-score   support

           0     0.6554    0.9721    0.7829      9711
           1     0.9667    0.6132    0.7504     12832

    accuracy                         0.7678     22543
   macro avg     0.8111    0.7927    0.7667     22543
weighted avg     0.8326    0.7678    0.7644     22543
---
Logistic-Regression MODEL:
AUC:  0.7264582540469442
[[8749  962]
 [5749 7083]]
              precision    recall  f1-score   support

           0     0.6035    0.9009    0.7228      9711
           1     0.8804    0.5520    0.6785     12832

    accuracy                         0.7023     22543
   macro avg     0.7419    0.7265    0.7007     22543
weighted avg     0.7611    0.7023    0.6976     22543
---
KNN MODEL:
AUC:  0.7931459195693189
[[9435  276]
 [4944 7888]]
              precision    recall  f1-score   support

           0     0.6562    0.9716    0.7833      9711
           1     0