In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
warnings.filterwarnings('ignore')

In [2]:
DATA_PATH = './data/'
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [3]:
NB15_FEATURES = 'UNSW-NB15_features-modified.csv'

In [4]:
metadata = pd.read_csv(DATA_PATH + TRAIN_PATH + NB15_FEATURES)

metadata

Unnamed: 0,No.,Name,Type,Description,feature_type
0,5,proto,nominal,Transaction protocol,flow
1,6,state,nominal,Indicates to the state and its dependent proto...,basic
2,7,dur,Float,Record total duration,basic
3,8,sbytes,Integer,Source to destination transaction bytes,basic
4,9,dbytes,Integer,Destination to source transaction bytes,basic
5,10,sttl,Integer,Source to destination time to live value,basic
6,11,dttl,Integer,Destination to source time to live value,basic
7,12,sloss,Integer,Source packets retransmitted or dropped,basic
8,13,dloss,Integer,Destination packets retransmitted or dropped,basic
9,14,service,nominal,"http, ftp, smtp, ssh, dns, ftp-data ,irc and ...",basic


In [5]:
ADDITIONAL_FEATURES_TRAIN = 'additional_features_train.csv'
BASIC_FEATURES_TRAIN = 'basic_features_train.csv'
CONTENT_FEATURES_TRAIN = 'content_features_train.csv'
FLOW_FEATURES_TRAIN = 'flow_features_train.csv'
LABELS_TRAIN = 'labels_train.csv'
TIME_FEATURES_TRAIN = 'time_features_train.csv'

In [6]:
dataset_paths_train = [
    ADDITIONAL_FEATURES_TRAIN,
    BASIC_FEATURES_TRAIN,
    CONTENT_FEATURES_TRAIN,
    FLOW_FEATURES_TRAIN,
    LABELS_TRAIN,
    TIME_FEATURES_TRAIN
]

In [7]:
all_data_train = [pd.read_csv(DATA_PATH + TRAIN_PATH + dp) for dp in dataset_paths_train]

df_train = all_data_train[0]

for df_i in all_data_train[1:]:
    df_train = pd.merge(df_train, df_i, on='id')

df_train.drop(["label"], axis=1, inplace=True)



In [8]:
ADDITIONAL_FEATURES_TEST = 'additional_features_test.csv'
BASIC_FEATURES_TEST = 'basic_features_test.csv'
CONTENT_FEATURES_TEST = 'content_features_test.csv'
FLOW_FEATURES_TEST = 'flow_features_test.csv'
TIME_FEATURES_TEST = 'time_features_test.csv'

In [9]:
dataset_paths_test = [
    ADDITIONAL_FEATURES_TEST,
    BASIC_FEATURES_TEST,
    CONTENT_FEATURES_TEST,
    FLOW_FEATURES_TEST,
    TIME_FEATURES_TEST
]

In [10]:
all_data_test = [pd.read_csv(DATA_PATH + TEST_PATH + dp) for dp in dataset_paths_test]

df_test = all_data_test[0]

for df_i in all_data_test[1:]:
    df_test = pd.merge(df_test, df_i, on='id')

df_test.shape

(20583, 42)

In [11]:
print(df_train.columns.tolist())

['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'id', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'proto', 'attack_cat', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat']


In [12]:
print(df_test.columns.tolist())

['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'id', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'proto', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat']


In [13]:
list_drop_train = ['id']

df_train.drop(list_drop_train,axis=1,inplace=True)

In [14]:
df_numeric = df_train.select_dtypes(include=[np.number])
df_cat = df_train.select_dtypes(exclude=[np.number])

In [15]:
df_numeric.columns

Index(['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'dur',
       'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'sjit', 'djit', 'sinpkt', 'dinpkt',
       'tcprtt', 'synack', 'ackdat'],
      dtype='object')

In [16]:
def handle_outliers(X):
    df_numeric = X.select_dtypes(include=[np.number])
    Q1 = df_numeric.quantile(0.25)
    Q3 = df_numeric.quantile(0.75)
    IQR = Q3 - Q1
    
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    # Clip outliers
    for column in df_numeric.columns:
        X[column] = np.clip(X[column], lower_limit[column], upper_limit[column])
    
    # Apply log transformation (adding a small constant to avoid log(0))
    for column in df_numeric.columns:
        # Check if the column has non-negative values
        if (X[column] >= 0).all():
            X[column] = np.log1p(X[column])  # log1p is log(1 + x) to handle zero values
    
    return X

In [17]:
def feature_engineering(X):
    if 'is_ftp_login' in X.columns:
        X['is_ftp_login'] = X['is_ftp_login'].replace({0: 'false', 1: 'true'})
    
    if 'is_sm_ips_ports' in X.columns:
        X['is_sm_ips_ports'] = X['is_sm_ips_ports'].replace({0: 'false', 1: 'true'})

    X['network_bytes'] = X['sbytes'] + X['dbytes']

    correlation_matrix = X.select_dtypes(include=[np.number]).corr()

    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    highly_correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]

    X.drop(columns=highly_correlated_features, inplace=True, errors='ignore')

    return X, highly_correlated_features

In [18]:
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=0)

In [19]:
X_train = df_train.drop("attack_cat", axis = 1)
y_train = df_train["attack_cat"]

X_val = df_val.drop("attack_cat", axis = 1)
y_val = df_val["attack_cat"]

In [20]:
X_train_transformed, dropped_features_train = feature_engineering(handle_outliers(X_train))
X_val_transformed, dropped_features_val = feature_engineering(X_val)

In [21]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('yeo_johnson', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')), 
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_train_transformed.select_dtypes(include=[np.number]).columns),
        ('cat', categorical_transformer, X_train_transformed.select_dtypes(exclude=[np.number]).columns)
    ], remainder='passthrough'
)

In [23]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95))  # Retain 95% of variance
])

In [24]:
# log_transform_and_clip(df_numeric, df_train)

In [25]:
X_train.shape

(140272, 22)

In [26]:
X_val.shape

(35069, 27)

In [27]:
train_X_preprocessor = pipeline.fit_transform(X_train, y_train)
val_X_preprocessor = pipeline.transform(X_val)

In [28]:
# smote = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=0)
# train_X_preprocessor, val_X_preprocessor = smote.fit_resample(train_X_preprocessor, y_train)

In [29]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=16, metric='euclidean')

knn.fit(train_X_preprocessor, y_train)
knn_pred = knn.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, knn_pred))
print("Classification Report: \n", classification_report(y_val, knn_pred))

Accuracy:  0.7826000171091277
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.31      0.05      0.08       372
      Backdoor       0.24      0.01      0.03       347
           DoS       0.32      0.29      0.30      2457
      Exploits       0.63      0.77      0.69      6681
       Fuzzers       0.60      0.68      0.64      3633
       Generic       1.00      0.98      0.99      8025
        Normal       0.92      0.87      0.89     11251
Reconnaissance       0.76      0.70      0.72      2073
     Shellcode       0.54      0.21      0.30       208
         Worms       1.00      0.05      0.09        22

      accuracy                           0.78     35069
     macro avg       0.63      0.46      0.47     35069
  weighted avg       0.78      0.78      0.78     35069



In [30]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_X_preprocessor, y_train)
gnb_pred = gnb.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, gnb_pred))
print("Classification Report: \n", classification_report(y_val, gnb_pred))

Accuracy:  0.6801163420684936
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.07      0.03      0.04       372
      Backdoor       0.05      0.01      0.01       347
           DoS       0.33      0.66      0.44      2457
      Exploits       0.58      0.40      0.47      6681
       Fuzzers       0.48      0.56      0.52      3633
       Generic       1.00      0.94      0.97      8025
        Normal       0.80      0.79      0.80     11251
Reconnaissance       0.50      0.50      0.50      2073
     Shellcode       0.18      0.11      0.14       208
         Worms       0.04      0.41      0.07        22

      accuracy                           0.68     35069
     macro avg       0.40      0.44      0.40     35069
  weighted avg       0.70      0.68      0.68     35069



In [31]:
from main import GaussianNaiveBayesScratch

gnb_scratch = GaussianNaiveBayesScratch()
gnb_scratch.fit(train_X_preprocessor, y_train)
gnb_scratch_pred = gnb_scratch.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, gnb_scratch_pred))
print("Classification Report: \n", classification_report(y_val, gnb_scratch_pred))

Accuracy:  0.6801163420684936
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.07      0.03      0.04       372
      Backdoor       0.05      0.01      0.01       347
           DoS       0.33      0.66      0.44      2457
      Exploits       0.58      0.40      0.47      6681
       Fuzzers       0.48      0.56      0.52      3633
       Generic       1.00      0.94      0.97      8025
        Normal       0.80      0.79      0.80     11251
Reconnaissance       0.50      0.50      0.50      2073
     Shellcode       0.18      0.11      0.14       208
         Worms       0.04      0.41      0.07        22

      accuracy                           0.68     35069
     macro avg       0.40      0.44      0.40     35069
  weighted avg       0.70      0.68      0.68     35069



In [32]:
test_ids = df_test["id"]
X_test = df_test.drop(columns=["id"])

In [33]:
X_test_transformed, dropped_features_val = feature_engineering(X_test)

In [34]:
test_X_preprocessed = pipeline.transform(X_test_transformed)
test_pred = knn.predict(test_X_preprocessed)

submission = pd.DataFrame({
    "id": test_ids,
    "attack_cat": test_pred
})

submission.to_csv("submission.csv", index=False)
print("Submission file succesfully created [submission.csv]")

Submission file succesfully created [submission.csv]
