In [29]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [30]:
DATA_PATH = './data/'
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [31]:
NB15_FEATURES = 'UNSW-NB15_features-modified.csv'

In [32]:
metadata = pd.read_csv(DATA_PATH + TRAIN_PATH + NB15_FEATURES)

metadata

Unnamed: 0,No.,Name,Type,Description,feature_type
0,5,proto,nominal,Transaction protocol,flow
1,6,state,nominal,Indicates to the state and its dependent proto...,basic
2,7,dur,Float,Record total duration,basic
3,8,sbytes,Integer,Source to destination transaction bytes,basic
4,9,dbytes,Integer,Destination to source transaction bytes,basic
5,10,sttl,Integer,Source to destination time to live value,basic
6,11,dttl,Integer,Destination to source time to live value,basic
7,12,sloss,Integer,Source packets retransmitted or dropped,basic
8,13,dloss,Integer,Destination packets retransmitted or dropped,basic
9,14,service,nominal,"http, ftp, smtp, ssh, dns, ftp-data ,irc and ...",basic


In [33]:
ADDITIONAL_FEATURES_TRAIN = 'additional_features_train.csv'
BASIC_FEATURES_TRAIN = 'basic_features_train.csv'
CONTENT_FEATURES_TRAIN = 'content_features_train.csv'
FLOW_FEATURES_TRAIN = 'flow_features_train.csv'
LABELS_TRAIN = 'labels_train.csv'
TIME_FEATURES_TRAIN = 'time_features_train.csv'

In [34]:
dataset_paths_train = [
    ADDITIONAL_FEATURES_TRAIN,
    BASIC_FEATURES_TRAIN,
    CONTENT_FEATURES_TRAIN,
    FLOW_FEATURES_TRAIN,
    LABELS_TRAIN,
    TIME_FEATURES_TRAIN
]

In [35]:
all_data_train = [pd.read_csv(DATA_PATH + TRAIN_PATH + dp) for dp in dataset_paths_train]

df_train = all_data_train[0]

for df_i in all_data_train[1:]:
    df_train = pd.merge(df_train, df_i, on='id')

df_train.drop(["label"], axis=1, inplace=True)



In [36]:
ADDITIONAL_FEATURES_TEST = 'additional_features_test.csv'
BASIC_FEATURES_TEST = 'basic_features_test.csv'
CONTENT_FEATURES_TEST = 'content_features_test.csv'
FLOW_FEATURES_TEST = 'flow_features_test.csv'
TIME_FEATURES_TEST = 'time_features_test.csv'

In [37]:
dataset_paths_test = [
    ADDITIONAL_FEATURES_TEST,
    BASIC_FEATURES_TEST,
    CONTENT_FEATURES_TEST,
    FLOW_FEATURES_TEST,
    TIME_FEATURES_TEST
]

In [38]:
all_data_test = [pd.read_csv(DATA_PATH + TEST_PATH + dp) for dp in dataset_paths_test]

df_test = all_data_test[0]

for df_i in all_data_test[1:]:
    df_test = pd.merge(df_test, df_i, on='id')

df_test.shape

(20583, 42)

In [39]:
print(df_train.columns.tolist())

['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'id', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'proto', 'attack_cat', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat']


In [40]:
print(df_test.columns.tolist())

['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'id', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'proto', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat']


In [41]:
list_drop_train = ['id']

df_train.drop(list_drop_train,axis=1,inplace=True)

In [42]:
df_numeric = df_train.select_dtypes(include=[np.number])
df_cat = df_train.select_dtypes(exclude=[np.number])

In [43]:
from sklearn.preprocessing import PowerTransformer

def log_transform_and_clip(df_numeric, df_train):
    # Initialize the Yeo-Johnson transformer
    yeo_johnson_transformer = PowerTransformer(method='yeo-johnson')

    # Apply the transformation to columns with skewness > 1 (adjust the threshold as needed)
    for col in df_numeric.columns:
        if df_numeric[col].skew() > 1:  # You can adjust the threshold for skewness
            # Reshape the data to apply the transformer (1D to 2D)
            df_train[col] = yeo_johnson_transformer.fit_transform(df_train[[col]])

    Q1 = df_numeric.quantile(0.25)
    Q3 = df_numeric.quantile(0.75)
    IQR = Q3 - Q1

    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    for column in df_numeric.columns:
        df_train[column] = np.clip(df_train[column], lower_limit[column], upper_limit[column])

In [44]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, df_numeric.columns),
        ('cat', categorical_transformer, ['state', 'service', 'proto'])
    ], remainder='passthrough')

In [46]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_filter', VarianceThreshold(threshold=0.01)),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))
])


In [47]:
X = df_train.drop("attack_cat", axis = 1)
y = df_train["attack_cat"]

In [48]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [49]:
log_transform_and_clip(df_numeric, X_train)

In [50]:
X_val.head()

Unnamed: 0,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,...,trans_depth,response_body_len,proto,sjit,djit,sinpkt,dinpkt,tcprtt,synack,ackdat
87395,0.0,1.0,0.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,...,0.0,0.0,tcp,3805.728971,185.324109,74.5,122.623797,0.167309,0.090865,0.076444
171525,0.0,0.0,1.0,0.0,0.0,3.0,1.0,3.0,5.0,2.0,...,1.0,3924.0,tcp,9561.133208,7699.332724,93.228091,60.313884,0.000624,,0.000111
100997,0.0,1.0,0.0,0.0,0.0,8.0,2.0,2.0,6.0,1.0,...,0.0,0.0,tcp,,241.358578,89.398455,104.770781,,0.057568,0.023055
106304,0.0,2.0,0.0,0.0,,31.0,31.0,15.0,15.0,,...,0.0,0.0,udp,0.0,0.0,0.003,0.0,0.0,0.0,0.0
170606,0.0,0.0,1.0,0.0,0.0,3.0,2.0,8.0,2.0,1.0,...,2.0,,tcp,9527.262188,7673.271229,91.298997,59.064765,,0.000534,0.000136


In [51]:
train_X_preprocessor = pipeline.fit_transform(X_train, y_train)
val_X_preprocessor = pipeline.transform(X_val)

In [52]:
smote = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=0)
train_X_preprocessor, val_X_preprocessor = smote.fit_resample(train_X_preprocessor, y_train)

In [53]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=100, metric='euclidean')

knn.fit(train_X_preprocessor, y_train)
knn_pred = knn.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, knn_pred))
print("Classification Report: \n", classification_report(y_val, knn_pred))

ValueError: Found input variables with inconsistent numbers of samples: [447490, 140272]

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_X_preprocessor, y_train)
gnb_pred = gnb.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, gnb_pred))
print("Classification Report: \n", classification_report(y_val, gnb_pred))

Accuracy:  0.3083920271464827
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       372
      Backdoor       0.00      0.00      0.00       347
           DoS       1.00      0.00      0.00      2457
      Exploits       0.20      0.00      0.00      6681
       Fuzzers       0.01      0.00      0.00      3633
       Generic       0.00      0.00      0.00      8025
        Normal       0.32      0.96      0.48     11251
Reconnaissance       0.00      0.00      0.00      2073
     Shellcode       0.00      0.00      0.00       208
         Worms       0.00      0.00      0.00        22

      accuracy                           0.31     35069
     macro avg       0.15      0.10      0.05     35069
  weighted avg       0.21      0.31      0.15     35069



In [None]:
from main import GaussianNaiveBayesScratch

gnb_scratch = GaussianNaiveBayesScratch()
gnb_scratch.fit(train_X_preprocessor, y_train)
gnb_scratch_pred = gnb_scratch.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, gnb_scratch_pred))
print("Classification Report: \n", classification_report(y_val, gnb_scratch_pred))

Accuracy:  0.011691237275086258
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.01      0.99      0.02       372
      Backdoor       0.00      0.00      0.00       347
           DoS       1.00      0.00      0.00      2457
      Exploits       0.20      0.00      0.00      6681
       Fuzzers       0.00      0.00      0.00      3633
       Generic       0.00      0.00      0.00      8025
        Normal       0.55      0.00      0.01     11251
Reconnaissance       0.00      0.00      0.00      2073
     Shellcode       0.00      0.00      0.00       208
         Worms       0.00      0.00      0.00        22

      accuracy                           0.01     35069
     macro avg       0.18      0.10      0.00     35069
  weighted avg       0.28      0.01      0.00     35069



In [27]:
test_ids = df_test["id"]
X_test = df_test.drop(columns=["id"])

In [28]:

# Preprocess the test data
test_X_preprocessed = pipeline.transform(X_test)
test_pred = gnb_scratch.predict(test_X_preprocessed)

submission = pd.DataFrame({
    "id": test_ids,
    "attack_cat": test_pred
})


submission.to_csv("submission.csv", index=False)
print("Submission file succesfully created [submission.csv]")

Submission file succesfully created [submission.csv]
