In [153]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
warnings.filterwarnings('ignore')

In [154]:
DATA_PATH = './data/'
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [155]:
NB15_FEATURES = 'UNSW-NB15_features-modified.csv'

In [156]:
metadata = pd.read_csv(DATA_PATH + TRAIN_PATH + NB15_FEATURES)

metadata

Unnamed: 0,No.,Name,Type,Description,feature_type
0,5,proto,nominal,Transaction protocol,flow
1,6,state,nominal,Indicates to the state and its dependent proto...,basic
2,7,dur,Float,Record total duration,basic
3,8,sbytes,Integer,Source to destination transaction bytes,basic
4,9,dbytes,Integer,Destination to source transaction bytes,basic
5,10,sttl,Integer,Source to destination time to live value,basic
6,11,dttl,Integer,Destination to source time to live value,basic
7,12,sloss,Integer,Source packets retransmitted or dropped,basic
8,13,dloss,Integer,Destination packets retransmitted or dropped,basic
9,14,service,nominal,"http, ftp, smtp, ssh, dns, ftp-data ,irc and ...",basic


In [157]:
ADDITIONAL_FEATURES_TRAIN = 'additional_features_train.csv'
BASIC_FEATURES_TRAIN = 'basic_features_train.csv'
CONTENT_FEATURES_TRAIN = 'content_features_train.csv'
FLOW_FEATURES_TRAIN = 'flow_features_train.csv'
LABELS_TRAIN = 'labels_train.csv'
TIME_FEATURES_TRAIN = 'time_features_train.csv'

In [158]:
dataset_paths_train = [
    ADDITIONAL_FEATURES_TRAIN,
    BASIC_FEATURES_TRAIN,
    CONTENT_FEATURES_TRAIN,
    FLOW_FEATURES_TRAIN,
    LABELS_TRAIN,
    TIME_FEATURES_TRAIN
]

In [159]:
all_data_train = [pd.read_csv(DATA_PATH + TRAIN_PATH + dp) for dp in dataset_paths_train]

df_train = all_data_train[0]

for df_i in all_data_train[1:]:
    df_train = pd.merge(df_train, df_i, on='id')

df_train.drop(["label"], axis=1, inplace=True)



In [160]:
ADDITIONAL_FEATURES_TEST = 'additional_features_test.csv'
BASIC_FEATURES_TEST = 'basic_features_test.csv'
CONTENT_FEATURES_TEST = 'content_features_test.csv'
FLOW_FEATURES_TEST = 'flow_features_test.csv'
TIME_FEATURES_TEST = 'time_features_test.csv'

In [161]:
dataset_paths_test = [
    ADDITIONAL_FEATURES_TEST,
    BASIC_FEATURES_TEST,
    CONTENT_FEATURES_TEST,
    FLOW_FEATURES_TEST,
    TIME_FEATURES_TEST
]

In [162]:
all_data_test = [pd.read_csv(DATA_PATH + TEST_PATH + dp) for dp in dataset_paths_test]

df_test = all_data_test[0]

for df_i in all_data_test[1:]:
    df_test = pd.merge(df_test, df_i, on='id')

df_test.shape

(20583, 42)

In [163]:
print(df_train.columns.tolist())

['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'id', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'proto', 'attack_cat', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat']


In [164]:
print(df_test.columns.tolist())

['is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'id', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'proto', 'sjit', 'djit', 'sinpkt', 'dinpkt', 'tcprtt', 'synack', 'ackdat']


In [165]:
list_drop_train = ['id']

df_train.drop(list_drop_train,axis=1,inplace=True)

In [166]:
df_numeric = df_train.select_dtypes(include=[np.number])
df_cat = df_train.select_dtypes(exclude=[np.number])

In [167]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Create new features
        X['network_bytes'] = X['sbytes'] + X['dbytes']
        
        # Handle outliers using IQR
        df_numeric = X.select_dtypes(include=[np.number])
        Q1 = df_numeric.quantile(0.25)
        Q3 = df_numeric.quantile(0.75)
        IQR = Q3 - Q1
        
        # Determine lower and upper limits
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        # Clip the values in the original DataFrame
        for column in df_numeric.columns:
            X[column] = np.clip(X[column], lower_limit[column], upper_limit[column])
        
        # Drop the original columns used to create the new feature
        X.drop(['sbytes', 'dbytes'], axis=1, inplace=True)  # Adjust the column names as necessary

        # Step 1: Compute the correlation matrix
        correlation_matrix = X.corr()

        # Step 2: Create an upper triangle matrix (since correlation is symmetric, we need to check only one side)
        upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

        # Step 3: Get columns with correlations higher than a threshold (e.g., 0.8)
        highly_correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]

        # Drop the highly correlated features
        X.drop(columns=highly_correlated_features, inplace=True)

        return X


In [168]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


In [169]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, df_numeric.columns),
        ('cat', categorical_transformer, ['state', 'service', 'proto'])
    ], remainder='passthrough')

In [170]:
pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineering()),
    ('preprocessor', preprocessor),
    ('var_filter', VarianceThreshold(threshold=0.01)),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))
])


In [171]:
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=0)

In [172]:
# log_transform_and_clip(df_numeric, df_train)

In [173]:
X_train = df_train.drop("attack_cat", axis = 1)
y_train = df_train["attack_cat"]

X_val = df_val.drop("attack_cat", axis = 1)
y_val = df_val["attack_cat"]

In [175]:
train_X_preprocessor = pipeline.fit_transform(X_train, y_train)
val_X_preprocessor = pipeline.transform(X_val)

KeyError: 'sbytes'

In [132]:
# smote = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=0)
# train_X_preprocessor, val_X_preprocessor = smote.fit_resample(train_X_preprocessor, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=100, metric='euclidean')

knn.fit(train_X_preprocessor, y_train)
knn_pred = knn.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, knn_pred))
print("Classification Report: \n", classification_report(y_val, knn_pred))

Accuracy:  0.7502637657190111
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       372
      Backdoor       0.00      0.00      0.00       347
           DoS       0.36      0.13      0.19      2457
      Exploits       0.57      0.83      0.68      6681
       Fuzzers       0.49      0.68      0.57      3633
       Generic       0.99      0.97      0.98      8025
        Normal       0.94      0.83      0.88     11251
Reconnaissance       0.52      0.40      0.45      2073
     Shellcode       0.00      0.00      0.00       208
         Worms       0.00      0.00      0.00        22

      accuracy                           0.75     35069
     macro avg       0.39      0.38      0.38     35069
  weighted avg       0.74      0.75      0.74     35069



In [134]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_X_preprocessor, y_train)
gnb_pred = gnb.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, gnb_pred))
print("Classification Report: \n", classification_report(y_val, gnb_pred))

Accuracy:  0.6664575551056489
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       372
      Backdoor       0.00      0.00      0.00       347
           DoS       0.33      0.73      0.45      2457
      Exploits       0.60      0.48      0.54      6681
       Fuzzers       0.41      0.53      0.47      3633
       Generic       0.96      0.97      0.97      8025
        Normal       0.89      0.75      0.82     11251
Reconnaissance       0.13      0.03      0.05      2073
     Shellcode       0.05      0.36      0.10       208
         Worms       0.00      0.00      0.00        22

      accuracy                           0.67     35069
     macro avg       0.34      0.39      0.34     35069
  weighted avg       0.69      0.67      0.67     35069



In [135]:
from main import GaussianNaiveBayesScratch

gnb_scratch = GaussianNaiveBayesScratch()
gnb_scratch.fit(train_X_preprocessor, y_train)
gnb_scratch_pred = gnb_scratch.predict(val_X_preprocessor)
print("Accuracy: ", accuracy_score(y_val, gnb_scratch_pred))
print("Classification Report: \n", classification_report(y_val, gnb_scratch_pred))

Accuracy:  0.6664575551056489
Classification Report: 
                 precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       372
      Backdoor       0.00      0.00      0.00       347
           DoS       0.33      0.73      0.45      2457
      Exploits       0.60      0.48      0.54      6681
       Fuzzers       0.41      0.53      0.47      3633
       Generic       0.96      0.97      0.97      8025
        Normal       0.89      0.75      0.82     11251
Reconnaissance       0.13      0.03      0.05      2073
     Shellcode       0.05      0.36      0.10       208
         Worms       0.00      0.00      0.00        22

      accuracy                           0.67     35069
     macro avg       0.34      0.39      0.34     35069
  weighted avg       0.69      0.67      0.67     35069



In [136]:
test_ids = df_test["id"]
X_test = df_test.drop(columns=["id"])

In [None]:

# Preprocess the test data
test_X_preprocessed = pipeline.transform(X_test)
test_pred = gnb_scratch.predict(test_X_preprocessed)

submission = pd.DataFrame({
    "id": test_ids,
    "attack_cat": test_pred
})


submission.to_csv("submission.csv", index=False)
print("Submission file succesfully created [submission.csv]")

Submission file succesfully created [submission.csv]
