In [7]:
%%time

from re import X
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


# Load the training and test datasets
train_data = pd.read_csv('UNSW_NB15_training-set-subset-subset.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

# Exclude specified attack categories
exclude_categories = ['Analysis', 'Backdoor', 'Shellcode', 'Worms']
train_data = train_data[~train_data['attack_cat'].isin(exclude_categories)]
test_data_filtered = test_data[~test_data['attack_cat'].isin(exclude_categories)]

# Separate features and labels for both datasets
X_train = train_data.drop(['attack_cat', 'label'], axis=1)  # Drop both target columns
y_train_attack_cat = train_data['attack_cat']  # Target for attack category
y_train_label = train_data['label']  # Target for attack label

X_test = test_data_filtered.drop(['attack_cat', 'label'], axis=1)
y_test_attack_cat = test_data_filtered['attack_cat']
y_test_label = test_data_filtered['label']

X_test_all = test_data.drop(['attack_cat','label'], axis=1)
y_test_attack_cat_all = test_data['attack_cat']
y_test_label_all = test_data['label']

print('Training data shape: ', X_train.shape)
print('Training attack category shape: ', y_train_attack_cat.shape)
print('Training labels shape: ', y_train_label.shape)
print('Test data shape: ', X_test.shape)
print('Test attack filtered category shape: ', y_test_attack_cat.shape)
print('Test labels filtered shape: ', y_test_label.shape)
print('Test data all shape: ', X_test_all.shape)
print('Test attack all category shape: ', y_test_attack_cat_all.shape)
print('Test labels all shape: ', y_test_label_all.shape)

label_encoder = LabelEncoder()

# Iterate through columns and encode string values
for col in X_train.columns:
    if X_train[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder on training data and transform both train and test data

        # Get unique values from both train and test data for this column
        unique_values = pd.concat([X_train[col], X_test[col]]).unique()

        # Fit the encoder on the combined unique values
        label_encoder.fit(unique_values)

        X_train[col] = label_encoder.transform(X_train[col])
        X_test[col] = label_encoder.transform(X_test[col])

for col in X_test_all.columns:
    if X_test_all[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder
        label_encoder.fit(X_test_all[col])

        X_test_all[col] = label_encoder.transform(X_test_all[col])


# Convert to numpy arrays
X_train = X_train.values
X_test = X_test.values
X_test_all = X_test_all.values

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_all_scaled = scaler.transform(X_test_all)

# KNN
# Attack Category
knn_attack_cat = KNeighborsClassifier(n_neighbors=5)
knn_attack_cat.fit(X_train_scaled, y_train_attack_cat)

knn_attack_cat_all = KNeighborsClassifier(n_neighbors=5)
knn_attack_cat_all.fit(X_train_scaled, y_train_attack_cat)

# Attack Label prediction
knn_label = KNeighborsClassifier(n_neighbors=5)
knn_label.fit(X_train_scaled, y_train_label)

knn_label_all = KNeighborsClassifier(n_neighbors=5)
knn_label_all.fit(X_train_scaled, y_train_label)

# Make predictions
y_pred_attack_cat = knn_attack_cat.predict(X_test_scaled)
y_pred_label = knn_label.predict(X_test_scaled)

y_pred_label_all = knn_label_all.predict(X_test_all_scaled)

# Evaluate the models separately
print("Classification Report for Attack Category:")
print(classification_report(y_test_attack_cat, y_pred_attack_cat))
print("Confusion Matrix for Attack Category:")
print(confusion_matrix(y_test_attack_cat, y_pred_attack_cat))

print("Classification Report for Attack Label:")
print(classification_report(y_test_label, y_pred_label))
print("Confusion Matrix for Attack Label:")
print(confusion_matrix(y_test_label, y_pred_label))

# For no filter
print("Classification Report for Attack Label All:")
print(classification_report(y_test_label_all, y_pred_label_all))
print("Confusion Matrix for Attack Label All:")
print(confusion_matrix(y_test_label_all, y_pred_label_all))


Training data shape:  (19999, 43)
Training attack category shape:  (19999,)
Training labels shape:  (19999,)
Test data shape:  (80650, 43)
Test attack filtered category shape:  (80650,)
Test labels filtered shape:  (80650,)
Test data all shape:  (82332, 43)
Test attack all category shape:  (82332,)
Test labels all shape:  (82332,)
Classification Report for Attack Category:
                precision    recall  f1-score   support

           DoS       0.19      0.47      0.27      4089
      Exploits       0.62      0.48      0.54     11132
       Fuzzers       0.17      0.66      0.27      6062
       Generic       0.99      0.52      0.68     18871
        Normal       0.88      0.48      0.62     37000
Reconnaissance       0.24      0.52      0.33      3496

      accuracy                           0.51     80650
     macro avg       0.51      0.52      0.45     80650
  weighted avg       0.75      0.51      0.57     80650

Confusion Matrix for Attack Category:
[[ 1933   634   813    

In [9]:
%%time
!pip install mealpy

import numpy as np
from mealpy import FloatVar, WOA
from sklearn.metrics import accuracy_score, f1_score

from re import X
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


# Load the training and test datasets
train_data = pd.read_csv('UNSW_NB15_training-set-subset-subset.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

# Exclude specified attack categories
exclude_categories = ['Analysis', 'Backdoor', 'Shellcode', 'Worms']
train_data = train_data[~train_data['attack_cat'].isin(exclude_categories)]
test_data_filtered = test_data[~test_data['attack_cat'].isin(exclude_categories)]

# Separate features and labels for both datasets
X_train = train_data.drop(['attack_cat', 'label'], axis=1)  # Drop both target columns
y_train_attack_cat = train_data['attack_cat']  # Target for attack category
y_train_label = train_data['label']  # Target for attack label

X_test = test_data_filtered.drop(['attack_cat', 'label'], axis=1)
y_test_attack_cat = test_data_filtered['attack_cat']
y_test_label = test_data_filtered['label']

X_test_all = test_data.drop(['attack_cat','label'], axis=1)
y_test_attack_cat_all = test_data['attack_cat']
y_test_label_all = test_data['label']

print('Training data shape: ', X_train.shape)
print('Training attack category shape: ', y_train_attack_cat.shape)
print('Training labels shape: ', y_train_label.shape)
print('Test data shape: ', X_test.shape)
print('Test attack filtered category shape: ', y_test_attack_cat.shape)
print('Test labels filtered shape: ', y_test_label.shape)
print('Test data all shape: ', X_test_all.shape)
print('Test attack all category shape: ', y_test_attack_cat_all.shape)
print('Test labels all shape: ', y_test_label_all.shape)

label_encoder = LabelEncoder()

# Iterate through columns and encode string values
for col in X_train.columns:
    if X_train[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder on training data and transform both train and test data

        # Get unique values from both train and test data for this column
        unique_values = pd.concat([X_train[col], X_test[col]]).unique()

        # Fit the encoder on the combined unique values
        label_encoder.fit(unique_values)

        X_train[col] = label_encoder.transform(X_train[col])
        X_test[col] = label_encoder.transform(X_test[col])

for col in X_test_all.columns:
    if X_test_all[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder
        label_encoder.fit(X_test_all[col])

        X_test_all[col] = label_encoder.transform(X_test_all[col])


# Convert to numpy arrays
X_train = X_train.values
X_test = X_test.values
X_test_all = X_test_all.values

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_all_scaled = scaler.transform(X_test_all)

def objective_function_knn_multi(solution):
    # Extract k value from solution (assuming solution[0] represents k)
    k = int(round(solution[0]))

    # Limit k from 1 to 30
    k = max(1, min(k, 30))

    # Create and train KNN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train_scaled, y_train_label)

    # Predict on test set and calculate accuracy and F1-score
    y_pred = knn_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_label, y_pred)
    f1 = f1_score(y_test_label, y_pred)

    # Since WOA minimizes, return 1 - accuracy and 1 - F1-score to maximize both
    return 1 - accuracy, 1 - f1

# Define the optimization problem
problem_dict = {
    "obj_func": objective_function_knn_multi,  # Objective function
    "bounds": FloatVar(lb=[1], ub=[30], name="k"),  # Bounds for k
    "minmax": "min",  # Minimize (since we are inverting accuracy and F1-score)
    "obj_weights": [0.5, 0.5]  # Equal weights for accuracy and F1-score
}

# Create and run the WOA optimizer
model = WOA.OriginalWOA(epoch=3, pop_size=30)
ideal_k = model.solve(problem_dict)

print(ideal_k)

Training data shape:  (19999, 43)
Training attack category shape:  (19999,)
Training labels shape:  (19999,)
Test data shape:  (80650, 43)
Test attack filtered category shape:  (80650,)
Test labels filtered shape:  (80650,)
Test data all shape:  (82332, 43)
Test attack all category shape:  (82332,)
Test labels all shape:  (82332,)


INFO:mealpy.swarm_based.WOA.OriginalWOA:Solving 2-objective optimization problem with weights: [0.5 0.5].
INFO:mealpy.swarm_based.WOA.OriginalWOA:>>>Problem: P, Epoch: 1, Current best: 0.21333143380643738, Global best: 0.21333143380643738, Runtime: 572.80918 seconds
INFO:mealpy.swarm_based.WOA.OriginalWOA:>>>Problem: P, Epoch: 2, Current best: 0.21333143380643738, Global best: 0.21333143380643738, Runtime: 579.11450 seconds
INFO:mealpy.swarm_based.WOA.OriginalWOA:>>>Problem: P, Epoch: 3, Current best: 0.21333143380643738, Global best: 0.21333143380643738, Runtime: 577.99299 seconds


id: 370, target: Objectives: [0.24239306 0.18426981], Fitness: 0.21333143380643738, solution: [22.52362177]
CPU times: user 38min 20s, sys: 25.8 s, total: 38min 46s
Wall time: 39min 4s


In [12]:
%%time

from re import X
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


# Load the training and test datasets
train_data = pd.read_csv('UNSW_NB15_training-set-subset-subset.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

# Exclude specified attack categories
exclude_categories = ['Analysis', 'Backdoor', 'Shellcode', 'Worms']
train_data = train_data[~train_data['attack_cat'].isin(exclude_categories)]
test_data_filtered = test_data[~test_data['attack_cat'].isin(exclude_categories)]

# Separate features and labels for both datasets
X_train = train_data.drop(['attack_cat', 'label'], axis=1)  # Drop both target columns
y_train_attack_cat = train_data['attack_cat']  # Target for attack category
y_train_label = train_data['label']  # Target for attack label

X_test = test_data_filtered.drop(['attack_cat', 'label'], axis=1)
y_test_attack_cat = test_data_filtered['attack_cat']
y_test_label = test_data_filtered['label']

X_test_all = test_data.drop(['attack_cat','label'], axis=1)
y_test_attack_cat_all = test_data['attack_cat']
y_test_label_all = test_data['label']

print('Training data shape: ', X_train.shape)
print('Training attack category shape: ', y_train_attack_cat.shape)
print('Training labels shape: ', y_train_label.shape)
print('Test data shape: ', X_test.shape)
print('Test attack filtered category shape: ', y_test_attack_cat.shape)
print('Test labels filtered shape: ', y_test_label.shape)
print('Test data all shape: ', X_test_all.shape)
print('Test attack all category shape: ', y_test_attack_cat_all.shape)
print('Test labels all shape: ', y_test_label_all.shape)

label_encoder = LabelEncoder()

# Iterate through columns and encode string values
for col in X_train.columns:
    if X_train[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder on training data and transform both train and test data

        # Get unique values from both train and test data for this column
        unique_values = pd.concat([X_train[col], X_test[col]]).unique()

        # Fit the encoder on the combined unique values
        label_encoder.fit(unique_values)

        X_train[col] = label_encoder.transform(X_train[col])
        X_test[col] = label_encoder.transform(X_test[col])

for col in X_test_all.columns:
    if X_test_all[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder
        label_encoder.fit(X_test_all[col])

        X_test_all[col] = label_encoder.transform(X_test_all[col])


# Convert to numpy arrays
X_train = X_train.values
X_test = X_test.values
X_test_all = X_test_all.values

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_all_scaled = scaler.transform(X_test_all)

# KNN
# Attack Category
knn_attack_cat = KNeighborsClassifier(n_neighbors=22)
knn_attack_cat.fit(X_train_scaled, y_train_attack_cat)

knn_attack_cat_all = KNeighborsClassifier(n_neighbors=22)
knn_attack_cat_all.fit(X_train_scaled, y_train_attack_cat)

# Attack Label prediction
knn_label = KNeighborsClassifier(n_neighbors=22)
knn_label.fit(X_train_scaled, y_train_label)

knn_label_all = KNeighborsClassifier(n_neighbors=22)
knn_label_all.fit(X_train_scaled, y_train_label)

# Make predictions
y_pred_attack_cat = knn_attack_cat.predict(X_test_scaled)
y_pred_label = knn_label.predict(X_test_scaled)

y_pred_label_all = knn_label_all.predict(X_test_all_scaled)

# Evaluate the models separately
print("Classification Report for Attack Category:")
print(classification_report(y_test_attack_cat, y_pred_attack_cat))
print("Confusion Matrix for Attack Category:")
print(confusion_matrix(y_test_attack_cat, y_pred_attack_cat))

print("Classification Report for Attack Label:")
print(classification_report(y_test_label, y_pred_label))
print("Confusion Matrix for Attack Label:")
print(confusion_matrix(y_test_label, y_pred_label))

# For no filter
print("Classification Report for Attack Label All:")
print(classification_report(y_test_label_all, y_pred_label_all))
print("Confusion Matrix for Attack Label All:")
print(confusion_matrix(y_test_label_all, y_pred_label_all))


Training data shape:  (19999, 43)
Training attack category shape:  (19999,)
Training labels shape:  (19999,)
Test data shape:  (80650, 43)
Test attack filtered category shape:  (80650,)
Test labels filtered shape:  (80650,)
Test data all shape:  (82332, 43)
Test attack all category shape:  (82332,)
Test labels all shape:  (82332,)
Classification Report for Attack Category:
                precision    recall  f1-score   support

           DoS       0.23      0.56      0.32      4089
      Exploits       0.61      0.50      0.55     11132
       Fuzzers       0.17      0.72      0.28      6062
       Generic       0.97      0.56      0.71     18871
        Normal       0.97      0.48      0.64     37000
Reconnaissance       0.30      0.59      0.40      3496

      accuracy                           0.53     80650
     macro avg       0.54      0.57      0.48     80650
  weighted avg       0.80      0.53      0.59     80650

Confusion Matrix for Attack Category:
[[ 2275   655   894    