In [24]:
%%time

from re import X
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


# Load the training and test datasets
train_data = pd.read_csv('UNSW_NB15_training-set-subset.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

# Exclude specified attack categories
exclude_categories = ['Analysis', 'Backdoor', 'Shellcode', 'Worms']
train_data = train_data[~train_data['attack_cat'].isin(exclude_categories)]
test_data_filtered = test_data[~test_data['attack_cat'].isin(exclude_categories)]

# Separate features and labels for both datasets
X_train = train_data.drop(['attack_cat', 'label'], axis=1)  # Drop both target columns
y_train_attack_cat = train_data['attack_cat']  # Target for attack category
y_train_label = train_data['label']  # Target for attack label

X_test = test_data_filtered.drop(['attack_cat', 'label'], axis=1)
y_test_attack_cat = test_data_filtered['attack_cat']
y_test_label = test_data_filtered['label']

X_test_all = test_data.drop(['attack_cat','label'], axis=1)
y_test_attack_cat_all = test_data['attack_cat']
y_test_label_all = test_data['label']

print('Training data shape: ', X_train.shape)
print('Training attack category shape: ', y_train_attack_cat.shape)
print('Training labels shape: ', y_train_label.shape)
print('Test data shape: ', X_test.shape)
print('Test attack filtered category shape: ', y_test_attack_cat.shape)
print('Test labels filtered shape: ', y_test_label.shape)
print('Test data all shape: ', X_test_all.shape)
print('Test attack all category shape: ', y_test_attack_cat_all.shape)
print('Test labels all shape: ', y_test_label_all.shape)

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through columns and encode string values
for col in X_train.columns:
    if X_train[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder on training data and transform both train and test data

        # Get unique values from both train and test data for this column
        unique_values = pd.concat([X_train[col], X_test[col]]).unique()

        # Fit the encoder on the combined unique values
        label_encoder.fit(unique_values)

        X_train[col] = label_encoder.transform(X_train[col])
        X_test[col] = label_encoder.transform(X_test[col])

for col in X_test_all.columns:
    if X_test_all[col].dtype == 'object':  # Check if column is of object type (string)
        # Fit the encoder
        label_encoder.fit(X_test_all[col])

        X_test_all[col] = label_encoder.transform(X_test_all[col])


# Convert to numpy arrays
X_train = X_train.values
X_test = X_test.values
X_test_all = X_test_all.values

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_all_scaled = scaler.transform(X_test_all)

# KNN
# Attack Category
knn_attack_cat = KNeighborsClassifier(n_neighbors=5)
knn_attack_cat.fit(X_train_scaled, y_train_attack_cat)

knn_attack_cat_all = KNeighborsClassifier(n_neighbors=5)
knn_attack_cat_all.fit(X_train_scaled, y_train_attack_cat)

# Attack Label prediction
knn_label = KNeighborsClassifier(n_neighbors=5)
knn_label.fit(X_train_scaled, y_train_label)

knn_label_all = KNeighborsClassifier(n_neighbors=5)
knn_label_all.fit(X_train_scaled, y_train_label)

# Make predictions
y_pred_attack_cat = knn_attack_cat.predict(X_test_scaled)
y_pred_label = knn_label.predict(X_test_scaled)

y_pred_label_all = knn_label_all.predict(X_test_all_scaled)

# Evaluate the models separately
print("Classification Report for Attack Category:")
print(classification_report(y_test_attack_cat, y_pred_attack_cat))
print("Confusion Matrix for Attack Category:")
print(confusion_matrix(y_test_attack_cat, y_pred_attack_cat))

print("Classification Report for Attack Label:")
print(classification_report(y_test_label, y_pred_label))
print("Confusion Matrix for Attack Label:")
print(confusion_matrix(y_test_label, y_pred_label))

# For no filter
print("Classification Report for Attack Label All:")
print(classification_report(y_test_label_all, y_pred_label_all))
print("Confusion Matrix for Attack Label All:")
print(confusion_matrix(y_test_label_all, y_pred_label_all))


Training data shape:  (106950, 43)
Training attack category shape:  (106950,)
Training labels shape:  (106950,)
Test data shape:  (80650, 43)
Test attack filtered category shape:  (80650,)
Test labels filtered shape:  (80650,)
Test data all shape:  (82332, 43)
Test attack all category shape:  (82332,)
Test labels all shape:  (82332,)
Classification Report for Attack Category:
                precision    recall  f1-score   support

           DoS       0.26      0.46      0.34      4089
      Exploits       0.64      0.42      0.51     11132
       Fuzzers       0.13      0.36      0.19      6062
       Generic       1.00      0.50      0.66     18871
        Normal       0.74      0.74      0.74     37000
Reconnaissance       0.44      0.38      0.41      3496

      accuracy                           0.58     80650
     macro avg       0.54      0.48      0.48     80650
  weighted avg       0.71      0.58      0.62     80650

Confusion Matrix for Attack Category:
[[ 1897   484   536 