In [36]:
import os
import sys
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split

import matplotlib.pyplot as plt
from tqdm import tqdm

root_dir = os.getcwd().split("AdversarialNIDS")[0] + "AdversarialNIDS"
sys.path.append(root_dir)

from scripts.logger import LoggerManager
from UNSWNB15.preprocessing.dataset import UNSWNB15
from scripts.models import random_forest
from scripts.models import knn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
lm = LoggerManager(log_dir=f"{root_dir}/logs", log_name="test_random_forest")
lm.logger.info("Logger initialized")

dataset = UNSWNB15(size="small",logger=lm.logger).encode(attack_encoder="label").scale(scaler="minmax").optimize_memory()


2025-11-18 09:37:32,967 - INFO - Logger initialized
2025-11-18 09:37:32,969 - INFO - Downloading dataset: mrwellsdavid/unsw-nb15
2025-11-18 09:37:42,227 - INFO - Loaded UNSW-NB15_1.csv with shape: (700000, 46)
2025-11-18 09:37:42,260 - INFO - DataFrame shape: (700000, 46)
2025-11-18 09:37:42,262 - INFO - Initial dimensions: 700,000 rows x 46 columns = 32,200,000 cells
2025-11-18 09:37:47,768 - INFO - Preprocessing completed successfully
2025-11-18 09:37:47,769 - INFO - Final dimensions: 640,658 rows x 46 columns
2025-11-18 09:37:47,770 - INFO - Total rows removed: 59,342 (8.48%)
2025-11-18 09:37:47,771 - INFO - data retention rate: 91.52%
2025-11-18 09:37:48,013 - INFO - Encoding attack labels...
2025-11-18 09:37:49,592 - INFO - Attack labels encoded using LabelEncoder() encoder.
2025-11-18 09:37:49,658 - INFO - Scaling dataset features...
2025-11-18 09:37:50,402 - INFO - Features scaled using MinMaxScaler() scaler.
2025-11-18 09:37:50,438 - INFO - Optimizing memory usage of the datase

In [15]:
X = torch.FloatTensor(dataset.scaled_features).to(device)
y1 = torch.FloatTensor(dataset.is_attack).to(device)
y2 = torch.FloatTensor(dataset.attack_classes).to(device)

dataset_size = len(X)

# Randomly get a subset of the data
subset_size = min(50000, dataset_size)
indices = torch.randperm(dataset_size)[:subset_size]
X = X[indices]
y1 = y1[indices]
y2 = y2[indices]

In [16]:
# Display num of elements per class
unique, counts = torch.unique(torch.argmax(y1, dim=1), return_counts=True)
class_distribution = dict(zip(unique.cpu().numpy(), counts.cpu().numpy()))
for cls, count in class_distribution.items():
    print(f"Class {cls}: {count} samples")

difference = subset_size - sum(counts).item()
print(f"Class 'benign' (0): {difference} samples")

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [17]:
# Display num of elements per class
unique, counts = torch.unique(torch.argmax(y2, dim=1), return_counts=True)
class_distribution = dict(zip(unique.cpu().numpy(), counts.cpu().numpy()))
for cls, count in class_distribution.items():
    print(f"Class {cls}: {count} samples")

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [18]:
dataset_tensor = TensorDataset(X, y1, y2)

train_size = int(0.8 * len(dataset_tensor))
val_size = len(dataset_tensor) - train_size

print(f"Dataset size: {len(dataset_tensor)}, Train size: {train_size}, Val size: {val_size}")

Dataset size: 50000, Train size: 40000, Val size: 10000


In [19]:
train_dataset, val_dataset = random_split(dataset_tensor, [train_size, val_size])

In [20]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [21]:
input_size = dataset.scaled_features.shape[1]
num_classes = len(dataset.attack_classes[0])
print(f"Input size: {input_size}, Num classes: {num_classes}")

TypeError: object of type 'numpy.int64' has no len()

In [None]:
rf1 = random_forest.train_random_forest(X,y1,n_estimators=10,random_state=42,logger=lm.logger)
rf1
rf1.predict()

2025-11-18 09:48:28,205 - INFO - Random Forest Model
2025-11-18 09:48:28,206 - INFO - 
Cross-validation scores: 0.9938, 0.9945, 0.9947, 0.9956, 0.9955
2025-11-18 09:48:28,207 - INFO - 
Mean cross-validation score: 0.99


(RandomForestClassifier(max_depth=6, max_features=None, n_estimators=10,
                        random_state=42),
 array([0.9938, 0.9945, 0.9947, 0.9956, 0.9955]))

In [32]:
rf2 = random_forest.train_random_forest(X,y2,logger=lm.logger)
rf2

2025-11-18 09:43:21,461 - INFO - Random Forest Model
2025-11-18 09:43:21,462 - INFO - 
Cross-validation scores: 0.9894, 0.9905, 0.9896, 0.9901, 0.9901
2025-11-18 09:43:21,463 - INFO - 
Mean cross-validation score: 0.99


(RandomForestClassifier(max_depth=6, max_features=None, n_estimators=10,
                        random_state=0),
 array([0.9894, 0.9905, 0.9896, 0.9901, 0.9901]))

In [37]:
k = knn.train_knn(X,y1,logger=lm.logger)

2025-11-18 10:00:03,878 - INFO - K-Nearest Neighbors Model
2025-11-18 10:00:03,880 - INFO - 
Cross-validation scores: 0.9925, 0.9946, 0.9943, 0.9935, 0.9935
2025-11-18 10:00:03,881 - INFO - 
Mean cross-validation score: 0.99


In [38]:
k = knn.train_knn(X,y2,logger=lm.logger)

2025-11-18 10:00:26,755 - INFO - K-Nearest Neighbors Model
2025-11-18 10:00:26,757 - INFO - 
Cross-validation scores: 0.9877, 0.9898, 0.9873, 0.9877, 0.9874
2025-11-18 10:00:26,759 - INFO - 
Mean cross-validation score: 0.99
