In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import copy
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, roc_auc_score
from shared.utils import load_data
from datasets import preprocess_dataset
from sklearn.preprocessing import StandardScaler
from intrusion_detection_systems.models import cnn_model, mlp_model, rnn_model
from intrusion_detection_systems import train_dl_model, evaluate_dl_model
from agent.DQN import DQNModelSelector

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
name = "CIC-IDS_2017_2"
df = load_data(
            [
                "./shared/data/CIC_2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
                # "./shared/data/CIC_2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
                # "./shared/data/CIC_2017/Friday-WorkingHours-Morning.pcap_ISCX.csv",
                # "./shared/data/CIC_2017/Monday-WorkingHours.pcap_ISCX.csv",
                # "./shared/data/CIC_2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
                # "./shared/data/CIC_2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
                # "./shared/data/CIC_2017/Tuesday-WorkingHours.pcap_ISCX.csv"
            ],
            42
        )
print("Dataset loaded")
df_preprocessed = preprocess_dataset(
    df, save=True, dataset_type="CIC_2017", seed=42, load=False, name_save=name, name_load=name)
print("Dataset preprocessed")

X_train_full, y_train_full = df_preprocessed.x_train, df_preprocessed.y_train
X_test_full, y_test_full = df_preprocessed.x_test, df_preprocessed.y_test
print(f"Train full shape: {X_train_full.shape}, labels distribution: {np.unique(y_train_full, return_counts=True)}")
print(f"Test full shape: {X_test_full.shape}, labels distribution: {np.unique(y_test_full, return_counts=True)}")
y_train_full = np.array([int(str(x).strip()) for x in y_train_full])
y_test_full = np.array([int(str(x).strip()) for x in y_test_full])


In [None]:
input_dim = X_train_full.shape[1]
#MLP Model
mlp_model_pt = mlp_model(input_dim)
criterion_mlp_model_pt = nn.CrossEntropyLoss()
optimizer_mlp_model_pt = optim.Adam(mlp_model_pt.parameters(), lr=0.001)

print("--- MLP Model ---")
print(mlp_model_pt)
print(f"Criterion: {criterion_mlp_model_pt}")
print(f"Optimizer: {optimizer_mlp_model_pt}")
# CNN Model
cnn_model_pt = cnn_model(input_dim)
criterion_cnn_model_pt = nn.CrossEntropyLoss()
optimizer_cnn_model_pt = optim.Adam(cnn_model_pt.parameters(), lr=0.001)

print("--- CNN Model ---")
print(cnn_model_pt)
print(f"Criterion: {criterion_cnn_model_pt}")
print(f"Optimizer: {optimizer_cnn_model_pt}")
# RNN Model
rnn_model_pt = rnn_model(input_dim) 
criterion_rnn_model_pt = nn.CrossEntropyLoss()
optimizer_rnn_model_pt = optim.Adam(rnn_model_pt.parameters(), lr=0.001)

print("--- RNN Model (LSTM) ---")
print(rnn_model_pt)
print(f"Criterion: {criterion_rnn_model_pt}")
print(f"Optimizer: {optimizer_rnn_model_pt}")

In [None]:
SUB_TRAIN = min(1000, len(X_train_full))
SUB_TEST = min(2000, len(X_test_full))
# Sử dụng mẫu nhỏ hoặc toàn bộ dữ liệu
X_train, y_train = X_train_full[:SUB_TRAIN], y_train_full[:SUB_TRAIN]
X_test, y_test = X_test_full[:SUB_TEST], y_test_full[:SUB_TEST]
# Nếu muốn dùng toàn bộ dữ liệu, bỏ dòng trên và dùng:
# X_train, y_train = X_train_full, y_train_full
# X_test, y_test = X_test_full, y_test_full
print("📌 Nhãn của tập huấn luyện (y_train):", np.unique(y_train, return_counts=True))
print("📌 Nhãn của tập kiểm thử (y_test):", np.unique(y_test, return_counts=True))

In [None]:
# =====================
# DQN Training and Evaluation
# =====================
models = [
    mlp_model_pt,
    cnn_model_pt,
    rnn_model_pt
]

dqn = DQNModelSelector(models, n_clusters=3)

print("Training DQN-based selector with cluster-specific DQNs...")
dqn.train(X_train, y_train)

print("Predicting on test set...")
y_pred, arms = dqn.predict(X_test)

if isinstance(y_test, pd.DataFrame):
    if "Label" in y_test.columns:
        y_test = y_test["Label"].values
    else:
        y_test = y_test.iloc[:, 0].values
elif isinstance(y_test, pd.Series):
    y_test = y_test.values

y_test = np.array([int(str(x).strip()) for x in y_test])
y_pred = np.array([int(x) for x in y_pred])

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))
print("ROC AUC:", roc_auc_score(y_test, y_pred))