In [63]:
import joblib
import sklearn
from skl2onnx import to_onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, label_binarize
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import onnxruntime as rt
from sklearn.pipeline import Pipeline

In [64]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

In [65]:
from sklearn.metrics import (
    balanced_accuracy_score,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import label_binarize

In [66]:
from skl2onnx import __max_supported_opset__, __version__

In [67]:
print("documentation for version:", __version__)
print("Last supported opset:", __max_supported_opset__)

documentation for version: 1.19.1
Last supported opset: 21


In [68]:
print(sklearn.__version__)

1.7.1


In [69]:
# Set the seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# Settings
N_FOLDS = 10
N_REPEATS = 3
# BATCHSIZE = 128  # Not used in Random Forest
CLASSES = 13

# INPUT_SIZE = 83  # Not used in Random Forest, but retained for consistency

In [70]:
def load_and_preprocess_data(filepath):
    """
    Loads data from a CSV file, replaces placeholders, and separates features from target.
    
    Args:
        filepath (str): Path to the CSV file.
        
    Returns:
        data (pd.DataFrame): Feature dataframe.
        target (pd.Series): Target labels.
        feature_names (list): List of feature names.
    """
    data = pd.read_csv(filepath)
    
    # Replace '-' with 'None' in categorical columns
    data['service'] = data['service'].replace('-', 'None')
    data['proto'] = data['proto'].replace('-', 'None')
    
    # Separate features and target
    feature_names = data.drop(columns=['Attack_type']).columns.tolist()
    target = data['Attack_type']
    data = data.drop(columns=['Attack_type'])
    
    return data, target, feature_names

# Load the dataset
data, target, feature_names = load_and_preprocess_data('./FCNN/RT_IOT2022_new.csv')



def encode_targets(target):
    """
    Encodes categorical target labels into numerical codes.
    
    Args:
        target (pd.Series): Categorical target labels.
        
    Returns:
        target_encoded (np.ndarray): Encoded target labels.
        class_labels (list): List of class names.
        target_encoder (LabelEncoder): Fitted LabelEncoder instance.
    """
    target_encoder = LabelEncoder()
    target_encoded = target_encoder.fit_transform(target)
    class_labels = target_encoder.classes_
    return target_encoded, class_labels, target_encoder


# In[24]:


# Encode target labels
Y_encoded, class_labels, target_encoder = encode_targets(target)


# In[25]:


print(f"Number of unique classes: {len(class_labels)}")
print(f"Class labels: {class_labels}")


# In[26]:


# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    data, Y_encoded, test_size=0.2, random_state=SEED, stratify=Y_encoded
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Clean up to save memory
del data
del target

Number of unique classes: 13
Class labels: ['ARP_poisioning' 'Alexa' 'DDOS_Slowloris' 'DOS_SYN_Hping' 'MQTT_Publish'
 'Metasploit_Brute_Force_SSH' 'NMAP_FIN_SCAN' 'NMAP_OS_DETECTION'
 'NMAP_TCP_scan' 'NMAP_UDP_SCAN' 'NMAP_XMAS_TREE_SCAN' 'Thing_Speak'
 'Wipro_bulb']
Training set size: (167967, 83)
Test set size: (41992, 83)


### SVM

In [71]:
# Load your best model (pipeline that includes preprocessing and classifier)
best_model_path = "./ML Models/best_svm_rbf_only_model_with_preproc__100.joblib"
best_model = joblib.load(best_model_path)

In [72]:
preprocessor, classifier = joblib.load(best_model_path)   # ← this file is a tuple

# 2) Wrap into a single Pipeline
pipe = Pipeline([("preprocessor", preprocessor), ("clf", classifier)])


# Transform the test data
X_test_transformed = preprocessor.transform(X_test)


# Print the shape before and after preprocessing
print(f"Before preprocessing: X_test shape: {X_test.shape}")
print(f"After preprocessing: X_test_transformed shape: {X_test_transformed.shape}")  

Before preprocessing: X_test shape: (41992, 83)
After preprocessing: X_test_transformed shape: (41992, 94)


In [73]:
sample_input = X_test_transformed[:1].astype(np.float32)  # one sample with 94 features

In [74]:
sample_input.shape

(1, 94)

In [75]:
initial_type = [("input", FloatTensorType([None, 94]))]

In [76]:
onnx_model = to_onnx(classifier, sample_input, initial_types=initial_type)

In [77]:
# Save the ONNX model
onnx_model_path = "./ML Models/best_svm_rbf_only_model_100.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"ONNX model saved to {onnx_model_path}")

ONNX model saved to ./ML Models/best_svm_rbf_only_model_100.onnx


In [78]:
# Path to the saved ONNX model
onnx_model_path = "./ML Models/best_svm_rbf_only_model_100.onnx"

# Create an inference session with ONNX Runtime
session = rt.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])

# Retrieve the input name of the ONNX model
input_name = session.get_inputs()[0].name
print("ONNX model input name:", input_name)

# Ensure the preprocessed test set is in float32
X_test_processed = X_test_transformed.astype(np.float32)

# Run inference: the SVC converter usually returns two outputs:
#  - The first is the predicted label.
#  - The second is the probability (if probability=True was set).
onnx_outputs = session.run(None, {input_name: X_test_processed})
Y_pred_onnx = onnx_outputs[0]  # Predicted labels

# Extract predicted probabilities
Y_pred_proba_raw = onnx_outputs[1]
if isinstance(Y_pred_proba_raw, dict):
    # Adjust the key if necessary; common key is "probabilities"
    Y_pred_proba = Y_pred_proba_raw.get("probabilities", None)
    if Y_pred_proba is None:
        raise ValueError("Expected key 'probabilities' not found in the output dict.")
else:
    Y_pred_proba = Y_pred_proba_raw

# If Y_pred_proba is a list of dicts, convert to a 2D array
if isinstance(Y_pred_proba, list) and isinstance(Y_pred_proba[0], dict):
    Y_pred_proba = np.array(
        [[sample.get(i, 0.0) for i in range(CLASSES)] for sample in Y_pred_proba],
        dtype=np.float32
    )

# Compute evaluation metrics using the ONNX model predictions.
test_bal_acc = balanced_accuracy_score(Y_test, Y_pred_onnx)
test_accuracy = accuracy_score(Y_test, Y_pred_onnx)
test_macro_f1 = f1_score(Y_test, Y_pred_onnx, average='macro')
test_precision = precision_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)
test_recall = recall_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)

# For ROC AUC, binarize Y_test
Y_test_bin = label_binarize(Y_test, classes=np.arange(CLASSES))
test_auc = roc_auc_score(Y_test_bin, Y_pred_proba, average='macro', multi_class='ovo')

print(f"Test Balanced Accuracy: {test_bal_acc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Macro F1 Score: {test_macro_f1:.4f}")
print(f"Test Precision (Macro): {test_precision:.4f}")
print(f"Test Recall (Macro): {test_recall:.4f}")
print(f"Test ROC AUC Score: {test_auc:.4f}")

# Optionally, print a classification report and confusion matrix:
report = classification_report(Y_test, Y_pred_onnx, target_names=class_labels)
print("\nClassification Report:")
print(report)

cm = confusion_matrix(Y_test, Y_pred_onnx)
print("\nConfusion Matrix:")
print(cm)


ONNX model input name: input
Test Balanced Accuracy: 0.8807
Test Accuracy: 0.9555
Test Macro F1 Score: 0.8888
Test Precision (Macro): 0.9195
Test Recall (Macro): 0.8807
Test ROC AUC Score: 0.9501

Classification Report:
                            precision    recall  f1-score   support

            ARP_poisioning       0.46      0.73      0.56      1550
                     Alexa       0.98      0.97      0.97     17368
            DDOS_Slowloris       1.00      0.79      0.88       107
             DOS_SYN_Hping       1.00      1.00      1.00     18932
              MQTT_Publish       1.00      1.00      1.00       829
Metasploit_Brute_Force_SSH       1.00      1.00      1.00         7
             NMAP_FIN_SCAN       0.71      0.83      0.77         6
         NMAP_OS_DETECTION       1.00      1.00      1.00       400
             NMAP_TCP_scan       0.99      0.99      0.99       200
             NMAP_UDP_SCAN       0.96      0.99      0.98       518
       NMAP_XMAS_TREE_SCAN     

In [79]:
# Set the sample index (change this value to test different samples)
sample_index = 1111  # e.g., 0 for first sample, 10 for the 11th sample, etc.

# Path to the saved ONNX model
# onnx_model_path = "./optuna_SVM/checkpoints/svm_classifier.onnx"

# Create an inference session with ONNX Runtime
session = rt.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])

# Retrieve the input name of the ONNX model
input_name = session.get_inputs()[0].name
print("ONNX model input name:", input_name)

# Ensure the preprocessed test set is in float32
X_test_processed = X_test_transformed.astype(np.float32)

# Select the sample using the variable (ensure it is a 2D array with shape (1, 94))
sample = X_test_processed[sample_index:sample_index+1]

# Run inference on the selected sample.
# The SVC converter usually returns two outputs:
#   - The first is the predicted label.
#   - The second is the predicted probabilities.
onnx_outputs = session.run(None, {input_name: sample})
Y_pred_sample = onnx_outputs[0]  # Predicted label(s)

# Extract predicted probabilities
Y_pred_proba_raw = onnx_outputs[1]
if isinstance(Y_pred_proba_raw, dict):
    # Adjust the key if necessary; common key is "probabilities"
    Y_pred_proba_sample = Y_pred_proba_raw.get("probabilities", None)
    if Y_pred_proba_sample is None:
        raise ValueError("Expected key 'probabilities' not found in the output dict.")
else:
    Y_pred_proba_sample = Y_pred_proba_raw

# If probabilities are returned as a list of dicts, convert to a 2D array
if isinstance(Y_pred_proba_sample, list) and isinstance(Y_pred_proba_sample[0], dict):
    Y_pred_proba_sample = np.array(
        [[sample_dict.get(i, 0.0) for i in range(CLASSES)] for sample_dict in Y_pred_proba_sample],
        dtype=np.float32
    )

# --- Compute metrics for the single sample ---

# Get the true label for the selected sample
true_label = Y_test[sample_index]
pred_label = Y_pred_sample[0]

print("\n--- Single Sample Inference ---")
print(f"Sample index: {sample_index}")
print(f"True label: {true_label}")
print(f"Predicted label: {pred_label}")
print("Predicted probabilities:", Y_pred_proba_sample[0])

# For a single sample, accuracy is 1 if prediction matches, else 0.
single_accuracy = 1 if pred_label == true_label else 0
print(f"Accuracy (single sample): {single_accuracy}")

# Create arrays for computing other metrics (they will be 0 or 1)
y_true_single = np.array([true_label])
y_pred_single = np.array([pred_label])

f1 = f1_score(y_true_single, y_pred_single, average='macro')
precision = precision_score(y_true_single, y_pred_single, average='macro', zero_division=0)
recall = recall_score(y_true_single, y_pred_single, average='macro', zero_division=0)

print(f"F1 Score (single sample): {f1:.4f}")
print(f"Precision (single sample): {precision:.4f}")
print(f"Recall (single sample): {recall:.4f}")


ONNX model input name: input

--- Single Sample Inference ---
Sample index: 1111
True label: 1
Predicted label: 1
Predicted probabilities: [0.0882486  0.6195134  0.00472645 0.00253444 0.02091732 0.00258259
 0.00180116 0.00241022 0.00319335 0.00631766 0.00366619 0.20716679
 0.0369219 ]
Accuracy (single sample): 1
F1 Score (single sample): 1.0000
Precision (single sample): 1.0000
Recall (single sample): 1.0000


-----------

------
### RandomForest

In [80]:
# Load your best model (pipeline that includes preprocessing and classifier)
best_model_path = "./ML Models/best_rf_model_100.joblib"
best_model = joblib.load(best_model_path)

In [81]:
# Access the preprocessor from the pipeline
preprocessor = best_model.named_steps['preprocessor']

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)

# Print the shape before and after preprocessing
print(f"Before preprocessing: X_test shape: {X_test.shape}")
print(f"After preprocessing: X_test_transformed shape: {X_test_transformed.shape}")  

Before preprocessing: X_test shape: (41992, 83)
After preprocessing: X_test_transformed shape: (41992, 94)


In [82]:
classifier = best_model.named_steps['classifier']

In [84]:
# Assume X_test_transformed is available and has shape (n_samples, n_features)
# For instance, if your RandomForest expects 94 features:
sample_input = X_test_transformed[:1].astype(np.float32)  # one sample with 94 features

# Define the input type as (None, 94)
initial_type = [("input", FloatTensorType([None, 94]))]

# Convert the RandomForest classifier to ONNX.
# If your model is a pipeline, extract the classifier part as done before.
onnx_model = to_onnx(classifier, sample_input, initial_types=initial_type)

# Save the ONNX model to a file
onnx_model_path = "./ML Models/best_rf_model_100.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"ONNX model saved to {onnx_model_path}")

ONNX model saved to ./ML Models/best_rf_model_100.onnx


In [85]:
# Path to the saved ONNX model
onnx_model_path = "./ML Models/best_rf_model_100.onnx"

# Create an inference session with ONNX Runtime
session = rt.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])

# Retrieve the input name of the ONNX model
input_name = session.get_inputs()[0].name
print("ONNX model input name:", input_name)

# Ensure the preprocessed test set is in float32
X_test_processed = X_test_transformed.astype(np.float32)

# Run inference: the SVC converter usually returns two outputs:
#  - The first is the predicted label.
#  - The second is the probability (if probability=True was set).
onnx_outputs = session.run(None, {input_name: X_test_processed})
Y_pred_onnx = onnx_outputs[0]  # Predicted labels

# Extract predicted probabilities
Y_pred_proba_raw = onnx_outputs[1]
if isinstance(Y_pred_proba_raw, dict):
    # Adjust the key if necessary; common key is "probabilities"
    Y_pred_proba = Y_pred_proba_raw.get("probabilities", None)
    if Y_pred_proba is None:
        raise ValueError("Expected key 'probabilities' not found in the output dict.")
else:
    Y_pred_proba = Y_pred_proba_raw

# If Y_pred_proba is a list of dicts, convert to a 2D array
if isinstance(Y_pred_proba, list) and isinstance(Y_pred_proba[0], dict):
    Y_pred_proba = np.array(
        [[sample.get(i, 0.0) for i in range(CLASSES)] for sample in Y_pred_proba],
        dtype=np.float32
    )

# Compute evaluation metrics using the ONNX model predictions.
test_bal_acc = balanced_accuracy_score(Y_test, Y_pred_onnx)
test_accuracy = accuracy_score(Y_test, Y_pred_onnx)
test_macro_f1 = f1_score(Y_test, Y_pred_onnx, average='macro')
test_precision = precision_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)
test_recall = recall_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)

# For ROC AUC, binarize Y_test
Y_test_bin = label_binarize(Y_test, classes=np.arange(CLASSES))
test_auc = roc_auc_score(Y_test_bin, Y_pred_proba, average='macro', multi_class='ovo')

print(f"Test Balanced Accuracy: {test_bal_acc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Macro F1 Score: {test_macro_f1:.4f}")
print(f"Test Precision (Macro): {test_precision:.4f}")
print(f"Test Recall (Macro): {test_recall:.4f}")
print(f"Test ROC AUC Score: {test_auc:.4f}")

# Optionally, print a classification report and confusion matrix:
report = classification_report(Y_test, Y_pred_onnx, target_names=class_labels)
print("\nClassification Report:")
print(report)

cm = confusion_matrix(Y_test, Y_pred_onnx)
print("\nConfusion Matrix:")
print(cm)


ONNX model input name: input
Test Balanced Accuracy: 0.9680
Test Accuracy: 0.9985
Test Macro F1 Score: 0.9807
Test Precision (Macro): 0.9954
Test Recall (Macro): 0.9680
Test ROC AUC Score: 0.9933

Classification Report:
                            precision    recall  f1-score   support

            ARP_poisioning       0.98      0.99      0.99      1550
                     Alexa       1.00      1.00      1.00     17368
            DDOS_Slowloris       1.00      0.95      0.98       107
             DOS_SYN_Hping       1.00      1.00      1.00     18932
              MQTT_Publish       1.00      1.00      1.00       829
Metasploit_Brute_Force_SSH       1.00      1.00      1.00         7
             NMAP_FIN_SCAN       1.00      0.83      0.91         6
         NMAP_OS_DETECTION       1.00      1.00      1.00       400
             NMAP_TCP_scan       0.99      0.99      0.99       200
             NMAP_UDP_SCAN       1.00      0.99      0.99       518
       NMAP_XMAS_TREE_SCAN     

------
### XGB

In [3]:
# !pip install xgboost==3.0.2

In [41]:
import joblib, numpy as np
from sklearn.pipeline import Pipeline
from skl2onnx import to_onnx

In [42]:
import numpy
import onnxruntime as rt
from sklearn.datasets import load_iris, load_diabetes, make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, XGBRegressor, DMatrix, train as train_xgb
from skl2onnx.common.data_types import FloatTensorType
from onnxmltools.convert.common.data_types import FloatTensorType as ml_tools_FloatTensorType
from skl2onnx import convert_sklearn, to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
    calculate_linear_regressor_output_shapes,
)
from skl2onnx.convert import may_switch_bases_classes_order
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert import convert_xgboost as convert_xgboost_booster


In [43]:
pipeline = joblib.load("./ML Models/best_xgb_model_100.joblib")

In [44]:
# === Extract the classifier and preprocessor ===
classifier = pipeline.named_steps['classifier']
preprocessor = pipeline.named_steps['preprocessor']

In [45]:
# === Transform test data to obtain the proper input shape ===
X_test_transformed = preprocessor.transform(X_test)
# Print the shape before and after preprocessing
print(f"Before preprocessing: X_test shape: {X_test.shape}")
print(f"After preprocessing: X_test_transformed shape: {X_test_transformed.shape}")  

Before preprocessing: X_test shape: (41992, 83)
After preprocessing: X_test_transformed shape: (41992, 94)


In [46]:
sample_input = X_test_transformed[:1].astype(np.float32)
n_features = sample_input.shape[1]

In [31]:
# ---------- Register converter ----------
update_registered_converter(
    XGBClassifier,
    "XGBoostXGBClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

In [47]:
# === Define ONNX input type ===
initial_type = [('input', FloatTensorType([None, n_features]))]

In [50]:
onnx_model = to_onnx(
    classifier,
    initial_type,                                  
    target_opset={"": 12, "ai.onnx.ml": 3},
    options={id(pipeline): {"zipmap": False}},
)

# === Save the ONNX model ===
onnx_model_path = "./ML Models/best_xgb_model_100.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"ONNX model saved to {onnx_model_path}")

ONNX model saved to ./ML Models/best_xgb_model_100.onnx


In [51]:
# Path to the saved ONNX model
onnx_model_path = "./ML Models/best_xgb_model_100.onnx"

# Create an inference session with ONNX Runtime
session = rt.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])

# Retrieve the input name of the ONNX model
input_name = session.get_inputs()[0].name
print("ONNX model input name:", input_name)

# Ensure the preprocessed test set is in float32
X_test_processed = X_test_transformed.astype(np.float32)

# Run inference: the SVC converter usually returns two outputs:
#  - The first is the predicted label.
#  - The second is the probability (if probability=True was set).
onnx_outputs = session.run(None, {input_name: X_test_processed})
Y_pred_onnx = onnx_outputs[0]  # Predicted labels

# Extract predicted probabilities
Y_pred_proba_raw = onnx_outputs[1]
if isinstance(Y_pred_proba_raw, dict):
    # Adjust the key if necessary; common key is "probabilities"
    Y_pred_proba = Y_pred_proba_raw.get("probabilities", None)
    if Y_pred_proba is None:
        raise ValueError("Expected key 'probabilities' not found in the output dict.")
else:
    Y_pred_proba = Y_pred_proba_raw

# If Y_pred_proba is a list of dicts, convert to a 2D array
if isinstance(Y_pred_proba, list) and isinstance(Y_pred_proba[0], dict):
    Y_pred_proba = np.array(
        [[sample.get(i, 0.0) for i in range(CLASSES)] for sample in Y_pred_proba],
        dtype=np.float32
    )

# Compute evaluation metrics using the ONNX model predictions.
test_bal_acc = balanced_accuracy_score(Y_test, Y_pred_onnx)
test_accuracy = accuracy_score(Y_test, Y_pred_onnx)
test_macro_f1 = f1_score(Y_test, Y_pred_onnx, average='macro')
test_precision = precision_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)
test_recall = recall_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)

# For ROC AUC, binarize Y_test
Y_test_bin = label_binarize(Y_test, classes=np.arange(CLASSES))
test_auc = roc_auc_score(Y_test_bin, Y_pred_proba, average='macro', multi_class='ovo')

print(f"Test Balanced Accuracy: {test_bal_acc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Macro F1 Score: {test_macro_f1:.4f}")
print(f"Test Precision (Macro): {test_precision:.4f}")
print(f"Test Recall (Macro): {test_recall:.4f}")
print(f"Test ROC AUC Score: {test_auc:.4f}")

# Optionally, print a classification report and confusion matrix:
report = classification_report(Y_test, Y_pred_onnx, target_names=class_labels)
print("\nClassification Report:")
print(report)

cm = confusion_matrix(Y_test, Y_pred_onnx)
print("\nConfusion Matrix:")
print(cm)


ONNX model input name: input
Test Balanced Accuracy: 0.9796
Test Accuracy: 0.9989
Test Macro F1 Score: 0.9703
Test Precision (Macro): 0.9685
Test Recall (Macro): 0.9796
Test ROC AUC Score: 1.0000

Classification Report:
                            precision    recall  f1-score   support

            ARP_poisioning       0.99      0.99      0.99      1550
                     Alexa       1.00      1.00      1.00     17368
            DDOS_Slowloris       1.00      0.95      0.98       107
             DOS_SYN_Hping       1.00      1.00      1.00     18932
              MQTT_Publish       1.00      1.00      1.00       829
Metasploit_Brute_Force_SSH       0.64      1.00      0.78         7
             NMAP_FIN_SCAN       1.00      0.83      0.91         6
         NMAP_OS_DETECTION       1.00      1.00      1.00       400
             NMAP_TCP_scan       1.00      1.00      1.00       200
             NMAP_UDP_SCAN       0.99      0.99      0.99       518
       NMAP_XMAS_TREE_SCAN     

------
### LGB

In [48]:
import joblib
import numpy as np
from skl2onnx import convert_sklearn, update_registered_converter
from onnxmltools.convert.common.data_types import FloatTensorType
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
)
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import (
    convert_lightgbm,
)
from onnxmltools.convert import convert_lightgbm as convert_lgbm  # <-- Booster converter
from lightgbm import LGBMClassifier
import onnx

In [49]:
# 1) Load fitted preprocessor + booster string
pipeline = "./ML Models/best_lgbm_model_with_preproc__100.joblib"
preprocessor, booster = joblib.load(pipeline)

In [50]:
# 2) Reconstruct Booster
booster = lgb.Booster(model_str=booster)

In [51]:
booster

<lightgbm.basic.Booster at 0x7cdad754f610>

In [52]:
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [53]:
X_test_transformed = preprocessor.transform(X_test)
sample_input = X_test_transformed[:1].astype(np.float32)
n_features = sample_input.shape[1]

# === Define the input type for ONNX conversion ===
initial_type = [('input', FloatTensorType([None, n_features]))]


In [54]:
# (The LightGBM Booster path doesn't need the skl2onnx LGBMClassifier registration,
# but leaving your update_registered_converter call doesn't hurt.)
target_opset = min(13, onnx.defs.onnx_opset_version())

onnx_model = convert_lgbm(
    booster,
    initial_types=initial_type,
    target_opset=target_opset,
    zipmap=False,  # second output should be a dense [N, C] tensor
)

In [55]:
# === Save the ONNX model ===
onnx_model_path = "./ML Models/best_lgb_model_100.onnx"
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"ONNX model saved to {onnx_model_path}")

ONNX model saved to ./ML Models/best_lgb_model_100.onnx


In [56]:
# Path to the saved ONNX model
onnx_model_path = "./ML Models/best_lgb_model_100.onnx"

# Create an inference session with ONNX Runtime
session = rt.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])

# Retrieve the input name of the ONNX model
input_name = session.get_inputs()[0].name
print("ONNX model input name:", input_name)

# Ensure the preprocessed test set is in float32
X_test_processed = X_test_transformed.astype(np.float32)

# Run inference: the SVC converter usually returns two outputs:
#  - The first is the predicted label.
#  - The second is the probability (if probability=True was set).
onnx_outputs = session.run(None, {input_name: X_test_processed})
Y_pred_onnx = onnx_outputs[0]  # Predicted labels

# Extract predicted probabilities
Y_pred_proba_raw = onnx_outputs[1]
if isinstance(Y_pred_proba_raw, dict):
    # Adjust the key if necessary; common key is "probabilities"
    Y_pred_proba = Y_pred_proba_raw.get("probabilities", None)
    if Y_pred_proba is None:
        raise ValueError("Expected key 'probabilities' not found in the output dict.")
else:
    Y_pred_proba = Y_pred_proba_raw

# If Y_pred_proba is a list of dicts, convert to a 2D array
if isinstance(Y_pred_proba, list) and isinstance(Y_pred_proba[0], dict):
    Y_pred_proba = np.array(
        [[sample.get(i, 0.0) for i in range(CLASSES)] for sample in Y_pred_proba],
        dtype=np.float32
    )

# Compute evaluation metrics using the ONNX model predictions.
test_bal_acc = balanced_accuracy_score(Y_test, Y_pred_onnx)
test_accuracy = accuracy_score(Y_test, Y_pred_onnx)
test_macro_f1 = f1_score(Y_test, Y_pred_onnx, average='macro')
test_precision = precision_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)
test_recall = recall_score(Y_test, Y_pred_onnx, average='macro', zero_division=0)

# For ROC AUC, binarize Y_test
Y_test_bin = label_binarize(Y_test, classes=np.arange(CLASSES))
test_auc = roc_auc_score(Y_test_bin, Y_pred_proba, average='macro', multi_class='ovo')

print(f"Test Balanced Accuracy: {test_bal_acc:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Macro F1 Score: {test_macro_f1:.4f}")
print(f"Test Precision (Macro): {test_precision:.4f}")
print(f"Test Recall (Macro): {test_recall:.4f}")
print(f"Test ROC AUC Score: {test_auc:.4f}")

# Optionally, print a classification report and confusion matrix:
report = classification_report(Y_test, Y_pred_onnx, target_names=class_labels)
print("\nClassification Report:")
print(report)

cm = confusion_matrix(Y_test, Y_pred_onnx)
print("\nConfusion Matrix:")
print(cm)


ONNX model input name: input
Test Balanced Accuracy: 0.9674
Test Accuracy: 0.9981
Test Macro F1 Score: 0.9810
Test Precision (Macro): 0.9970
Test Recall (Macro): 0.9674
Test ROC AUC Score: 0.9993

Classification Report:
                            precision    recall  f1-score   support

            ARP_poisioning       0.98      0.99      0.98      1550
                     Alexa       1.00      1.00      1.00     17368
            DDOS_Slowloris       1.00      0.95      0.98       107
             DOS_SYN_Hping       1.00      1.00      1.00     18932
              MQTT_Publish       1.00      1.00      1.00       829
Metasploit_Brute_Force_SSH       1.00      1.00      1.00         7
             NMAP_FIN_SCAN       1.00      0.83      0.91         6
         NMAP_OS_DETECTION       1.00      1.00      1.00       400
             NMAP_TCP_scan       0.99      0.99      0.99       200
             NMAP_UDP_SCAN       1.00      0.98      0.99       518
       NMAP_XMAS_TREE_SCAN     

[0;93m2025-08-12 15:45:08.628133661 [W:onnxruntime:, execution_frame.cc:876 VerifyOutputSizes] Expected shape from model of {1} does not match actual shape of {41992} for output label[m
