Import Librarys

In [None]:
from pathlib import Path
from typing import List, Tuple, Dict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
    precision_recall_curve,
    PrecisionRecallDisplay,
    RocCurveDisplay,
)
from minisom import MiniSom


SOM classifier

In [None]:

# trains a SOM on the input data
def fit_som(data: np.ndarray, grid: Tuple[int, int] = (22, 22), seed: int = 42) -> MiniSom:
    rows, cols = grid  # SOM grid dimensions set to 22x22
    som = MiniSom(
        x=rows, y=cols,
        input_len=data.shape[1],         # number of features
        sigma=3.0,                       # spread of the neighborhood
        learning_rate=0.5,               # speed of learning
        neighborhood_function="gaussian",# type of neighborhood function
        random_seed=seed                 # for reproducibility
    )
    som.random_weights_init(data)       
    som.train_batch(data, num_iteration=10_000, verbose=False)  # train the SOM
    return som

# create a lookup table from each SOM node BMU to a majority label
def majority_vote_lookup(som: MiniSom, data: np.ndarray, labels: np.ndarray) -> Dict[Tuple[int, int], int]:
    vote: Dict[Tuple[int, int], List[int]] = {}

    for vec, lbl in zip(data, labels):                  # go through each data point
        bmu = som.winner(vec)                           # find best-matching unit (BMU)
        vote.setdefault(bmu, []).append(lbl)            # collect all labels that match this BMU

    # assign each BMU the most common label (rounded average)
    return {bmu: int(round(np.mean(v))) for bmu, v in vote.items()}

# predict labels for new data using the trained SOM and the vote map
def predict_som(som: MiniSom, vote_map: Dict[Tuple[int, int], int], data: np.ndarray) -> np.ndarray:
    # for each input vector, find its BMU and use the vote_map to assign a predicted label
    return np.array([vote_map.get(som.winner(v), 0) for v in data])


K-folds and cross validation

In [None]:

# performs cross-validation on k-folds using a Self-Organizing Map (SOM)
def som_cross_validate(Syn_df: pd.DataFrame, feature_columns: List[str], grid: Tuple[int, int] = (22,22)):
    accuracies = []
    all_true   = []  # collect true labels across folds
    all_pred   = []  # collect predicted labels across folds
    all_scores = []  # collect confidence scores across folds

    # Loop over each fold in the dataset
    for fold in sorted(Syn_df["Fold"].unique()):
        print(f"Fold {fold+1} / {Syn_df['Fold'].nunique()}")

        # Split into training and validation sets
        train_df    = Syn_df[Syn_df["Fold"] != fold]
        validate_df = Syn_df[Syn_df["Fold"] == fold]

        # Scale features
        standard_scaler = StandardScaler()
        X_train = standard_scaler.fit_transform(train_df[feature_columns])
        X_validate = standard_scaler.transform(validate_df[feature_columns])
        y_train = train_df["Label"].values
        y_validate = validate_df["Label"].values

        # train SOM and assign labels to nodes via majority voting
        som      = fit_som(X_train, grid)
        vote_map = majority_vote_lookup(som, X_train, y_train)
        
        # predict labels for validation set
        y_predict = predict_som(som, vote_map, X_validate)

        # confidence score = negative quantization error
        scores = [-som.quantization_error(np.array([v])) for v in X_validate]

        # collect for visual diagnostics
        all_true.extend(y_validate)
        all_pred.extend(y_predict)
        all_scores.extend(scores)

        # accuracy
        acc = (y_predict == y_validate).mean()
        accuracies.append(float(acc))
        print(f"Fold {fold+1}: accuracy = {acc:.4f}")

    # summary of final results
    print("\n══════ SOM Validation Summary ══════")
    for i, a in enumerate(accuracies, 1):
        print(f"Fold {i}: {a:.4f}")
    print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
    print(f"Standard Deviation: {np.std(accuracies):.4f}")

    # Visual evaluation (from cross-validation only)
    cm = confusion_matrix(all_true, all_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap="Blues")
    plt.title("Confusion Matrix (CV)")
    plt.show()

    fpr, tpr, _ = roc_curve(all_true, all_scores)
    roc_auc     = auc(fpr, tpr)
    RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
    plt.title(f"ROC Curve (CV) (AUC = {roc_auc:.4f})")
    plt.show()

    precision, recall, _ = precision_recall_curve(all_true, all_scores)
    PrecisionRecallDisplay(precision=precision, recall=recall).plot()
    plt.title("Precision-Recall Curve (CV)")
    plt.show()

    return accuracies



Results and running it all together

In [None]:


if __name__ == "__main__":
    # Load the dataset
    Syn_df = pd.read_csv("D:\Coding Projects\Detection-of-SYN-Flood-Attacks-Using-Machine-Learning-and-Deep-Learning-Techniques-with-Feature-Base\Data\K5_Dataset.csv")

    # select first 12 feature columns (exclude label and fold info)
    feature_columns = Syn_df.columns.difference(["Label", "Fold"]).tolist()[:12]

    # run cross-validation using a Self-Organizing Map
    accs = som_cross_validate(Syn_df, feature_columns)
