# Phase 4: Advanced Machine Learning Models


This notebook orchestrates advanced supervised models for cybersecurity attack classification and severity prediction. The workflow follows the Phase 4 implementation plan and builds on the cleaned feature sets produced during Phase 3.

## Objectives & Scope


- Configure a reproducible environment for advanced classifiers and regressors.
- Load standardized train, validation, and test splits with consistent data types.
- Prepare shared preprocessing pipelines (imputation, scaling, encoding) for downstream models.
- Persist dataset summaries and metadata for reporting and audit trails.

## Reproducibility & Artifacts



- Random seeds are fixed to ensure deterministic behaviour across executions.

- All data loading happens from the processed datasets directory without mutating source files.

- Derived summaries are exported to the `reports/` folder for later phases.


In [None]:
# Imports and global configuration
import warnings
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
from pandas.api.types import (
    CategoricalDtype,
    is_numeric_dtype,
    is_string_dtype,
    )
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    RandomForestClassifier,
    RandomForestRegressor,
    )
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay,
    RocCurveDisplay,
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    mean_absolute_error,
    mean_squared_error,
    precision_recall_fscore_support,
    r2_score,
    roc_auc_score,
    )
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    cross_validate,
    )
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor

# Matplotlib and seaborn configuration for consistent visuals
COLOR_PALETTE: List[str] = [
    "#c6d4e1",
    "#9bbcd4",
    "#6fa3c7",
    "#4a8ab8",
    "#2f6fa1",
    "#1f4f75",
    "#d3d3d3",
    "#a9a9a9",
    "#696969",
]

sns.set_theme(style="whitegrid", palette=COLOR_PALETTE)
mpl.rcParams["axes.prop_cycle"] = mpl.cycler(color=COLOR_PALETTE)

RANDOM_STATE: int = 42
np.random.seed(RANDOM_STATE)
warnings.filterwarnings("ignore", category=UserWarning)

DATA_DIR = Path("../cybersecurity_attacks_data/processed")
ARTIFACT_METRICS = Path("../models/phase4_metrics.csv")
ARTIFACT_FIGURES = Path("../reports/visualizations/phase4")
ARTIFACT_SUMMARY = Path("../reports/phase4_data_snapshot.md")
ARTIFACT_IMPORTANCES = Path("../models/phase4_feature_importances")

ARTIFACT_FIGURES.mkdir(parents=True, exist_ok=True)
ARTIFACT_SUMMARY.parent.mkdir(parents=True, exist_ok=True)
ARTIFACT_IMPORTANCES.mkdir(parents=True, exist_ok=True)


## Data Loading & Preparation
This section loads the preprocessed splits and performs lightweight validation before model training. We reuse the Phase 3 engineered features and extend them with model-specific augmentations when necessary.

In [None]:
def load_split(name: str) -> pd.DataFrame:
    """Load a processed CSV split from the Phase 3 preprocessing outputs."""
    path = DATA_DIR / f"{name}_data.csv"
    if not path.exists():
        raise FileNotFoundError(f"Split '{name}' not found at {path}")
    return pd.read_csv(path)


def summarize_split(df: pd.DataFrame, name: str) -> None:
    """Display a brief summary with shape and missingness diagnostics."""
    null_pct = df.isna().mean().sort_values(ascending=False)
    head = df.head(3)
    display(
        Markdown(
            f"### {name.title()} Split\n"
            f"- Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns\n"
            f"- Missing features (>0.01%): {null_pct[null_pct > 0.0001].index.tolist()}"
        )
    )
    display(head)


def attach_attack_success_flag(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure an attackSuccessful column exists by deriving it from action outcomes."""
    if "attackSuccessful" in df.columns:
        return df
    if not {"Action Taken_Ignored", "Action Taken_Logged"}.intersection(df.columns):
        raise KeyError(
            "Columns needed to derive attackSuccessful are missing from the dataframe."
        )
    df_copy = df.copy()
    ignored_series = (
        df_copy["Action Taken_Ignored"]
        if "Action Taken_Ignored" in df_copy
        else pd.Series(0, index=df_copy.index)
    )
    logged_series = (
        df_copy["Action Taken_Logged"]
        if "Action Taken_Logged" in df_copy
        else pd.Series(0, index=df_copy.index)
    )
    df_copy["attackSuccessful"] = (
        (ignored_series.fillna(0) + logged_series.fillna(0)) > 0
    ).astype(int)
    return df_copy


def derive_severity_score(df: pd.DataFrame) -> pd.Series:
    """Create an ordinal severity score from available one-hot encoded severity columns."""
    baseline = pd.Series(0.0, index=df.index)
    low = df.get("Severity Level_Low", baseline).fillna(0)
    medium = df.get("Severity Level_Medium", baseline).fillna(0)
    residual = (1 - (low + medium)).clip(lower=0)
    severity = medium + (residual * 2)
    return severity.astype(float)


train_df = attach_attack_success_flag(load_split("train"))
val_df = attach_attack_success_flag(load_split("val"))
test_df = attach_attack_success_flag(load_split("test"))

summarize_split(train_df, "train")
summarize_split(val_df, "validation")
summarize_split(test_df, "test")

TARGET_COL = "attackSuccessful"
LEAKAGE_COLS = {TARGET_COL, "Action Taken_Ignored", "Action Taken_Logged"}
FEATURE_COLS = [col for col in train_df.columns if col not in LEAKAGE_COLS]
SEVERITY_LEVEL_COLS = [col for col in FEATURE_COLS if col.startswith("Severity Level_")]
SEVERITY_TARGET_NAME = "severity_score"

train_severity = derive_severity_score(train_df).rename(SEVERITY_TARGET_NAME)
val_severity = derive_severity_score(val_df).rename(SEVERITY_TARGET_NAME)
test_severity = derive_severity_score(test_df).rename(SEVERITY_TARGET_NAME)
REGRESSION_FEATURE_COLS = [
    col for col in FEATURE_COLS if col not in SEVERITY_LEVEL_COLS
    ]


### Train Split
- Shape: 24,000 rows × 87 columns
- Missing features (>0.01%): ['IDS/IPS Alerts_class']

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Packet Length,Packet Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,...,Timestamp_x_Destination Port,Timestamp_div_Destination Port,Destination Port_div_Timestamp,Timestamp_x_Packet Length,Timestamp_div_Packet Length,Packet Length_div_Timestamp,Timestamp_x_Packet Type,Timestamp_div_Packet Type,Packet Type_div_Timestamp,attackSuccessful
0,0.0295,1.371828,1.471161,-1.047207,-0.529053,-0.046757,-0.988219,1.0,-0.369576,-0.996656,...,-0.015607,-0.055761,-17.933774,-0.001379,-0.63093,-1.584961,-0.029153,-0.029852,-33.498515,1
1,-1.211124,-1.560448,-1.403697,-1.476943,-0.595919,0.453196,1.011921,-1.0,-0.40978,1.003356,...,0.721732,2.032362,0.492038,-0.548877,-2.672406,-0.374195,-1.225562,-1.196856,-0.835522,1
2,1.358296,0.655884,0.746211,0.80563,-1.323317,-1.630743,-0.988219,-1.0,-0.059386,1.003356,...,-1.797456,-1.026432,-0.974248,-2.215031,-0.83293,-1.200581,-1.342294,-1.374488,-0.727544,1


### Validation Split
- Shape: 8,000 rows × 87 columns
- Missing features (>0.01%): ['IDS/IPS Alerts_class']

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Packet Length,Packet Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,...,Timestamp_x_Destination Port,Timestamp_div_Destination Port,Destination Port_div_Timestamp,Timestamp_x_Packet Length,Timestamp_div_Packet Length,Packet Length_div_Timestamp,Timestamp_x_Packet Type,Timestamp_div_Packet Type,Packet Type_div_Timestamp,attackSuccessful
0,1.454175,-0.033039,0.673378,0.549221,-1.662493,-0.85197,1.011921,1.0,0.37696,-0.996656,...,-2.417555,-0.874695,-1.143255,-1.238913,-1.706838,-0.585879,1.47151,1.437043,0.695873,1
1,-1.052278,-1.108556,0.383086,0.562367,1.624659,-0.976958,-0.988219,-1.0,0.905151,-0.996656,...,-1.709594,-0.647692,-1.543944,1.028032,1.077097,0.928422,1.039882,1.064823,0.939123,1
2,-0.655684,0.918377,-0.624448,0.339094,0.923104,-0.373649,-0.988219,1.0,-1.486262,-0.996656,...,-0.605264,-0.710304,-1.407849,0.244996,1.754811,0.569862,0.64796,0.6635,1.507158,0


### Test Split
- Shape: 8,000 rows × 87 columns
- Missing features (>0.01%): ['IDS/IPS Alerts_class']

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Packet Length,Packet Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,...,Timestamp_x_Destination Port,Timestamp_div_Destination Port,Destination Port_div_Timestamp,Timestamp_x_Packet Length,Timestamp_div_Packet Length,Packet Length_div_Timestamp,Timestamp_x_Packet Type,Timestamp_div_Packet Type,Packet Type_div_Timestamp,attackSuccessful
0,0.295744,1.140252,0.838702,1.448511,1.269763,1.49156,-0.988219,-1.0,0.66185,-0.996656,...,0.375525,0.232913,4.293446,0.441121,0.198279,5.04341,-0.29226,-0.29927,-3.341464,1
1,0.302673,0.001516,-1.227808,0.769854,-0.295453,0.17678,-0.988219,-1.0,0.193272,1.003356,...,-0.089426,-1.024439,-0.976144,0.053507,1.712149,0.584061,-0.299108,-0.306282,-3.26497,1
2,0.163489,1.232917,-1.649129,-1.687017,-0.756624,-1.164441,1.011921,-1.0,-0.607678,-0.996656,...,-0.123699,-0.216076,-4.627996,-0.190373,-0.140401,-7.122461,0.165437,0.161563,6.189554,1


## Feature Engineering & Pipelines
The preprocessing pipeline mirrors Phase 3 encoders while allowing advanced estimators to toggle optional feature transformations. This modular design supports experimentation with imbalance handling and interaction effects.

In [None]:
def infer_feature_types(
    df: pd.DataFrame, feature_columns: List[str]
    ) -> Tuple[List[str], List[str]]:
    """Infer categorical and numeric columns from the provided feature list."""
    categorical_cols: List[str] = []
    numeric_cols: List[str] = []
    for col in feature_columns:
        dtype = df[col].dtype
        if is_numeric_dtype(dtype):
            numeric_cols.append(col)
        elif isinstance(dtype, CategoricalDtype) or is_string_dtype(dtype):
            categorical_cols.append(col)
        else:
            categorical_cols.append(col)
    return numeric_cols, categorical_cols


def build_preprocessor(numeric_cols: List[str], categorical_cols: List[str]) -> ColumnTransformer:
    numeric_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    categorical_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]
    )
    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric_cols),
            ("cat", categorical_pipe, categorical_cols),
        ]
    )


NUMERIC_FEATURES, CATEGORICAL_FEATURES = infer_feature_types(train_df, FEATURE_COLS)
PREPROCESSOR = build_preprocessor(NUMERIC_FEATURES, CATEGORICAL_FEATURES)

REG_NUMERIC_FEATURES, REG_CATEGORICAL_FEATURES = infer_feature_types(
    train_df, REGRESSION_FEATURE_COLS
    )
REG_PREPROCESSOR = build_preprocessor(REG_NUMERIC_FEATURES, REG_CATEGORICAL_FEATURES)

train_X, train_y = train_df[FEATURE_COLS], train_df[TARGET_COL]
val_X, val_y = val_df[FEATURE_COLS], val_df[TARGET_COL]
test_X, test_y = test_df[FEATURE_COLS], test_df[TARGET_COL]

train_reg_X, train_reg_y = train_df[REGRESSION_FEATURE_COLS], train_severity
val_reg_X, val_reg_y = val_df[REGRESSION_FEATURE_COLS], val_severity
test_reg_X, test_reg_y = test_df[REGRESSION_FEATURE_COLS], test_severity


## Experiment Utilities
Helper functions standardize training, hyperparameter search, and evaluation output across models. Metrics are written to `models/phase4_metrics.csv` to align with the project plan.

In [None]:
METRIC_COLUMNS: List[str] = ["model_id", "target", "metric_name", "metric_value", "split"]
SUMMARY_LABELS: Dict[str, str] = {
    "accuracy": "Accuracy",
    "balanced_accuracy": "Balanced Accuracy",
    "precision": "Precision",
    "recall": "Recall",
    "f1": "F1",
    "roc_auc": "ROC-AUC",
    "rmse": "RMSE",
    "mae": "MAE",
    "r2": "R²",
}
CLASSIFICATION_SUMMARY_ORDER: List[str] = [
    "accuracy",
    "balanced_accuracy",
    "precision",
    "recall",
    "f1",
    "roc_auc",
    ]
REGRESSION_SUMMARY_ORDER: List[str] = ["rmse", "mae", "r2"]


def ensure_metrics_store() -> pd.DataFrame:
    """Load the metrics store, migrating legacy wide-format files when needed."""
    if not ARTIFACT_METRICS.exists():
        return pd.DataFrame(columns=METRIC_COLUMNS)
    store = pd.read_csv(ARTIFACT_METRICS)
    if set(METRIC_COLUMNS).issubset(store.columns):
        return store
    if "model" not in store.columns:
        return pd.DataFrame(columns=METRIC_COLUMNS)
    records: List[Dict[str, object]] = []
    legacy_columns = [col for col in store.columns if col != "model"]
    for _, row in store.iterrows():
        model_name = row["model"]
        for col in legacy_columns:
            value = row[col]
            if pd.isna(value) or "_" not in col:
                continue
            split, metric_name = col.split("_", 1)
            records.append(
                {
                    "model_id": model_name,
                    "target": "attack_success",
                    "metric_name": metric_name,
                    "metric_value": float(value),
                    "split": split,
                }
            )
    long_store = pd.DataFrame(records, columns=METRIC_COLUMNS)
    long_store.to_csv(ARTIFACT_METRICS, index=False)
    return long_store


def log_metrics(
    model_name: str, target_name: str, metrics: Dict[str, Dict[str, float]]
    ) -> None:
    """Append metrics for a model/target pair to the metrics store."""
    store = ensure_metrics_store()
    rows: List[Dict[str, object]] = []
    for split, split_metrics in metrics.items():
        for metric_name, metric_value in split_metrics.items():
            rows.append(
                {
                    "model_id": model_name,
                    "target": target_name,
                    "metric_name": metric_name,
                    "metric_value": float(metric_value),
                    "split": split,
                }
            )
    updated_store = pd.concat([store, pd.DataFrame(rows)], ignore_index=True)
    updated_store.to_csv(ARTIFACT_METRICS, index=False)


def evaluate_classifier(
    model_pipeline: Pipeline,
    model_name: str,
    train_X: pd.DataFrame,
    train_y: pd.Series,
    val_X: pd.DataFrame,
    val_y: pd.Series,
    test_X: pd.DataFrame,
    test_y: pd.Series,
    target_name: str,
    k_folds: int = 5,
    ) -> Dict[str, Dict[str, float]]:
    """Train, evaluate, and log metrics for a classification pipeline."""
    scoring = {
        "accuracy": "accuracy",
        "precision": "precision",
        "recall": "recall",
        "f1": "f1",
        "roc_auc": "roc_auc",
        "balanced_accuracy": "balanced_accuracy",
    }
    cv = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(
        model_pipeline,
        train_X,
        train_y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
    )
    cv_summary = {
        metric: np.mean(cv_results[f"test_{metric}"]) for metric in scoring
    }
    combined_X = pd.concat([train_X, val_X])
    combined_y = pd.concat([train_y, val_y])
    model_pipeline.fit(combined_X, combined_y)
    val_pred = model_pipeline.predict(val_X)
    test_pred = model_pipeline.predict(test_X)
    if hasattr(model_pipeline, "predict_proba"):
        val_scores = model_pipeline.predict_proba(val_X)[:, 1]
        test_scores = model_pipeline.predict_proba(test_X)[:, 1]
    else:
        val_scores = model_pipeline.decision_function(val_X)
        test_scores = model_pipeline.decision_function(test_X)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(
        val_y, val_pred, average="binary", zero_division=0
    )
    test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
        test_y, test_pred, average="binary", zero_division=0
    )
    metrics = {
        "cv": cv_summary,
        "validation": {
            "accuracy": accuracy_score(val_y, val_pred),
            "balanced_accuracy": balanced_accuracy_score(val_y, val_pred),
            "precision": val_precision,
            "recall": val_recall,
            "f1": val_f1,
            "roc_auc": roc_auc_score(val_y, val_scores),
        },
        "test": {
            "accuracy": accuracy_score(test_y, test_pred),
            "balanced_accuracy": balanced_accuracy_score(test_y, test_pred),
            "precision": test_precision,
            "recall": test_recall,
            "f1": test_f1,
            "roc_auc": roc_auc_score(test_y, test_scores),
        },
    }
    log_metrics(model_name, target_name, metrics)
    report_text = classification_report(val_y, val_pred, digits=4)
    display(Markdown(f"### {model_name}"))
    display(Markdown(f"```\n{report_text}\n```"))
    return metrics


def evaluate_regressor(
    model_pipeline: Pipeline,
    model_name: str,
    train_X: pd.DataFrame,
    train_y: pd.Series,
    val_X: pd.DataFrame,
    val_y: pd.Series,
    test_X: pd.DataFrame,
    test_y: pd.Series,
    target_name: str,
    k_folds: int = 5,
    ) -> Dict[str, Dict[str, float]]:
    """Train, evaluate, and log metrics for a regression pipeline."""
    scoring = {
        "neg_root_mean_squared_error": "neg_root_mean_squared_error",
        "neg_mean_absolute_error": "neg_mean_absolute_error",
        "r2": "r2",
    }
    cv = KFold(n_splits=k_folds, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(
        model_pipeline,
        train_X,
        train_y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
    )
    cv_summary = {
        "rmse": -np.mean(cv_results["test_neg_root_mean_squared_error"]),
        "mae": -np.mean(cv_results["test_neg_mean_absolute_error"]),
        "r2": np.mean(cv_results["test_r2"]),
    }
    combined_X = pd.concat([train_X, val_X])
    combined_y = pd.concat([train_y, val_y])
    model_pipeline.fit(combined_X, combined_y)
    val_pred = model_pipeline.predict(val_X)
    test_pred = model_pipeline.predict(test_X)
    metrics = {
        "cv": cv_summary,
        "validation": {
            "rmse": mean_squared_error(val_y, val_pred, squared=False),
            "mae": mean_absolute_error(val_y, val_pred),
            "r2": r2_score(val_y, val_pred),
        },
        "test": {
            "rmse": mean_squared_error(test_y, test_pred, squared=False),
            "mae": mean_absolute_error(test_y, test_pred),
            "r2": r2_score(test_y, test_pred),
        },
    }
    log_metrics(model_name, target_name, metrics)
    display(Markdown(f"### {model_name} (Severity Regression)"))
    validation_line = (
        f"Validation RMSE: {metrics['validation']['rmse']:.4f}, "
        f"MAE: {metrics['validation']['mae']:.4f}, "
        f"R²: {metrics['validation']['r2']:.4f}"
    )
    test_line = (
        f"Test RMSE: {metrics['test']['rmse']:.4f}, "
        f"MAE: {metrics['test']['mae']:.4f}, "
        f"R²: {metrics['test']['r2']:.4f}"
    )
    display(
        Markdown(
            "\n".join(
                [f"- {validation_line}", f"- {test_line}"]
            )
        )
    )
    return metrics


def export_feature_importances(
    model_pipeline: Pipeline, model_name: str, target_name: str
    ) -> None:
    """Persist feature importances or coefficients for downstream explainability."""
    preprocessor = model_pipeline.named_steps.get("preprocessor")
    if preprocessor is None or not hasattr(preprocessor, "get_feature_names_out"):
        return
    feature_names = preprocessor.get_feature_names_out()
    estimator = model_pipeline.named_steps.get("model")
    if estimator is None:
        return
    if hasattr(estimator, "feature_importances_"):
        importances = estimator.feature_importances_
    elif hasattr(estimator, "coef_"):
        importances = np.ravel(estimator.coef_)
    else:
        return
    importance_df = pd.DataFrame(
        {
            "model_id": model_name,
            "target": target_name,
            "feature": feature_names,
            "importance": importances,
        }
    )
    importance_df["abs_importance"] = importance_df["importance"].abs()
    importance_df = importance_df.sort_values("abs_importance", ascending=False)
    importance_df = importance_df.drop(columns="abs_importance")
    file_stub = f"{model_name.lower().replace(' ', '_')}_{target_name}"
    export_path = ARTIFACT_IMPORTANCES / f"{file_stub}.csv"
    importance_df.to_csv(export_path, index=False)


In [11]:
def plot_diagnostics(model, X: pd.DataFrame, y: pd.Series, model_name: str, suffix: str) -> None:
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    ConfusionMatrixDisplay.from_estimator(model, X, y, ax=axes[0], colorbar=False)
    axes[0].set_title(f"{model_name} Confusion Matrix ({suffix})")

    RocCurveDisplay.from_estimator(model, X, y, ax=axes[1])
    axes[1].set_title(f"{model_name} ROC Curve ({suffix})")

    PrecisionRecallDisplay.from_estimator(model, X, y, ax=axes[2])
    axes[2].set_title(f"{model_name} Precision-Recall ({suffix})")

    plt.tight_layout()
    figure_path = ARTIFACT_FIGURES / f"{model_name.lower().replace(' ', '_')}_{suffix}.png"
    fig.savefig(figure_path, dpi=200)
    plt.close(fig)


In [None]:
def append_summary_entry(model_name: str, metrics: Dict[str, Dict[str, float]]) -> None:
    """Append a concise markdown summary for the supplied metrics."""
    lines = [f"## {model_name}\n"]
    validation_metrics = metrics.get("validation", {})
    is_classification = "accuracy" in validation_metrics
    metric_order = (
        CLASSIFICATION_SUMMARY_ORDER if is_classification else REGRESSION_SUMMARY_ORDER
    )
    split_labels = {
        "validation": "Validation",
        "test": "Test",
        "cv": "Cross-Validation",
    }
    for split in ("validation", "test", "cv"):
        if split not in metrics:
            continue
        split_metrics = metrics[split]
        formatted = [
            f"{SUMMARY_LABELS[name]}: {split_metrics[name]:.4f}"
            for name in metric_order
            if name in split_metrics
        ]
        if not formatted:
            continue
        lines.append(f"- {split_labels[split]}: {', '.join(formatted)}\n")
    with open(ARTIFACT_SUMMARY, "a", encoding="utf-8") as handle:
        handle.writelines(lines + ["\n"])


## Model Experiments

### Balanced Logistic Regression
Revisits the linear baseline with tuned regularization and class weighting to establish a strong probabilistic reference.

In [None]:
logit_params = {
    "model__C": np.logspace(-3, 1, 8),
    "model__penalty": ["l2"],
    "model__solver": ["lbfgs"],
}

logit_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            LogisticRegression(
                class_weight="balanced",
                max_iter=200,
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)

logit_search = RandomizedSearchCV(
    estimator=logit_pipeline,
    param_distributions=logit_params,
    n_iter=10,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    random_state=RANDOM_STATE,
    refit=True,
    )

logit_search.fit(train_X, train_y)
logit_best = logit_search.best_estimator_
logit_metrics = evaluate_classifier(
    logit_best,
    "Balanced Logistic Regression",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(logit_best, val_X, val_y, "Balanced Logistic Regression", "validation")
plot_diagnostics(logit_best, test_X, test_y, "Balanced Logistic Regression", "test")
append_summary_entry("Balanced Logistic Regression", logit_metrics)


  store = pd.concat([store, pd.DataFrame([row])], ignore_index=True)


### Balanced Logistic Regression

              precision    recall  f1-score   support

           0       0.36      0.52      0.42      2701
           1       0.68      0.52      0.59      5299

    accuracy                           0.52      8000
   macro avg       0.52      0.52      0.51      8000
weighted avg       0.57      0.52      0.53      8000


### Decision Tree Classifier
Establishes a single-tree baseline to inspect splitting patterns and feature importances before moving to ensemble variants.

In [None]:
dt_params = {
    "model__criterion": ["gini", "entropy"],
    "model__max_depth": [None, 5, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
}

dt_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            DecisionTreeClassifier(
                class_weight="balanced",
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)

dt_grid = GridSearchCV(
    estimator=dt_pipeline,
    param_grid=dt_params,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    refit=True,
    )

dt_grid.fit(train_X, train_y)
dt_best = dt_grid.best_estimator_
dt_metrics = evaluate_classifier(
    dt_best,
    "Decision Tree Classifier",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(dt_best, val_X, val_y, "Decision Tree Classifier", "validation")
plot_diagnostics(dt_best, test_X, test_y, "Decision Tree Classifier", "test")
export_feature_importances(dt_best, "Decision Tree Classifier", "attack_success")
append_summary_entry("Decision Tree Classifier", dt_metrics)


### Random Forest Ensemble
Targets non-linear interactions and feature importance analysis via tree-based ensembles.

In [None]:
rf_params = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", 0.5],
}

rf_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            RandomForestClassifier(
                class_weight="balanced_subsample",
                random_state=RANDOM_STATE,
                n_jobs=-1,
            ),
        ),
    ]
)

rf_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_params,
    n_iter=20,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    random_state=RANDOM_STATE,
    refit=True,
    )

rf_search.fit(train_X, train_y)
rf_best = rf_search.best_estimator_
rf_metrics = evaluate_classifier(
    rf_best,
    "Random Forest Ensemble",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(rf_best, val_X, val_y, "Random Forest Ensemble", "validation")
plot_diagnostics(rf_best, test_X, test_y, "Random Forest Ensemble", "test")
export_feature_importances(rf_best, "Random Forest Ensemble", "attack_success")
append_summary_entry("Random Forest Ensemble", rf_metrics)


### Random Forest Ensemble

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2701
           1       1.00      1.00      1.00      5299

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000


### Gradient Boosting
Leverages stage-wise additive modeling to optimize for ROC-AUC with shrinkage and depth control.

In [None]:
gb_params = {
    "model__n_estimators": [200, 400, 600],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__max_depth": [2, 3, 4],
    "model__subsample": [0.7, 0.85, 1.0],
    "model__min_samples_split": [2, 4, 6],
}

gb_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            GradientBoostingClassifier(random_state=RANDOM_STATE),
        ),
    ]
)

gb_search = RandomizedSearchCV(
    estimator=gb_pipeline,
    param_distributions=gb_params,
    n_iter=20,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    random_state=RANDOM_STATE,
    refit=True,
    )

gb_search.fit(train_X, train_y)
gb_best = gb_search.best_estimator_
gb_metrics = evaluate_classifier(
    gb_best,
    "Gradient Boosting",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(gb_best, val_X, val_y, "Gradient Boosting", "validation")
plot_diagnostics(gb_best, test_X, test_y, "Gradient Boosting", "test")
export_feature_importances(gb_best, "Gradient Boosting", "attack_success")
append_summary_entry("Gradient Boosting", gb_metrics)


### Gradient Boosting

              precision    recall  f1-score   support

           0       0.89      0.01      0.03      2701
           1       0.67      1.00      0.80      5299

    accuracy                           0.67      8000
   macro avg       0.78      0.51      0.41      8000
weighted avg       0.74      0.67      0.54      8000


### XGBoost Gradient Boosted Trees
Explores gradient-boosting with second-order optimization and regularization controls.

In [None]:
xgb_params = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [3, 4, 5],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0],
    "model__reg_alpha": [0.0, 0.1, 0.5],
    "model__reg_lambda": [0.5, 1.0, 2.0],
    "model__gamma": [0, 0.1, 0.3],
}

xgb_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            XGBClassifier(
                objective="binary:logistic",
                eval_metric="logloss",
                random_state=RANDOM_STATE,
                tree_method="hist",
                scale_pos_weight=None,
                n_jobs=-1,
            ),
        ),
    ]
)

xgb_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=xgb_params,
    n_iter=25,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    random_state=RANDOM_STATE,
    refit=True,
    )

xgb_search.fit(train_X, train_y)
xgb_best = xgb_search.best_estimator_
xgb_metrics = evaluate_classifier(
    xgb_best,
    "XGBoost Gradient Boosted Trees",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(
    xgb_best,
    val_X,
    val_y,
    "XGBoost Gradient Boosted Trees",
    "validation",
)
plot_diagnostics(
    xgb_best,
    test_X,
    test_y,
    "XGBoost Gradient Boosted Trees",
    "test",
)
export_feature_importances(
    xgb_best, "XGBoost Gradient Boosted Trees", "attack_success"
    )
append_summary_entry("XGBoost Gradient Boosted Trees", xgb_metrics)


### XGBoost Gradient Boosted Trees

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2701
           1       0.66      1.00      0.80      5299

    accuracy                           0.66      8000
   macro avg       0.33      0.50      0.40      8000
weighted avg       0.44      0.66      0.53      8000


### Support Vector Machine (Linear)
Complements the RBF kernel variant by benchmarking a linear decision boundary with class weighting. Grid search tunes the soft margin parameter to balance bias and variance.

In [None]:
svm_linear_params = {
    "model__C": np.logspace(-3, 2, 6),
}

svm_linear_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            SVC(
                kernel="linear",
                probability=True,
                class_weight="balanced",
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)

svm_linear_grid = GridSearchCV(
    estimator=svm_linear_pipeline,
    param_grid=svm_linear_params,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    refit=True,
    )

svm_linear_grid.fit(train_X, train_y)
svm_linear_best = svm_linear_grid.best_estimator_
svm_linear_metrics = evaluate_classifier(
    svm_linear_best,
    "Support Vector Machine (Linear)",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(
    svm_linear_best, val_X, val_y, "Support Vector Machine (Linear)", "validation"
)
plot_diagnostics(
    svm_linear_best, test_X, test_y, "Support Vector Machine (Linear)", "test"
)
export_feature_importances(
    svm_linear_best, "Support Vector Machine (Linear)", "attack_success"
)
append_summary_entry("Support Vector Machine (Linear)", svm_linear_metrics)


### Support Vector Machine (RBF)
Captures complex decision boundaries through kernel methods with probabilistic calibration.

In [None]:
svm_rbf_params = {
    "model__C": np.logspace(-2, 2, 5),
    "model__gamma": np.logspace(-3, 1, 5),
}

svm_rbf_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            SVC(
                kernel="rbf",
                probability=True,
                class_weight="balanced",
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)

svm_rbf_grid = GridSearchCV(
    estimator=svm_rbf_pipeline,
    param_grid=svm_rbf_params,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    refit=True,
    )

svm_rbf_grid.fit(train_X, train_y)
svm_rbf_best = svm_rbf_grid.best_estimator_
svm_rbf_metrics = evaluate_classifier(
    svm_rbf_best,
    "Support Vector Machine (RBF)",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(
    svm_rbf_best, val_X, val_y, "Support Vector Machine (RBF)", "validation"
)
plot_diagnostics(
    svm_rbf_best, test_X, test_y, "Support Vector Machine (RBF)", "test"
)
append_summary_entry("Support Vector Machine (RBF)", svm_rbf_metrics)


### Support Vector Machine (RBF)

              precision    recall  f1-score   support

           0       0.34      1.00      0.50      2701
           1       0.00      0.00      0.00      5299

    accuracy                           0.34      8000
   macro avg       0.17      0.50      0.25      8000
weighted avg       0.11      0.34      0.17      8000


### K-Nearest Neighbors Classifier
Explores a non-parametric baseline across neighborhood sizes to inspect locality-driven performance and balanced accuracy.

In [None]:
knn_params = {
    "model__n_neighbors": [3, 5, 7, 9],
    "model__weights": ["uniform", "distance"],
}

knn_pipeline = Pipeline(
    steps=[
        ("preprocessor", PREPROCESSOR),
        (
            "model",
            KNeighborsClassifier(),
        ),
    ]
)

knn_grid = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=knn_params,
    scoring="roc_auc",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    refit=True,
    )

knn_grid.fit(train_X, train_y)
knn_best = knn_grid.best_estimator_
knn_metrics = evaluate_classifier(
    knn_best,
    "K-Nearest Neighbors Classifier",
    train_X,
    train_y,
    val_X,
    val_y,
    test_X,
    test_y,
    target_name="attack_success",
)
plot_diagnostics(
    knn_best, val_X, val_y, "K-Nearest Neighbors Classifier", "validation"
)
plot_diagnostics(
    knn_best, test_X, test_y, "K-Nearest Neighbors Classifier", "test"
)
append_summary_entry("K-Nearest Neighbors Classifier", knn_metrics)


## Severity Regression Experiments
Translates the categorical severity indicators into an ordinal score and evaluates regression pipelines aligned with the implementation plan.

### Support Vector Regression (RBF Kernel)
Models the severity score with a radial basis kernel to capture non-linear relationships in the engineered features.

In [None]:
svr_params = {
    "model__C": np.logspace(-2, 2, 5),
    "model__epsilon": [0.1, 0.5, 1.0],
    "model__gamma": np.logspace(-3, 0, 4),
}

svr_pipeline = Pipeline(
    steps=[
        ("preprocessor", REG_PREPROCESSOR),
        ("model", SVR(kernel="rbf")),
    ]
)

svr_grid = GridSearchCV(
    estimator=svr_pipeline,
    param_grid=svr_params,
    scoring="neg_root_mean_squared_error",
    cv=KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    refit=True,
    )

svr_grid.fit(train_reg_X, train_reg_y)
svr_best = svr_grid.best_estimator_
svr_metrics = evaluate_regressor(
    svr_best,
    "Support Vector Regression (RBF)",
    train_reg_X,
    train_reg_y,
    val_reg_X,
    val_reg_y,
    test_reg_X,
    test_reg_y,
    target_name=SEVERITY_TARGET_NAME,
)
append_summary_entry("Support Vector Regression (RBF)", svr_metrics)


### Random Forest Regressor
Extends tree ensembles to the severity target and captures feature contributions for downstream SHAP analysis.

In [None]:
rf_reg_params = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["auto", "sqrt", 0.5],
}

rf_reg_pipeline = Pipeline(
    steps=[
        ("preprocessor", REG_PREPROCESSOR),
        (
            "model",
            RandomForestRegressor(
                random_state=RANDOM_STATE,
                n_jobs=-1,
            ),
        ),
    ]
)

rf_reg_search = RandomizedSearchCV(
    estimator=rf_reg_pipeline,
    param_distributions=rf_reg_params,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    random_state=RANDOM_STATE,
    refit=True,
    )

rf_reg_search.fit(train_reg_X, train_reg_y)
rf_reg_best = rf_reg_search.best_estimator_
rf_reg_metrics = evaluate_regressor(
    rf_reg_best,
    "Random Forest Regressor",
    train_reg_X,
    train_reg_y,
    val_reg_X,
    val_reg_y,
    test_reg_X,
    test_reg_y,
    target_name=SEVERITY_TARGET_NAME,
)
export_feature_importances(rf_reg_best, "Random Forest Regressor", SEVERITY_TARGET_NAME)
append_summary_entry("Random Forest Regressor", rf_reg_metrics)


## Consolidated Results & Next Steps
Loads the aggregated metrics table and outlines follow-up analyses for the final report.

In [18]:
if ARTIFACT_METRICS.exists():
    phase4_metrics = pd.read_csv(ARTIFACT_METRICS)
    display(phase4_metrics.sort_values("validation_roc_auc", ascending=False))
else:
    display(Markdown("No metrics logged yet. Run the experiments above to populate the table."))


Unnamed: 0,model,validation_accuracy,validation_precision,validation_recall,validation_f1,validation_roc_auc,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc,cv_accuracy,cv_f1,cv_precision,cv_recall,cv_roc_auc
1,Random Forest Ensemble,1.0,1.0,1.0,1.0,1.0,0.65675,0.658017,0.996582,0.792661,0.515612,0.661958,0.796253,0.662917,0.996731,0.503725
3,XGBoost Gradient Boosted Trees,0.662375,0.662375,1.0,0.796902,0.685496,0.658375,0.658375,1.0,0.794,0.505674,0.662708,0.797143,0.662708,1.0,0.503102
2,Gradient Boosting,0.66675,0.665493,0.999056,0.798853,0.657628,0.657875,0.65872,0.996772,0.793231,0.503807,0.65975,0.794266,0.662673,0.991072,0.505954
0,Balanced Logistic Regression,0.5195,0.680297,0.518022,0.588172,0.531032,0.503125,0.659427,0.50731,0.573452,0.508279,0.492417,0.564497,0.654279,0.496448,0.493103
4,Support Vector Machine (RBF),0.337625,0.0,0.0,0.0,0.0,0.341625,0.0,0.0,0.0,0.483823,0.337292,0.0,0.0,0.0,0.504718


### Reporting Checklist
- Integrate best-performing model metrics into `reports/phase4_advanced_models_report.md`.
- Refresh visualization gallery with ROC, PR, and feature importance plots.
- Update `plan/data-cybersecurity-attacks-analysis-1.md` TASK-036 to reflect experiment outcomes.
- Prepare deployment considerations if gap between validation and test metrics is minimal.