# Synthetic SOC Alert Anomaly Detector Walkthrough



## Notebook roadmap

1. Configure environment & imports
2. Generate a synthetic SOC dataset and inspect distributions
3. Split into train/test using only normal events for training
4. Train IsolationForest and OneClassSVM
5. Evaluate metrics (ROC AUC, precision@k, confusion matrices)
6. Interpret results with global correlations and local z-score explanations
7. Produce inline visualizations (score histograms, confusion matrices, feature deviations)


In [None]:
from __future__ import annotations

from pathlib import Path
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from soc_anomaly.config import (
    DEFAULT_ANOMALY_FRACTION,
    DEFAULT_EVENTS_PER_USER,
    DEFAULT_N_USERS,
    DEFAULT_RANDOM_STATE,
)
from soc_anomaly.data_generation import generate_synthetic_soc_dataset
from soc_anomaly.anomaly_detection import (
    FEATURE_COLS,
    LABEL_COL,
    compute_feature_stats,
    evaluate_model,
    explain_top_anomalies,
    global_feature_correlations,
    isolation_forest_scores,
    load_dataset,
    oneclass_svm_scores,
    prepare_train_test,
    train_isolation_forest,
    train_oneclass_svm,
)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["font.size"] = 11


In [None]:
N_USERS = DEFAULT_N_USERS
EVENTS_PER_USER = DEFAULT_EVENTS_PER_USER
ANOMALY_FRACTION = DEFAULT_ANOMALY_FRACTION
RANDOM_STATE = DEFAULT_RANDOM_STATE
PRECISION_K = 50
THRESHOLD_PERCENTILE = 99.0


## 1. Generate synthetic SOC events


In [None]:
df = generate_synthetic_soc_dataset(
    n_users=N_USERS,
    events_per_user=EVENTS_PER_USER,
    anomaly_fraction=ANOMALY_FRACTION,
    random_state=RANDOM_STATE,
)

print(f"Generated {len(df):,} events with {df[LABEL_COL].sum():,} anomalies ({df[LABEL_COL].mean():.2%}).")


Quick peek at the dataset:


In [None]:
df.head()


In [None]:
df[FEATURE_COLS + [LABEL_COL]].describe().T


## 2. Train/test preparation

In [None]:
(
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    scaler,
    X_train_df,
    X_test_df,
) = prepare_train_test(
    df,
    feature_cols=FEATURE_COLS,
    label_col=LABEL_COL,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

print(f"Training samples (normal only): {X_train_scaled.shape[0]:,}")
print(f"Test samples (normals + anomalies): {X_test_scaled.shape[0]:,}")
print(f"Test anomaly fraction: {y_test.mean():.2%}")


## 3. Train IsolationForest & OneClassSVM


In [None]:
iso_model = train_isolation_forest(
    X_train_scaled,
    contamination=ANOMALY_FRACTION,
    random_state=RANDOM_STATE,
)
iso_scores = isolation_forest_scores(iso_model, X_test_scaled)

ocsvm_model = train_oneclass_svm(
    X_train_scaled,
    nu=ANOMALY_FRACTION,
    kernel="rbf",
    gamma="scale",
)
ocsvm_scores = oneclass_svm_scores(ocsvm_model, X_test_scaled)

print("Models trained and scoring arrays computed.")


## 4. Core metrics


In [None]:
iso_metrics = evaluate_model(
    "IsolationForest",
    y_test,
    iso_scores,
    k=PRECISION_K,
    threshold_percentile=THRESHOLD_PERCENTILE,
)

ocsvm_metrics = evaluate_model(
    "OneClassSVM",
    y_test,
    ocsvm_scores,
    k=PRECISION_K,
    threshold_percentile=THRESHOLD_PERCENTILE,
)

iso_metrics, ocsvm_metrics


## 5. Global signals

Correlate each feature with the IsolationForest anomaly scores (unscaled features) to see which attributes drive alerts.


In [None]:
iso_corr = global_feature_correlations(
    X_test_df=X_test_df,
    y_test=y_test,
    scores=iso_scores,
    feature_cols=FEATURE_COLS,
    score_col_name="iso_score",
)
iso_corr


In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=iso_corr.values, y=iso_corr.index, palette="coolwarm")
plt.axvline(0, color="black", linestyle="--", linewidth=0.8)
plt.title("IsolationForest feature correlations")
plt.xlabel("Pearson correlation with anomaly score")
plt.ylabel("Feature")
plt.show()


## 6. Score distributions


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

sns.histplot(
    x=iso_scores,
    hue=y_test,
    bins=60,
    ax=axes[0],
    palette={0: "#2ca02c", 1: "#d62728"},
    legend=True,
)
axes[0].set_title("IsolationForest scores")
axes[0].set_xlabel("Score (higher = more anomalous)")
axes[0].set_ylabel("Count")

sns.histplot(
    x=ocsvm_scores,
    hue=y_test,
    bins=60,
    ax=axes[1],
    palette={0: "#2ca02c", 1: "#d62728"},
    legend=True,
)
axes[1].set_title("OneClassSVM scores")
axes[1].set_xlabel("Score (higher = more anomalous)")
axes[1].set_ylabel("")

plt.tight_layout()
plt.show()


## 7. Confusion matrices at 99th percentile threshold


In [None]:
iso_preds = (iso_scores >= iso_metrics["threshold"]).astype(int)
ocsvm_preds = (ocsvm_scores >= ocsvm_metrics["threshold"]).astype(int)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, preds, title in [
    (axes[0], iso_preds, "IsolationForest"),
    (axes[1], ocsvm_preds, "OneClassSVM"),
]:
    cm = confusion_matrix(y_test, preds)
    disp = ConfusionMatrixDisplay(cm, display_labels=["Normal", "Anomaly"])
    disp.plot(ax=ax, cmap="Blues", colorbar=False)
    ax.set_title(title)
plt.tight_layout()
plt.show()


## 8. Local explanations (z-scores)

Each event is compared against the mean/std of normal traffic to describe how extreme the top anomalies are.


In [None]:
means, stds = compute_feature_stats(X_train_df, FEATURE_COLS)
local_explanations = explain_top_anomalies(
    X_test_df=X_test_df,
    y_test=y_test,
    scores=iso_scores,
    means=means,
    stds=stds,
    feature_cols=FEATURE_COLS,
    top_m=10,
)
local_explanations[:3]


In [None]:
explain_df = pd.DataFrame(local_explanations)
explain_df["true_label"] = explain_df["true_label"].map({0: "Normal", 1: "Anomaly"})
explain_df[["index", "true_label", "score", "explanations"]]


In [None]:
feature_counts = {}
for expl_list in explain_df["explanations"]:
    for expl in expl_list:
        feature = expl.split(" is ")[0]
        feature_counts[feature] = feature_counts.get(feature, 0) + 1

feature_rank = pd.Series(feature_counts).sort_values(ascending=False)
feature_rank


In [None]:
feature_rank.head(10).plot(kind="barh", color="#ff7f0e")
plt.title("Most frequent deviant features (top anomalies)")
plt.xlabel("Frequency in top explanations")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.show()


## 9. Wrap-up

Feel free to tweak the generation parameters (user count, anomaly fraction, random seed) and re-run the notebook to stress-test different SOC baselines.
