In [None]:
import time, json, warnings, os
from collections import Counter
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox

from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
import hdbscan

from imblearn.over_sampling import SMOTE

sns.set(style="whitegrid")

# -------------------------
# CONFIG - adapt paths & sizes
# -------------------------
TRAIN_PATH = r'X:\Dissertacao\python_projects\dataset\ISCX-Bot-2014\ISCX_csv\Testing_file.csv'
TEST_PATH  = r'X:\Dissertacao\python_projects\dataset\ISCX-Bot-2014\ISCX_csv\Training_file
SAMPLE_SIZE = 100_000                 # reduce if memory constrained
RANDOM_STATE = 42

# IForest / HDBSCAN / PCA params
IFOREST_PARAMS = dict(n_estimators=250, contamination=0.05, max_samples=0.5, random_state=RANDOM_STATE)
HDB_MIN_CLUSTER = 500
HDB_MIN_SAMPLES = 10
PCA_COMPONENTS = 3

# RandomForest params (used for feature importance)
RF_PARAMS = dict(n_estimators=300, class_weight="balanced_subsample", random_state=RANDOM_STATE, n_jobs=-1)

# Features to use (will be ensured during build)
NUM_FEATURES = [
    "Time_Diff","Log_IATime","Log_BRate",
    "BoxCox_Length","BoxCox_PRate",
    "Length_Mean","Length_Std","Pkt_Per_Src",
    "Session_Dur_Src","Session_Dur_Dst",
    "Rate_to_Length","IAT_to_Session"
]

OUTPUT_SUMMARY = "detection_summary_extended.json"

# -------------------------
# Helpers
# -------------------------
def safe_boxcox(series):
    arr = np.asarray(series.fillna(0.0).astype(float) + 1e-6)
    try:
        out, _ = boxcox(arr + 1e-6)
        return out
    except Exception:
        return np.log1p(arr)

def safe_sample(df, n, seed=RANDOM_STATE):
    if n >= len(df):
        return df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return df.sample(n=n, random_state=seed).reset_index(drop=True)

def ensure_numeric(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0)
    return df

def safe_pct(col):
    return round(100.0 * np.mean(col), 2)

# -------------------------
# 1. Load CSVs
# -------------------------
print("Loading data...")
df_train = pd.read_csv(TRAIN_PATH, low_memory=False)
df_test  = pd.read_csv(TEST_PATH, low_memory=False)

# Basic safe cleaning (drop rows missing critical fields)
for df in (df_train, df_test):
    for col in ['Time','Source','Destination','Length']:
        if col in df.columns:
            df.dropna(subset=[col], inplace=True)
    if 'Info' in df.columns:
        df['Info'] = df['Info'].fillna("Unknown")

# -------------------------
# 2. Safe combined encoding for categorical columns (vectorized)
# -------------------------
def safe_map(train, test, col):
    # create mapping from union of values
    if col not in train.columns or col not in test.columns:
        # if missing, create dummy
        train[col] = train.get(col, "Unknown").astype(str)
        test[col] = test.get(col, "Unknown").astype(str)
    all_vals = pd.concat([train[col].astype(str), test[col].astype(str)], axis=0)
    uniques = pd.unique(all_vals)
    mapping = {v: i for i,v in enumerate(uniques)}
    train[col + "_enc"] = train[col].astype(str).map(mapping).fillna(-1).astype(int)
    test[col + "_enc"]  = test[col].astype(str).map(mapping).fillna(-1).astype(int)
    return mapping

print("Encoding categorical columns (Protocol/Source/Destination) ...")
safe_map(df_train, df_test, 'Protocol')
safe_map(df_train, df_test, 'Source')
safe_map(df_train, df_test, 'Destination')

# -------------------------
# 3. Feature engineering (vectorized + group operations)
# -------------------------
def build_features(df):
    df = df.copy()
    src = 'Source_enc' if 'Source_enc' in df.columns else 'Source'
    dst = 'Destination_enc' if 'Destination_enc' in df.columns else 'Destination'

    # Time_Diff per source
    if 'Time' in df.columns:
        df['Time'] = pd.to_numeric(df['Time'], errors='coerce').fillna(0.0)
        df['Time_Diff'] = df.groupby(src)['Time'].diff()
        # fallback to median per source when NaN
        df['Time_Diff'] = df['Time_Diff'].fillna(df.groupby(src)['Time'].transform('median')).fillna(0.0)
    else:
        df['Time_Diff'] = 0.0

    # Packet_Rate per source (IQR robust)
    def pkt_rate_series(s):
        a = s.dropna().values
        if a.size < 2:
            return np.zeros_like(s)
        # approximate packet rate per group using count / (range) robustified
        q1, q3 = np.percentile(a, [25,75])
        iqr = max(q3 - q1, 1e-9)
        cl = np.clip(a, q1 - 1.5*iqr, q3 + 1.5*iqr)
        denom = cl.max() - cl.min()
        rate = np.zeros_like(a)
        if denom <= 1e-6:
            rate[:] = 0.0
        else:
            rate[:] = cl.size / denom
        # return aligned series sized as group index
        out = pd.Series(rate, index=s.dropna().index)
        return out.reindex(s.index).fillna(0.0)

    if 'Time' in df.columns:
        # compute per-group packet rate by applying on groups of 'Time'
        df['Packet_Rate'] = df.groupby(src)['Time'].transform(lambda s: pkt_rate_series(s))
        df['Packet_Rate'] = df['Packet_Rate'].fillna(0.0)
    else:
        df['Packet_Rate'] = 0.0

    # Inter-arrival and burst features
    df['Inter_Arrival_Time'] = df.groupby(src)['Time_Diff'].transform(lambda s: s.rolling(10, min_periods=1).mean()).fillna(0.0).clip(lower=1e-6)
    df['Burst_Rate'] = np.where(df['Inter_Arrival_Time']>1e-6, 1.0/df['Inter_Arrival_Time'], 0.0)

    # variability features
    df['Length'] = pd.to_numeric(df.get('Length', 0)).fillna(0)
    df['Length_Mean'] = df.groupby(src)['Length'].transform('mean').fillna(df['Length'].mean() if 'Length' in df.columns else 0.0)
    df['Length_Std']  = df.groupby(src)['Length'].transform('std').fillna(0.0)
    df['Pkt_Per_Src'] = df.groupby(src)['Length'].transform('count').fillna(0).astype(int)

    # session durations
    if 'Time' in df.columns:
        df['Session_Dur_Src'] = df.groupby(src)['Time'].transform(lambda x: x.max() - x.min()).fillna(0.0)
        df['Session_Dur_Dst'] = df.groupby(dst)['Time'].transform(lambda x: x.max() - x.min()).fillna(0.0)
    else:
        df['Session_Dur_Src'] = 0.0
        df['Session_Dur_Dst'] = 0.0

    # transforms
    df['Log_IATime'] = np.log1p(df['Inter_Arrival_Time'])
    df['Log_BRate']  = np.log1p(df['Burst_Rate'].clip(lower=0))

    df['BoxCox_Length'] = safe_boxcox(df['Length']) if 'Length' in df.columns else np.zeros(len(df))
    df['BoxCox_PRate']  = safe_boxcox(df['Packet_Rate'])

    df['Rate_to_Length'] = df['Burst_Rate'] / (df['BoxCox_Length'] + 1e-6)
    df['IAT_to_Session'] = df['Inter_Arrival_Time'] / (df['Session_Dur_Src'] + 1e-6)

    # ensure numeric features present
    df = ensure_numeric(df, NUM_FEATURES)
    return df

print("Building features...")
df_train = build_features(df_train)
df_test  = build_features(df_test)

# -------------------------
# 4. Add temporal/variability features requested
# -------------------------
for df in (df_train, df_test):
    df['Burst_Variability'] = df['Burst_Rate'].rolling(5, min_periods=1).std().fillna(0.0).values
    df['Pkt_Rate_Change']  = df['Packet_Rate'].diff().fillna(0.0).values

# update feature list if new added
for f in ['Burst_Variability','Pkt_Rate_Change']:
    if f not in NUM_FEATURES:
        NUM_FEATURES.append(f)

print(f"NUM_FEATURES used: {NUM_FEATURES}")

# -------------------------
# 5. Sampling for memory (safe)
# -------------------------
print(f"Sampling up to {SAMPLE_SIZE} rows for train/test (reduce SAMPLE_SIZE to lower memory).")
df_train_sample = safe_sample(df_train, SAMPLE_SIZE)
df_test_sample  = safe_sample(df_test, SAMPLE_SIZE)

# -------------------------
# 6. Scaling (fit on train sample and transform both)
# -------------------------
print("Scaling numeric features (RobustScaler)...")
scaler = RobustScaler()
scaler.fit(df_train_sample[NUM_FEATURES])
X_train = pd.DataFrame(scaler.transform(df_train_sample[NUM_FEATURES]), columns=NUM_FEATURES, index=df_train_sample.index)
X_test  = pd.DataFrame(scaler.transform(df_test_sample[NUM_FEATURES]),  columns=NUM_FEATURES, index=df_test_sample.index)

# -------------------------
# 7. Isolation Forest
# -------------------------
print("Training IsolationForest...")
iso = IsolationForest(**IFOREST_PARAMS)
t0 = time.time()
iso.fit(X_train)
t1 = time.time()
print(f"IForest trained in {(t1-t0):.1f}s")

df_train_sample['Anomaly_IForest'] = iso.predict(X_train)   # -1 anomaly, 1 normal
df_test_sample['Anomaly_IForest']  = iso.predict(X_test)

# -------------------------
# 8. Global PCA + HDBSCAN (fit PCA once on train)
# -------------------------
print("PCA reduction and HDBSCAN clustering (global PCA fit on train sample)...")
pca = PCA(n_components=min(PCA_COMPONENTS, len(NUM_FEATURES)), random_state=RANDOM_STATE)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca  = pca.transform(X_test)

clusterer = hdbscan.HDBSCAN(min_cluster_size=HDB_MIN_CLUSTER, min_samples=HDB_MIN_SAMPLES,
                            metric='euclidean', cluster_selection_method='eom', core_dist_n_jobs=-1)
t0 = time.time()
train_labels = clusterer.fit_predict(X_train_pca)
t1 = time.time()
print(f"HDBSCAN on train done in {(t1-t0):.1f}s. Clusters found (excl noise): {len(set(train_labels)) - (1 if -1 in train_labels else 0)}")
df_train_sample['HDBSCAN_Label'] = train_labels
df_train_sample['HDBSCAN_IsNoise'] = (train_labels == -1).astype(int)

# approximate_predict for test
test_labels, strengths = hdbscan.approximate_predict(clusterer, X_test_pca)
df_test_sample['HDBSCAN_Label'] = test_labels
df_test_sample['HDBSCAN_IsNoise'] = (test_labels == -1).astype(int)

# -------------------------
# 9. Consensus anomaly mask
# -------------------------
df_train_sample['Consensus_Anomaly'] = (((df_train_sample['Anomaly_IForest'] == -1) & (df_train_sample['HDBSCAN_IsNoise'] == 1))).astype(int)
df_test_sample['Consensus_Anomaly']  = (((df_test_sample['Anomaly_IForest'] == -1) & (df_test_sample['HDBSCAN_IsNoise'] == 1))).astype(int)

# -------------------------
# 10. Percentiles & botnet profiles (Neris/Virut/Zeus/Generic)
# -------------------------
print("Computing percentiles for profile generation...")
percentiles = [5,10,30,50,70,80,90,95]
p = {}
features_for_percentiles = ["Pkt_Per_Src","Burst_Rate","Session_Dur_Src","Inter_Arrival_Time","BoxCox_Length","Rate_to_Length","IAT_to_Session","Burst_Variability","Pkt_Rate_Change"]

for feat in features_for_percentiles:
    if feat in df_train_sample.columns:
        vals = df_train_sample[feat].dropna().values
        if vals.size == 0:
            p[feat] = {perc: 0.0 for perc in percentiles}
        else:
            q = np.percentile(vals, percentiles)
            p[feat] = {perc: float(x) for perc, x in zip(percentiles, q)}
    else:
        p[feat] = {perc: 0.0 for perc in percentiles}

def pp(feat, perc):
    return p.get(feat, {}).get(perc, 0.0)

# Revised profiles (more forgiving bands to increase recall)
botnet_profiles = {
    "Neris": {
        "Pkt_Per_Src": (pp("Pkt_Per_Src", 70), np.inf),
        "Rate_to_Length": (pp("Rate_to_Length", 50), np.inf),
        "Burst_Rate": (pp("Burst_Rate", 40), np.inf)
    },
    "Virut": {
        "BoxCox_Length": (pp("BoxCox_Length", 60), np.inf),
        "Burst_Rate": (pp("Burst_Rate", 40), np.inf)
    },
    "Zeus": {
        "Inter_Arrival_Time": (pp("Inter_Arrival_Time", 5), pp("Inter_Arrival_Time", 60)),
        "IAT_to_Session": (pp("IAT_to_Session", 10), pp("IAT_to_Session", 90)),
        "Burst_Variability": (pp("Burst_Variability", 5), pp("Burst_Variability", 80))
    },
    "GenericBotnet": {
        "Pkt_Rate_Change": (pp("Pkt_Rate_Change", 70), np.inf),
        "Burst_Variability": (pp("Burst_Variability", 70), np.inf)
    }
}

print("Botnet profiles defined (percentile-driven).")

# Score-based profile matcher - partial matches allowed
def score_profiles(row, profiles, min_matches=1):
    scores = {}
    for name, rules in profiles.items():
        matches = 0
        total = len(rules)
        for feat, (low, high) in rules.items():
            val = row.get(feat, np.nan)
            if pd.isna(val):
                continue
            if val >= low and val <= high:
                matches += 1
        scores[name] = (matches, total)
    # choose best by (matches, total) and require at least min_matches
    best = max(scores.items(), key=lambda kv: (kv[1][0], kv[1][1]))
    if best[1][0] >= min_matches and best[1][0] > 0:
        return best[0]
    return "Unknown"

# Apply profile matching to consensus anomalies only
for df_s in (df_train_sample, df_test_sample):
    df_s['Pred_Botnet_Profile'] = "Normal"
    mask = df_s['Consensus_Anomaly'] == 1
    if mask.any():
        df_s.loc[mask, 'Pred_Botnet_Profile'] = df_s.loc[mask].apply(lambda r: score_profiles(r, botnet_profiles, min_matches=1), axis=1)
        # fallback: if still Unknown but consensus anomaly -> GenericBotnet
        fallback = (df_s['Pred_Botnet_Profile'] == "Unknown") & (df_s['Consensus_Anomaly'] == 1)
        df_s.loc[fallback, 'Pred_Botnet_Profile'] = "GenericBotnet"

print("Profile assignment done.")
print("Train profile counts:", df_train_sample['Pred_Botnet_Profile'].value_counts().to_dict())
print("Test profile counts: ", df_test_sample['Pred_Botnet_Profile'].value_counts().to_dict())

# -------------------------
# 11. Prepare supervised dataset (pseudo-labels)
# -------------------------
def prepare_rf_df(df_s):
    dfc = df_s.copy()
    dfc['RF_Label'] = dfc['Pred_Botnet_Profile'].fillna("Unknown")
    return dfc

rf_train_df = prepare_rf_df(df_train_sample)
rf_test_df  = prepare_rf_df(df_test_sample)

# Build training set: keep known profiles and sample normals to balance
anom_known = rf_train_df[rf_train_df['RF_Label'] != "Unknown"]
normals = rf_train_df[rf_train_df['RF_Label'] == "Normal"]

if len(anom_known) == 0:
    print("No known anomalies to train supervised models. Skipping supervised stage.")
    do_supervised = False
else:
    do_supervised = True
    # target: sample normals to not dwarf anomalies
    class_counts = anom_known['RF_Label'].value_counts()
    min_non_norm = class_counts.min()
    n_normals_keep = min(len(normals), int(min_non_norm * max(3, len(class_counts))))
    normals_sampled = normals.sample(n=n_normals_keep, random_state=RANDOM_STATE) if len(normals)>0 else pd.DataFrame()
    rf_train_ready = pd.concat([anom_known, normals_sampled]).sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)
    rf_train_ready = rf_train_ready[rf_train_ready['RF_Label'] != "Unknown"]
    print("Supervised training distribution:", Counter(rf_train_ready['RF_Label']))

# -------------------------
# 12. Train classifiers (RF, LinearSVC, RBF SVM, Logistic) and ensemble
# -------------------------
models = {}
feat_imp = pd.Series(0, index=NUM_FEATURES)

if do_supervised and len(rf_train_ready) > 10:
    X_train_sup = rf_train_ready[NUM_FEATURES].fillna(0.0)
    y_train_sup = rf_train_ready['RF_Label']

    # Try SMOTE to handle class imbalance (may fail on tiny classes)
    try:
        sm = SMOTE(random_state=RANDOM_STATE)
        X_bal, y_bal = sm.fit_resample(X_train_sup, y_train_sup)
        print("SMOTE applied: balanced shape:", X_bal.shape)
    except Exception as e:
        print("SMOTE failed (fall back to original):", e)
        X_bal, y_bal = X_train_sup, y_train_sup

    # Models
    rf = RandomForestClassifier(**RF_PARAMS)
    lsvc = LinearSVC(C=0.5, class_weight='balanced', max_iter=5000, random_state=RANDOM_STATE)
    rbf_svc = SVC(C=1.0, kernel='rbf', probability=True, class_weight='balanced', random_state=RANDOM_STATE)
    logreg = LogisticRegression(max_iter=5000, class_weight='balanced', random_state=RANDOM_STATE)

    # Fit RF
    print("Training RandomForest...")
    t0 = time.time()
    rf.fit(X_bal, y_bal)
    t1 = time.time()
    print(f"RF trained in {(t1-t0):.1f}s")
    models['RandomForest'] = rf
    feat_imp = pd.Series(rf.feature_importances_, index=NUM_FEATURES).sort_values(ascending=False)

    # Fit LinearSVC
    print("Training LinearSVC...")
    t0 = time.time()
    lsvc.fit(X_bal, y_bal)
    t1 = time.time()
    print(f"LinearSVC trained in {(t1-t0):.1f}s")
    models['LinearSVC'] = lsvc

    # Fit RBF SVM (may be slow)
    try:
        print("Training RBF SVC (may be slow)...")
        t0 = time.time()
        rbf_svc.fit(X_bal, y_bal)
        t1 = time.time()
        print(f"RBF SVC trained in {(t1-t0):.1f}s")
        models['RBF_SVC'] = rbf_svc
    except Exception as e:
        print("RBF SVC training failed/too slow; skipping:", e)

    # Fit Logistic Regression
    print("Training LogisticRegression...")
    t0 = time.time()
    logreg.fit(X_bal, y_bal)
    t1 = time.time()
    print(f"LogisticRegression trained in {(t1-t0):.1f}s")
    models['LogisticRegression'] = logreg

    # Voting ensemble (soft) using available models
    estimators = []
    estimators.append(('rf', rf))
    estimators.append(('lsvc', lsvc))
    if 'RBF_SVC' in models:
        estimators.append(('rbf', models['RBF_SVC']))
    estimators.append(('lr', logreg))

    ensemble = VotingClassifier(estimators=estimators, voting='soft', weights=[3,1,1,1] if 'RBF_SVC' in models else [3,1,1], n_jobs=-1)
    print("Training Voting Ensemble...")
    t0 = time.time()
    ensemble.fit(X_bal, y_bal)
    t1 = time.time()
    print(f"Ensemble trained in {(t1-t0):.1f}s")
    models['Ensemble'] = ensemble

    # Prepare test supervised set (keep Unknown as label if present)
    X_test_sup = rf_test_df[NUM_FEATURES].fillna(0.0)
    y_test_sup = rf_test_df['RF_Label']

    # Predictions and reports
    print("\nRandomForest classification report (test sample):")
    y_pred_rf = rf.predict(X_test_sup)
    print(classification_report(y_test_sup, y_pred_rf, zero_division=0))

    print("\nLinearSVC classification report (test sample):")
    y_pred_lsvc = lsvc.predict(X_test_sup)
    print(classification_report(y_test_sup, y_pred_lsvc, zero_division=0))

    if 'RBF_SVC' in models:
        print("\nRBF SVC classification report (test sample):")
        y_pred_rbf = models['RBF_SVC'].predict(X_test_sup)
        print(classification_report(y_test_sup, y_pred_rbf, zero_division=0))

    print("\nLogisticRegression classification report (test sample):")
    y_pred_log = logreg.predict(X_test_sup)
    print(classification_report(y_test_sup, y_pred_log, zero_division=0))

    print("\nEnsemble classification report (test sample):")
    y_pred_ens = ensemble.predict(X_test_sup)
    print(classification_report(y_test_sup, y_pred_ens, zero_division=0))
else:
    print("Skipping supervised training due to insufficient pseudo-labeled anomalies.")
    models = {}

# -------------------------
# 13. Metrics & summary
# -------------------------
summary = {
    "IForest_train_pct": safe_pct(df_train_sample['Anomaly_IForest'] == -1),
    "IForest_test_pct": safe_pct(df_test_sample['Anomaly_IForest'] == -1),
    "HDBSCAN_train_pct": safe_pct(df_train_sample['HDBSCAN_IsNoise'] == 1),
    "HDBSCAN_test_pct": safe_pct(df_test_sample['HDBSCAN_IsNoise'] == 1),
    "Consensus_train_pct": safe_pct(df_train_sample['Consensus_Anomaly'] == 1),
    "Consensus_test_pct": safe_pct(df_test_sample['Consensus_Anomaly'] == 1),
    "Agreement_train_pct": safe_pct(((df_train_sample['Anomaly_IForest'] == -1).astype(int) == df_train_sample['HDBSCAN_IsNoise'])),
    "Agreement_test_pct": safe_pct(((df_test_sample['Anomaly_IForest'] == -1).astype(int) == df_test_sample['HDBSCAN_IsNoise'])),
    "Silhouette_IF_train": None,
    "Silhouette_HDB_train": None
}

# silhouette safety
try:
    if len(np.unique((df_train_sample['Anomaly_IForest'] == -1).astype(int))) > 1:
        summary["Silhouette_IF_train"] = round(silhouette_score(X_train, (df_train_sample['Anomaly_IForest'] == -1).astype(int)), 4)
    if len(np.unique(df_train_sample['HDBSCAN_IsNoise'])) > 1:
        summary["Silhouette_HDB_train"] = round(silhouette_score(X_train, df_train_sample['HDBSCAN_IsNoise']), 4)
except Exception as e:
    print("Silhouette calc failed:", e)

print("=== SUMMARY ===")
print(json.dumps(summary, indent=2))
print("Top predicted botnet profiles (test sample):")
print(df_test_sample['Pred_Botnet_Profile'].value_counts().head(10))

# -------------------------
# 14. Diagnostics plots
# -------------------------
# Feature importances
if 'RandomForest' in models:
    feat_imp = pd.Series(models['RandomForest'].feature_importances_, index=NUM_FEATURES).sort_values(ascending=False)
    plt.figure(figsize=(8,5))
    feat_imp.head(12).plot(kind='barh')
    plt.title("RF Top Features")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# Confusion matrix for ensemble if exists, else RF
cm_model = 'Ensemble' if 'Ensemble' in models else ('RandomForest' if 'RandomForest' in models else None)
if cm_model and do_supervised:
    preds = models[cm_model].predict(X_test_sup)
    labels = sorted(list(pd.unique(y_test_sup)))
    cm = confusion_matrix(y_test_sup, preds, labels=labels)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap="Blues")
    plt.title(f"Confusion matrix - {cm_model}")
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# KDE plots per feature (consensus anomaly vs normal)
kde_feats = ["Pkt_Per_Src","Burst_Rate","Inter_Arrival_Time","BoxCox_Length","Burst_Variability"]
for f in kde_feats:
    if f in df_test_sample.columns:
        plt.figure(figsize=(7,4))
        sns.kdeplot(df_test_sample.loc[df_test_sample['Consensus_Anomaly']==0, f], label="Normal (consensus)", fill=True)
        sns.kdeplot(df_test_sample.loc[df_test_sample['Consensus_Anomaly']==1, f], label="Consensus Anom", fill=True)
        plt.title(f"KDE Test Sample - {f}")
        plt.legend()
        plt.tight_layout()
        plt.show()

# -------------------------
# 15. Save JSON summary + top counts
# -------------------------
out = {
    "summary": summary,
    "top_botnet_profiles_test": df_test_sample['Pred_Botnet_Profile'].value_counts().head(20).to_dict(),
    "rf_feature_importances": feat_imp.head(20).to_dict() if 'RandomForest' in models else {}
}
with open(OUTPUT_SUMMARY, "w") as fh:
    json.dump(out, fh, indent=2)

print(f"Saved summary to {OUTPUT_SUMMARY}")
print("Pipeline finished.")
