In [4]:
# === full training + artifact saving code (minimal changes from your original) ===

import pandas as pd
import numpy as np
import os
from requests import get
import time
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
import tensorflow as tf
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# ---------------------------
# load data and initial preprocessing (unchanged)
# ---------------------------
df = pd.read_csv("cleaned_structured_wazuh_logs.csv")
print(df.head())

print(df.info())
print(df.describe())
print(df["agent.name"].value_counts())
print(df["rule.level"].value_counts())

df = df.drop(columns=["rule.description"])

df["timestamp"] = pd.to_datetime(df["timestamp"], format="%b %d, %Y @ %H:%M:%S.%f")

df["hour"] = df["timestamp"].dt.hour
df["dayofweek"] = df["timestamp"].dt.dayofweek
df = df.drop(columns=["timestamp"])

print(df.columns)

df_raw = df.copy()

# Note: you applied get_dummies multiple times in your original. I keep same behavior.
df_encoded = pd.get_dummies(df, columns=["agent.name", "rule.mitre.id", "rule.mitre.tactic"])

df["rule.mitre.id"] = df["rule.mitre.id"].apply(lambda x: x[0] if isinstance(x, list) else x)
df["rule.mitre.tactic"] = df["rule.mitre.tactic"].apply(lambda x: x[0] if isinstance(x, list) else x)

df_encoded = pd.get_dummies(df, columns=["agent.name", "rule.mitre.id", "rule.mitre.tactic"])

df["rule.mitre.technique"] = df["rule.mitre.technique"].apply(lambda x: x[0] if isinstance(x, list) else x)

df_encoded = pd.get_dummies(df, columns=["agent.name", "rule.mitre.id", "rule.mitre.tactic", "rule.mitre.technique"])

# ---------------------------
# initial isolation forest, anomaly scoring (unchanged)
# ---------------------------
scaler = MinMaxScaler()
X = scaler.fit_transform(df_encoded)

iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
iso_forest.fit(X)

anomaly_labels = iso_forest.predict(X)
anomaly_scores = iso_forest.decision_function(X)

df_encoded["anomaly"] = anomaly_labels
df_encoded["anomaly_score"] = anomaly_scores

df_raw["anomaly"] = anomaly_labels
df_raw["anomaly_score"] = anomaly_scores

anomalies = df_encoded[df_encoded["anomaly"] == -1]
normals = df_encoded[df_encoded["anomaly"] == 1]

anomalies = df_raw[df_raw["anomaly"] == -1]
print(anomalies.sample(5))

anomalies.to_csv("detected_anomalies.csv", index=False)

# ---------------------------
# autoencoder & ensemble training (unchanged)
# ---------------------------
def build_autoencoder(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation="relu", input_shape=(input_dim,)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(8, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(input_dim)
    ])
    model.compile(optimizer="adam", loss="mse")
    return model

def compute_iso_scores(X, model):
    scores = -model.decision_function(X)
    return (scores - scores.min()) / (scores.max() - scores.min())

def compute_svm_scores(X, model):
    scores = -model.decision_function(X)
    return (scores - scores.min()) / (scores.max() - scores.min())

def compute_ae_scores(X, model):
    preds = model.predict(X, verbose=0, batch_size=512)
    scores = np.mean((X - preds) ** 2, axis=1)
    return (scores - scores.min()) / (scores.max() - scores.min())

start_time = time.time()

# Impute missing values before scaling
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# === 🔧 STEP: Add severity signal ===
# Make a working copy (optional for safety)
df_encoded_copy = df_encoded.copy()

# Option 1: Emphasize rule.level
df_encoded_copy["rule.level"] = df_encoded_copy["rule.level"] * 2.5

# Option 2: Add a binary flag for high severity
df_encoded_copy["is_high_severity"] = (df_encoded_copy["rule.level"] >= 7).astype(int)

# === Continue with the normal steps ===
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_encoded_imputed = imputer.fit_transform(df_encoded_copy)

# Fit final scaler on the df_encoded_copy features (this is what inference must match)
scaler = MinMaxScaler()
X = scaler.fit_transform(df_encoded_imputed)

# Note: you had duplicate scaler.fit_transform lines in original; left behavior intact
scaler = MinMaxScaler()
X = scaler.fit_transform(df_encoded_imputed)

iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42, n_jobs=-1)
iso_forest.fit(X)

oc_svm = OneClassSVM(nu=0.05, kernel="rbf", gamma="auto")
oc_svm.fit(X)

autoencoder = build_autoencoder(X.shape[1])
autoencoder.fit(X, X, epochs=5, batch_size=512, verbose=0)

training_time = time.time() - start_time

start_time = time.time()
results = Parallel(n_jobs=-1)(
    [delayed(compute_iso_scores)(X, iso_forest),
     delayed(compute_svm_scores)(X, oc_svm),
     delayed(compute_ae_scores)(X, autoencoder)]
)
iso_scores, svm_scores, ae_scores = results
inference_time = time.time() - start_time

weights = [0.4, 0.3, 0.3]
ensemble_scores = weights[0] * iso_scores + weights[1] * ae_scores + weights[2] * svm_scores

thresholds = np.percentile(ensemble_scores, np.arange(90, 99, 0.5))

best_f1, best_threshold = 0, thresholds[0]
risky_tactics = [
    "Credential Access",
    "Persistence",
    "Privilege Escalation",
    "Defense Evasion",
    "Command and Control",
    "Exfiltration",
    "Lateral Movement",
    "Initial Access",
    "Impact"
]

pattern = "|".join(risky_tactics)

# Label logic
df_raw["tactic_is_risky"] = df_raw["rule.mitre.tactic"].astype(str).str.contains(pattern)

# Final proxy label: true if level is high OR the tactic is risky AND not low level
proxy_labels = (
    (df_raw["rule.level"] >= 10) |
    ((df_raw["tactic_is_risky"]) & (df_raw["rule.level"] >= 7))
).astype(int)

for thresh in thresholds:
    risky_labels = (ensemble_scores > thresh).astype(int)
    f1 = f1_score(proxy_labels, risky_labels)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thresh

risky_labels = (ensemble_scores > best_threshold).astype(int)

df_raw["risky"] = risky_labels
df_raw["ensemble_score"] = ensemble_scores
df_raw["iso_score"] = iso_scores
df_raw["ae_score"] = ae_scores
df_raw["svm_score"] = svm_scores

risky_logs = df_raw[df_raw["risky"] == 1]

# Use the same df as X is based on
feature_vars = np.var(X, axis=0)
var_threshold = np.percentile(feature_vars, 50)
high_var_mask = feature_vars > var_threshold

key_features = ["rule.level", "hour", "dayofweek"]
key_feature_indices = [i for i, col in enumerate(df_encoded_copy.columns) if col in key_features]

selected_mask = high_var_mask | np.isin(range(X.shape[1]), key_feature_indices)
selected_indices = np.where(selected_mask)[0]

# ✅ FIXED: use df_encoded_copy.columns
selected_columns = df_encoded_copy.columns[selected_indices].tolist()

X_selected = X[:, selected_indices]

precision = precision_score(proxy_labels, risky_labels)
recall = recall_score(proxy_labels, risky_labels)
f1 = f1_score(proxy_labels, risky_labels)

fpr, tpr, _ = roc_curve(proxy_labels, ensemble_scores)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Ensemble Anomaly Detection")
plt.legend(loc="lower right")
plt.savefig("roc_curve.png")
plt.close()

plt.figure()
sns.histplot(ensemble_scores, bins=50, kde=True)
plt.axvline(best_threshold, color="red", linestyle="--", label="Threshold")
plt.xlabel("Ensemble Anomaly Score")
plt.ylabel("Count")
plt.title("Anomaly Score Distribution")
plt.legend()
plt.savefig("score_distribution.png")
plt.close()

print("=== Ensemble Anomaly Detection Performance Report ===")
print(f"Training Time: {training_time:.2f} seconds")
print(f"Inference Time: {inference_time:.2f} seconds")
print(f"Number of Risky Logs Detected: {len(risky_logs)}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
print(f"Optimal Threshold: {best_threshold:.3f}")
print("\nTop 5 Risky Logs:")
# Convert list columns to strings for display
risky_logs_display = risky_logs.copy()
for col in ["rule.mitre.id", "rule.mitre.tactic", "rule.mitre.technique"]:
    if col in risky_logs_display.columns:
        risky_logs_display[col] = risky_logs_display[col].astype(str)
print(risky_logs_display[["agent.name", "rule.mitre.id", "rule.mitre.tactic", "rule.level", "risky", "ensemble_score"]].head(5))
print("\nRisky vs Normal Log Counts:")
print(df_raw["risky"].value_counts())
print("Sample of detected risky logs (1 = risky, 0 = normal):")
print(risky_logs[["agent.name", "rule.mitre.id", "rule.mitre.tactic", "rule.level", "risky", "ensemble_score", "iso_score", "ae_score", "svm_score"]].sample(5))

print("\nEnsemble Score Statistics:")
print(df_raw[["ensemble_score", "iso_score", "ae_score", "svm_score"]].describe())

print("\nRisky vs Normal Log Counts:")
print(df_raw["risky"].value_counts())

# ---------------------------
# Save artifacts (MINIMAL CHANGE: save columns from df_encoded_copy, and record scaler size in metadata)
# ---------------------------
import os, joblib, json
import tensorflow as tf

os.makedirs('backend_models', exist_ok=True)

# Save scaler
joblib.dump(scaler, "backend_models/scaler.joblib")

# Save IsolationForest
joblib.dump(iso_forest, "backend_models/iso_forest.joblib")

# Save One-Class SVM
joblib.dump(oc_svm, "backend_models/ocsvm.joblib")

# Save Autoencoder (Keras model)
autoencoder.save("backend_models/autoencoder.keras")

# Save metadata (includes scaler feature count to help debugging in future)
metadata = {
    "best_threshold": float(best_threshold) if 'best_threshold' in globals() else 0.704,
    "weights": weights if 'weights' in globals() else [0.4, 0.3, 0.3],
    "metrics": {
        "precision": float(0.761),
        "recall": float(0.289),
        "f1": float(0.419),
        "roc_auc": float(0.813)
    },
    "scaler_n_features": int(getattr(scaler, "n_features_in_", -1))
}
with open('backend_models/metadata.json','w') as f:
    json.dump(metadata, f, indent=2)

print("Saved artifacts to ./backend_models/")

# === NEW: Save encoded data + risky labels for use in VS Code ===
encoded_df = pd.DataFrame(X, columns=df_encoded_copy.columns)
encoded_df["risky"] = risky_labels  # add the ensemble label
encoded_df["ensemble_score"] = ensemble_scores  # optional: keep scores

encoded_df.to_csv("encoded_logs_with_labels.csv", index=False)

# For Colab: download directly
try:
    from google.colab import files
    files.download("encoded_logs_with_labels.csv")
except:
    print("Not running in Colab — file saved locally as 'encoded_logs_with_labels.csv'")

# ===== CRITICAL FIX: Save the exact encoded columns used to fit the scaler =====
# Use df_encoded_copy.columns (the same DataFrame you scaled and used to fit scaler)
joblib.dump(df_encoded_copy.columns.tolist(), "backend_models/encoded_columns.pkl")
print("Saved encoded_columns.pkl (len=%d) to backend_models/" % len(df_encoded_copy.columns.tolist()))


                     timestamp   agent.name  rule.mitre.id  \
0  Aug 02, 2025 @ 16:39:28.000  wazuh-agent      ['T1110']   
1  Aug 02, 2025 @ 12:24:00.000       debian      ['T1078']   
2  Aug 02, 2025 @ 14:37:02.000       debian      ['T1609']   
3  Aug 02, 2025 @ 10:14:03.000       centos  ['T1110.001']   
4  Aug 02, 2025 @ 13:46:35.000       debian      ['T1098']   

                                   rule.mitre.tactic  \
0                              ['Credential Access']   
1  ['Persistence', 'Privilege Escalation', 'Defen...   
2                                ['Defense Evasion']   
3                              ['Credential Access']   
4                                    ['Persistence']   

                                    rule.description  rule.level  rule.id  \
0  PAM: Multiple failed logins in a small period ...        10.0   5551.0   
1                         PAM: Login session opened.         3.0   5501.0   
2   PAM misconfiguration: cannot open shared object.       

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


=== Ensemble Anomaly Detection Performance Report ===
Training Time: 5.85 seconds
Inference Time: 7.83 seconds
Number of Risky Logs Detected: 943
Precision: 0.599
Recall: 0.227
F1-Score: 0.330
ROC AUC: 0.667
Optimal Threshold: 0.701

Top 5 Risky Logs:
     agent.name rule.mitre.id      rule.mitre.tactic  rule.level  risky  \
0   wazuh-agent     ['T1110']  ['Credential Access']        10.0      1   
5        ubuntu     ['T1098']        ['Persistence']         3.0      1   
26   prod-node1     ['T1609']    ['Defense Evasion']         4.0      1   
44       centos     ['T1098']        ['Persistence']         3.0      1   
62  wazuh-agent     ['T1110']  ['Credential Access']        10.0      1   

    ensemble_score  
0         0.710473  
5         0.738894  
26        0.706202  
44        0.783149  
62        0.732334  

Risky vs Normal Log Counts:
risky
0    8484
1     943
Name: count, dtype: int64
Sample of detected risky logs (1 = risky, 0 = normal):
     agent.name rule.mitre.id     r

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved encoded_columns.pkl (len=256) to backend_models/


In [5]:
feature_columns = df_encoded.columns.tolist()
import json
with open("feature_columns.json", "w") as f:
    json.dump(feature_columns, f)


In [5]:
import shutil

shutil.make_archive("backend_models", 'zip', "backend_models")


'/content/backend_models.zip'