<a href="https://colab.research.google.com/github/CyberMetrics/Prototypes/blob/main/Prototype_002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [3]:
file_path = '/content/drive/My Drive/Capstone Mark-01/wazuh_sample(csv).csv'

df_full = pd.read_csv(file_path)
print("File loaded successfully — shape:", df_full.shape)
print("Columns:", list(df_full.columns))
print(df_full.head())

File loaded successfully — shape: (1000, 14)
Columns: ['timestamp', 'rule_id', 'rule_level', 'agent_id', 'agent_name', 'srcip', 'dstip', 'event_type', 'action', 'status', 'severity', 'group', 'message', 'system']
              timestamp  rule_id  rule_level  agent_id    agent_name  \
0  2025-10-13T18:00:00Z     1005           7         1  web-server-1   
1  2025-10-13T18:00:15Z     1031           2         4   mail-server   
2  2025-10-13T18:00:30Z     1036           3         3    firewall-1   
3  2025-10-13T18:00:45Z     1048           1         1  web-server-1   
4  2025-10-13T18:01:00Z     1021           3         5  proxy-server   

             srcip         dstip              event_type   action   status  \
0  192.168.216.215    10.0.81.30               port_scan   detect  success   
1   192.168.91.137    10.0.3.165          file_integrity     scan  success   
2  192.168.228.116   10.0.34.134  authentication_failure  execute  success   
3  192.168.213.103  10.0.184.248       dat

In [4]:

class SimpleLogisticRegression:
    """A minimal implementation of Logistic Regression using Gradient Descent."""
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.w = None
        self.b = 0

    def _sigmoid(self, z):
        z_clip = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clip))

    def fit(self, X, y):
        X = np.array(X, dtype=float)
        y = np.array(y, dtype=float)
        n, d = X.shape
        self.w = np.zeros(d)
        self.b = 0

        for _ in range(self.epochs):
            z = X.dot(self.w) + self.b
            pred = self._sigmoid(z)
            grad_w = (1/n) * X.T.dot(pred - y)
            grad_b = (1/n) * np.sum(pred - y)
            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b

    def predict_proba(self, X):
        X = np.array(X, dtype=float)
        z = X.dot(self.w) + self.b
        return self._sigmoid(z)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)


In [5]:
class WazuhEventAnalyzer:
    """
    A lightweight SIEM-style analyzer for Wazuh-like logs.
    Performs preprocessing, anomaly detection, ML classification, and sequence analysis.
    """
    def __init__(self, lr=0.01, epochs=1000):
        self.classifier = SimpleLogisticRegression(lr=lr, epochs=epochs)
        self.label_encoders = {}
        self.is_trained = False
        # Adjusted to fit your dataset
        self.feature_cols = [
            'rule_level_scaled',
            'rule_id_scaled',
            'severity_encoded',
            'status_encoded',
            'Anomaly_Flag'
        ]

    def _preprocess(self, df):
        df_proc = df.copy()

        # 1️ Binary target based on severity
        df_proc['is_critical_event'] = df_proc['severity'].apply(
            lambda x: 1 if str(x).lower() in ['high', 'critical'] else 0
        )

        # 2️ Label encode categorical columns safely
        categorical_cols = ['agent_name', 'status', 'severity', 'action', 'event_type', 'system']
        for col in categorical_cols:
            if col in df_proc.columns:
                if col not in self.label_encoders:
                    le = LabelEncoder()
                    df_proc[f'{col}_encoded'] = le.fit_transform(df_proc[col].astype(str))
                    self.label_encoders[col] = le
                else:
                    le = self.label_encoders[col]
                    df_proc[f'{col}_encoded'] = df_proc[col].apply(
                        lambda x: le.transform([x])[0] if x in le.classes_ else -1
                    )

        # 3️ Scale numeric columns (rule_id, rule_level)
        numeric_cols_to_scale = ['rule_id', 'rule_level']
        for col in numeric_cols_to_scale:
            if col in df_proc.columns:
                data = df_proc[col].values.reshape(-1, 1)
                if col not in self.label_encoders:
                    scaler = StandardScaler()
                    df_proc[f'{col}_scaled'] = scaler.fit_transform(data)
                    self.label_encoders[col] = scaler
                else:
                    scaler = self.label_encoders[col]
                    df_proc[f'{col}_scaled'] = scaler.transform(data)

        # 4️ Time delta inverse (event frequency indicator)
        df_proc['timestamp'] = pd.to_datetime(df_proc['timestamp'])
        df_proc = df_proc.sort_values('timestamp').reset_index(drop=True)
        time_diff = df_proc['timestamp'].diff().dt.total_seconds().fillna(0)
        df_proc['time_delta_inv'] = 1 / (time_diff + 1e-6)

        return df_proc

    def fit(self, df):
            df_proc = self._preprocess(df)
            # Anomaly Detection (rare rule IDs)
            event_counts = df_proc["rule_id"].value_counts()
            df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(
                lambda x: 1 if event_counts[x] == 1 else 0
            )

            X = df_proc[self.feature_cols]
            y = df_proc['is_critical_event']

            self.classifier.fit(X.values, y.values)
            self.is_trained = True
            print(f"Classifier trained with {len(X)} samples.")

    def analyze(self, df):
            if df.empty:
                return pd.DataFrame()

            df_proc = self._preprocess(df)
            event_counts = df_proc["rule_id"].value_counts()
            df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(
                lambda x: 1 if event_counts[x] == 1 else 0
            )

            # ML-based classification
            if not self.is_trained:
                df_proc['ML_Proba'] = 0.5
                df_proc['ML_Prediction'] = -1
            else:
                X_test = df_proc[self.feature_cols]
                df_proc['ML_Proba'] = self.classifier.predict_proba(X_test.values)
                df_proc['ML_Prediction'] = (df_proc['ML_Proba'] >= 0.5).astype(int)

            # Simple sequence detection (fast bursts from same srcip)
            seq_alert_check = df_proc.groupby('srcip')['time_delta_inv'].transform(
                lambda x: x.rolling(window=3, min_periods=1).mean()
            )
            df_proc['Seq_Alert'] = (df_proc['time_delta_inv'] > seq_alert_check.shift(1).fillna(0) * 2).astype(int)

            # Weighted final score
            df_proc['Final_Score'] = (
                df_proc['ML_Proba'] * 0.5 +
                df_proc['Anomaly_Flag'] * 0.3 +
                df_proc['Seq_Alert'] * 0.2
            )
            df_proc['FinalAlert'] = (df_proc['Final_Score'] > 0.7).astype(int)

            return df_proc[['timestamp', 'rule_id', 'severity', 'Anomaly_Flag', 'ML_Proba',
                            'Seq_Alert', 'Final_Score', 'FinalAlert']]

In [6]:
df_train = df_full.head(200).copy()
df_batch = df_full.tail(df_full.shape[0] - 200).copy()

event_analyzer = WazuhEventAnalyzer(lr=0.05, epochs=2000)
print("\n--- Training the WazuhEventAnalyzer ---")
event_analyzer.fit(df_train)

print("\n--- Sequential Event Analysis on New Batch ---")
results = event_analyzer.analyze(df_batch)
print(results.head(20).to_markdown(index=False, floatfmt=".2f"))


--- Training the WazuhEventAnalyzer ---
Classifier trained with 200 samples.

--- Sequential Event Analysis on New Batch ---
| timestamp                 |   rule_id | severity   |   Anomaly_Flag |   ML_Proba |   Seq_Alert |   Final_Score |   FinalAlert |
|:--------------------------|----------:|:-----------|---------------:|-----------:|------------:|--------------:|-------------:|
| 2025-10-13 18:50:00+00:00 |      1013 | critical   |              0 |       1.00 |           1 |          0.70 |            0 |
| 2025-10-13 18:50:15+00:00 |      1012 | high       |              0 |       0.77 |           0 |          0.38 |            0 |
| 2025-10-13 18:50:30+00:00 |      1005 | high       |              0 |       0.90 |           0 |          0.45 |            0 |
| 2025-10-13 18:50:45+00:00 |      1010 | medium     |              0 |       0.00 |           0 |          0.00 |            0 |
| 2025-10-13 18:51:00+00:00 |      1012 | medium     |              0 |       0.00 |          