<a href="https://colab.research.google.com/github/CyberMetrics/Prototypes/blob/main/Prototype_002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [163]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [164]:
import pandas as pd
import numpy as np
import json
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pandas import json_normalize
import plotly.express as px
import plotly.graph_objects as go


In [165]:
# file_path = '/content/drive/My Drive/Capstone Mark-01/wazuh_sample(csv).csv'
# df_full = pd.read_csv(file_path)
# print("File loaded successfully — shape:", df_full.shape)
# print("Columns:", list(df_full.columns))
# print(df_full.head(10))

#json
file_path = '/content/drive/My Drive/Capstone Mark-01/wazuh_sample(json).json'
# Load NDJSON properly
df_full = pd.read_json(file_path, lines=True)
df_flat = json_normalize(df_full.to_dict(orient='records'), sep='_')
print("Flattened columns:", df_flat.columns.tolist())
print(df_flat.head(3))
print("File loaded successfully!")
print("Shape:", df_full.shape)
print(df_full.head(3))


Flattened columns: ['timestamp', 'srcip', 'dstip', 'event_type', 'action', 'status', 'severity', 'group', 'message', 'system', 'rule_id', 'rule_level', 'agent_id', 'agent_name']
                  timestamp            srcip        dstip  \
0 2025-10-13 18:00:00+00:00  192.168.216.215   10.0.81.30   
1 2025-10-13 18:00:15+00:00   192.168.91.137   10.0.3.165   
2 2025-10-13 18:00:30+00:00  192.168.228.116  10.0.34.134   

               event_type   action   status  severity             group  \
0               port_scan   detect  success      high  [authentication]   
1          file_integrity     scan  success  critical            [user]   
2  authentication_failure  execute  success  critical          [system]   

                                             message system  rule_id  \
0             Port scan detected with status success  macos     1005   
1        File integrity detected with status success  macos     1031   
2  Authentication failure detected with status su...  linux 

In [166]:

class SimpleLogisticRegression:

    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.w = None
        self.b = 0

    def _sigmoid(self, z):
        z_clip = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clip))

    def fit(self, X, y):
        X = np.array(X, dtype=float)
        y = np.array(y, dtype=float)
        n, d = X.shape
        self.w = np.zeros(d)
        self.b = 0

        for _ in range(self.epochs):
            z = X.dot(self.w) + self.b
            pred = self._sigmoid(z)
            grad_w = (1/n) * X.T.dot(pred - y)
            grad_b = (1/n) * np.sum(pred - y)
            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b

    def predict_proba(self, X):
        X = np.array(X, dtype=float)
        z = X.dot(self.w) + self.b
        return self._sigmoid(z)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)


In [167]:
class WazuhEventAnalyzer:
    def __init__(self, lr=0.01, epochs=1000):
        self.classifier = SimpleLogisticRegression(lr=lr, epochs=epochs)
        self.label_encoders = {}
        self.is_trained = False
        # Adjusted to fit your dataset
        self.feature_cols = [
            'rule_level_scaled',
            'rule_id_scaled',
            'severity_encoded',
            'status_encoded',
            'Anomaly_Flag'
        ]
    def _preprocess(self, df):
        df_proc = df.copy()
        # 1️ Binary target based on severity
        df_proc['is_critical_event'] = df_proc['severity'].apply(
            lambda x: 1 if str(x).lower() in ['high', 'critical'] else 0
        )
        # 2️ Label encode categorical columns safely
        categorical_cols = ['agent_name', 'status', 'severity', 'action', 'event_type', 'system']
        for col in categorical_cols:
            if col in df_proc.columns:
                if col not in self.label_encoders:
                    le = LabelEncoder()
                    df_proc[f'{col}_encoded'] = le.fit_transform(df_proc[col].astype(str))
                    self.label_encoders[col] = le
                else:
                    le = self.label_encoders[col]
                    df_proc[f'{col}_encoded'] = df_proc[col].apply(
                        lambda x: le.transform([x])[0] if x in le.classes_ else -1
                    )
        # 3️ Scale numeric columns (rule_id, rule_level)
        numeric_cols_to_scale = ['rule_id', 'rule_level']
        for col in numeric_cols_to_scale:
            if col in df_proc.columns:
                data = df_proc[col].values.reshape(-1, 1)
                if col not in self.label_encoders:
                    scaler = StandardScaler()
                    df_proc[f'{col}_scaled'] = scaler.fit_transform(data)
                    self.label_encoders[col] = scaler
                else:
                    scaler = self.label_encoders[col]
                    df_proc[f'{col}_scaled'] = scaler.transform(data)
        # 4️ Time delta inverse (event frequency indicator)
        df_proc['timestamp'] = pd.to_datetime(df_proc['timestamp'])
        df_proc = df_proc.sort_values('timestamp').reset_index(drop=True)
        time_diff = df_proc['timestamp'].diff().dt.total_seconds().fillna(0)
        df_proc['time_delta_inv'] = 1 / (time_diff + 1e-6)
        return df_proc
    def fit(self, df):
            df_proc = self._preprocess(df)
            # Anomaly Detection (rare rule IDs)
            event_counts = df_proc["rule_id"].value_counts()
            df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(
                lambda x: 1 if event_counts[x] == 1 else 0
            )
            X = df_proc[self.feature_cols]
            y = df_proc['is_critical_event']
            self.classifier.fit(X.values, y.values)
            self.is_trained = True
            print(f"Classifier trained with {len(X)} samples.")
    def analyze(self, df):
            if df.empty:
                return pd.DataFrame()
            df_proc = self._preprocess(df)
            event_counts = df_proc["rule_id"].value_counts()
            df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(
                lambda x: 1 if event_counts[x] == 1 else 0
            )
            # ML-based classification
            if not self.is_trained:
                df_proc['ML_Proba'] = 0.5
                df_proc['ML_Prediction'] = -1
            else:
                X_test = df_proc[self.feature_cols]
                df_proc['ML_Proba'] = self.classifier.predict_proba(X_test.values)
                df_proc['ML_Prediction'] = (df_proc['ML_Proba'] >= 0.5).astype(int)
            # Simple sequence detection (fast bursts from same srcip)
            seq_alert_check = df_proc.groupby('srcip')['time_delta_inv'].transform(
                lambda x: x.rolling(window=3, min_periods=1).mean()
            )
            df_proc['Seq_Alert'] = (df_proc['time_delta_inv'] > seq_alert_check.shift(1).fillna(0) * 2).astype(int)
            # Weighted final score
            df_proc['Final_Score'] = (
                df_proc['ML_Proba'] * 0.5 +
                df_proc['Anomaly_Flag'] * 0.3 +
                df_proc['Seq_Alert'] * 0.2
            )
            df_proc['FinalAlert'] = (df_proc['Final_Score'] > 0.49).astype(int)
            return df_proc[['timestamp' ,'rule_id', 'severity', 'Anomaly_Flag', 'ML_Proba',
                            'Seq_Alert', 'Final_Score', 'FinalAlert']]

Training and testing The data

In [168]:
df_train = df_flat.head(800).copy()
df_batch = df_flat.tail(df_flat.shape[0] - 200).copy()
event_analyzer = WazuhEventAnalyzer(lr=0.05, epochs=2000)
print("\n--- Training the WazuhEventAnalyzer ---")
event_analyzer.fit(df_train)
print("\n--- Sequential Event Analysis on New Batch ---")
results = event_analyzer.analyze(df_batch)
print(results.head(20).to_markdown(index=False, floatfmt=".2f"))


--- Training the WazuhEventAnalyzer ---
Classifier trained with 800 samples.

--- Sequential Event Analysis on New Batch ---
| timestamp                 |   rule_id | severity   |   Anomaly_Flag |   ML_Proba |   Seq_Alert |   Final_Score |   FinalAlert |
|:--------------------------|----------:|:-----------|---------------:|-----------:|------------:|--------------:|-------------:|
| 2025-10-13 18:50:00+00:00 |      1013 | critical   |              0 |       1.00 |           1 |          0.70 |            1 |
| 2025-10-13 18:50:15+00:00 |      1012 | high       |              0 |       0.72 |           0 |          0.36 |            0 |
| 2025-10-13 18:50:30+00:00 |      1005 | high       |              0 |       0.86 |           0 |          0.43 |            0 |
| 2025-10-13 18:50:45+00:00 |      1010 | medium     |              0 |       0.00 |           0 |          0.00 |            0 |
| 2025-10-13 18:51:00+00:00 |      1012 | medium     |              0 |       0.00 |          

Explicitly testing with custom data for model perfomance

In [169]:
def create_data_to_force_high_score(df_base):
    """
    Creates synthetic data with an ABSURDLY high rule_level (100.0)
    to force the ML_Proba to be near 1.0, thereby guaranteeing the Final_Score > 0.7.
    """

    base_event = df_base.iloc[-1].copy()

    forced_alert_event = base_event.copy()
    forced_alert_event['timestamp'] = pd.to_datetime('2025-01-01 00:00:01')

    # Critical inputs to push ML_Proba to 1.0 (assuming the model learned that higher levels mean higher risk)
    forced_alert_event['rule_level'] = 100.0 # UNREALISTICALLY HIGH LEVEL
    forced_alert_event['severity'] = 'Critical'

    # Logic-based inputs to boost the score further
    forced_alert_event['rule_id'] = 999999 # Unique Rule ID (Anomaly_Flag=1)
    forced_alert_event['srcip'] = '10.0.0.100' # Unique source (no Seq_Alert needed here, but kept)
    low_score_event = base_event.copy()
    low_score_event['timestamp'] = pd.to_datetime('2025-01-01 00:00:02')
    low_score_event['rule_id'] = 10000 # Common rule
    low_score_event['rule_level'] = 3.0 # Low level
    low_score_event['severity'] = 'Low'
    low_score_event['srcip'] = '10.0.0.101'

    test_data = pd.DataFrame([forced_alert_event, low_score_event])
    test_data = test_data.sort_values('timestamp').reset_index(drop=True)
    return test_data

# --- 2. Run Analysis with FORCED DATA ---

df_forced_alert_test = create_data_to_force_high_score(df_batch)

print("\n--- Synthetic Test: FORCING SCORE > 0.7 by Extreme Feature Value ---")
forced_alert_results = event_analyzer.analyze(df_forced_alert_test)
print(forced_alert_results.to_markdown(index=False, floatfmt=".4f"))


--- Synthetic Test: FORCING SCORE > 0.7 by Extreme Feature Value ---
| timestamp           |   rule_id | severity   |   Anomaly_Flag |   ML_Proba |   Seq_Alert |   Final_Score |   FinalAlert |
|:--------------------|----------:|:-----------|---------------:|-----------:|------------:|--------------:|-------------:|
| 2025-01-01 00:00:01 |    999999 | Critical   |              1 |     1.0000 |           1 |        1.0000 |            1 |
| 2025-01-01 00:00:02 |     10000 | Low        |              1 |     1.0000 |           0 |        0.8000 |            1 |


In [170]:
results = event_analyzer.analyze(df_batch)
fig1 = px.pie(
    results,
    names='FinalAlert',
    title='Critical vs Non-Critical Alerts',
    color='FinalAlert',
    color_discrete_map={0: 'lightgreen', 1: 'red'},
    hole=0.4
)
fig1.update_traces(textinfo='value+percent', pull=[0, 0.1])
fig1.show()


**Overall system health (how many alerts are critical)**

In [171]:
fig2 = px.line(
    results,
    x='timestamp',
    y='Final_Score',
    color='FinalAlert',
    title='Event Severity Score Over Time',
    color_discrete_map={0: 'blue', 1: 'red'}
)
fig2.update_traces(mode='lines+markers')
fig2.show()


**Behavior of alert severity over time (detect peaks)**

In [172]:
df_vis = df_batch.join(results[['FinalAlert']])
top_src = df_vis[df_vis['FinalAlert'] == 1]['srcip'].value_counts().head(10)

fig3 = px.bar(
    x=top_src.index,
    y=top_src.values,
    title='Top Source IPs Causing Alerts',
    labels={'x': 'Source IP', 'y': 'Alert Count'},
    color=top_src.values,
    color_continuous_scale='Reds'
)
fig3.show()


**Identify risky IPs or agents (threat sources)**

In [173]:
def wazuh_dashboard(results, df_batch):
    import plotly.express as px

    df_vis = df_batch.join(results[['FinalAlert']])

    # 1. Alert distribution
    fig1 = px.pie(results, names='FinalAlert', title='Critical vs Non-Critical Alerts')
    fig1.show()

    # 2. Severity over time
    fig2 = px.line(results, x='timestamp', y='Final_Score', color='FinalAlert',
                   title='Severity Score Over Time')
    fig2.show()

    # 3. Top source IPs
    top_src = df_vis[df_vis['FinalAlert'] == 1]['srcip'].value_counts().head(10)
    fig3 = px.bar(x=top_src.index, y=top_src.values, title='Top Alert Source IPs')
    fig3.show()

wazuh_dashboard(results, df_batch)


**DashBoard\**