<a href="https://colab.research.google.com/github/CyberMetrics/Prototypes/blob/prototype02_modified/Prototype_002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
from google.colab import drive;drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
import pandas as pd
import numpy as np
import json
import time
from collections import deque
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pandas import json_normalize
from google.colab import drive
from datetime import datetime
import os
from typing import Dict, Any, List
import io # Needed for DataFrame to_markdown fallback (optional)

# --- Configuration: Define the Drive Paths ---
# WARNING: ENSURE THIS PATH IS CORRECT FOR YOUR DRIVE SETUP!
DRIVE_BASE_PATH = '/content/drive/MyDrive/Capstone Mark-01/'
TRAIN_FILE_PATH = DRIVE_BASE_PATH + 'wazuh_sample(json).json'
LIVE_INPUT_PATH = DRIVE_BASE_PATH + 'live_security_feed.json'

In [72]:

# ====================================================================
# 1. CORE ML MODEL (LOGISTIC REGRESSION)
# ====================================================================

class SimpleLogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.w = None
        self.b = 0

    def _sigmoid(self, z):
        z_clip = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clip))

    def fit(self, X, y):
        X = np.array(X, dtype=float)
        y = np.array(y, dtype=float)
        n, d = X.shape
        self.w = np.zeros(d)
        self.b = 0

        for _ in range(self.epochs):
            z = X.dot(self.w) + self.b
            pred = self._sigmoid(z)
            grad_w = (1/n) * X.T.dot(pred - y)
            grad_b = (1/n) * np.sum(pred - y)
            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b

    def predict_proba(self, X):
        X = np.array(X, dtype=float)
        z = X.dot(self.w) + self.b
        return self._sigmoid(z)

In [73]:
# ====================================================================
# 2. WAZUH EVENT ANALYZER (CORE LOGIC)
# ====================================================================

class WazuhEventAnalyzer:
    def __init__(self, lr=0.01, epochs=1000):
        self.classifier = SimpleLogisticRegression(lr=lr, epochs=epochs)
        self.label_encoders = {}
        self.is_trained = False
        self.rule_counts = pd.Series()
        self.feature_cols = [
            'rule_level_scaled',
            'rule_id_scaled',
            'severity_encoded',
            'status_encoded',
            'Anomaly_Flag'
        ]

    def _preprocess(self, df):
        df_proc = df.copy()
        df_proc['is_critical_event'] = df_proc['severity'].apply(
            lambda x: 1 if str(x).lower() in ['high', 'critical'] else 0
        )

        categorical_cols = ['agent_name', 'status', 'severity', 'action', 'event_type', 'system']
        for col in categorical_cols:
            if col in df_proc.columns:
                le = self.label_encoders.get(col, LabelEncoder())

                def safe_transform(x):
                    x_str = str(x)
                    if col not in self.label_encoders:
                        return le.fit_transform([x_str])[0]
                    try:
                        return le.transform([x_str])[0]
                    except ValueError:
                        return -1

                df_proc[f'{col}_encoded'] = df_proc[col].apply(safe_transform)
                if col not in self.label_encoders:
                    self.label_encoders[col] = le

        numeric_cols_to_scale = ['rule_id', 'rule_level']
        for col in numeric_cols_to_scale:
            if col in df_proc.columns:
                data = df_proc[col].values.reshape(-1, 1)
                scaler = self.label_encoders.get(col)

                if scaler is None:
                    scaler = StandardScaler()
                    df_proc[f'{col}_scaled'] = scaler.fit_transform(data)
                    self.label_encoders[col] = scaler
                else:
                    df_proc[f'{col}_scaled'] = scaler.transform(data)

        df_proc['timestamp'] = pd.to_datetime(df_proc['timestamp'], errors='coerce')
        df_proc = df_proc.sort_values('timestamp').reset_index(drop=True)
        time_diff = df_proc['timestamp'].diff().dt.total_seconds().fillna(0)
        df_proc['time_delta_inv'] = 1 / (time_diff + 1e-6)

        return df_proc

    def fit(self, df):
        df_proc = self._preprocess(df)
        self.rule_counts = df_proc["rule_id"].value_counts()

        df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(
            lambda x: 1 if self.rule_counts.get(x, 0) <= 1 else 0
        )

        X = df_proc[self.feature_cols]
        y = df_proc['is_critical_event']
        self.classifier.fit(X.values, y.values)
        self.is_trained = True
        print(f"Classifier trained with {len(X)} samples. Feature columns: {self.feature_cols}")

    def analyze(self, df):
        if df.empty:
            return pd.DataFrame()

        df_proc = self._preprocess(df)

        def get_anomaly_flag(rule_id):
            is_rare_or_new = self.rule_counts.get(rule_id, 0) <= 1
            return 1 if is_rare_or_new else 0

        df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(get_anomaly_flag)

        if not self.is_trained:
            df_proc['ML_Proba'] = 0.5
        else:
            X_test = df_proc[self.feature_cols]
            df_proc['ML_Proba'] = self.classifier.predict_proba(X_test.values)

        seq_alert_check = df_proc.groupby('srcip')['time_delta_inv'].transform(
             lambda x: x.rolling(window=3, min_periods=1).mean()
        )
        df_proc['Seq_Alert'] = (df_proc['time_delta_inv'] > seq_alert_check.shift(1).fillna(0) * 2).astype(int)

        df_proc['Final_Score'] = (
            df_proc['ML_Proba'] * 0.5 +
            df_proc['Anomaly_Flag'] * 0.3 +
            df_proc['Seq_Alert'] * 0.2
        )
        df_proc['FinalAlert'] = (df_proc['Final_Score'] > 0.75).astype(int)

        return df_proc[['timestamp', 'srcip', 'severity', 'rule_id', 'rule_level',
                        'Anomaly_Flag', 'Seq_Alert', 'ML_Proba', 'Final_Score', 'FinalAlert']]



In [74]:
# ====================================================================
# 3. CONTINUOUS DRIVE MONITORING LOGIC
# ====================================================================

def create_scaler_booster_data(agent_id_prefix):
    """Creates synthetic data mirroring Agent's high rule ranges for stable scaling."""
    return pd.DataFrame([
        {'timestamp': '2025-01-01T00:00:00Z', 'srcip': '1.1.1.1', 'dstip': '2.2.2.2',
         'event_type': 'brute_force_attempt', 'action': 'block', 'status': 'failed',
         'severity': 'Critical', 'group': '[api_booster]', 'message': 'Booster data high level',
         'system': 'linux', 'rule_id': 900101, 'rule_level': 16,
         'agent_id': agent_id_prefix + '99', 'agent_name': 'firewall-gateway'},
        {'timestamp': '2025-01-01T00:00:01Z', 'srcip': '1.1.1.1', 'dstip': '2.2.2.2',
         'event_type': 'network_connection', 'action': 'allow', 'status': 'success',
         'severity': 'Low', 'group': '[api_booster]', 'message': 'Booster data low level',
         'system': 'linux', 'rule_id': 900350, 'rule_level': 4,
         'agent_id': agent_id_prefix + '99', 'agent_name': 'firewall-gateway'},
    ])

def load_and_prepare_training_data(file_path):
    """Loads and flattens training data, injecting custom rules for stable scaling."""
    df_full = pd.read_json(file_path, lines=True)
    df_flat = json_normalize(df_full.to_dict(orient='records'), sep='.')

    required_context_cols = ['timestamp', 'srcip', 'dstip', 'event_type', 'action', 'status',
                             'severity', 'group', 'message', 'system']

    final_df_original = df_flat[required_context_cols].copy()
    final_df_original['rule_id'] = df_flat['rule.id']
    final_df_original['rule_level'] = df_flat['rule.level']
    final_df_original['agent_id'] = df_flat['agent.id']
    final_df_original['agent_name'] = df_flat['agent.name']

    df_booster = create_scaler_booster_data('00')
    df_combined = pd.concat([final_df_original.head(800), df_booster], ignore_index=True)

    df_combined['rule_id'] = pd.to_numeric(df_combined['rule_id'], errors='coerce')
    df_combined['rule_level'] = pd.to_numeric(df_combined['rule_level'], errors='coerce')

    return df_combined



In [75]:
def continuous_live_analysis(analyzer):
    """Monitors the Drive file for new lines and processes them in batches."""

    BATCH_SIZE = 5
    f = None
    new_logs_queue = deque()
    total_processed_count = 0

    print("\n--- 🧠 Continuous Analyzer Started ---")
    print(f"Monitoring Live Feed at: {LIVE_INPUT_PATH}")

    # CATCH-UP LOGIC: Reads all existing data on startup (FIXES MISSING BACKLOG DATA)
    try:
        if os.path.exists(LIVE_INPUT_PATH):
            with open(LIVE_INPUT_PATH, 'r', encoding="utf-8") as init_f:
                for line in init_f:
                    try:
                        log_entry = json.loads(line)
                        new_logs_queue.append(log_entry)
                    except json.JSONDecodeError:
                        pass

            print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Found {len(new_logs_queue)} backlog events. Starting detailed batch analysis...")

            # Process all backlog events immediately and print detailed results
            while len(new_logs_queue) >= BATCH_SIZE:
                batch_list = [new_logs_queue.popleft() for _ in range(BATCH_SIZE)]
                df_batch = pd.DataFrame(batch_list)

                results = analyzer.analyze(df_batch)
                total_processed_count += len(df_batch)

                # --- BACKLOG REPORTING (SHOWS ALL RESULTS 0/1) ---
                display_cols = ['timestamp', 'srcip', 'severity', 'rule_id', 'rule_level',
                                'Anomaly_Flag', 'Seq_Alert', 'ML_Proba', 'Final_Score', 'FinalAlert']
                display_df = results[display_cols].copy()
                critical_count = len(results[results['FinalAlert'] == 1])

                print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- BACKLOG BATCH (Total: {total_processed_count}) ---")
                if critical_count > 0:
                    print(f"🚨 ALERT COUNT: {critical_count} CRITICAL EVENTS FOUND 🚨")

                try:
                    print(display_df.to_markdown(index=False, floatfmt=".4f"))
                except Exception as markdown_error:
                    print(f"[ERROR DURING FORMATTING: {markdown_error}] Falling back to string output.")
                    print(display_df.to_string(index=False))
                print("--------------------------------------------------")

            print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Finished initial processing. Total processed: {total_processed_count}")

    except Exception as e:
        print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Error during initial catch-up: {e}. Clearing queue.")
        new_logs_queue.clear()

    # LIVE MONITORING LOOP (Monitors the end of the file for new Agent writes)
    while True:
        if f is None:
             try:
                f = open(LIVE_INPUT_PATH, 'r', encoding="utf-8")
                f.seek(0, 1)
             except FileNotFoundError:
                time.sleep(5)
                continue

        new_line = f.readline()

        if new_line:
            try:
                log_entry = json.loads(new_line)
                new_logs_queue.append(log_entry)

                if len(new_logs_queue) >= BATCH_SIZE:
                    batch_list = [new_logs_queue.popleft() for _ in range(BATCH_SIZE)]
                    df_batch = pd.DataFrame(batch_list)
                    results = analyzer.analyze(df_batch)
                    total_processed_count += len(df_batch)

                    # --- LIVE REPORTING (SHOWS ALL RESULTS 0/1) ---
                    display_cols = ['timestamp', 'srcip', 'severity', 'rule_id', 'rule_level',
                                    'Anomaly_Flag', 'Seq_Alert', 'ML_Proba', 'Final_Score', 'FinalAlert']
                    display_df = results[display_cols].copy()
                    critical_count = len(results[results['FinalAlert'] == 1])

                    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] --- LIVE BATCH ANALYSIS (Total: {total_processed_count}) ---")
                    if critical_count > 0:
                        print(f"🚨 ALERT COUNT: {critical_count} CRITICAL EVENTS FOUND 🚨")

                    try:
                        print(display_df.to_markdown(index=False, floatfmt=".4f"))
                    except Exception as markdown_error:
                        print(f"[ERROR DURING FORMATTING: {markdown_error}] Falling back to string output.")
                        print(display_df.to_string(index=False))

                    print("--------------------------------------------------")

            except Exception as e:
                print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Runtime Error: {e}. Skipping batch.")
                new_logs_queue.clear()

        else:
            time.sleep(1)



In [79]:
# --- Main Analyzer Execution Block ---

if __name__ == "__main__":

    # 1. Mount Drive (Essential for file access)
    try:
        drive.mount('/content/drive', force_remount=True)
    except Exception as e:
        print(f"Drive Mounting Warning: {e}")

    # 2. Train the Analyzer (using stability-enhanced data)
    try:
        print(f"\nAttempting to load and train model using: {TRAIN_FILE_PATH}")

        # Load data including the custom rule range booster
        df_train = load_and_prepare_training_data(TRAIN_FILE_PATH)

        event_analyzer = WazuhEventAnalyzer(lr=0.05, epochs=2000)
        event_analyzer.fit(df_train)

        print("\n==================================================")
        print("SERVER: WazuhEventAnalyzer **RETRAINED** with custom rule ranges.")
        print("==================================================")

        # 3. Start the continuous monitoring loop
        continuous_live_analysis(event_analyzer)

    except KeyboardInterrupt:
        print("\nANALYZER STOPPED by user.")
    except Exception as e:
        print(f"\nCRITICAL ANALYZER STARTUP ERROR: {e}")
        print(f"Please check path: '{TRAIN_FILE_PATH}'.")

Mounted at /content/drive

Attempting to load and train model using: /content/drive/MyDrive/Capstone Mark-01/wazuh_sample(json).json
Classifier trained with 802 samples. Feature columns: ['rule_level_scaled', 'rule_id_scaled', 'severity_encoded', 'status_encoded', 'Anomaly_Flag']

SERVER: WazuhEventAnalyzer **RETRAINED** with custom rule ranges.

--- 🧠 Continuous Analyzer Started ---
Monitoring Live Feed at: /content/drive/MyDrive/Capstone Mark-01/live_security_feed.json
[19:49:48] ANALYZER: Found 37 backlog events. Starting detailed batch analysis...

[19:49:48] --- BACKLOG BATCH (Total: 5) ---
| timestamp                        | srcip        | severity   |   rule_id |   rule_level |   Anomaly_Flag |   Seq_Alert |   ML_Proba |   Final_Score |   FinalAlert |
|:---------------------------------|:-------------|:-----------|----------:|-------------:|---------------:|------------:|-----------:|--------------:|-------------:|
| 2025-10-30 19:48:49.815000+00:00 | 172.16.1.10  | high       