<a href="https://colab.research.google.com/github/CyberMetrics/Prototypes/blob/prototype02_modified/Prototype_002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [123]:
from google.colab import drive;drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [124]:
import pandas as pd
import numpy as np
import json
import time
from collections import deque
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pandas import json_normalize
from google.colab import drive
from datetime import datetime
import os
import plotly.express as px
import plotly.graph_objects as go
import threading
from IPython.display import display, clear_output

# --- Configuration: Define the Drive Paths ---
DRIVE_BASE_PATH = '/content/drive/MyDrive/Capstone Mark-01/'
TRAIN_FILE_PATH = DRIVE_BASE_PATH + 'wazuh_sample(json).json'
LIVE_INPUT_PATH = DRIVE_BASE_PATH + 'live_security_feed.json'

# --- Global Data Storage for Dashboard ---
# This DataFrame accumulates all analyzed events for plotting historical trends.
ANALYSIS_RESULTS_DF = pd.DataFrame()
ANALYZER_IS_ACTIVE = True

In [125]:
# ====================================================================
# 1. CORE ML MODEL (LOGISTIC REGRESSION) & WAZUH ANALYZER CLASSES
# (These remain the same as the validated code)
# ====================================================================

class SimpleLogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.w = None
        self.b = 0
    def _sigmoid(self, z):
        z_clip = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clip))
    def fit(self, X, y):
        X = np.array(X, dtype=float)
        y = np.array(y, dtype=float)
        n, d = X.shape
        self.w = np.zeros(d)
        self.b = 0
        for _ in range(self.epochs):
            z = X.dot(self.w) + self.b
            pred = self._sigmoid(z)
            grad_w = (1/n) * X.T.dot(pred - y)
            grad_b = (1/n) * np.sum(pred - y)
            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b
    def predict_proba(self, X):
        X = np.array(X, dtype=float)
        z = X.dot(self.w) + self.b
        return self._sigmoid(z)

class WazuhEventAnalyzer:
    def __init__(self, lr=0.01, epochs=1000):
        self.classifier = SimpleLogisticRegression(lr=lr, epochs=epochs)
        self.label_encoders = {}
        self.is_trained = False
        self.rule_counts = pd.Series()
        self.feature_cols = [
            'rule_level_scaled', 'rule_id_scaled', 'severity_encoded',
            'status_encoded', 'Anomaly_Flag'
        ]

    def _preprocess(self, df):
        df_proc = df.copy()
        df_proc['is_critical_event'] = df_proc['severity'].apply(
            lambda x: 1 if str(x).lower() in ['high', 'critical'] else 0
        )

        categorical_cols = ['agent_name', 'status', 'severity', 'action', 'event_type', 'system']
        for col in categorical_cols:
            if col in df_proc.columns:
                le = self.label_encoders.get(col, LabelEncoder())
                def safe_transform(x):
                    x_str = str(x)
                    if col not in self.label_encoders: return le.fit_transform([x_str])[0]
                    try: return le.transform([x_str])[0]
                    except ValueError: return -1

                df_proc[f'{col}_encoded'] = df_proc[col].apply(safe_transform)
                if col not in self.label_encoders: self.label_encoders[col] = le

        numeric_cols_to_scale = ['rule_id', 'rule_level']
        for col in numeric_cols_to_scale:
            if col in df_proc.columns:
                data = df_proc[col].values.reshape(-1, 1)
                scaler = self.label_encoders.get(col)
                if scaler is None:
                    scaler = StandardScaler()
                    df_proc[f'{col}_scaled'] = scaler.fit_transform(data)
                    self.label_encoders[col] = scaler
                else:
                    df_proc[f'{col}_scaled'] = scaler.transform(data)

        df_proc['timestamp'] = pd.to_datetime(df_proc['timestamp'], errors='coerce')
        df_proc = df_proc.sort_values('timestamp').reset_index(drop=True)
        time_diff = df_proc['timestamp'].diff().dt.total_seconds().fillna(0)
        df_proc['time_delta_inv'] = 1 / (time_diff + 1e-6)
        return df_proc

    def fit(self, df):
        df_proc = self._preprocess(df)
        self.rule_counts = df_proc["rule_id"].value_counts()
        df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(
            lambda x: 1 if self.rule_counts.get(x, 0) <= 1 else 0
        )
        X = df_proc[self.feature_cols]
        y = df_proc['is_critical_event']
        self.classifier.fit(X.values, y.values)
        self.is_trained = True
        print(f"Classifier trained with {len(X)} samples. Feature columns: {self.feature_cols}")

    def analyze(self, df):
        if df.empty: return pd.DataFrame()
        df_proc = self._preprocess(df)

        def get_anomaly_flag(rule_id):
            is_rare_or_new = self.rule_counts.get(rule_id, 0) <= 1
            return 1 if is_rare_or_new else 0

        df_proc['Anomaly_Flag'] = df_proc["rule_id"].apply(get_anomaly_flag)
        df_proc['ML_Proba'] = self.classifier.predict_proba(df_proc[self.feature_cols].values)

        seq_alert_check = df_proc.groupby('srcip')['time_delta_inv'].transform(
             lambda x: x.rolling(window=3, min_periods=1).mean()
        )
        df_proc['Seq_Alert'] = (df_proc['time_delta_inv'] > seq_alert_check.shift(1).fillna(0) * 2).astype(int)

        df_proc['Final_Score'] = (
            df_proc['ML_Proba'] * 0.5 + df_proc['Anomaly_Flag'] * 0.3 + df_proc['Seq_Alert'] * 0.2
        )
        df_proc['FinalAlert'] = (df_proc['Final_Score'] > 0.75).astype(int)

        return df_proc[['timestamp', 'srcip', 'severity', 'rule_id', 'rule_level',
                        'Anomaly_Flag', 'Seq_Alert', 'ML_Proba', 'Final_Score', 'FinalAlert']]


In [126]:

# ====================================================================
# 3. DASHBOARD AND DATA LOADING LOGIC
# ====================================================================

def create_scaler_booster_data(agent_id_prefix):
    """Creates synthetic data mirroring Agent's high rule ranges for stable scaling."""
    return pd.DataFrame([
        {'timestamp': '2025-01-01T00:00:00Z', 'srcip': '1.1.1.1', 'dstip': '2.2.2.2',
         'event_type': 'brute_force_attempt', 'action': 'block', 'status': 'failed',
         'severity': 'Critical', 'group': '[api_booster]', 'message': 'Booster data high level',
         'system': 'linux', 'rule_id': 900101, 'rule_level': 16,
         'agent_id': agent_id_prefix + '99', 'agent_name': 'firewall-gateway'},
        {'timestamp': '2025-01-01T00:00:01Z', 'srcip': '1.1.1.1', 'dstip': '2.2.2.2',
         'event_type': 'network_connection', 'action': 'allow', 'status': 'success',
         'severity': 'Low', 'group': '[api_booster]', 'message': 'Booster data low level',
         'system': 'linux', 'rule_id': 900350, 'rule_level': 4,
         'agent_id': agent_id_prefix + '99', 'agent_name': 'firewall-gateway'},
    ])

def load_and_prepare_training_data(file_path):
    """Loads and flattens training data, injecting custom rules for stable scaling."""
    df_full = pd.read_json(file_path, lines=True)
    df_flat = json_normalize(df_full.to_dict(orient='records'), sep='.')

    required_context_cols = ['timestamp', 'srcip', 'dstip', 'event_type', 'action', 'status',
                             'severity', 'group', 'message', 'system']

    final_df_original = df_flat[required_context_cols].copy()
    final_df_original['rule_id'] = df_flat['rule.id']
    final_df_original['rule_level'] = df_flat['rule.level']
    final_df_original['agent_id'] = df_flat['agent.id']
    final_df_original['agent_name'] = df_flat['agent.name']

    df_booster = create_scaler_booster_data('00')
    df_combined = pd.concat([final_df_original.head(800), df_booster], ignore_index=True)

    df_combined['rule_id'] = pd.to_numeric(df_combined['rule_id'], errors='coerce')
    df_combined['rule_level'] = pd.to_numeric(df_combined['rule_level'], errors='coerce')

    return df_combined

def display_live_dashboard():
    """Generates and displays the dashboard using accumulated data."""
    global ANALYSIS_RESULTS_DF

    if ANALYSIS_RESULTS_DF.empty:
        print("Dashboard: Waiting for first batch of data...")
        return

    clear_output(wait=True)

    # --- 1. Alert Distribution (Pie Chart) ---
    alert_counts = ANALYSIS_RESULTS_DF['FinalAlert'].map({1: 'Critical', 0: 'Non-Critical'}).value_counts().reset_index()
    alert_counts.columns = ['AlertStatus', 'Count']

    fig1 = px.pie(
        alert_counts,
        names='AlertStatus',
        values='Count',
        title='Total Critical vs Non-Critical Alerts',
        color='AlertStatus',
        color_discrete_map={'Critical': 'red', 'Non-Critical': 'lightgreen'},
        hole=0.4
    )
    fig1.update_traces(textinfo='value+percent', pull=[0.1 if s == 'Critical' else 0 for s in alert_counts['AlertStatus']])
    fig1.show()

    # --- 2. Severity Score Over Time (Line Chart) ---
    fig2 = px.line(
        ANALYSIS_RESULTS_DF,
        x='timestamp',
        y='Final_Score',
        color=ANALYSIS_RESULTS_DF['FinalAlert'].map({1: 'Critical (1)', 0: 'Non-Critical (0)'}),
        title='Event Severity Score Over Time (FinalAlert = 1 is Threat)',
        color_discrete_map={'Critical (1)': 'red', 'Non-Critical (0)': 'blue'},
        markers=True
    )
    fig2.update_layout(showlegend=True, yaxis_title="Final Score")
    fig2.show()


In [127]:
def continuous_live_analysis(analyzer):
    """Monitors the Drive file for new lines and processes them in batches."""
    # --- FIX: Global declarations moved to the top of the function ---
    global ANALYSIS_RESULTS_DF, ANALYZER_IS_ACTIVE

    BATCH_SIZE = 5
    f = None
    new_logs_queue = deque()
    total_processed_count = 0

    print("\n--- 🧠 Continuous Analyzer Started ---")
    print(f"Monitoring Live Feed at: {LIVE_INPUT_PATH}")

    # CATCH-UP LOGIC: Reads all existing data on startup (FIXES MISSING BACKLOG DATA)
    try:
        # Load and process any backlog data
        if os.path.exists(LIVE_INPUT_PATH):
            with open(LIVE_INPUT_PATH, 'r', encoding="utf-8") as init_f:
                for line in init_f:
                    try:
                        log_entry = json.loads(line)
                        new_logs_queue.append(log_entry)
                    except json.JSONDecodeError:
                        pass

        # ... (Rest of the backlog processing logic remains the same) ...
        # (This section now correctly uses the globally accessible ANALYSIS_RESULTS_DF)

        while len(new_logs_queue) >= BATCH_SIZE:
            batch_list = [new_logs_queue.popleft() for _ in range(BATCH_SIZE)]
            df_batch = pd.DataFrame(batch_list)
            results = analyzer.analyze(df_batch)
            total_processed_count += len(df_batch)

            # ACCUMULATE RESULTS
            ANALYSIS_RESULTS_DF = pd.concat([ANALYSIS_RESULTS_DF, results], ignore_index=True)
            print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Processed backlog batch. Total: {total_processed_count}")

        print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Finished initial processing. Total processed: {total_processed_count}")

    except Exception as e:
        print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Error during initial catch-up: {e}. Clearing queue.")
        new_logs_queue.clear()

    # --- LIVE MONITORING LOOP ---
    while ANALYZER_IS_ACTIVE:
        if f is None:
             try:
                f = open(LIVE_INPUT_PATH, 'r', encoding="utf-8")
                f.seek(0, 1)
             except FileNotFoundError:
                time.sleep(5)
                continue

        new_line = f.readline()

        if new_line:
            try:
                log_entry = json.loads(new_line)
                new_logs_queue.append(log_entry)

                if len(new_logs_queue) >= BATCH_SIZE:
                    batch_list = [new_logs_queue.popleft() for _ in range(BATCH_SIZE)]
                    df_batch = pd.DataFrame(batch_list)
                    results = analyzer.analyze(df_batch)
                    total_processed_count += len(df_batch)

                    # ACCUMULATE RESULTS
                    # The global statement is implicitly working here now
                    ANALYSIS_RESULTS_DF = pd.concat([ANALYSIS_RESULTS_DF, results], ignore_index=True)

                    # Log activity
                    critical_count = len(results[results['FinalAlert'] == 1])
                    print(f"[{datetime.now().strftime('%H:%M:%S')}] LIVE BATCH PROCESSED. Critical Alerts: {critical_count}. Total: {total_processed_count}")

            except Exception as e:
                print(f"[{datetime.now().strftime('%H:%M:%S')}] ANALYZER: Runtime Error: {e}. Skipping batch.")
                new_logs_queue.clear()

        else:
            time.sleep(1)

In [130]:
# --- RUN THIS CODE IN A SEPARATE CELL ---

# WARNING: Ensure the main Analyzer code is running first!

def print_latest_analysis_detail(num_rows=15):
    """Prints the latest N processed analysis rows for debugging and review."""

    # NOTE: Since global ANALYSIS_RESULTS_DF is defined in the main Analyzer thread,
    # it must be imported/referenced correctly. We assume it's accessible globally
    # across your notebook session.
    global ANALYSIS_RESULTS_DF

    if ANALYSIS_RESULTS_DF.empty:
        print("Analysis results table is currently empty.")
        return

    display_cols = [
        'timestamp', 'srcip', 'severity', 'rule_id', 'rule_level',
        'ML_Proba', 'Seq_Alert', 'Final_Score', 'FinalAlert'
    ]

    # Check if ANALYSIS_RESULTS_DF has the required columns before proceeding
    if not all(col in ANALYSIS_RESULTS_DF.columns for col in display_cols):
        print("Analysis results table not fully initialized yet (missing ML scores).")
        return

    latest_df = ANALYSIS_RESULTS_DF[display_cols].tail(num_rows).copy()

    print(f"\n--- LATEST {len(latest_df)} PROCESSED EVENTS (Detail Log) ---")

    try:
        # We ensure a minimum number of rows are available for clean display
        if len(latest_df) > 0:
            print(latest_df.to_markdown(index=False, floatfmt=".4f"))
        else:
            print("Not enough rows processed yet for the requested display count.")
    except Exception as e:
        print(f"Error displaying detailed table: {e}")
        print(latest_df.to_string(index=False))
    print("--------------------------------------------------")

# --- ACTION: Call the function here! ---
print_latest_analysis_detail(num_rows=15)


--- LATEST 15 PROCESSED EVENTS (Detail Log) ---
| timestamp                        | srcip        | severity   |   rule_id |   rule_level |   ML_Proba |   Seq_Alert |   Final_Score |   FinalAlert |
|:---------------------------------|:-------------|:-----------|----------:|-------------:|-----------:|------------:|--------------:|-------------:|
| 2025-10-30 20:13:31.673000+00:00 | 203.0.113.5  | critical   |    900101 |           15 |     0.4773 |           1 |        0.7386 |            0 |
| 2025-10-30 20:13:32.145000+00:00 | 192.168.1.67 | medium     |    900347 |            5 |     0.5226 |           0 |        0.5613 |            0 |
| 2025-10-30 20:13:32.525000+00:00 | 192.168.1.59 | medium     |    900324 |            5 |     0.5226 |           0 |        0.5613 |            0 |
| 2025-10-30 20:13:32.887000+00:00 | 192.168.1.80 | medium     |    900395 |            5 |     0.5226 |           0 |        0.5613 |            0 |
| 2025-10-30 20:13:33.216000+00:00 | 192.168.1.31 |

In [None]:
 # --- Renamed the function for clarity and avoiding SyntaxError ---
def start_dashboard_loop():
    """Runs the dashboard display function periodically in the main thread."""
    global ANALYZER_IS_ACTIVE
    while ANALYZER_IS_ACTIVE:
        display_live_dashboard() # This updates the plot
        time.sleep(15) # Update the dashboard every 5 seconds

# --- Main Analyzer Execution Block (FIXED AND CLARIFIED) ---

if __name__ == "__main__":

    # 1. Thread Control Initialization
    global ANALYZER_IS_ACTIVE
    ANALYZER_IS_ACTIVE = True

    try:
        # Drive mounting and Training setup
        drive.mount('/content/drive', force_remount=True)

        print(f"\nAttempting to load and train model using: {TRAIN_FILE_PATH}")
        df_train = load_and_prepare_training_data(TRAIN_FILE_PATH)
        event_analyzer = WazuhEventAnalyzer(lr=0.05, epochs=2000)
        event_analyzer.fit(df_train)

        print("\n==================================================")
        print("SERVER: WazuhEventAnalyzer **RETRAINED** with custom rule ranges.")
        print("Starting continuous monitoring and dashboard updates...")
        print("==================================================")

        # 2. Start Monitoring in a Background Thread
        monitor_thread = threading.Thread(target=continuous_live_analysis, args=(event_analyzer,))
        monitor_thread.daemon = True
        monitor_thread.start()

        # 3. Start Dashboard Updater in the Main Thread
        # This function blocks the main kernel execution and refreshes the plots.
        start_dashboard_loop()

    except KeyboardInterrupt:
        # User pressed the Stop button (Ctrl+C)
        ANALYZER_IS_ACTIVE = False
        print("\nANALYZER STOPPED by user. Shutting down threads.")
    except Exception as e:
        ANALYZER_IS_ACTIVE = False
        print(f"\nCRITICAL ANALYZER STARTUP ERROR: {e}")
        print("Please check all class definitions and file paths.")

