In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import pickle
import json

# --- PyTorch ANN Model Definition ---
class ANNModel(nn.Module):
    """A simple sequential neural network for binary classification."""
    def __init__(self, input_dim):
        super(ANNModel, self).__init__()
        self.layer_1 = nn.Linear(input_dim, 128)
        self.layer_2 = nn.Linear(128, 64)
        self.layer_3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.sigmoid(self.layer_3(x))
        return x

# --- 0. Setup and File Management ---
def clear_stale_model_files():
    """Deletes old model and preprocessing files to force retraining."""
    print("--- Clearing stale model files ---")
    files_to_delete = [
        'ann_model.pth', 'rf_model.pkl', 'iso_forest_model.pkl', # Changed .keras to .pth
        'scaler.pkl', 'feature_columns.pkl', 'label_encoders.pkl'
    ]
    for f in files_to_delete:
        if os.path.exists(f):
            os.remove(f)

def create_default_policy():
    """Creates a default policy.json file if one doesn't exist."""
    if not os.path.exists('policy.json'):
        default_policy = {
          "LowConfidenceThreat": { "confidence_threshold": 0.50, "priority_actions": ["MONITOR", "LOG_EVENT"] },
          "MediumConfidenceThreat": { "confidence_threshold": 0.75, "priority_actions": ["FLAG_FOR_REVIEW", "MONITOR"] },
          "HighConfidenceThreat": { "confidence_threshold": 0.90, "priority_actions": ["BLOCK_IP", "ISOLATE_HOST"] },
          "KnownThreatIoC": { "confidence_threshold": 0.99, "priority_actions": ["BLOCK_IP", "ISOLATE_HOST", "RUN_SCAN"] }
        }
        with open('policy.json', 'w') as f:
            json.dump(default_policy, f, indent=2)
        print("--- Created default policy.json ---")

# --- 1. Data Loading and Preprocessing ---
def preprocess_data(net_path='Problem_Statement/network_traffic_logs.csv', sys_path='Problem_Statement/system_event_logs.csv', intel_path='Problem_Statement/threat_intelligence.csv'):
    """Loads, combines, and preprocesses all data sources including threat intelligence."""
    print("--- Starting Data Preprocessing ---")
    # This function is framework-agnostic and remains unchanged.
    df_net = pd.read_csv(net_path, skipinitialspace=True)
    df_sys = pd.read_csv(sys_path, skipinitialspace=True)
    df_intel = pd.read_csv(intel_path, skipinitialspace=True)

    # # Add these lines to clean the headers
    # df_net.columns = df_net.columns.str.strip()
    # df_sys.columns = df_sys.columns.str.strip()
    # df_intel.columns = df_intel.columns.str.strip()

    known_threat_ips = set(df_intel[df_intel['Threat Type'] != 'Phishing']['IOC'])
    
    df_net['is_known_threat_ip'] = df_net.apply(
        lambda row: 1 if row['Source IP'] in known_threat_ips or row['Destination IP'] in known_threat_ips else 0, axis=1
    )
    
    df_net['Timestamp'] = pd.to_datetime(df_net['Timestamp'])
    df_sys['Timestamp'] = pd.to_datetime(df_sys['Timestamp'])
    df_sys.sort_values('Timestamp', inplace=True)
    
    lookback_window = pd.Timedelta(hours=10)
    aggregated_features = []

    for index, net_row in df_net.iterrows():
        end_time = net_row['Timestamp']
        start_time = end_time - lookback_window
        recent_sys_events = df_sys[(df_sys['Timestamp'] > start_time) & (df_sys['Timestamp'] <= end_time)]
        
        features = {
            'timestamp': end_time,
            'recent_failed_logins': (recent_sys_events['Event Type'] == 'Failed Login').sum(),
            'recent_priv_escalations': (recent_sys_events['Event Type'] == 'Privilege Escalation').sum(),
            'is_system_threat_present': 1 if 'Yes' in recent_sys_events['Threat'].unique() else 0
        }
        aggregated_features.append(features)

    df_agg_sys = pd.DataFrame(aggregated_features)
    df = pd.merge(df_net, df_agg_sys, left_on='Timestamp', right_on='timestamp', how='left').fillna(0)
    
    original_df = df.copy()

    df['Threat'] = LabelEncoder().fit_transform(df['Threat'])
    df['Combined_Threat'] = (df['Threat'] | df['is_system_threat_present'] | df['is_known_threat_ip']).astype(int)

    df_features = df.drop(['Timestamp', 'Source IP', 'Destination IP', 'Threat Type', 'Threat', 'timestamp', 'is_system_threat_present', 'Combined_Threat'], axis=1)
    
    encoders = {}
    for col in ['Protocol']:
        le = LabelEncoder()
        df_features[col] = le.fit_transform(df_features[col].astype(str))
        encoders[col] = le
        
    X = df_features
    y = df['Combined_Threat']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("--- Data Preprocessing Complete ---\n")
    return X_scaled, y.values, original_df, scaler, X.columns, encoders

# --- 2. Model Training ---
def train_models(X, y, scaler, encoders):
    """Trains and saves the hybrid detection models."""
    print("\n--- Training Hybrid Detection Models ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

    # a) Supervised: Random Forest (Unchanged)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    print(f"Random Forest Test Accuracy: {rf_model.score(X_test, y_test)*100:.2f}%")

    # b) Supervised: ANN (PyTorch Conversion)
    input_dim = X_train.shape[1]
    ann_model = ANNModel(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(ann_model.parameters(), lr=0.001)
    
    # Prepare data for PyTorch
    X_train_tensor = torch.tensor(X_train.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.astype(np.float32)).reshape(-1, 1)
    X_test_tensor = torch.tensor(X_test.astype(np.float32))
    y_test_tensor = torch.tensor(y_test.astype(np.float32)).reshape(-1, 1)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

    # Training loop
    ann_model.train()
    for epoch in range(50):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = ann_model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
    # Evaluation
    ann_model.eval()
    with torch.no_grad():
        y_pred = ann_model(X_test_tensor)
        y_pred_class = y_pred.round()
        accuracy = accuracy_score(y_test_tensor, y_pred_class)
        print(f"ANN Model Test Accuracy: {accuracy*100:.2f}%")
    
    # c) Unsupervised: Isolation Forest (Unchanged)
    iso_forest_model = IsolationForest(contamination='auto', random_state=42)
    iso_forest_model.fit(X_train)

    # Save models and preprocessing objects
    torch.save(ann_model.state_dict(), 'ann_model.pth') # PyTorch model saving
    with open('rf_model.pkl', 'wb') as f: pickle.dump(rf_model, f)
    with open('iso_forest_model.pkl', 'wb') as f: pickle.dump(iso_forest_model, f)
    with open('scaler.pkl', 'wb') as f: pickle.dump(scaler, f)
    with open('label_encoders.pkl', 'wb') as f: pickle.dump(encoders, f)
    print("--- All Models and Preprocessing Objects Saved ---")

# --- 3. Multi-Agent System ---
class DetectionAgent:
    """Uses a hybrid model approach for threat detection."""
    def __init__(self, input_dim, feature_columns):
        print("--- Initializing Detection Agent ---")
        # Load PyTorch model
        self.ann_model = ANNModel(input_dim)
        self.ann_model.load_state_dict(torch.load('ann_model.pth'))
        self.ann_model.eval()

        with open('rf_model.pkl', 'rb') as f: self.rf_model = pickle.load(f)
        with open('iso_forest_model.pkl', 'rb') as f: self.iso_forest_model = pickle.load(f)
        with open('scaler.pkl', 'rb') as f: self.scaler = pickle.load(f)
        with open('label_encoders.pkl', 'rb') as f: self.encoders = pickle.load(f)
        self.feature_columns = feature_columns

    def analyze(self, new_data):
        print("\n>>> [Detection Agent] Analyzing new data with hybrid model...")
        data_df = pd.DataFrame([new_data])
        
        data_processed = data_df.copy()
        for col, encoder in self.encoders.items():
            data_processed[col] = encoder.transform(data_processed[col].astype(str))
        
        data_processed = data_processed[self.feature_columns]
        data_scaled = self.scaler.transform(data_processed)
        
        # Get predictions: PyTorch prediction logic
        data_tensor = torch.tensor(data_scaled.astype(np.float32))
        with torch.no_grad():
            ann_pred_prob = self.ann_model(data_tensor).item()
        
        rf_probs = self.rf_model.predict_proba(data_scaled)
        rf_pred_prob = rf_probs[0][1] if rf_probs.shape[1] > 1 else (1.0 if self.rf_model.classes_[0] == 1 else 0.0)

        iso_pred = self.iso_forest_model.predict(data_scaled)[0]
        
        anomaly_score = 1 if iso_pred == -1 else 0
        final_confidence = (ann_pred_prob + rf_pred_prob + anomaly_score) / 3.0
        
        alert = {
            'is_threat': bool(final_confidence > 0.5),
            'is_known_threat': bool(data_processed['is_known_threat_ip'].iloc[0] == 1),
            'confidence': f"{final_confidence:.2%}",
            'details': data_df.to_dict('records')[0]
        }
        print(f">>> [Detection Agent] Analysis complete. Threat Detected: {alert['is_threat']} (Confidence: {alert['confidence']})")
        return alert

class ResponseAgent:
    """Acts based on a dynamic, file-based policy."""
    def __init__(self, policy_file='policy.json'):
        # This class is framework-agnostic and remains unchanged.
        print("--- Initializing Response Agent ---")
        with open(policy_file, 'r') as f:
            self.policy = json.load(f)

    def formulate_response(self, alert):
        print(">>> [Response Agent] Consulting policy...")
        confidence = float(alert['confidence'].strip('%')) / 100
        
        if alert['is_known_threat']:
            threat_level = "KnownThreatIoC"
        elif confidence > self.policy['HighConfidenceThreat']['confidence_threshold']:
            threat_level = "HighConfidenceThreat"
        elif confidence > self.policy['MediumConfidenceThreat']['confidence_threshold']:
            threat_level = "MediumConfidenceThreat"
        elif confidence > self.policy['LowConfidenceThreat']['confidence_threshold']:
            threat_level = "LowConfidenceThreat"
        else:
            return "No action required. Continue monitoring."
        
        actions = self.policy[threat_level]['priority_actions']
        print(f">>> [Response Agent] Threat Level '{threat_level}'. Executing actions: {actions}")
        return actions

class LearningAgent:
    """RL-based agent to adapt and improve the system."""
    def __init__(self, log_file='incident_log.csv'):
        # This class is framework-agnostic and remains unchanged.
        print("--- Initializing Learning Agent (RL Core) ---")
        self.log_file = log_file

    def get_feedback_and_reward(self, alert, action):
        print(">>> [Learning Agent] Analyzing action results...")
        confidence = float(alert['confidence'].strip('%')) / 100
        is_correct_action = ('BLOCK_IP' in action and confidence > 0.8) or ('MONITOR' in action and confidence < 0.8)
        
        reward = 1 if is_correct_action else -1
        print(f">>> [Learning Agent] Action effectiveness assessed. Reward: {reward}")
        return reward

    def update_policy(self, reward):
        print(">>> [Learning Agent] Policy refinement signal sent (Reward: {}).".format(reward))
        pass
        
    def update_models(self, alert, action):
        print(">>> [Learning Agent] Model update signal sent.")
        pass

# --- 4. Main Execution ---
def main():
    """Main function to run the full multi-agent system simulation."""
    clear_stale_model_files()
    create_default_policy()
    
    # --- Training Phase ---
    # This assumes dummy CSV files exist: 'network_traffic_logs.csv', etc.
    # To run, you would need to create these files with the expected columns.
    X_scaled, y, original_df, scaler, feature_columns, encoders = preprocess_data()
    with open('feature_columns.pkl', 'wb') as f: pickle.dump(feature_columns, f)
    train_models(X_scaled, y, scaler, encoders)
    
    # --- Operational Phase ---
    print("\n" + "="*50)
    print("--- Initializing Multi-Agent Cybersecurity Watchdog ---")
    detection_agent = DetectionAgent(input_dim=X_scaled.shape[1], feature_columns=feature_columns)
    response_agent = ResponseAgent()
    learning_agent = LearningAgent()

    # --- Simulation Loop ---
    print("\n" + "="*50)
    print("--- Simulating New Incoming Data Event ---")
    
    new_event_data = {
        'Packet Size (bytes)': 1490.0, 'Flow Duration (s)': 0.2, 'Protocol': 'TCP',
        'is_known_threat_ip': 0, 'recent_failed_logins': 8, 'recent_priv_escalations': 2,
        'Source IP': '198.51.100.23', 'Destination IP': '10.0.0.5'
    }
    
    alert = detection_agent.analyze(new_event_data)
    actions = response_agent.formulate_response(alert)
    reward = learning_agent.get_feedback_and_reward(alert, actions)
    learning_agent.update_policy(reward)
    learning_agent.update_models(alert, actions)
    
    print("\n--- System Cycle Complete ---")


if __name__ == "__main__":
    # Set seeds for reproducibility
    torch.manual_seed(42) # PyTorch seed
    np.random.seed(42)
    # The original script would fail here without the data files.
    # Add a try-except block to notify the user if data is missing.
    try:
        main()
    except FileNotFoundError:
        print("\nERROR: Data files ('network_traffic_logs.csv', 'system_event_logs.csv', 'threat_intelligence.csv') not found.")
        print("Please create these CSV files with appropriate data to run the pipeline.")

--- Clearing stale model files ---
--- Starting Data Preprocessing ---
--- Data Preprocessing Complete ---


--- Training Hybrid Detection Models ---
Random Forest Test Accuracy: 99.07%
ANN Model Test Accuracy: 99.07%
--- All Models and Preprocessing Objects Saved ---

--- Initializing Multi-Agent Cybersecurity Watchdog ---
--- Initializing Detection Agent ---
--- Initializing Response Agent ---
--- Initializing Learning Agent (RL Core) ---

--- Simulating New Incoming Data Event ---

>>> [Detection Agent] Analyzing new data with hybrid model...
>>> [Detection Agent] Analysis complete. Threat Detected: True (Confidence: 100.00%)
>>> [Response Agent] Consulting policy...
>>> [Response Agent] Threat Level 'HighConfidenceThreat'. Executing actions: ['BLOCK_IP', 'ISOLATE_HOST']
>>> [Learning Agent] Analyzing action results...
>>> [Learning Agent] Action effectiveness assessed. Reward: 1
>>> [Learning Agent] Policy refinement signal sent (Reward: 1).
>>> [Learning Agent] Model update signal