In [1]:
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
os.chdir(project_root)  # 👈 this changes the working directory
sys.path.append(project_root)

print("New working directory:", os.getcwd())

New working directory: C:\Users\Atharva Kulkarni\Desktop\Programming\ML PROJECTS\SIEM\SIEM project\AI-driven-SIEM-System


In [3]:
from src.utils.config_loader import load_config
config = load_config("configs/log-anomaly-detection.yml")
print("Raw data path:", config["data_paths"]["input_dir"])

Raw data path: data/logs/raw/


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
import pickle
import json
from datetime import datetime
import warnings
from tqdm import tqdm
from collections import Counter
import re
warnings.filterwarnings('ignore')

In [None]:
# ================== DATA PROCESSING ==================
class LogDataset(Dataset):
    def __init__(self, data, seq_len=8, stride=8, return_indices=False):
        self.seq_len = seq_len
        self.stride = stride
        self.data = data
        self.return_indices = return_indices
        self.samples = []
        self.indices = []
        
        for i in range(0, len(data)-seq_len, stride):
            self.samples.append(data[i:i+seq_len])
            self.indices.append(i)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = torch.tensor(self.samples[idx], dtype=torch.float)
        if self.return_indices:
            return sample, self.indices[idx]
        return sample

def load_and_preprocess(path):
    print("Loading and preprocessing data...")
    df = pd.read_csv(path)
    
    # Store original data for reference
    original_df = df.copy()
    
    # Remove only clearly problematic columns
    columns_to_drop = ["LineId", "Time"]  # Keep Content and EventTemplate for now
    
    # Remove temporal identifiers that cause overfitting
    if "Date" in df.columns:
        columns_to_drop.append("Date")
        print("Removed 'Date' column - temporal identifiers cause overfitting")
    
    if "PID" in df.columns:
        columns_to_drop.append("PID")
        print("Removed 'PID' column - process IDs are not semantically meaningful")
    
    print(f"Dropping columns: {columns_to_drop}")
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Define categorical columns properly
    categorical_cols = ["Level", "Component", "EventId"]
    
    # Add EventTemplate if it exists and has reasonable cardinality
    if "EventTemplate" in df.columns:
        unique_templates = df["EventTemplate"].nunique()
        print(f"EventTemplate has {unique_templates} unique values")
        
        if unique_templates < 1000:  # Reasonable threshold
            categorical_cols.append("EventTemplate")
            print("Including EventTemplate as categorical feature")
        else:
            print(f"EventTemplate has too many unique values ({unique_templates}), dropping")
            df = df.drop(columns=["EventTemplate"])
    
    # Handle Content with better feature extraction
    if "Content" in df.columns:
        print("Processing Content column...")
        
        # Basic length and word count features
        df['content_length'] = df['Content'].str.len()
        df['content_word_count'] = df['Content'].str.split().str.len()
        
        # More discriminative error/warning detection with word boundaries
        df['content_has_error'] = df['Content'].str.contains(
            r'\b(error|fail|exception|crash|abort|fault)\b', case=False, na=False
        ).astype(int)
        
        df['content_has_warning'] = df['Content'].str.contains(
            r'\b(warn|alert|caution)\b', case=False, na=False
        ).astype(int)
        
        # More specific numeric patterns
        df['content_has_large_numbers'] = df['Content'].str.contains(
            r'\b\d{4,}\b', na=False
        ).astype(int)
        
        # Critical system indicators
        df['content_has_critical'] = df['Content'].str.contains(
            r'\b(critical|fatal|panic|segfault|timeout|killed)\b', case=False, na=False
        ).astype(int)
        
        # Network/security related patterns
        df['content_has_network'] = df['Content'].str.contains(
            r'\b(connection|socket|port|network|tcp|udp)\b', case=False, na=False
        ).astype(int)
        
        # Memory/resource related patterns
        df['content_has_memory'] = df['Content'].str.contains(
            r'\b(memory|malloc|free|leak|oom|out of memory)\b', case=False, na=False
        ).astype(int)
        
        # Drop the original Content column
        df = df.drop(columns=["Content"])
        print("Extracted features: length, word_count, has_error, has_warning, has_large_numbers, has_critical, has_network, has_memory")
        
        # Validation: Check feature distributions
        content_features = ['content_has_error', 'content_has_warning', 'content_has_large_numbers', 
                           'content_has_critical', 'content_has_network', 'content_has_memory']
        for feature in content_features:
            if feature in df.columns:
                percentage = df[feature].mean() * 100
                print(f"  {feature}: {percentage:.1f}% of logs")
    
    # Remove Month entirely
    if "Month" in df.columns:
        df = df.drop(columns=["Month"])
        print("Removed 'Month' column - avoiding temporal overfitting")
    
    # Filter to only existing categorical columns
    categorical_cols = [col for col in categorical_cols if col in df.columns]
    print(f"Using categorical columns: {categorical_cols}")
    
    if categorical_cols:
        if "EventTemplate" in categorical_cols:
            from sklearn.feature_extraction.text import TfidfVectorizer
            
            print("Processing EventTemplate with TF-IDF for better semantic representation...")
            
            # Custom stopwords for log data
            log_stopwords = [
                'at', 'by', 'for', 'from', 'to', 'in', 'on', 'with', 'of', 'the', 'a', 'an',
                'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
                'and', 'or', 'but', 'if', 'then', 'else', 'when', 'where', 'why', 'how',
                'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them', 'their'
            ]
            
            # Use TF-IDF for EventTemplate
            tfidf = TfidfVectorizer(
                max_features=50,
                stop_words=log_stopwords,
                ngram_range=(1, 2),
                min_df=3,
                max_df=0.7,
                token_pattern=r'\b[a-zA-Z]{3,}\b'
            )
            
            template_tfidf = tfidf.fit_transform(df["EventTemplate"].astype(str))
            template_features = template_tfidf.toarray()
            
            print(f"EventTemplate TF-IDF generated {template_features.shape[1]} meaningful features")
            
            # Process other categoricals normally
            other_cats = [col for col in categorical_cols if col != "EventTemplate"]
            if other_cats:
                other_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=50)
                other_features = other_ohe.fit_transform(df[other_cats].astype(str))
                cat_features = np.hstack([template_features, other_features])
                print(f"Other categoricals generated {other_features.shape[1]} features")
                
                ohe = (tfidf, other_ohe)
            else:
                cat_features = template_features
                ohe = (tfidf, None)
            
            df = df.drop(columns=["EventTemplate"])
        else:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=100)
            cat_features = ohe.fit_transform(df[categorical_cols].astype(str))
        
        df = df.drop(columns=[col for col in categorical_cols if col != "EventTemplate"])
    else:
        ohe = None
        cat_features = np.array([]).reshape(len(df), 0)

    # Process numerical features
    numerical_cols = list(df.columns)
    print(f"Numerical columns: {numerical_cols}")
    
    if len(numerical_cols) > 0:
        imputer = SimpleImputer(strategy="mean")
        num_features = imputer.fit_transform(df[numerical_cols])
    else:
        imputer = None
        num_features = np.array([]).reshape(len(df), 0)

    # Feature combination
    if cat_features.shape[1] > 0 and num_features.shape[1] > 0:
        combined = np.hstack([cat_features, num_features])
    elif cat_features.shape[1] > 0:
        combined = cat_features
    elif num_features.shape[1] > 0:
        combined = num_features
    else:
        raise ValueError("No features remaining after preprocessing!")
    
    # Apply scaling
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(combined)
    
    print(f"Final feature dimension: {scaled_data.shape[1]}")
    print(f"Numerical features: {num_features.shape[1]}, Categorical features: {cat_features.shape[1]}")

    return scaled_data, ohe, imputer, scaler, original_df

In [None]:
# ================== HYBRID ATTENTION LSTM AUTOENCODER ==================
class HybridAttentionLSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=16, dropout=0.4, enable_single_log=True):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.enable_single_log = enable_single_log
        
        # Sequential processing components (original approach)
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True, 
                              num_layers=2, dropout=dropout, bidirectional=True)
        
        self.attention = nn.MultiheadAttention(hidden_dim * 2, num_heads=4, 
                                             dropout=dropout, batch_first=True)
        
        self.decoder = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, 
                              num_layers=2, dropout=dropout)
        
        # Single log processing components (new approach)
        if enable_single_log:
            self.single_log_encoder = nn.Sequential(
                nn.Linear(input_dim, hidden_dim * 2),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim * 2, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, hidden_dim // 2)
            )
            
            self.single_log_decoder = nn.Sequential(
                nn.Linear(hidden_dim // 2, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, hidden_dim * 2),
                nn.ReLU(),
                nn.Linear(hidden_dim * 2, input_dim)
            )
        
        # Shared output layers
        self.batch_norm = nn.BatchNorm1d(hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, input_dim)
        self.dropout = nn.Dropout(dropout)
        
        # Fusion layer for combining both approaches
        self.fusion_layer = nn.Linear(input_dim * 2, input_dim)
        
    def forward(self, x, mode='hybrid'):
        """
        Forward pass with multiple modes:
        - 'sequential': Original sequence-based processing only
        - 'single': Single log processing only  
        - 'hybrid': Both approaches combined (default)
        """
        batch_size, seq_len, feature_dim = x.shape
        
        if mode == 'single' or (mode == 'hybrid' and self.enable_single_log):
            # Single log processing path
            single_outputs = []
            single_attentions = []
            
            for i in range(seq_len):
                log_features = x[:, i, :]  # Shape: (batch_size, input_dim)
                
                # Process single log
                encoded_single = self.single_log_encoder(log_features)
                decoded_single = self.single_log_decoder(encoded_single)
                
                single_outputs.append(decoded_single)
                # Create dummy attention for single logs (uniform attention)
                dummy_attention = torch.ones(batch_size, 1, 1) / seq_len
                single_attentions.append(dummy_attention)
            
            single_reconstruction = torch.stack(single_outputs, dim=1)
            single_attention_weights = torch.cat(single_attentions, dim=2)
        
        if mode == 'sequential' or mode == 'hybrid':
            # Sequential processing path (original)
            enc_out, (hidden, cell) = self.encoder(x)
            attn_out, attn_weights = self.attention(enc_out, enc_out, enc_out)
            attn_out = self.dropout(attn_out)
            dec_out, _ = self.decoder(attn_out)
            
            # Batch normalization and output
            dec_out_norm = self.batch_norm(dec_out.transpose(1, 2)).transpose(1, 2)
            sequential_reconstruction = self.output_layer(dec_out_norm)
        
        # Return based on mode
        if mode == 'single':
            return single_reconstruction, single_attention_weights
        elif mode == 'sequential':
            return sequential_reconstruction, attn_weights
        else:  # hybrid mode
            # Combine both reconstructions
            combined_features = torch.cat([sequential_reconstruction, single_reconstruction], dim=-1)
            hybrid_reconstruction = self.fusion_layer(combined_features)
            
            # Combine attention weights (weighted average)
            combined_attention = 0.6 * attn_weights + 0.4 * single_attention_weights.expand_as(attn_weights)
            
            return hybrid_reconstruction, combined_attention

In [None]:
# ================== SEVERITY CLASSIFICATION ==================
class EnhancedSeverityManager:
    def __init__(self, percentiles=None, severity_labels=None):
        self.percentiles = percentiles or [85, 95, 99]
        self.severity_labels = severity_labels or ['Low', 'Medium', 'High', 'Critical']
        self.threshold_values = {}
        self.error_stats = {}
        
    def learn_thresholds(self, error_distribution, validation_errors=None):
        """Learn thresholds with optional validation for stability"""
        error_array = np.array(error_distribution)
        
        # Learn primary thresholds
        for p in self.percentiles:
            self.threshold_values[f'p{p}'] = np.percentile(error_array, p)
        
        # Store distribution statistics
        self.error_stats = {
            'mean': np.mean(error_array),
            'std': np.std(error_array),
            'median': np.median(error_array),
            'iqr': np.percentile(error_array, 75) - np.percentile(error_array, 25)
        }
        
        print(f"✅ Learned severity thresholds: {self.threshold_values}")
        print(f"📊 Error distribution stats: {self.error_stats}")
    
    def classify_with_confidence(self, error):
        """Classify severity with confidence score"""
        if not self.threshold_values:
            raise RuntimeError("Thresholds not learned. Call learn_thresholds() first.")
        
        # Determine severity level
        severity_idx = 0
        for i, p in enumerate(self.percentiles):
            if error > self.threshold_values[f'p{p}']:
                severity_idx = i + 1
        
        severity = self.severity_labels[severity_idx]
        
        # Calculate confidence based on distance from threshold
        if severity_idx == 0:
            threshold = self.threshold_values[f'p{self.percentiles[0]}']
            confidence = max(0.1, 1.0 - (error / threshold))
        else:
            current_threshold = self.threshold_values[f'p{self.percentiles[severity_idx-1]}']
            confidence = min(1.0, (error - current_threshold) / current_threshold + 0.5)
        
        return severity, min(1.0, max(0.1, confidence))

In [None]:
# ================== RULE-BASED LOG TYPE CLASSIFIER ==================
class RuleBasedLogClassifier:
    def __init__(self):
        self.classification_rules = {
            'memory_error': [
                r'\b(out of memory|oom|page allocation failure|dma timeout)\b',
                r'\b(malloc failed|memory leak|segfault|kernel panic)\b',
                r'\b(swap.*full|virtual memory|memory pressure)\b'
            ],
            'authentication_error': [
                r'\b(authentication failure|invalid username|login failed)\b',
                r'\b(kerberos.*failed|pam_unix.*failed|ssh.*failed)\b',
                r'\b(password.*incorrect|access denied|unauthorized)\b'
            ],
            'filesystem_error': [
                r'\b(no such file|permission denied|disk full|quota exceeded)\b',
                r'\b(failed command|status timeout|drive not ready|io error)\b',
                r'\b(filesystem.*corrupt|bad sector|read.*error)\b'
            ],
            'network_error': [
                r'\b(connection timed out|connection refused|peer died)\b',
                r'\b(network unreachable|socket error|host.*down)\b',
                r'\b(dns.*failed|routing.*error|packet.*lost)\b'
            ],
            'permission_error': [
                r'\b(permission denied|operation not supported|access forbidden)\b',
                r'\b(selinux.*denied|capability.*denied|privilege.*error)\b',
                r'\b(sudo.*failed|su.*failed|root.*access)\b'
            ],
            'system_critical': [
                r'\b(critical|fatal|panic|emergency|alert)\b',
                r'\b(system.*halt|kernel.*oops|hardware.*error)\b',
                r'\b(temperature.*critical|power.*failure)\b'
            ]
        }
        
        self.pattern_weights = {
            'memory_error': 0.9,
            'authentication_error': 0.95,
            'filesystem_error': 0.85,
            'network_error': 0.8,
            'permission_error': 0.9,
            'system_critical': 0.95
        }
    
    def classify_log(self, event_template, content=""):
        """Classify a single log entry"""
        combined_text = f"{event_template} {content}".lower()
        
        for category, patterns in self.classification_rules.items():
            for pattern in patterns:
                if re.search(pattern, combined_text, re.IGNORECASE):
                    confidence = self._calculate_confidence(pattern, combined_text, category)
                    return {
                        'log_type': category,
                        'confidence': confidence,
                        'matched_pattern': pattern,
                        'is_critical': category in ['system_critical', 'authentication_error']
                    }
        
        return {
            'log_type': 'normal',
            'confidence': 0.7,
            'matched_pattern': None,
            'is_critical': False
        }
    
    def _calculate_confidence(self, pattern, text, category):
        """Calculate confidence based on pattern specificity and context"""
        base_confidence = self.pattern_weights.get(category, 0.7)
        pattern_specificity = min(len(pattern) / 50.0, 0.3)
        keywords = re.findall(r'\w+', pattern.lower())
        keyword_matches = sum(1 for keyword in keywords if keyword in text)
        keyword_bonus = min(keyword_matches * 0.05, 0.2)
        final_confidence = min(base_confidence + pattern_specificity + keyword_bonus, 0.98)
        return round(final_confidence, 3)

In [None]:
# ================== HYBRID ENSEMBLE DETECTOR ==================
class HybridEnsembleDetector:
    def __init__(self, input_dim, num_models=3, enable_single_log=True):
        self.models = []
        self.weights = []
        self.input_dim = input_dim
        self.enable_single_log = enable_single_log
        
        # Create diverse hybrid models
        configs = [
            {'hidden_dim': 16, 'dropout': 0.3},
            {'hidden_dim': 24, 'dropout': 0.4},
            {'hidden_dim': 32, 'dropout': 0.2}
        ]
        
        for i, config in enumerate(configs[:num_models]):
            model = HybridAttentionLSTMAutoencoder(
                input_dim, 
                enable_single_log=enable_single_log,
                **config
            )
            self.models.append(model)
            self.weights.append(1.0)
    
    def train_ensemble(self, train_loader, val_loader, patience=5, max_epochs=100):
        model_performances = []
        
        for i, model in enumerate(self.models):
            print(f"\n=== Training Hybrid Model {i+1}/{len(self.models)} ===")
            train_losses, val_losses = self.train_hybrid_model(
                model, train_loader, val_loader, patience, max_epochs, 
                model_name=f'hybrid_ensemble_model_{i}.pth'
            )
            
            final_val_loss = val_losses[-1] if val_losses else float('inf')
            model_performances.append(final_val_loss)
        
        # Update weights based on performance
        total_inv_loss = sum(1/loss for loss in model_performances)
        self.weights = [(1/loss) / total_inv_loss for loss in model_performances]
        
        print(f"\nHybrid Ensemble weights: {[f'{w:.3f}' for w in self.weights]}")
        return model_performances
    
    def train_hybrid_model(self, model, train_loader, val_loader, patience=5, max_epochs=100, model_name='hybrid_model.pth'):
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)
        criterion = nn.MSELoss()
        
        best_loss = float('inf')
        no_improve = 0
        train_losses = []
        val_losses = []

        for epoch in range(max_epochs):
            model.train()
            train_loss = 0
            
            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
                optimizer.zero_grad()
                
                # Train with hybrid mode (combines both approaches)
                reconstructions, _ = model(batch, mode='hybrid')
                loss = criterion(reconstructions, batch)
                
                # Add regularization for single log consistency
                if model.enable_single_log:
                    single_recon, _ = model(batch, mode='single')
                    seq_recon, _ = model(batch, mode='sequential')
                    consistency_loss = 0.1 * criterion(single_recon, seq_recon)
                    loss += consistency_loss
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                train_loss += loss.item()

            # Validation
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_loader:
                    reconstructions, _ = model(batch, mode='hybrid')
                    val_loss += criterion(reconstructions, batch).item()

            avg_train = train_loss/len(train_loader)
            avg_val = val_loss/len(val_loader)
            
            train_losses.append(avg_train)
            val_losses.append(avg_val)
            
            if epoch % 10 == 0:
                print(f"Epoch {epoch+1}: Train Loss: {avg_train:.4f} | Val Loss: {avg_val:.4f}")
            
            scheduler.step(avg_val)

            if avg_val < best_loss:
                best_loss = avg_val
                no_improve = 0
                torch.save(model.state_dict(), model_name)
            else:
                no_improve += 1
                if no_improve >= patience:
                    print(f"Early stopping triggered after {epoch+1} epochs")
                    break
        
        return train_losses, val_losses
    
    def predict(self, dataloader, mode='hybrid'):
        """Predict with specified mode"""
        all_errors = []
        
        for i, model in enumerate(self.models):
            model.load_state_dict(torch.load(f'hybrid_ensemble_model_{i}.pth'))
            errors = self.evaluate_hybrid_model(model, dataloader, mode)
            all_errors.append(errors)
        
        # Weighted ensemble prediction
        ensemble_errors = np.average(all_errors, axis=0, weights=self.weights)
        
        return ensemble_errors, all_errors
    
    def evaluate_hybrid_model(self, model, dataloader, mode='hybrid'):
        model.eval()
        errors = []
        with torch.no_grad():
            for batch in dataloader:
                reconstructions, _ = model(batch, mode=mode)
                batch_errors = torch.mean((batch - reconstructions)**2, dim=(1,2))
                errors.extend(batch_errors.numpy())
        return np.array(errors)

In [None]:
# ================== OUTPUT PROCESSING FUNCTIONS ==================

def process_single_log_outputs(single_errors, single_threshold, test_data, original_df, 
                              severity_manager, log_classifier, seq_len, stride):
    """Process and format single log anomaly outputs - only for non-normal types"""
    single_anomalies = single_errors > single_threshold
    single_results = []
    
    for seq_idx, is_anomaly in enumerate(single_anomalies):
        if is_anomaly:
            # Get each log in the sequence
            start_idx = seq_idx * stride
            for log_offset in range(seq_len):
                log_idx = start_idx + log_offset
                if log_idx < len(original_df):
                    log_entry = original_df.iloc[log_idx]
                    
                    # CHANGE: First pass - check log type classification
                    classification = log_classifier.classify_log(
                        log_entry.get('EventTemplate', ''),
                        log_entry.get('Content', '')
                    )
                    
                    # CHANGE: Only process if anomaly type is NOT normal
                    if classification['log_type'] != 'normal':
                        # Get severity
                        error = single_errors[seq_idx]
                        severity, confidence = severity_manager.classify_with_confidence(error)
                        
                        single_results.append({
                            'log': {
                                'content': log_entry.get('Content', ''),
                                'event_template': log_entry.get('EventTemplate', ''),
                                'level': log_entry.get('Level', ''),
                                'component': log_entry.get('Component', ''),
                                'line_id': log_entry.get('LineId', log_idx)
                            },
                            'anomaly_type': classification['log_type'],
                            'severity': severity,
                            'confidence': confidence,
                            'timestamp': log_entry.get('Time', ''),
                            'anomaly_score': float(error),
                            'processing_mode': 'single_log'
                        })
    
    return single_results

def process_sequential_outputs(seq_errors, seq_threshold, test_data, original_df, 
                              severity_manager, log_classifier, seq_len, stride):
    """Process and format sequential anomaly outputs - only for non-normal types"""
    seq_anomalies = seq_errors > seq_threshold
    sequential_results = []
    
    for seq_idx, is_anomaly in enumerate(seq_anomalies):
        if is_anomaly:
            # Get the entire sequence
            start_idx = seq_idx * stride
            sequence_logs = []
            sequence_classifications = []
            
            for log_offset in range(seq_len):
                log_idx = start_idx + log_offset
                if log_idx < len(original_df):
                    log_entry = original_df.iloc[log_idx]
                    
                    # CHANGE: First pass - check log type for each log in sequence
                    classification = log_classifier.classify_log(
                        log_entry.get('EventTemplate', ''),
                        log_entry.get('Content', '')
                    )
                    sequence_classifications.append(classification['log_type'])
                    
                    sequence_logs.append({
                        'content': log_entry.get('Content', ''),
                        'event_template': log_entry.get('EventTemplate', ''),
                        'level': log_entry.get('Level', ''),
                        'component': log_entry.get('Component', ''),
                        'line_id': log_entry.get('LineId', log_idx),
                        'timestamp': log_entry.get('Time', '')
                    })
            
            # CHANGE: Check if sequence has any non-normal anomaly types
            non_normal_types = [t for t in sequence_classifications if t != 'normal']
            
            # CHANGE: Only process if sequence contains non-normal anomaly types
            if non_normal_types:
                # Get severity for the sequence
                error = seq_errors[seq_idx]
                severity, confidence = severity_manager.classify_with_confidence(error)
                
                # Get most common non-normal anomaly type
                from collections import Counter
                if non_normal_types:
                    anomaly_type_counts = Counter(non_normal_types)
                    dominant_anomaly_type = anomaly_type_counts.most_common(1)[0][0]
                else:
                    # Fallback to most common overall type
                    anomaly_type_counts = Counter(sequence_classifications)
                    dominant_anomaly_type = anomaly_type_counts.most_common(1)[0][0]
                
                sequential_results.append({
                    'logs': sequence_logs,
                    'anomaly_type': dominant_anomaly_type,
                    'severity': severity,
                    'confidence': confidence,
                    'timestamp': sequence_logs[0]['timestamp'] if sequence_logs else '',
                    'sequence_length': len(sequence_logs),
                    'anomaly_score': float(error),
                    'processing_mode': 'sequential',
                    'non_normal_count': len(non_normal_types),
                    'total_logs_in_sequence': len(sequence_classifications)
                })
    
    return sequential_results


def display_single_log_results(results, max_display=10):
    """Display single log anomaly results"""
    print(f"\n🔍 SINGLE LOG ANOMALIES ({len(results)} found - non-normal types only):")
    print("=" * 80)
    
    for i, result in enumerate(results[:max_display]):
        print(f"\nAnomaly #{i+1}:")
        print(f"Log: {result['log']['content'][:100]}...")
        print(f"Anomaly Type: {result['anomaly_type']}")
        print(f"Severity: {result['severity']}")
        print(f"Timestamp: {result['timestamp']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Score: {result['anomaly_score']:.4f}")
        print("-" * 40)
    
    if len(results) > max_display:
        print(f"... and {len(results) - max_display} more")

def display_sequential_results(results, max_display=5):
    """Display sequential anomaly results"""
    print(f"\n📊 SEQUENTIAL ANOMALIES ({len(results)} found - sequences with non-normal types only):")
    print("=" * 80)
    
    for i, result in enumerate(results[:max_display]):
        print(f"\nSequence #{i+1}:")
        print(f"Logs in sequence: {result['sequence_length']}")
        print(f"Non-normal logs: {result['non_normal_count']}/{result['total_logs_in_sequence']}")
        print(f"Dominant Anomaly Type: {result['anomaly_type']}")
        print(f"Severity: {result['severity']}")
        print(f"Timestamp: {result['timestamp']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Score: {result['anomaly_score']:.4f}")
        
        print("Sequence logs:")
        for j, log in enumerate(result['logs'][:3]):  # Show first 3 logs
            print(f"  {j+1}. {log['content'][:80]}...")
        if len(result['logs']) > 3:
            print(f"  ... and {len(result['logs']) - 3} more logs")
        print("-" * 40)
    
    if len(results) > max_display:
        print(f"... and {len(results) - max_display} more sequences")

In [None]:
# ================== MAIN EXECUTION ==================
if __name__ == "__main__":
    # Configuration
    DATA_PATH = "data/logs/processed/Linux.log_structured.csv"
    SEQ_LEN = 8
    STRIDE = 8
    BATCH_SIZE = 32

    # Load and preprocess data
    data, ohe, imputer, scaler, original_df = load_and_preprocess(DATA_PATH)
    print(f"Data shape: {data.shape}")

    # Create datasets
    train_size = int(0.7 * len(data))
    val_size = int(0.15 * len(data))
    
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]

    train_dataset = LogDataset(train_data, SEQ_LEN, STRIDE)
    val_dataset = LogDataset(val_data, SEQ_LEN, STRIDE)
    test_dataset = LogDataset(test_data, SEQ_LEN, STRIDE)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize hybrid ensemble
    input_dim = data.shape[1]
    ensemble = HybridEnsembleDetector(input_dim, num_models=3, enable_single_log=True)

    # Train ensemble
    print("Training Hybrid Ensemble Models...")
    model_performances = ensemble.train_ensemble(train_loader, val_loader, patience=5)

    # Evaluate with different processing modes
    print("\nEvaluating different processing modes...")

    # Sequential processing
    print("Processing sequences...")
    seq_errors, _ = ensemble.predict(test_loader, mode='sequential')
    seq_threshold = np.percentile(seq_errors, 95)
    seq_anomalies = seq_errors > seq_threshold

    # Single log processing  
    print("Processing individual logs...")
    single_errors, _ = ensemble.predict(test_loader, mode='single')
    single_threshold = np.percentile(single_errors, 95)
    single_anomalies = single_errors > single_threshold

    print(f"Sequential anomalies detected: {seq_anomalies.sum()}")
    print(f"Single log anomalies detected: {single_anomalies.sum()}")

    # Initialize severity manager and classifier
    print("\nInitializing severity classification...")
    severity_manager = EnhancedSeverityManager(
        percentiles=[85, 95, 99],
        severity_labels=['Low', 'Medium', 'High', 'Critical']
    )

    # Learn thresholds from both processing modes
    all_errors = np.concatenate([seq_errors, single_errors])
    severity_manager.learn_thresholds(all_errors)

    print("\nInitializing rule-based log classification...")
    log_classifier = RuleBasedLogClassifier()

    # Process outputs separately
    print("\nProcessing single log outputs...")
    single_log_results = process_single_log_outputs(
        single_errors, single_threshold, test_data, original_df,
        severity_manager, log_classifier, SEQ_LEN, STRIDE
    )

    print("\nProcessing sequential outputs...")
    sequential_results = process_sequential_outputs(
        seq_errors, seq_threshold, test_data, original_df,
        severity_manager, log_classifier, SEQ_LEN, STRIDE
    )

    print(f"Single log anomalies found: {len(single_log_results)}")
    print(f"Sequential anomalies found: {len(sequential_results)}")

    # Display results
    display_single_log_results(single_log_results)
    display_sequential_results(sequential_results)

    # Save structured outputs
    output_data = {
        'single_log_anomalies': single_log_results,
        'sequential_anomalies': sequential_results,
        'summary': {
            'total_single_anomalies': len(single_log_results),
            'total_sequential_anomalies': len(sequential_results),
            'processing_timestamp': datetime.now().isoformat(),
            'thresholds': {
                'single_log_threshold': float(single_threshold),
                'sequential_threshold': float(seq_threshold)
            }
        }
    }

    # Save to JSON files
    with open('single_log_anomalies.json', 'w') as f:
        json.dump(single_log_results, f, indent=2, default=str)

    with open('sequential_anomalies.json', 'w') as f:
        json.dump(sequential_results, f, indent=2, default=str)

    with open('anomaly_outputs.json', 'w') as f:
        json.dump(output_data, f, indent=2, default=str)

    # Save model artifacts
    artifacts = {
        'ohe': ohe,
        'imputer': imputer,
        'scaler': scaler,
        'single_log_threshold': single_threshold,
        'sequential_threshold': seq_threshold,
        'ensemble_weights': ensemble.weights,
        'input_dim': input_dim,
        'seq_len': SEQ_LEN,
        'stride': STRIDE,
        'severity_manager': severity_manager,
        'processing_modes_available': ['sequential', 'single', 'hybrid']
    }

    with open('hybrid_ensemble_artifacts.pkl', 'wb') as f:
        pickle.dump(artifacts, f)

    # Comparison statistics
    print(f"\n📈 COMPARISON SUMMARY:")
    print(f"Single Log Processing:")
    print(f"   • Total anomalies: {len(single_log_results)}")
    print(f"   • Mean error: {np.mean(single_errors):.4f}")
    print(f"   • Threshold: {single_threshold:.4f}")

    print(f"\nSequential Processing:")
    print(f"   • Total anomalies: {len(sequential_results)}")
    print(f"   • Mean error: {np.mean(seq_errors):.4f}")
    print(f"   • Threshold: {seq_threshold:.4f}")

    # Severity distribution for both modes
    single_severities = [r['severity'] for r in single_log_results]
    seq_severities = [r['severity'] for r in sequential_results]

    print(f"\nSeverity Distribution:")
    print(f"Single Logs: {Counter(single_severities)}")
    print(f"Sequential: {Counter(seq_severities)}")

    print(f"\n✅ Results saved to:")
    print(f"   • single_log_anomalies.json")
    print(f"   • sequential_anomalies.json") 
    print(f"   • anomaly_outputs.json")
    print(f"   • hybrid_ensemble_artifacts.pkl")

    print(f"\n✅ Hybrid testing complete! Ready for mode-specific analysis.")

Loading and preprocessing data...
Removed 'Date' column - temporal identifiers cause overfitting
Removed 'PID' column - process IDs are not semantically meaningful
Dropping columns: ['LineId', 'Time', 'Date', 'PID']
EventTemplate has 519 unique values
Including EventTemplate as categorical feature
Processing Content column...
  content_has_error: 0.2% of logs
  content_has_large_numbers: 57.3% of logs
  content_has_critical: 40.9% of logs
  content_has_network: 14.1% of logs
  content_has_memory: 41.0% of logs
Removed 'Month' column - avoiding temporal overfitting
Using categorical columns: ['Level', 'Component', 'EventId', 'EventTemplate']
Processing EventTemplate with TF-IDF for better semantic representation...
EventTemplate TF-IDF generated 50 meaningful features
Other categoricals generated 101 features
Final feature dimension: 159
Numerical features: 8, Categorical features: 151
Data shape: (25456, 159)
Training Hybrid Ensemble Models...

=== Training Hybrid Model 1/3 ===


                                                                                                                                                          

Epoch 1: Train Loss: 0.8688 | Val Loss: 0.5688


                                                                                                                                                          

Epoch 11: Train Loss: 0.6017 | Val Loss: 0.5132


                                                                                                                                                          

Epoch 21: Train Loss: 0.5644 | Val Loss: 0.4886


                                                                                                                                                          

Epoch 31: Train Loss: 0.5541 | Val Loss: 0.4683


                                                                                                                                                          

Epoch 41: Train Loss: 0.5441 | Val Loss: 0.4639


                                                                                                                                                          

Early stopping triggered after 44 epochs

=== Training Hybrid Model 2/3 ===


                                                                                                                                                          

Epoch 1: Train Loss: 0.8566 | Val Loss: 0.5736


                                                                                                                                                          

Epoch 11: Train Loss: 0.5791 | Val Loss: 0.5024


                                                                                                                                                          

Epoch 21: Train Loss: 0.5362 | Val Loss: 0.4834


                                                                                                                                                          

Epoch 31: Train Loss: 0.5089 | Val Loss: 0.4419


                                                                                                                                                          

Epoch 41: Train Loss: 0.4931 | Val Loss: 0.4317


                                                                                                                                                          

Epoch 51: Train Loss: 0.4841 | Val Loss: 0.4261


                                                                                                                                                          

Epoch 61: Train Loss: 0.4770 | Val Loss: 0.4139


                                                                                                                                                          

Epoch 71: Train Loss: 0.4717 | Val Loss: 0.4098


                                                                                                                                                          

Early stopping triggered after 75 epochs

=== Training Hybrid Model 3/3 ===


                                                                                                                                                          

Epoch 1: Train Loss: 0.7993 | Val Loss: 0.5450


                                                                                                                                                          

Epoch 11: Train Loss: 0.5029 | Val Loss: 0.4360


                                                                                                                                                          

Epoch 21: Train Loss: 0.4411 | Val Loss: 0.4004


                                                                                                                                                          

Epoch 31: Train Loss: 0.4096 | Val Loss: 0.3722


                                                                                                                                                          

Epoch 41: Train Loss: 0.3850 | Val Loss: 0.3576


                                                                                                                                                          

Epoch 51: Train Loss: 0.3644 | Val Loss: 0.3426


                                                                                                                                                          

Epoch 61: Train Loss: 0.3501 | Val Loss: 0.3372


                                                                                                                                                          

Epoch 71: Train Loss: 0.3330 | Val Loss: 0.3258


                                                                                                                                                          

Epoch 81: Train Loss: 0.3200 | Val Loss: 0.3163


                                                                                                                                                          

Epoch 91: Train Loss: 0.3169 | Val Loss: 0.3147


                                                                                                                                                          


Hybrid Ensemble weights: ['0.276', '0.314', '0.410']

Evaluating different processing modes...
Processing sequences...
Processing individual logs...
Sequential anomalies detected: 24
Single log anomalies detected: 24

Initializing severity classification...
✅ Learned severity thresholds: {'p85': 2.730556753315499, 'p95': 4.080528542445025, 'p99': 7.0022962475051465}
📊 Error distribution stats: {'mean': 1.4596107065258013, 'std': 1.512213478614884, 'median': 0.9437472978112085, 'iqr': 0.9271088474004039}

Initializing rule-based log classification...

Processing single log outputs...

Processing sequential outputs...
Single log anomalies found: 56
Sequential anomalies found: 13

🔍 SINGLE LOG ANOMALIES (56 found - non-normal types only):

Anomaly #1:
Log: ALERT exited abnormally with [1]...
Anomaly Type: system_critical
Severity: Critical
Timestamp: 04:02:55
Confidence: 0.722
Score: 8.5563
----------------------------------------

Anomaly #2:
Log: Kerberos authentication failed...
Anoma