In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import savgol_filter
import warnings
warnings.filterwarnings('ignore')
import joblib
import pickle
import os
import json

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class LSTMAutoencoder(nn.Module):
    """
    LSTM Autoencoder for temporal anomaly detection in PyTorch
    """
    
    def __init__(self, input_dim, hidden_dim=64, latent_dim=10, num_layers=2, sequence_length=20):
        super(LSTMAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        self.num_layers = num_layers
        self.sequence_length = sequence_length
        
        # Encoder
        self.encoder_lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers, 
            batch_first=True, dropout=0.2 if num_layers > 1 else 0
        )
        self.encoder_fc = nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        self.decoder_fc = nn.Linear(latent_dim, hidden_dim)
        self.decoder_lstm = nn.LSTM(
            hidden_dim, hidden_dim, num_layers, 
            batch_first=True, dropout=0.2 if num_layers > 1 else 0
        )
        self.output_projection = nn.Linear(hidden_dim, input_dim)
        
        self.dropout = nn.Dropout(0.2)
        
    def encode(self, x):
        # x shape: (batch_size, sequence_length, input_dim)
        lstm_out, (hidden, cell) = self.encoder_lstm(x)
        # Use the last hidden state
        encoded = self.encoder_fc(hidden[-1])  # Shape: (batch_size, latent_dim)
        return encoded
    
    def decode(self, encoded):
        batch_size = encoded.size(0)
        
        # Project to hidden dimension
        decoded = self.decoder_fc(encoded)  # Shape: (batch_size, hidden_dim)
        
        # Repeat for sequence length
        decoded = decoded.unsqueeze(1).repeat(1, self.sequence_length, 1)  # Shape: (batch_size, seq_len, hidden_dim)
        
        # Pass through LSTM
        lstm_out, _ = self.decoder_lstm(decoded)
        
        # Project to output dimension
        output = self.output_projection(lstm_out)  # Shape: (batch_size, seq_len, input_dim)
        
        return output
    
    def forward(self, x):
        encoded = self.encode(x)
        decoded = self.decode(encoded)
        return decoded

class SequenceDataset(Dataset):
    """
    Custom Dataset for sequence data
    """
    
    def __init__(self, sequences, labels=None):
        self.sequences = torch.FloatTensor(sequences)
        self.labels = torch.FloatTensor(labels) if labels is not None else None
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if self.labels is not None:
            return self.sequences[idx], self.labels[idx]
        return self.sequences[idx]

class EarlyStopping:
    """
    Early stopping utility class
    """
    
    def __init__(self, patience=10, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None
        
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1
            
        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False
    
    def save_checkpoint(self, model):
        self.best_weights = model.state_dict().copy()

class DrivingAnomalyDetectorPyTorchFixed:
    """
    FIXED: Advanced anomaly detection system for dangerous driving behaviors using PyTorch
    """
    
    def __init__(self, df):
        """
        Initialize with geotrack data
        
        Args:
            df: DataFrame with columns: randomized_id, lat, lng, alt, spd, azm
        """
        self.df = df.copy()
        self.features_df = None
        self.scaler = RobustScaler()
        
        # Models
        self.isolation_forest = None
        self.one_class_svm = None
        self.lstm_autoencoder = None
        
        # Store reconstruction threshold
        self.lstm_threshold = None
        
    def safe_divide(self, numerator, denominator, default_value=0.0):
        """Safe division to avoid NaN and infinite values"""
        with np.errstate(divide='ignore', invalid='ignore'):
            result = np.divide(numerator, denominator)
            result = np.where(np.isfinite(result), result, default_value)
            return result
    
    def calculate_driving_physics(self):
        """
        FIXED: Calculate advanced physics-based features for driving analysis
        """
        print("Calculating driving physics features...")
        
        # Sort by trip and sequence
        self.df = self.df.sort_values(['randomized_id', 'lat', 'lng'])
        self.df['sequence'] = self.df.groupby('randomized_id').cumcount()
        
        # Time estimation (assuming 1-second intervals)
        self.df['time_delta'] = 1.0  # seconds
        
        def calculate_trip_features(group):
            """Calculate features for each trip with NaN/inf protection"""
            if len(group) < 3:
                # Fill with safe default values for short trips
                group['distance'] = 0.0
                group['speed_smooth'] = group['spd']
                group['acceleration'] = 0.0
                group['jerk'] = 0.0
                group['angular_velocity'] = 0.0
                group['lateral_acceleration'] = 0.0
                group['heading_change_rate'] = 0.0
                group['curvature'] = 0.0
                return group
            
            # Distance calculation using Haversine formula
            def haversine_distance(lat1, lon1, lat2, lon2):
                R = 6371000  # Earth's radius in meters
                lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
                dlat = lat2 - lat1
                dlon = lon2 - lon1
                a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
                c = 2 * np.arcsin(np.sqrt(np.clip(a, 0, 1)))  # Clip to avoid numerical errors
                return R * c
            
            # Calculate distances
            distances = [0]  # First point
            for i in range(1, len(group)):
                try:
                    dist = haversine_distance(
                        group.iloc[i-1]['lat'], group.iloc[i-1]['lng'],
                        group.iloc[i]['lat'], group.iloc[i]['lng']
                    )
                    # Limit maximum distance to avoid GPS errors
                    dist = min(dist, 1000)  # Max 1km between consecutive points
                    distances.append(dist)
                except:
                    distances.append(0)
            
            group['distance'] = distances
            
            # Smooth speed data to reduce GPS noise
            if len(group) >= 5:
                try:
                    group['speed_smooth'] = savgol_filter(group['spd'], 5, 2)
                except:
                    group['speed_smooth'] = group['spd']
            else:
                group['speed_smooth'] = group['spd']
            
            # Ensure positive speeds
            group['speed_smooth'] = np.maximum(group['speed_smooth'], 0)
            
            # Calculate acceleration (m/s²) with error handling
            speed_ms = group['speed_smooth'] / 3.6  # Convert km/h to m/s
            try:
                acceleration = np.gradient(speed_ms, group['time_delta'])
                # Limit extreme accelerations
                acceleration = np.clip(acceleration, -15, 15)  # Reasonable limits for cars
            except:
                acceleration = np.zeros(len(group))
            group['acceleration'] = acceleration
            
            # Calculate jerk (m/s³) with error handling
            try:
                jerk = np.gradient(acceleration, group['time_delta'])
                jerk = np.clip(jerk, -20, 20)  # Reasonable limits for jerk
            except:
                jerk = np.zeros(len(group))
            group['jerk'] = jerk
            
            # Calculate angular velocity (rad/s) with error handling
            try:
                azimuth_rad = np.radians(group['azm'])
                azimuth_unwrapped = np.unwrap(azimuth_rad)
                angular_velocity = np.gradient(azimuth_unwrapped, group['time_delta'])
                angular_velocity = np.clip(angular_velocity, -np.pi, np.pi)  # Reasonable limits
            except:
                angular_velocity = np.zeros(len(group))
            group['angular_velocity'] = angular_velocity
            
            # Calculate lateral acceleration (cornering force)
            lateral_acceleration = speed_ms * angular_velocity
            lateral_acceleration = np.clip(lateral_acceleration, -20, 20)  # Reasonable limits
            group['lateral_acceleration'] = lateral_acceleration
            
            # Calculate heading change rate
            heading_change = np.abs(angular_velocity)
            group['heading_change_rate'] = heading_change
            
            # Calculate curvature (1/radius of turn) with safe division
            group['curvature'] = self.safe_divide(
                np.abs(angular_velocity), 
                speed_ms + 0.1,  # Add small value to avoid division by zero
                default_value=0.0
            )
            
            return group
        
        self.df = self.df.groupby('randomized_id').apply(calculate_trip_features)
        self.df = self.df.reset_index(drop=True)
        
        # Final check for any remaining NaN or inf values
        numeric_columns = ['distance', 'speed_smooth', 'acceleration', 'jerk', 
                          'angular_velocity', 'lateral_acceleration', 'heading_change_rate', 'curvature']
        
        for col in numeric_columns:
            if col in self.df.columns:
                self.df[col] = self.df[col].fillna(0)
                self.df[col] = self.df[col].replace([np.inf, -np.inf], 0)
        
        print("Physics features calculated successfully")
        return self.df
    
    def engineer_anomaly_features(self):
        """
        FIXED: Engineer specific features for anomaly detection with NaN/inf protection
        """
        print("Engineering anomaly detection features...")
        
        if 'acceleration' not in self.df.columns:
            self.calculate_driving_physics()
        
        # Rolling window statistics for pattern detection
        window_sizes = [3, 5, 10]
        
        for window in window_sizes:
            # Speed patterns with error handling
            try:
                self.df[f'speed_std_{window}'] = self.df.groupby('randomized_id')['spd'].rolling(window, center=True, min_periods=1).std().reset_index(0, drop=True).fillna(0)
                self.df[f'speed_max_{window}'] = self.df.groupby('randomized_id')['spd'].rolling(window, center=True, min_periods=1).max().reset_index(0, drop=True).fillna(0)
                self.df[f'speed_min_{window}'] = self.df.groupby('randomized_id')['spd'].rolling(window, center=True, min_periods=1).min().reset_index(0, drop=True).fillna(0)
            except:
                self.df[f'speed_std_{window}'] = 0
                self.df[f'speed_max_{window}'] = self.df['spd']
                self.df[f'speed_min_{window}'] = self.df['spd']
            
            # Acceleration patterns with error handling
            try:
                self.df[f'accel_std_{window}'] = self.df.groupby('randomized_id')['acceleration'].rolling(window, center=True, min_periods=1).std().reset_index(0, drop=True).fillna(0)
                self.df[f'accel_max_{window}'] = self.df.groupby('randomized_id')['acceleration'].rolling(window, center=True, min_periods=1).max().reset_index(0, drop=True).fillna(0)
                self.df[f'accel_min_{window}'] = self.df.groupby('randomized_id')['acceleration'].rolling(window, center=True, min_periods=1).min().reset_index(0, drop=True).fillna(0)
            except:
                self.df[f'accel_std_{window}'] = 0
                self.df[f'accel_max_{window}'] = self.df['acceleration']
                self.df[f'accel_min_{window}'] = self.df['acceleration']
        
        # Extreme behavior indicators with safe thresholds
        self.df['hard_braking'] = (self.df['acceleration'] < -4.0).astype(int)
        self.df['hard_acceleration'] = (self.df['acceleration'] > 3.0).astype(int)
        self.df['excessive_speed'] = (self.df['spd'] > 80).astype(int)
        self.df['sharp_turn'] = (np.abs(self.df['lateral_acceleration']) > 4.0).astype(int)
        self.df['erratic_steering'] = (np.abs(self.df['heading_change_rate']) > 0.5).astype(int)
        
        # Composite risk scores with safe division and clipping
        self.df['acceleration_risk'] = np.clip(np.abs(self.df['acceleration']) / 10.0, 0, 1)
        self.df['jerk_risk'] = np.clip(np.abs(self.df['jerk']) / 5.0, 0, 1)
        self.df['lateral_risk'] = np.clip(np.abs(self.df['lateral_acceleration']) / 8.0, 0, 1)
        self.df['speed_risk'] = np.clip(np.maximum(0, (self.df['spd'] - 60) / 40.0), 0, 1)
        
        # Overall risk score with weights summing to 1
        self.df['overall_risk'] = (
            self.df['acceleration_risk'] * 0.25 +
            self.df['jerk_risk'] * 0.20 +
            self.df['lateral_risk'] * 0.25 +
            self.df['speed_risk'] * 0.15 +
            (self.df['hard_braking'] + self.df['hard_acceleration'] + 
             self.df['sharp_turn'] + self.df['erratic_steering']) * 0.15 / 4  # Normalize by 4
        )
        
        # Ensure overall risk is between 0 and 1
        self.df['overall_risk'] = np.clip(self.df['overall_risk'], 0, 1)
        
        print("Anomaly features engineered successfully")
        return self.df
    
    def prepare_ml_features(self):
        """
        FIXED: Prepare feature matrix for ML models with proper handling
        """
        if 'overall_risk' not in self.df.columns:
            self.engineer_anomaly_features()
        
        # Select features for ML models
        feature_columns = [
            'spd', 'acceleration', 'jerk', 'angular_velocity', 'lateral_acceleration',
            'heading_change_rate', 'curvature', 'overall_risk',
            'speed_std_3', 'speed_std_5', 'speed_std_10',
            'accel_std_3', 'accel_std_5', 'accel_std_10',
            'acceleration_risk', 'jerk_risk', 'lateral_risk', 'speed_risk'
        ]
        
        # Handle missing values and infinite values
        self.features_df = self.df[feature_columns].copy()
        
        # Replace any remaining NaN or infinite values
        for col in feature_columns:
            self.features_df[col] = self.features_df[col].fillna(0)
            self.features_df[col] = self.features_df[col].replace([np.inf, -np.inf], 0)
        
        print(f"Prepared {len(feature_columns)} features for ML models")
        print(f"Feature matrix shape: {self.features_df.shape}")
        print(f"Any NaN values: {self.features_df.isnull().sum().sum()}")
        print(f"Any infinite values: {np.isinf(self.features_df.values).sum()}")
        
        return self.features_df
    
    def train_isolation_forest(self, contamination=0.05):
        """
        Train Isolation Forest for anomaly detection
        """
        print("Training Isolation Forest...")
        
        if self.features_df is None:
            self.prepare_ml_features()
        
        # Scale features
        X_scaled = self.scaler.fit_transform(self.features_df)
        
        # Train Isolation Forest
        self.isolation_forest = IsolationForest(
            contamination=contamination,
            random_state=42,
            n_estimators=200,
            max_samples='auto',
            max_features=0.8,
            n_jobs=-1
        )
        
        self.isolation_forest.fit(X_scaled)
        
        # Predict anomalies
        anomaly_scores = self.isolation_forest.decision_function(X_scaled)
        anomaly_labels = self.isolation_forest.predict(X_scaled)
        
        self.df['if_anomaly_score'] = anomaly_scores
        self.df['if_anomaly'] = (anomaly_labels == -1).astype(int)
        
        anomaly_rate = self.df['if_anomaly'].mean()
        print(f"Isolation Forest trained. Anomaly rate: {anomaly_rate:.3f}")
        
        return self.isolation_forest
    
    def train_one_class_svm(self, nu=0.05):
        """
        Train One-Class SVM for anomaly detection
        """
        print("Training One-Class SVM...")
        
        if self.features_df is None:
            self.prepare_ml_features()
        
        # Scale features
        X_scaled = self.scaler.transform(self.features_df)
        
        # Train One-Class SVM
        self.one_class_svm = OneClassSVM(
            nu=nu,
            kernel='rbf',
            gamma='scale',
            cache_size=1000
        )
        
        # Sample for training if dataset is too large
        if len(X_scaled) > 50000:
            sample_idx = np.random.choice(len(X_scaled), 50000, replace=False)
            X_sample = X_scaled[sample_idx]
        else:
            X_sample = X_scaled
        
        self.one_class_svm.fit(X_sample)
        
        # Predict anomalies on full dataset
        anomaly_scores = self.one_class_svm.decision_function(X_scaled)
        anomaly_labels = self.one_class_svm.predict(X_scaled)
        
        self.df['svm_anomaly_score'] = anomaly_scores
        self.df['svm_anomaly'] = (anomaly_labels == -1).astype(int)
        
        anomaly_rate = self.df['svm_anomaly'].mean()
        print(f"One-Class SVM trained. Anomaly rate: {anomaly_rate:.3f}")
        
        return self.one_class_svm
    
    def prepare_sequences(self, sequence_length=20):
        """
        Prepare sequential data for LSTM training
        """
        if self.features_df is None:
            self.prepare_ml_features()
        
        # Scale features
        X_scaled = self.scaler.transform(self.features_df)
        
        sequences = []
        trip_ids = []
        indices = []
        
        # Create sequences for each trip
        for trip_id in self.df['randomized_id'].unique():
            trip_mask = self.df['randomized_id'] == trip_id
            trip_data = X_scaled[trip_mask]
            
            if len(trip_data) >= sequence_length:
                for i in range(len(trip_data) - sequence_length + 1):
                    sequences.append(trip_data[i:i + sequence_length])
                    trip_ids.append(trip_id)
                    indices.append(np.where(trip_mask)[0][i + sequence_length - 1])
        
        return np.array(sequences), trip_ids, indices
    
    def train_lstm_autoencoder(self, sequence_length=20, epochs=50, batch_size=32, lr=0.001):
        """
        FIXED: Train LSTM Autoencoder with better parameters
        """
        print("Training LSTM Autoencoder with PyTorch...")
        
        # Prepare sequences
        X_sequences, trip_ids, indices = self.prepare_sequences(sequence_length)
        
        if len(X_sequences) == 0:
            print("No sequences could be created. Check sequence_length parameter.")
            return None
        
        print(f"Created {len(X_sequences)} sequences of length {sequence_length}")
        
        # Create model
        input_dim = X_sequences.shape[2]
        self.lstm_autoencoder = LSTMAutoencoder(
            input_dim=input_dim,
            hidden_dim=64,
            latent_dim=10,
            num_layers=2,
            sequence_length=sequence_length
        ).to(device)
        
        # Split data
        X_train, X_val = train_test_split(X_sequences, test_size=0.2, random_state=42)
        
        # Create datasets and dataloaders
        train_dataset = SequenceDataset(X_train)
        val_dataset = SequenceDataset(X_val)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        # Optimizer and loss function
        optimizer = optim.Adam(self.lstm_autoencoder.parameters(), lr=lr, weight_decay=1e-5)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, min_lr=1e-6)
        
        # Early stopping
        early_stopping = EarlyStopping(patience=15, restore_best_weights=True)
        
        # Training loop
        train_losses = []
        val_losses = []
        
        print(f"Training on {device} for {epochs} epochs...")
        
        for epoch in range(epochs):
            # Training
            self.lstm_autoencoder.train()
            train_loss = 0.0
            
            for batch in train_loader:
                sequences = batch.to(device)
                
                optimizer.zero_grad()
                reconstructed = self.lstm_autoencoder(sequences)
                loss = criterion(reconstructed, sequences)
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(self.lstm_autoencoder.parameters(), max_norm=1.0)
                
                optimizer.step()
                train_loss += loss.item()
            
            avg_train_loss = train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            
            # Validation
            self.lstm_autoencoder.eval()
            val_loss = 0.0
            
            with torch.no_grad():
                for batch in val_loader:
                    sequences = batch.to(device)
                    reconstructed = self.lstm_autoencoder(sequences)
                    loss = criterion(reconstructed, sequences)
                    val_loss += loss.item()
            
            avg_val_loss = val_loss / len(val_loader)
            val_losses.append(avg_val_loss)
            
            # Learning rate scheduling
            scheduler.step(avg_val_loss)
            
            # Print progress
            if (epoch + 1) % 5 == 0 or epoch == 0:
                print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}")
            
            # Early stopping
            if early_stopping(avg_val_loss, self.lstm_autoencoder):
                print(f"Early stopping at epoch {epoch + 1}")
                break
        
        # Calculate reconstruction errors
        print("Calculating reconstruction errors...")
        self.lstm_autoencoder.eval()
        
        reconstruction_errors = []
        batch_size_eval = 64
        
        with torch.no_grad():
            for i in range(0, len(X_sequences), batch_size_eval):
                batch_sequences = torch.FloatTensor(X_sequences[i:i+batch_size_eval]).to(device)
                reconstructed = self.lstm_autoencoder(batch_sequences)
                batch_errors = torch.mean((batch_sequences - reconstructed) ** 2, dim=(1, 2)).cpu().numpy()
                reconstruction_errors.extend(batch_errors)
        
        reconstruction_errors = np.array(reconstruction_errors)
        
        # Determine anomaly threshold (95th percentile)
        self.lstm_threshold = np.percentile(reconstruction_errors, 95)
        
        # Mark anomalies
        self.df['lstm_reconstruction_error'] = 0.0
        self.df['lstm_anomaly'] = 0
        
        for i, idx in enumerate(indices):
            self.df.loc[idx, 'lstm_reconstruction_error'] = reconstruction_errors[i]
            self.df.loc[idx, 'lstm_anomaly'] = int(reconstruction_errors[i] > self.lstm_threshold)
        
        anomaly_rate = self.df['lstm_anomaly'].mean()
        print(f"LSTM Autoencoder trained successfully!")
        print(f"Anomaly rate: {anomaly_rate:.3f}")
        print(f"Reconstruction error threshold: {self.lstm_threshold:.6f}")
        print(f"Final train loss: {avg_train_loss:.6f}")
        print(f"Final val loss: {avg_val_loss:.6f}")
        
        return {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'threshold': self.lstm_threshold,
            'final_train_loss': avg_train_loss,
            'final_val_loss': avg_val_loss
        }
    
    def ensemble_anomaly_detection(self, weights=None):
        """
        FIXED: Combine multiple anomaly detection methods
        """
        print("Creating ensemble anomaly detection...")
        
        if weights is None:
            weights = {
                'isolation_forest': 0.35,
                'one_class_svm': 0.30,
                'lstm': 0.35
            }
        
        # Initialize ensemble score
        self.df['ensemble_anomaly_score'] = 0.0
        n_methods = 0
        
        # Isolation Forest contribution
        if 'if_anomaly_score' in self.df.columns:
            if_min, if_max = self.df['if_anomaly_score'].min(), self.df['if_anomaly_score'].max()
            if if_max > if_min:  # Avoid division by zero
                if_scores_norm = (self.df['if_anomaly_score'] - if_min) / (if_max - if_min)
                self.df['ensemble_anomaly_score'] += weights['isolation_forest'] * (1 - if_scores_norm)
                n_methods += 1
                print(f"✓ Isolation Forest scores integrated (range: {if_min:.4f} to {if_max:.4f})")
        
        # One-Class SVM contribution
        if 'svm_anomaly_score' in self.df.columns:
            svm_min, svm_max = self.df['svm_anomaly_score'].min(), self.df['svm_anomaly_score'].max()
            if svm_max > svm_min:  # Avoid division by zero
                svm_scores_norm = (self.df['svm_anomaly_score'] - svm_min) / (svm_max - svm_min)
                self.df['ensemble_anomaly_score'] += weights['one_class_svm'] * (1 - svm_scores_norm)
                n_methods += 1
                print(f"✓ SVM scores integrated (range: {svm_min:.4f} to {svm_max:.4f})")
        
        # LSTM contribution
        if 'lstm_reconstruction_error' in self.df.columns:
            lstm_max = self.df['lstm_reconstruction_error'].max()
            if lstm_max > 0:  # Avoid division by zero
                lstm_scores_norm = self.df['lstm_reconstruction_error'] / lstm_max
                self.df['ensemble_anomaly_score'] += weights['lstm'] * lstm_scores_norm
                n_methods += 1
                print(f"✓ LSTM scores integrated (max error: {lstm_max:.6f})")
        
        if n_methods == 0:
            print("⚠️ No trained models found for ensemble!")
            return self.df
        
        # Normalize ensemble score
        if n_methods > 0:
            self.df['ensemble_anomaly_score'] = self.df['ensemble_anomaly_score'] / sum(weights.values())
        
        # Determine final anomaly threshold (top 5% as anomalies)
        anomaly_threshold = np.percentile(self.df['ensemble_anomaly_score'], 95)
        self.df['ensemble_anomaly'] = (self.df['ensemble_anomaly_score'] > anomaly_threshold).astype(int)
        
        anomaly_rate = self.df['ensemble_anomaly'].mean()
        print(f"Ensemble anomaly detection complete.")
        print(f"Anomaly threshold: {anomaly_threshold:.4f}")
        print(f"Final anomaly rate: {anomaly_rate:.3f}")
        print(f"Methods used: {n_methods}/3")
        
        return self.df
    
    def save_models(self, save_dir='./saved_models/'):
        """
        Save all trained models and parameters
        """
        print("Saving all trained models...")
        
        # Create save directory
        os.makedirs(save_dir, exist_ok=True)
        
        models_saved = []
        
        # Save Isolation Forest
        if hasattr(self, 'isolation_forest') and self.isolation_forest is not None:
            joblib.dump(self.isolation_forest, os.path.join(save_dir, 'isolation_forest.pkl'))
            models_saved.append('Isolation Forest')
            print(f"✓ Isolation Forest saved to {save_dir}isolation_forest.pkl")
        
        # Save One-Class SVM
        if hasattr(self, 'one_class_svm') and self.one_class_svm is not None:
            joblib.dump(self.one_class_svm, os.path.join(save_dir, 'one_class_svm.pkl'))
            models_saved.append('One-Class SVM')
            print(f"✓ One-Class SVM saved to {save_dir}one_class_svm.pkl")
        
        # Save LSTM Autoencoder
        if hasattr(self, 'lstm_autoencoder') and self.lstm_autoencoder is not None:
            torch.save({
                'model_state_dict': self.lstm_autoencoder.state_dict(),
                'model_config': {
                    'input_dim': self.lstm_autoencoder.input_dim,
                    'hidden_dim': self.lstm_autoencoder.hidden_dim,
                    'latent_dim': self.lstm_autoencoder.latent_dim,
                    'num_layers': self.lstm_autoencoder.num_layers,
                    'sequence_length': self.lstm_autoencoder.sequence_length
                }
            }, os.path.join(save_dir, 'lstm_autoencoder.pth'))
            models_saved.append('LSTM Autoencoder')
            print(f"✓ LSTM Autoencoder saved to {save_dir}lstm_autoencoder.pth")
        
        # Save LSTM threshold
        if hasattr(self, 'lstm_threshold') and self.lstm_threshold is not None:
            with open(os.path.join(save_dir, 'lstm_threshold.json'), 'w') as f:
                json.dump({'lstm_threshold': float(self.lstm_threshold)}, f)
            models_saved.append('LSTM Threshold')
            print(f"✓ LSTM threshold saved to {save_dir}lstm_threshold.json")
        
        # Save Scaler
        if hasattr(self, 'scaler') and self.scaler is not None:
            joblib.dump(self.scaler, os.path.join(save_dir, 'scaler.pkl'))
            models_saved.append('Scaler')
            print(f"✓ Scaler saved to {save_dir}scaler.pkl")
        
        # Save feature names if available
        if hasattr(self, 'features_df') and self.features_df is not None:
            feature_names = list(self.features_df.columns)
            with open(os.path.join(save_dir, 'feature_names.json'), 'w') as f:
                json.dump({'feature_names': feature_names}, f)
            models_saved.append('Feature Names')
            print(f"✓ Feature names saved to {save_dir}feature_names.json")
        
        # Save model metadata
        metadata = {
            'models_saved': models_saved,
            'save_timestamp': pd.Timestamp.now().isoformat(),
            'device_used': str(device),
            'total_samples': len(self.df) if hasattr(self, 'df') else 0
        }
        
        with open(os.path.join(save_dir, 'model_metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=2)
        
        print(f"\n🎉 Successfully saved {len(models_saved)} models and components:")
        for model in models_saved:
            print(f"   • {model}")
        print(f"📁 All models saved in: {save_dir}")
        
        return save_dir
    
    def load_models(self, save_dir='./saved_models/'):
        """
        Load all previously saved models and parameters
        """
        print(f"Loading models from {save_dir}...")
        
        if not os.path.exists(save_dir):
            print(f"❌ Save directory {save_dir} not found!")
            return False
        
        models_loaded = []
        
        # Load Isolation Forest
        if_path = os.path.join(save_dir, 'isolation_forest.pkl')
        if os.path.exists(if_path):
            self.isolation_forest = joblib.load(if_path)
            models_loaded.append('Isolation Forest')
            print(f"✓ Isolation Forest loaded from {if_path}")
        
        # Load One-Class SVM
        svm_path = os.path.join(save_dir, 'one_class_svm.pkl')
        if os.path.exists(svm_path):
            self.one_class_svm = joblib.load(svm_path)
            models_loaded.append('One-Class SVM')
            print(f"✓ One-Class SVM loaded from {svm_path}")
        
        # Load LSTM Autoencoder
        lstm_path = os.path.join(save_dir, 'lstm_autoencoder.pth')
        if os.path.exists(lstm_path):
            checkpoint = torch.load(lstm_path, map_location=device)
            config = checkpoint['model_config']
            
            self.lstm_autoencoder = LSTMAutoencoder(
                input_dim=config['input_dim'],
                hidden_dim=config['hidden_dim'],
                latent_dim=config['latent_dim'],
                num_layers=config['num_layers'],
                sequence_length=config['sequence_length']
            ).to(device)
            
            self.lstm_autoencoder.load_state_dict(checkpoint['model_state_dict'])
            self.lstm_autoencoder.eval()
            models_loaded.append('LSTM Autoencoder')
            print(f"✓ LSTM Autoencoder loaded from {lstm_path}")
        
        # Load LSTM threshold
        threshold_path = os.path.join(save_dir, 'lstm_threshold.json')
        if os.path.exists(threshold_path):
            with open(threshold_path, 'r') as f:
                threshold_data = json.load(f)
            self.lstm_threshold = threshold_data['lstm_threshold']
            models_loaded.append('LSTM Threshold')
            print(f"✓ LSTM threshold loaded from {threshold_path}")
        
        # Load Scaler
        scaler_path = os.path.join(save_dir, 'scaler.pkl')
        if os.path.exists(scaler_path):
            self.scaler = joblib.load(scaler_path)
            models_loaded.append('Scaler')
            print(f"✓ Scaler loaded from {scaler_path}")
        
        # Load feature names
        features_path = os.path.join(save_dir, 'feature_names.json')
        if os.path.exists(features_path):
            with open(features_path, 'r') as f:
                feature_data = json.load(f)
            self.feature_names = feature_data['feature_names']
            models_loaded.append('Feature Names')
            print(f"✓ Feature names loaded from {features_path}")
        
        # Load metadata
        metadata_path = os.path.join(save_dir, 'model_metadata.json')
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                self.model_metadata = json.load(f)
            print(f"✓ Model metadata loaded - saved on {self.model_metadata['save_timestamp']}")
        
        if models_loaded:
            print(f"\n🎉 Successfully loaded {len(models_loaded)} models:")
            for model in models_loaded:
                print(f"   • {model}")
            print(f"📁 Models loaded from: {save_dir}")
            return True
        else:
            print("❌ No models found to load!")
            return False
    
    def save_lstm_model_only(self, save_path='./lstm_model.pth'):
        """
        Save only the LSTM Autoencoder model and threshold
        """
        if not hasattr(self, 'lstm_autoencoder') or self.lstm_autoencoder is None:
            print("❌ No LSTM model to save!")
            return False
        
        save_data = {
            'model_state_dict': self.lstm_autoencoder.state_dict(),
            'model_config': {
                'input_dim': self.lstm_autoencoder.input_dim,
                'hidden_dim': self.lstm_autoencoder.hidden_dim,
                'latent_dim': self.lstm_autoencoder.latent_dim,
                'num_layers': self.lstm_autoencoder.num_layers,
                'sequence_length': self.lstm_autoencoder.sequence_length
            },
            'threshold': self.lstm_threshold if hasattr(self, 'lstm_threshold') else None,
            'device': str(device)
        }
        
        torch.save(save_data, save_path)
        print(f"✓ LSTM Autoencoder saved to {save_path}")
        return True
    
    def load_lstm_model_only(self, load_path='./lstm_model.pth'):
        """
        Load only the LSTM Autoencoder model and threshold
        """
        if not os.path.exists(load_path):
            print(f"❌ LSTM model file {load_path} not found!")
            return False
        
        checkpoint = torch.load(load_path, map_location=device)
        config = checkpoint['model_config']
        
        self.lstm_autoencoder = LSTMAutoencoder(
            input_dim=config['input_dim'],
            hidden_dim=config['hidden_dim'],
            latent_dim=config['latent_dim'],
            num_layers=config['num_layers'],
            sequence_length=config['sequence_length']
        ).to(device)
        
        self.lstm_autoencoder.load_state_dict(checkpoint['model_state_dict'])
        self.lstm_autoencoder.eval()
        
        if 'threshold' in checkpoint and checkpoint['threshold'] is not None:
            self.lstm_threshold = checkpoint['threshold']
        
        print(f"✓ LSTM Autoencoder loaded from {load_path}")
        return True
    
    def analyze_anomaly_patterns(self):
        """
        FIXED: Analyze detected anomalies with proper handling of NaN values
        """
        print("Analyzing anomaly patterns...")
        
        if 'ensemble_anomaly' not in self.df.columns:
            self.ensemble_anomaly_detection()
        
        anomalies = self.df[self.df['ensemble_anomaly'] == 1]
        normal = self.df[self.df['ensemble_anomaly'] == 0]
        
        # Safe calculation of statistics
        def safe_mean(series):
            return series.fillna(0).replace([np.inf, -np.inf], 0).mean()
        
        def safe_max(series):
            return series.fillna(0).replace([np.inf, -np.inf], 0).max()
        
        def safe_min(series):
            return series.fillna(0).replace([np.inf, -np.inf], 0).min()
        
        analysis = {
            'total_anomalies': len(anomalies),
            'anomaly_rate': len(anomalies) / len(self.df),
            'speed_comparison': {
                'anomaly_avg_speed': safe_mean(anomalies['spd']),
                'normal_avg_speed': safe_mean(normal['spd']),
                'anomaly_max_speed': safe_max(anomalies['spd'])
            },
            'acceleration_comparison': {
                'anomaly_avg_accel': safe_mean(anomalies['acceleration']),
                'normal_avg_accel': safe_mean(normal['acceleration']),
                'anomaly_max_accel': safe_max(anomalies['acceleration']),
                'anomaly_min_accel': safe_min(anomalies['acceleration'])
            },
            'behavior_patterns': {
                'hard_braking_rate': safe_mean(anomalies['hard_braking']),
                'hard_acceleration_rate': safe_mean(anomalies['hard_acceleration']),
                'sharp_turn_rate': safe_mean(anomalies['sharp_turn']),
                'erratic_steering_rate': safe_mean(anomalies['erratic_steering'])
            },
            'risk_scores': {
                'avg_overall_risk': safe_mean(anomalies['overall_risk']),
                'avg_acceleration_risk': safe_mean(anomalies['acceleration_risk']),
                'avg_lateral_risk': safe_mean(anomalies['lateral_risk']),
                'avg_speed_risk': safe_mean(anomalies['speed_risk'])
            }
        }
        
        return analysis
    
    def get_anomaly_report(self):
        """
        FIXED: Generate comprehensive anomaly detection report
        """
        analysis = self.analyze_anomaly_patterns()
        
        report = f"""
        🚨 DRIVING ANOMALY DETECTION REPORT (PyTorch - FIXED) 🚨
        
        📊 DATASET OVERVIEW:
        • Total Records: {len(self.df):,}
        • Unique Trips: {self.df['randomized_id'].nunique():,}
        • Total Anomalies Detected: {analysis['total_anomalies']:,}
        • Anomaly Rate: {analysis['anomaly_rate']:.2%}
        
        🏎️ SPEED ANALYSIS:
        • Normal Driving Avg Speed: {analysis['speed_comparison']['normal_avg_speed']:.1f} km/h
        • Anomalous Driving Avg Speed: {analysis['speed_comparison']['anomaly_avg_speed']:.1f} km/h
        • Maximum Anomalous Speed: {analysis['speed_comparison']['anomaly_max_speed']:.1f} km/h
        
        ⚡ ACCELERATION ANALYSIS:
        • Normal Avg Acceleration: {analysis['acceleration_comparison']['normal_avg_accel']:.2f} m/s²
        • Anomalous Avg Acceleration: {analysis['acceleration_comparison']['anomaly_avg_accel']:.2f} m/s²
        • Max Acceleration (Anomaly): {analysis['acceleration_comparison']['anomaly_max_accel']:.2f} m/s²
        • Min Acceleration (Hard Braking): {analysis['acceleration_comparison']['anomaly_min_accel']:.2f} m/s²
        
        🚨 DANGEROUS BEHAVIOR PATTERNS:
        • Hard Braking Rate: {analysis['behavior_patterns']['hard_braking_rate']:.1%}
        • Hard Acceleration Rate: {analysis['behavior_patterns']['hard_acceleration_rate']:.1%}
        • Sharp Turn Rate: {analysis['behavior_patterns']['sharp_turn_rate']:.1%}
        • Erratic Steering Rate: {analysis['behavior_patterns']['erratic_steering_rate']:.1%}
        
        ⚠️ RISK ASSESSMENT:
        • Overall Risk Score: {analysis['risk_scores']['avg_overall_risk']:.3f}
        • Acceleration Risk: {analysis['risk_scores']['avg_acceleration_risk']:.3f}
        • Lateral Force Risk: {analysis['risk_scores']['avg_lateral_risk']:.3f}
        • Speed Risk: {analysis['risk_scores']['avg_speed_risk']:.3f}
        
        🎯 MODEL PERFORMANCE (PyTorch - FIXED):
        • Device Used: {device}
        • Isolation Forest: {'✓' if hasattr(self, 'isolation_forest') and self.isolation_forest else '✗'}
        • One-Class SVM: {'✓' if hasattr(self, 'one_class_svm') and self.one_class_svm else '✗'}
        • LSTM Autoencoder (PyTorch): {'✓' if hasattr(self, 'lstm_autoencoder') and self.lstm_autoencoder else '✗'}
        • LSTM Threshold: {getattr(self, 'lstm_threshold', 'N/A')}
        • Ensemble Method: {'✓' if 'ensemble_anomaly' in self.df.columns else '✗'}
        """
        
        print(report)
        return analysis

def run_complete(df, output_dir='./anomaly_analysis_pytorch_fixed/', epochs=30):
    """
    Run complete anomaly detection pipeline with PyTorch
    """
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    print("🚀 Starting Driving Anomaly Detection Pipeline")
    print("=" * 75)
    
    # Initialize detector
    detector = DrivingAnomalyDetectorPyTorchFixed(df)
    
    # Step 1: Feature Engineering
    print("\n1️⃣ Calculating physics features...")
    detector.calculate_driving_physics()
    
    print("\n2️⃣ Engineering anomaly features...")
    detector.engineer_anomaly_features()
    
    # Step 2: Train ML Models
    print("\n3️⃣ Training Isolation Forest...")
    detector.train_isolation_forest(contamination=0.05)
    
    print("\n4️⃣ Training One-Class SVM...")
    detector.train_one_class_svm(nu=0.05)
    
    # Step 3: Train PyTorch LSTM Model with proper epochs
    print(f"\n5️⃣ Training LSTM Autoencoder (PyTorch) for {epochs} epochs...")
    lstm_history = detector.train_lstm_autoencoder(sequence_length=15, epochs=500, batch_size=32)
    
    # Step 4: Ensemble Method
    print("\n6️⃣ Creating ensemble anomaly detection...")
    detector.ensemble_anomaly_detection()
    
    # Step 5: Analysis
    print("\n7️⃣ Generating comprehensive report...")
    analysis = detector.get_anomaly_report()
    
    # Step 6: Save Results and Models
    print("\n💾 Saving results and models...")
    detector.df.to_csv(f'{output_dir}/anomaly_results_fixed.csv', index=False)
    
    # Save all trained models
    model_save_dir = os.path.join(output_dir, 'models')
    detector.save_models(model_save_dir)
    
    print(f"\n✅ Analysis complete! Results and models saved in: {output_dir}")
    
    return detector, analysis

def load_trained_models(df, model_dir='./saved_models/'):
    """
    Create a new detector instance and load previously trained models
    
    Args:
        df: New dataframe to analyze
        model_dir: Directory containing saved models
    
    Returns:
        detector: DrivingAnomalyDetectorPyTorchFixed instance with loaded models
    """
    print("🔄 Creating detector with pre-trained models...")
    
    # Initialize detector with new data
    detector = DrivingAnomalyDetectorPyTorchFixed(df)
    
    # Load pre-trained models
    success = detector.load_models(model_dir)
    
    if success:
        print("✅ Ready to analyze new data with pre-trained models!")
        return detector
    else:
        print("❌ Failed to load models. You may need to train new models.")
        return detector

# Example usage with better parameters
if __name__ == "__main__":
    # Generate synthetic driving data
    print("Generating synthetic driving data with anomalies...")
    
    np.random.seed(42)
    n_trips = 2000
    data = []
    
    for trip_id in range(n_trips):
        trip_length = np.random.randint(20, 100)
        base_speed = np.random.uniform(30, 70)
        base_lat = np.random.uniform(55.7, 55.8)
        base_lng = np.random.uniform(37.5, 37.7)
        is_anomalous = np.random.random() < 0.1
        
        for i in range(trip_length):
            if not is_anomalous or np.random.random() > 0.3:
                speed = base_speed + np.random.normal(0, 5)
                speed = max(0, speed)
                lat = base_lat + i * 0.001 + np.random.normal(0, 0.0001)
                lng = base_lng + i * 0.001 + np.random.normal(0, 0.0001)
                azm = np.random.uniform(0, 360)
            else:
                anomaly_type = np.random.choice(['speeding', 'hard_braking', 'sharp_turn'])
                if anomaly_type == 'speeding':
                    speed = np.random.uniform(90, 150)
                elif anomaly_type == 'hard_braking':
                    speed = max(0, base_speed - np.random.uniform(30, 50))
                else:
                    speed = base_speed
                    azm = (np.random.uniform(0, 360) + np.random.uniform(45, 90)) % 360
                
                lat = base_lat + i * 0.001 + np.random.normal(0, 0.0002)
                lng = base_lng + i * 0.001 + np.random.normal(0, 0.0002)
                if 'azm' not in locals():
                    azm = np.random.uniform(0, 360)
            
            data.append({
                'randomized_id': f'trip_{trip_id}',
                'lat': lat,
                'lng': lng,
                'alt': np.random.uniform(100, 200),
                'spd': speed,
                'azm': azm
            })
    
    df = pd.DataFrame(data)
    print(f"Generated {len(df)} data points from {n_trips} trips")
    
    # Run FIXED analysis with proper epochs
    detector, analysis = run_complete(df)

Using device: cuda
Generating synthetic driving data with anomalies...
Generated 118166 data points from 2000 trips
🚀 Starting Driving Anomaly Detection Pipeline

1️⃣ Calculating physics features...
Calculating driving physics features...
Physics features calculated successfully

2️⃣ Engineering anomaly features...
Engineering anomaly detection features...
Anomaly features engineered successfully

3️⃣ Training Isolation Forest...
Training Isolation Forest...
Prepared 18 features for ML models
Feature matrix shape: (118166, 18)
Any NaN values: 0
Any infinite values: 0
Isolation Forest trained. Anomaly rate: 0.050

4️⃣ Training One-Class SVM...
Training One-Class SVM...
One-Class SVM trained. Anomaly rate: 0.050

5️⃣ Training LSTM Autoencoder (PyTorch) for 30 epochs...
Training LSTM Autoencoder with PyTorch...
Created 90166 sequences of length 15
Training on cuda for 500 epochs...
Epoch [1/500], Train Loss: 12.096553, Val Loss: 10.446937
Epoch [5/500], Train Loss: 6.435770, Val Loss: 5.3

In [2]:
class RealTimeAnomalyDetectorFinalFixed:
    """  
    FINAL FIX: Real-time anomaly detection with proper confidence scoring
    """
    
    def __init__(self, trained_detector):
        self.detector = trained_detector
        self.buffer = []
        self.buffer_size = 20
        self.alert_threshold = 0.3  # Adjusted threshold
        
        # Pre-calculate normalization parameters from training data
        self.setup_normalization_parameters()
        
    def setup_normalization_parameters(self):
        """Setup normalization parameters from training data"""
        if hasattr(self.detector, 'df') and 'if_anomaly_score' in self.detector.df.columns:
            self.if_min = self.detector.df['if_anomaly_score'].min()
            self.if_max = self.detector.df['if_anomaly_score'].max()
        else:
            self.if_min, self.if_max = -0.5, 0.5
            
        if hasattr(self.detector, 'df') and 'svm_anomaly_score' in self.detector.df.columns:
            self.svm_min = self.detector.df['svm_anomaly_score'].min()
            self.svm_max = self.detector.df['svm_anomaly_score'].max()
        else:
            self.svm_min, self.svm_max = -2.0, 2.0
            
        print(f"Normalization setup - IF range: [{self.if_min:.4f}, {self.if_max:.4f}]")
        print(f"Normalization setup - SVM range: [{self.svm_min:.4f}, {self.svm_max:.4f}]")
        print(f"LSTM threshold: {getattr(self.detector, 'lstm_threshold', 'N/A')}")
        
    def process_real_time_point(self, data_point):
        """
        FINAL FIXED: Process a single data point in real-time
        """
        # Add to buffer
        self.buffer.append(data_point)
        
        # Maintain buffer size
        if len(self.buffer) > self.buffer_size:
            self.buffer.pop(0)
        
        # Need minimum points for analysis
        if len(self.buffer) < 5:
            return {
                'anomaly_detected': False,
                'confidence': 0.0,
                'alert_level': 'NORMAL',
                'buffer_size': len(self.buffer),
                'message': f'Building buffer... {len(self.buffer)}/{self.buffer_size}'
            }
        
        try:
            # Calculate features for current window
            df_buffer = pd.DataFrame(self.buffer)
            df_buffer['randomized_id'] = 'real_time'
            
            # Use the trained detector's feature engineering
            temp_detector = DrivingAnomalyDetectorPyTorchFixed(df_buffer)
            temp_detector.scaler = self.detector.scaler  # Use trained scaler
            temp_detector.calculate_driving_physics()
            temp_detector.engineer_anomaly_features()
            
            # Get the latest point's features
            features_df = temp_detector.prepare_ml_features()
            if len(features_df) == 0:
                return {'anomaly_detected': False, 'confidence': 0.0, 'alert_level': 'NORMAL', 'error': 'No features'}
            
            latest_features = features_df.iloc[-1:].values
            latest_scaled = self.detector.scaler.transform(latest_features)
            
            # Get anomaly scores from trained models
            scores = {}
            
            # Isolation Forest
            if self.detector.isolation_forest:
                if_score = self.detector.isolation_forest.decision_function(latest_scaled)[0]
                scores['isolation_forest'] = if_score
            
            # One-Class SVM  
            if self.detector.one_class_svm:
                svm_score = self.detector.one_class_svm.decision_function(latest_scaled)[0]
                scores['one_class_svm'] = svm_score
            
            # LSTM Autoencoder
            lstm_score = 0.0
            if self.detector.lstm_autoencoder and len(self.buffer) >= 15:  # Need at least 15 points
                try:
                    # Get last 15 points for sequence
                    sequence_features = features_df.iloc[-15:].values
                    sequence_scaled = self.detector.scaler.transform(sequence_features)
                    sequence_tensor = torch.FloatTensor(sequence_scaled).unsqueeze(0).to(device)
                    
                    self.detector.lstm_autoencoder.eval()
                    with torch.no_grad():
                        reconstructed = self.detector.lstm_autoencoder(sequence_tensor)
                        reconstruction_error = torch.mean((sequence_tensor - reconstructed) ** 2).item()
                        scores['lstm'] = reconstruction_error
                        lstm_score = reconstruction_error
                except Exception as e:
                    print(f"LSTM error: {e}")
                    scores['lstm'] = 0.0
            
            # Calculate ensemble score with proper normalization
            ensemble_score = 0.0
            score_components = []
            
            # Isolation Forest contribution (lower = more anomalous)
            if 'isolation_forest' in scores:
                if_raw = scores['isolation_forest']
                if_range = self.if_max - self.if_min
                if if_range > 0:
                    if_normalized = (if_raw - self.if_min) / if_range
                    if_anomaly_score = 1.0 - np.clip(if_normalized, 0, 1)  # Invert: lower IF score = higher anomaly
                else:
                    if_anomaly_score = 0.5
                
                ensemble_score += 0.35 * if_anomaly_score
                score_components.append(f"IF: {if_anomaly_score:.3f}")
            
            # SVM contribution (negative = more anomalous)
            if 'one_class_svm' in scores:
                svm_raw = scores['one_class_svm']
                svm_range = self.svm_max - self.svm_min
                if svm_range > 0:
                    svm_normalized = (svm_raw - self.svm_min) / svm_range
                    svm_anomaly_score = 1.0 - np.clip(svm_normalized, 0, 1)  # Invert: lower SVM score = higher anomaly
                else:
                    svm_anomaly_score = 0.5
                
                ensemble_score += 0.30 * svm_anomaly_score
                score_components.append(f"SVM: {svm_anomaly_score:.3f}")
            
            # LSTM contribution (higher reconstruction error = more anomalous)
            if 'lstm' in scores and hasattr(self.detector, 'lstm_threshold'):
                lstm_raw = scores['lstm']
                lstm_threshold = self.detector.lstm_threshold
                if lstm_threshold > 0:
                    lstm_anomaly_score = np.clip(lstm_raw / lstm_threshold, 0, 1)
                else:
                    lstm_anomaly_score = 0.0
                
                ensemble_score += 0.35 * lstm_anomaly_score
                score_components.append(f"LSTM: {lstm_anomaly_score:.3f}")
            
            # Add rule-based component from current driving behavior
            current_point = temp_detector.df.iloc[-1]
            rule_based_score = current_point['overall_risk']
            ensemble_score += 0.1 * rule_based_score  # Small weight for immediate behavior
            score_components.append(f"Rule: {rule_based_score:.3f}")
            
            # Normalize ensemble score to 0-1 range
            ensemble_score = np.clip(ensemble_score, 0, 1)
            
            # Determine alert level
            if ensemble_score > 0.8:
                alert_level = 'CRITICAL'
            elif ensemble_score > 0.6:
                alert_level = 'HIGH'  
            elif ensemble_score > 0.4:
                alert_level = 'MEDIUM'
            elif ensemble_score > 0.2:
                alert_level = 'LOW'
            else:
                alert_level = 'NORMAL'
            
            return {
                'anomaly_detected': ensemble_score > self.alert_threshold,
                'confidence': ensemble_score,
                'alert_level': alert_level,
                'raw_scores': scores,
                'score_components': score_components,
                'details': {
                    'speed': current_point['spd'],
                    'acceleration': current_point['acceleration'],
                    'lateral_acceleration': current_point['lateral_acceleration'],
                    'overall_risk': current_point['overall_risk'],
                    'risk_factors': {
                        'hard_braking': bool(current_point['hard_braking']),
                        'hard_acceleration': bool(current_point['hard_acceleration']),
                        'sharp_turn': bool(current_point['sharp_turn']),
                        'erratic_steering': bool(current_point['erratic_steering'])
                    }
                },
                'buffer_info': {
                    'size': len(self.buffer),
                    'lstm_ready': len(self.buffer) >= 15
                }
            }
            
        except Exception as e:
            print(f"Error in real-time processing: {e}")
            import traceback
            traceback.print_exc()
            return {
                'anomaly_detected': False,
                'confidence': 0.0,
                'alert_level': 'ERROR',
                'error': str(e)
            }

# Test the final fixed version
def test_realtime_final_fix(detector):
    """Test the final fixed real-time detector"""
    print("\n🔴 Testing FINAL Real-Time Anomaly Detection:")
    
    real_time_detector = RealTimeAnomalyDetectorFinalFixed(detector)
    
    # More comprehensive test points
    test_points = [
        # Normal driving
        {'lat': 55.75, 'lng': 37.6, 'spd': 45, 'azm': 90, 'alt': 150},
        {'lat': 55.751, 'lng': 37.601, 'spd': 47, 'azm': 92, 'alt': 152},
        {'lat': 55.752, 'lng': 37.602, 'spd': 46, 'azm': 94, 'alt': 154},
        {'lat': 55.753, 'lng': 37.603, 'spd': 48, 'azm': 96, 'alt': 156},
        {'lat': 55.754, 'lng': 37.604, 'spd': 49, 'azm': 98, 'alt': 158},
        
        # Build up buffer with normal points
        *[{'lat': 55.75 + i*0.001, 'lng': 37.6 + i*0.001, 'spd': 45 + i, 'azm': 90 + i*2, 'alt': 150 + i} 
          for i in range(5, 15)],
        
        # Now test anomalous behaviors
        {'lat': 55.765, 'lng': 37.615, 'spd': 120, 'azm': 120, 'alt': 170},  # Excessive speed
        {'lat': 55.766, 'lng': 37.616, 'spd': 15, 'azm': 125, 'alt': 172},   # Hard braking
        {'lat': 55.767, 'lng': 37.617, 'spd': 50, 'azm': 200, 'alt': 174},   # Sharp turn
        {'lat': 55.768, 'lng': 37.618, 'spd': 55, 'azm': 140, 'alt': 176},   # Recovery
    ]
    
    for i, point in enumerate(test_points):
        result = real_time_detector.process_real_time_point(point)
        
        if i < 15:  # Buffer building phase
            if i % 5 == 0:  # Show every 5th point during buffer building
                print(f"Point {i+1}: {result['alert_level']} (Confidence: {result['confidence']:.3f}) - {result.get('message', '')}")
        else:  # Testing phase
            print(f"\nPoint {i+1}: {result['alert_level']} (Confidence: {result['confidence']:.3f})")
            if 'score_components' in result:
                print(f"  Score breakdown: {', '.join(result['score_components'])}")
            if 'raw_scores' in result:
                print(f"  Raw scores: {result['raw_scores']}")
            if result['anomaly_detected']:
                print(f"  ⚠️ ANOMALY DETECTED! Speed: {result['details']['speed']:.1f} km/h")
                print(f"  Risk factors: {result['details']['risk_factors']}")
            print(f"  Buffer: {result['buffer_info']['size']}/20, LSTM ready: {result['buffer_info']['lstm_ready']}")

# Run the test
if __name__ == "__main__":
    # Assuming you have the detector from the previous run
    test_realtime_final_fix(detector)


🔴 Testing FINAL Real-Time Anomaly Detection:
Normalization setup - IF range: [-0.2400, 0.1680]
Normalization setup - SVM range: [-381.6356, 106.7346]
LSTM threshold: 2.9153685569763184
Point 1: NORMAL (Confidence: 0.000) - Building buffer... 1/20
Calculating driving physics features...
Physics features calculated successfully
Engineering anomaly detection features...
Anomaly features engineered successfully
Prepared 18 features for ML models
Feature matrix shape: (5, 18)
Any NaN values: 0
Any infinite values: 0
Calculating driving physics features...
Physics features calculated successfully
Engineering anomaly detection features...
Anomaly features engineered successfully
Prepared 18 features for ML models
Feature matrix shape: (6, 18)
Any NaN values: 0
Any infinite values: 0
Point 6: MEDIUM (Confidence: 0.421) - 
Calculating driving physics features...
Physics features calculated successfully
Engineering anomaly detection features...
Anomaly features engineered successfully
Prepared 

In [20]:
import os
import json
import time
import logging
import numpy as np
import pandas as pd
import torch
import joblib
from datetime import datetime
from collections import deque
from typing import Dict, List, Optional, Any
import asyncio
import aiofiles
from dataclasses import dataclass, asdict
from pathlib import Path
from scipy.signal import savgol_filter

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

@dataclass
class GPSPoint:
    """GPS data point from tracker - matches your dataset structure"""
    vehicle_id: str  # This will be our randomized_id
    lat: float
    lng: float
    alt: float
    spd: float  # speed in km/h
    azm: float  # azimuth/heading 0-360
    timestamp: str = None  # Added for real-time tracking
    
    @classmethod
    def from_tracker_data(cls, tracker_data: Dict) -> 'GPSPoint':
        """Convert from real GPS tracker format to our dataset format"""
        return cls(
            vehicle_id=tracker_data.get('vehicle_id', tracker_data.get('device_id')),
            lat=tracker_data['lat'],
            lng=tracker_data['lng'],
            alt=tracker_data.get('alt', tracker_data.get('altitude', 0.0)),
            spd=tracker_data.get('spd', tracker_data.get('speed', 0.0)),
            azm=tracker_data.get('azm', tracker_data.get('heading', 0.0)),
            timestamp=tracker_data.get('timestamp', datetime.now().isoformat())
        )
    
    def to_dataset_format(self) -> Dict:
        """Convert to the format expected by your trained model"""
        return {
            'randomized_id': self.vehicle_id,
            'lat': self.lat,
            'lng': self.lng,
            'alt': self.alt,
            'spd': self.spd,
            'azm': self.azm
        }

@dataclass
class AnomalyResult:
    """Anomaly detection result"""
    timestamp: str
    vehicle_id: str
    anomaly_detected: bool
    confidence: float
    alert_level: str
    raw_scores: Dict[str, float]
    driving_metrics: Dict[str, float]
    risk_factors: Dict[str, bool]
    
    def to_dict(self) -> Dict:
        return asdict(self)

# Import the LSTM model from your training code
class LSTMAutoencoder(torch.nn.Module):
    """LSTM Autoencoder - same as your training code"""
    
    def __init__(self, input_dim, hidden_dim=64, latent_dim=10, num_layers=2, sequence_length=20):
        super(LSTMAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        self.num_layers = num_layers
        self.sequence_length = sequence_length
        
        # Encoder
        self.encoder_lstm = torch.nn.LSTM(
            input_dim, hidden_dim, num_layers, 
            batch_first=True, dropout=0.2 if num_layers > 1 else 0
        )
        self.encoder_fc = torch.nn.Linear(hidden_dim, latent_dim)
        
        # Decoder
        self.decoder_fc = torch.nn.Linear(latent_dim, hidden_dim)
        self.decoder_lstm = torch.nn.LSTM(
            hidden_dim, hidden_dim, num_layers, 
            batch_first=True, dropout=0.2 if num_layers > 1 else 0
        )
        self.output_projection = torch.nn.Linear(hidden_dim, input_dim)
        
        self.dropout = torch.nn.Dropout(0.2)
        
    def encode(self, x):
        lstm_out, (hidden, cell) = self.encoder_lstm(x)
        encoded = self.encoder_fc(hidden[-1])
        return encoded
    
    def decode(self, encoded):
        batch_size = encoded.size(0)
        decoded = self.decoder_fc(encoded)
        decoded = decoded.unsqueeze(1).repeat(1, self.sequence_length, 1)
        lstm_out, _ = self.decoder_lstm(decoded)
        output = self.output_projection(lstm_out)
        return output
    
    def forward(self, x):
        encoded = self.encode(x)
        decoded = self.decode(encoded)
        return decoded

class ProductionAnomalyDetector:
    """
    Production-ready driving anomaly detection system
    Works with your exact dataset format: randomized_id,lat,lng,alt,spd,azm
    """
    
    def __init__(self, model_dir: str, config: Dict = None):
        """
        Initialize with pre-trained models
        """
        self.model_dir = Path(model_dir)
        self.config = config or self._default_config()
        
        # Model components
        self.scaler = None
        self.isolation_forest = None
        self.one_class_svm = None
        self.lstm_autoencoder = None
        self.lstm_threshold = None
        
        # Vehicle buffers for real-time processing
        self.vehicle_buffers = {}  # vehicle_id -> deque of GPS points
        self.buffer_size = self.config['buffer_size']
        
        # Normalization parameters
        self.if_min = None
        self.if_max = None
        self.svm_min = None
        self.svm_max = None
        
        # Load models
        self._load_models()
        
        logger.info(f"ProductionAnomalyDetector initialized with models from {model_dir}")
        logger.info(f"Using device: {device}")
        
    def _default_config(self) -> Dict:
        """Default configuration matching your training setup"""
        return {
            'buffer_size': 20,
            'min_points_for_detection': 5,
            'lstm_sequence_length': 15,
            'alert_threshold': 0.3,
            'weights': {
                'isolation_forest': 0.35,
                'one_class_svm': 0.30,
                'lstm': 0.35
            }
        }
    
    def _load_models(self):
        """Load all pre-trained models"""
        try:
            # Load scaler (required)
            scaler_path = self.model_dir / 'scaler.pkl'
            if scaler_path.exists():
                self.scaler = joblib.load(scaler_path)
                logger.info("✓ Feature scaler loaded")
            else:
                raise FileNotFoundError(f"Feature scaler not found: {scaler_path}")
            
            # Load Isolation Forest
            if_path = self.model_dir / 'isolation_forest.pkl'
            if if_path.exists():
                self.isolation_forest = joblib.load(if_path)
                logger.info("✓ Isolation Forest loaded")
            
            # Load One-Class SVM
            svm_path = self.model_dir / 'one_class_svm.pkl'
            if svm_path.exists():
                self.one_class_svm = joblib.load(svm_path)
                logger.info("✓ One-Class SVM loaded")
            
            # Load LSTM Autoencoder
            lstm_path = self.model_dir / 'lstm_autoencoder.pth'
            if lstm_path.exists():
                checkpoint = torch.load(lstm_path, map_location=device)
                lstm_config = checkpoint["model_config"]
                self.lstm_autoencoder = LSTMAutoencoder(**lstm_config).to(device)
            
                self.lstm_autoencoder.load_state_dict(checkpoint["model_state_dict"])
                self.lstm_autoencoder.eval()
                logger.info("✓ LSTM Autoencoder loaded")
                self.lstm_threshold = 2.9153685569763184 # fallback threshold
                logger.info(f"✓ LSTM threshold: {self.lstm_threshold}")

            # Load normalization parameters
            norm_path = self.model_dir / 'normalization_params.json'
            if norm_path.exists():
                with open(norm_path, 'r') as f:
                    norm_params = json.load(f)
                    self.if_min = norm_params.get('if_min', -0.2400)
                    self.if_max = norm_params.get('if_max', 0.1680)
                    self.svm_min = norm_params.get('svm_min', -381.6356)
                    self.svm_max = norm_params.get('svm_max', 106.7346)
                logger.info("✓ Normalization parameters loaded")
            else:
                # Use your actual training values
                self.if_min, self.if_max = -0.2400, 0.1680
                self.svm_min, self.svm_max = -381.6356, 106.7346
                logger.info("Using training normalization parameters")
            
            logger.info("All models loaded successfully!")
            
        except Exception as e:
            logger.error(f"Error loading models: {e}")
            raise
    
    def process_gps_point(self, gps_point: GPSPoint) -> Optional[AnomalyResult]:
        """
        Process a single GPS point - main entry point for real-time detection
        """
        vehicle_id = gps_point.vehicle_id
        
        # Initialize vehicle buffer if needed
        if vehicle_id not in self.vehicle_buffers:
            self.vehicle_buffers[vehicle_id] = deque(maxlen=self.buffer_size)
        
        # Add point to buffer
        self.vehicle_buffers[vehicle_id].append(gps_point)
        buffer = self.vehicle_buffers[vehicle_id]
        
        # Need minimum points for detection
        if len(buffer) < self.config['min_points_for_detection']:
            return None
        
        try:
            # Convert buffer to DataFrame in your exact format
            buffer_data = []
            for point in buffer:
                buffer_data.append(point.to_dataset_format())
            
            df_buffer = pd.DataFrame(buffer_data)
            
            # Calculate features using your exact feature engineering pipeline
            features_df = self._calculate_features_exact_pipeline(df_buffer)
            
            if len(features_df) == 0:
                return None
            
            # Get latest point features
            latest_features = features_df.iloc[-1:].values
            latest_scaled = self.scaler.transform(latest_features)
            
            # Get anomaly scores
            scores = self._get_anomaly_scores(features_df, latest_scaled)
            
            # Calculate ensemble score
            ensemble_score = self._calculate_ensemble_score(scores)
            
            # Determine alert level
            alert_level = self._get_alert_level(ensemble_score)
            
            # Extract metrics from the processed features
            latest_processed = features_df.iloc[-1]
            driving_metrics = self._extract_driving_metrics_from_features(latest_processed)
            risk_factors = self._extract_risk_factors_from_features(latest_processed)
            
            return AnomalyResult(
                timestamp=gps_point.timestamp or datetime.now().isoformat(),
                vehicle_id=vehicle_id,
                anomaly_detected=ensemble_score > self.config['alert_threshold'],
                confidence=float(ensemble_score),
                alert_level=alert_level,
                raw_scores=scores,
                driving_metrics=driving_metrics,
                risk_factors=risk_factors
            )
            
        except Exception as e:
            logger.error(f"Error processing GPS point for vehicle {vehicle_id}: {e}")
            return None
    
    def _calculate_features_exact_pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate features using EXACT same pipeline as your training code
        Input: DataFrame with columns [randomized_id, lat, lng, alt, spd, azm]
        Output: DataFrame with 18 features ready for ML models
        """
        # Apply the EXACT same feature engineering as your training
        df_processed = self._apply_physics_calculations(df.copy())
        df_processed = self._apply_anomaly_feature_engineering(df_processed)
        features_df = self._prepare_ml_features_exact(df_processed)
        
        return features_df
    
    def _apply_physics_calculations(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply exact physics calculations from your training code"""
        
        # Sort by trip and create sequence
        df = df.sort_values(['randomized_id', 'lat', 'lng'])
        df['sequence'] = df.groupby('randomized_id').cumcount()
        df['time_delta'] = 1.0  # 1 second intervals
        
        def calculate_trip_features(group):
            if len(group) < 3:
                # Fill with safe defaults for short trips
                group['distance'] = 0.0
                group['speed_smooth'] = group['spd']
                group['acceleration'] = 0.0
                group['jerk'] = 0.0
                group['angular_velocity'] = 0.0
                group['lateral_acceleration'] = 0.0
                group['heading_change_rate'] = 0.0
                group['curvature'] = 0.0
                return group
            
            # Haversine distance calculation
            def haversine_distance(lat1, lon1, lat2, lon2):
                R = 6371000  # Earth radius in meters
                lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
                dlat = lat2 - lat1
                dlon = lon2 - lon1
                a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
                c = 2 * np.arcsin(np.sqrt(np.clip(a, 0, 1)))
                return R * c
            
            # Calculate distances
            distances = [0]
            for i in range(1, len(group)):
                try:
                    dist = haversine_distance(
                        group.iloc[i-1]['lat'], group.iloc[i-1]['lng'],
                        group.iloc[i]['lat'], group.iloc[i]['lng']
                    )
                    dist = min(dist, 1000)  # Cap at 1km to avoid GPS errors
                    distances.append(dist)
                except:
                    distances.append(0)
            
            group['distance'] = distances
            
            # Smooth speed data
            if len(group) >= 5:
                try:
                    group['speed_smooth'] = savgol_filter(group['spd'], 5, 2)
                except:
                    group['speed_smooth'] = group['spd']
            else:
                group['speed_smooth'] = group['spd']
            
            group['speed_smooth'] = np.maximum(group['speed_smooth'], 0)
            
            # Calculate acceleration
            speed_ms = group['speed_smooth'] / 3.6  # km/h to m/s
            try:
                acceleration = np.gradient(speed_ms, group['time_delta'])
                acceleration = np.clip(acceleration, -15, 15)
            except:
                acceleration = np.zeros(len(group))
            group['acceleration'] = acceleration
            
            # Calculate jerk
            try:
                jerk = np.gradient(acceleration, group['time_delta'])
                jerk = np.clip(jerk, -20, 20)
            except:
                jerk = np.zeros(len(group))
            group['jerk'] = jerk
            
            # Calculate angular velocity
            try:
                azimuth_rad = np.radians(group['azm'])
                azimuth_unwrapped = np.unwrap(azimuth_rad)
                angular_velocity = np.gradient(azimuth_unwrapped, group['time_delta'])
                angular_velocity = np.clip(angular_velocity, -np.pi, np.pi)
            except:
                angular_velocity = np.zeros(len(group))
            group['angular_velocity'] = angular_velocity
            
            # Calculate lateral acceleration
            lateral_acceleration = speed_ms * angular_velocity
            lateral_acceleration = np.clip(lateral_acceleration, -20, 20)
            group['lateral_acceleration'] = lateral_acceleration
            
            # Calculate heading change rate
            group['heading_change_rate'] = np.abs(angular_velocity)
            
            # Calculate curvature with safe division
            denominator = speed_ms + 0.1
            group['curvature'] = np.divide(
                np.abs(angular_velocity), 
                denominator, 
                out=np.zeros_like(angular_velocity), 
                where=denominator!=0
            )
            
            return group
        
        df = df.groupby('randomized_id').apply(calculate_trip_features)
        df = df.reset_index(drop=True)
        
        # Clean any remaining NaN/inf values
        numeric_columns = ['distance', 'speed_smooth', 'acceleration', 'jerk', 
                          'angular_velocity', 'lateral_acceleration', 'heading_change_rate', 'curvature']
        
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].fillna(0)
                df[col] = df[col].replace([np.inf, -np.inf], 0)
        
        return df
    
    def _apply_anomaly_feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply exact anomaly feature engineering from your training code"""
        
        # Rolling window statistics
        window_sizes = [3, 5, 10]
        
        for window in window_sizes:
            try:
                # Speed patterns
                df[f'speed_std_{window}'] = df.groupby('randomized_id')['spd'].rolling(
                    window, center=True, min_periods=1).std().reset_index(0, drop=True).fillna(0)
                df[f'speed_max_{window}'] = df.groupby('randomized_id')['spd'].rolling(
                    window, center=True, min_periods=1).max().reset_index(0, drop=True).fillna(0)
                df[f'speed_min_{window}'] = df.groupby('randomized_id')['spd'].rolling(
                    window, center=True, min_periods=1).min().reset_index(0, drop=True).fillna(0)
                
                # Acceleration patterns
                df[f'accel_std_{window}'] = df.groupby('randomized_id')['acceleration'].rolling(
                    window, center=True, min_periods=1).std().reset_index(0, drop=True).fillna(0)
                df[f'accel_max_{window}'] = df.groupby('randomized_id')['acceleration'].rolling(
                    window, center=True, min_periods=1).max().reset_index(0, drop=True).fillna(0)
                df[f'accel_min_{window}'] = df.groupby('randomized_id')['acceleration'].rolling(
                    window, center=True, min_periods=1).min().reset_index(0, drop=True).fillna(0)
            except:
                # Fallback values
                df[f'speed_std_{window}'] = 0
                df[f'speed_max_{window}'] = df['spd']
                df[f'speed_min_{window}'] = df['spd']
                df[f'accel_std_{window}'] = 0
                df[f'accel_max_{window}'] = df['acceleration']
                df[f'accel_min_{window}'] = df['acceleration']
        
        # Extreme behavior indicators (exact thresholds from training)
        df['hard_braking'] = (df['acceleration'] < -4.0).astype(int)
        df['hard_acceleration'] = (df['acceleration'] > 3.0).astype(int)
        df['excessive_speed'] = (df['spd'] > 80).astype(int)
        df['sharp_turn'] = (np.abs(df['lateral_acceleration']) > 4.0).astype(int)
        df['erratic_steering'] = (np.abs(df['heading_change_rate']) > 0.5).astype(int)
        
        # Composite risk scores (exact same calculations)
        df['acceleration_risk'] = np.clip(np.abs(df['acceleration']) / 10.0, 0, 1)
        df['jerk_risk'] = np.clip(np.abs(df['jerk']) / 5.0, 0, 1)
        df['lateral_risk'] = np.clip(np.abs(df['lateral_acceleration']) / 8.0, 0, 1)
        df['speed_risk'] = np.clip(np.maximum(0, (df['spd'] - 60) / 40.0), 0, 1)
        
        # Overall risk score (exact same weights)
        df['overall_risk'] = (
            df['acceleration_risk'] * 0.25 +
            df['jerk_risk'] * 0.20 +
            df['lateral_risk'] * 0.25 +
            df['speed_risk'] * 0.15 +
            (df['hard_braking'] + df['hard_acceleration'] + 
             df['sharp_turn'] + df['erratic_steering']) * 0.15 / 4
        )
        
        df['overall_risk'] = np.clip(df['overall_risk'], 0, 1)
        
        return df
    
    def _prepare_ml_features_exact(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepare exact same 18 features as in training"""
        
        # Exact same feature columns as your training
        feature_columns = [
            'spd', 'acceleration', 'jerk', 'angular_velocity', 'lateral_acceleration',
            'heading_change_rate', 'curvature', 'overall_risk',
            'speed_std_3', 'speed_std_5', 'speed_std_10',
            'accel_std_3', 'accel_std_5', 'accel_std_10',
            'acceleration_risk', 'jerk_risk', 'lateral_risk', 'speed_risk'
        ]
        
        features_df = df[feature_columns].copy()
        
        # Clean any remaining issues
        for col in feature_columns:
            features_df[col] = features_df[col].fillna(0)
            features_df[col] = features_df[col].replace([np.inf, -np.inf], 0)
        
        return features_df
    
    def _get_anomaly_scores(self, features_df: pd.DataFrame, latest_scaled: np.ndarray) -> Dict[str, float]:
        """Get anomaly scores from all models"""
        scores = {}
        
        # Isolation Forest
        if self.isolation_forest:
            scores['isolation_forest'] = float(self.isolation_forest.decision_function(latest_scaled)[0])
        
        # One-Class SVM
        if self.one_class_svm:
            scores['one_class_svm'] = float(self.one_class_svm.decision_function(latest_scaled)[0])
        
        # LSTM Autoencoder
        if self.lstm_autoencoder and len(features_df) >= self.config['lstm_sequence_length']:
            try:
                sequence_length = self.config['lstm_sequence_length']
                sequence_features = features_df.iloc[-sequence_length:].values
                sequence_scaled = self.scaler.transform(sequence_features)
                sequence_tensor = torch.FloatTensor(sequence_scaled).unsqueeze(0).to(device)
                
                with torch.no_grad():
                    reconstructed = self.lstm_autoencoder(sequence_tensor)
                    reconstruction_error = torch.mean((sequence_tensor - reconstructed) ** 2).item()
                    scores['lstm'] = float(reconstruction_error)
            except Exception as e:
                logger.warning(f"LSTM inference error: {e}")
                scores['lstm'] = 0.0
        
        return scores
    
    def _calculate_ensemble_score(self, scores: Dict[str, float]) -> float:
        """Calculate ensemble score using exact same logic as training"""
        ensemble_score = 0.0
        weights = self.config['weights']
        
        # Isolation Forest (lower = more anomalous)
        if 'isolation_forest' in scores:
            if_range = self.if_max - self.if_min
            if if_range > 0:
                if_normalized = (scores['isolation_forest'] - self.if_min) / if_range
                if_anomaly_score = 1.0 - np.clip(if_normalized, 0, 1)
            else:
                if_anomaly_score = 0.5
            ensemble_score += weights['isolation_forest'] * if_anomaly_score
        
        # SVM (negative = more anomalous)
        if 'one_class_svm' in scores:
            svm_range = self.svm_max - self.svm_min
            if svm_range > 0:
                svm_normalized = (scores['one_class_svm'] - self.svm_min) / svm_range
                svm_anomaly_score = 1.0 - np.clip(svm_normalized, 0, 1)
            else:
                svm_anomaly_score = 0.5
            ensemble_score += weights['one_class_svm'] * svm_anomaly_score
        
        # LSTM (higher reconstruction error = more anomalous)
        if 'lstm' in scores and self.lstm_threshold:
            lstm_anomaly_score = np.clip(scores['lstm'] / self.lstm_threshold, 0, 1)
            ensemble_score += weights['lstm'] * lstm_anomaly_score
        
        return np.clip(ensemble_score, 0, 1)
    
    def _get_alert_level(self, confidence: float) -> str:
        """Determine alert level"""
        if confidence > 0.8:
            return 'CRITICAL'
        elif confidence > 0.6:
            return 'HIGH'
        elif confidence > 0.4:
            return 'MEDIUM'
        elif confidence > 0.2:
            return 'LOW'
        else:
            return 'NORMAL'
    
    def _extract_driving_metrics_from_features(self, features_row: pd.Series) -> Dict[str, float]:
        """Extract driving metrics from processed features"""
        return {
            'speed': float(features_row['spd']),
            'acceleration': float(features_row['acceleration']),
            'lateral_acceleration': float(features_row['lateral_acceleration']),
            'jerk': float(features_row['jerk']),
            'heading_change_rate': float(features_row['heading_change_rate']),
            'overall_risk': float(features_row['overall_risk'])
        }
    
    def _extract_risk_factors_from_features(self, features_row):
        """
        Extract boolean risk factors from a row of driving features.
        """
    
        return {
            'hard_braking': bool(features_row['acceleration'] < -2.5),        # sudden deceleration
            'hard_acceleration': bool(features_row['acceleration'] > 2.5),    # sudden acceleration
            'excessive_speed': bool(features_row['spd'] > 120),               # overspeeding (km/h)
            'sharp_turn': bool(abs(features_row['lateral_acceleration']) > 3.0),  # strong lateral g-force
            'erratic_steering': bool(abs(features_row['angular_velocity']) > 30)  # quick steering angle change
        }

    def get_vehicle_status(self, vehicle_id: str) -> Dict[str, Any]:
        """Get current status of a vehicle"""
        if vehicle_id not in self.vehicle_buffers:
            return {'vehicle_id': vehicle_id, 'status': 'no_data'}
        
        buffer = self.vehicle_buffers[vehicle_id]
        return {
            'vehicle_id': vehicle_id,
            'buffer_size': len(buffer),
            'last_update': buffer[-1].timestamp if buffer else None,
            'ready_for_detection': len(buffer) >= self.config['min_points_for_detection']
        }

# Updated API input model to match your data structure
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional

class GPSPointRequest(BaseModel):
    """API request model matching your dataset columns"""
    vehicle_id: str  # maps to randomized_id
    lat: float
    lng: float
    alt: float = 0.0
    spd: float  # speed in km/h
    azm: float  # azimuth/heading 0-360
    timestamp: Optional[str] = None

# Updated sample input/output for your exact data structure
sample_input_output = {
    "input": {
        "vehicle_id": "fleet_001",
        "lat": 55.7558,
        "lng": 37.6176,
        "alt": 156.0,
        "spd": 45.5,
        "azm": 85.0,
        "timestamp": "2025-09-13T10:31:18Z"
    },
    "output": {
        "status": "detected",
        "result": {
            "timestamp": "2025-09-13T10:31:18Z",
            "vehicle_id": "fleet_001",
            "anomaly_detected": False,
            "confidence": 0.156,
            "alert_level": "NORMAL",
            "raw_scores": {
                "isolation_forest": 0.045,
                "one_class_svm": 12.34,
                "lstm": 0.234
            },
            "driving_metrics": {
                "speed": 45.5,
                "acceleration": 0.12,
                "lateral_acceleration": 0.08,
                "jerk": 0.05,
                "heading_change_rate": 0.02,
                "overall_risk": 0.089
            },
            "risk_factors": {
                "hard_braking": False,
                "hard_acceleration": False,
                "excessive_speed": False,
                "sharp_turn": False,
                "erratic_steering": False
            }
        }
    }
}

if __name__ == "__main__":
    print("Production Anomaly Detector for AdilzhanB's dataset format:")
    print("Columns: randomized_id, lat, lng, alt, spd, azm")
    print("Ready for deployment!")

Production Anomaly Detector for AdilzhanB's dataset format:
Columns: randomized_id, lat, lng, alt, spd, azm
Ready for deployment!


In [21]:
import numpy as np
import pandas as pd
from typing import List, Dict, Optional, Tuple, Any
from datetime import datetime, timedelta
import logging

logger = logging.getLogger(__name__)

class BatchAnomalyDetector(ProductionAnomalyDetector):
    """
    Extended ProductionAnomalyDetector with batch processing capabilities
    Processes data as list of lists: [[id, lat, lng, azm, spd, alt], ...]
    """
    
    def __init__(self, model_dir: str, config: Dict = None):
        super().__init__(model_dir, config)
        self.batch_results = []
        
    def process_batch_list_of_lists(self, 
                                   data: List[List], 
                                   column_order: List[str] = None,
                                   sort_by_vehicle: bool = True,
                                   generate_timestamps: bool = True) -> Dict[str, Any]:
        """
        Process batch data as list of lists
        
        Args:
            data: List of lists in format [[id, lat, lng, azm, spd, alt], ...]
            column_order: Order of columns if different from default
            sort_by_vehicle: Whether to sort by vehicle_id for proper sequence
            generate_timestamps: Whether to generate timestamps automatically
            
        Returns:
            Dictionary with batch processing results
        """
        
        if column_order is None:
            column_order = ['vehicle_id', 'lat', 'lng', 'azm', 'spd', 'alt']
        
        print(f"🔄 Processing batch of {len(data)} GPS points...")
        
        # Convert list of lists to DataFrame
        df = pd.DataFrame(data, columns=column_order)
        
        # Rename to match your training format
        column_mapping = {
            'vehicle_id': 'randomized_id',
            'azm': 'azm',
            'spd': 'spd', 
            'alt': 'alt',
            'lat': 'lat',
            'lng': 'lng'
        }
        
        # Apply column mapping if needed
        for old_col, new_col in column_mapping.items():
            if old_col in df.columns and old_col != new_col:
                df = df.rename(columns={old_col: new_col})
        
        # Ensure we have the right columns
        required_columns = ['randomized_id', 'lat', 'lng', 'alt', 'spd', 'azm']
        missing_columns = [col for col in required_columns if col not in df.columns]
        
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
        
        # Sort by vehicle and add sequence if requested
        if sort_by_vehicle:
            df = df.sort_values(['randomized_id', 'lat', 'lng']).reset_index(drop=True)
        
        # Generate timestamps if requested
        if generate_timestamps:
            df['timestamp'] = self._generate_timestamps(df)
        
        # Process batch
        return self._process_dataframe_batch(df)
    
    def process_batch_by_vehicle(self, 
                                data: List[List],
                                column_order: List[str] = None,
                                time_interval_seconds: int = 2) -> Dict[str, List[AnomalyResult]]:
        """
        Process batch data vehicle by vehicle to maintain proper sequence
        
        Args:
            data: List of lists format
            column_order: Column order specification
            time_interval_seconds: Time interval between GPS points
            
        Returns:
            Dictionary with vehicle_id as key and list of results as value
        """
        
        if column_order is None:
            column_order = ['vehicle_id', 'lat', 'lng', 'azm', 'spd', 'alt']
        
        # Convert to DataFrame
        df = pd.DataFrame(data, columns=column_order)
        
        # Group by vehicle
        vehicle_results = {}
        total_anomalies = 0
        
        print(f"🚛 Processing {df['vehicle_id'].nunique()} vehicles with {len(df)} total points...")
        
        for vehicle_id in df['vehicle_id'].unique():
            vehicle_data = df[df['vehicle_id'] == vehicle_id].copy()
            vehicle_data = vehicle_data.sort_values(['lat', 'lng']).reset_index(drop=True)
            
            print(f"\n📍 Processing vehicle: {vehicle_id} ({len(vehicle_data)} points)")
            
            # Clear vehicle buffer to start fresh
            if vehicle_id in self.vehicle_buffers:
                del self.vehicle_buffers[vehicle_id]
            
            vehicle_results[vehicle_id] = []
            vehicle_anomalies = 0
            
            # Process points sequentially for this vehicle
            for idx, row in vehicle_data.iterrows():
                timestamp = datetime.now() + timedelta(seconds=idx * time_interval_seconds)
                
                gps_point = GPSPoint(
                    vehicle_id=vehicle_id,
                    lat=row['lat'],
                    lng=row['lng'],
                    alt=row.get('alt', 0.0),
                    spd=row.get('spd', 0.0),
                    azm=row.get('azm', 0.0),
                    timestamp=timestamp.isoformat()
                )
                
                result = self.process_gps_point(gps_point)
                
                if result:
                    vehicle_results[vehicle_id].append(result)
                    if result.anomaly_detected:
                        vehicle_anomalies += 1
                        total_anomalies += 1
                        
                        # Print anomaly details
                        print(f"   🚨 Point {idx+1}: {result.alert_level} "
                              f"(Speed: {result.driving_metrics['speed']:.1f} km/h, "
                              f"Conf: {result.confidence:.3f})")
                        print(f"      Risk factors: {result.risk_factors}")
            
            detection_rate = vehicle_anomalies / len(vehicle_results[vehicle_id]) if vehicle_results[vehicle_id] else 0
            print(f"   📊 Vehicle summary: {vehicle_anomalies} anomalies out of {len(vehicle_results[vehicle_id])} detections ({detection_rate:.1%})")
        
        print(f"\n🎯 Batch Summary:")
        print(f"   Total vehicles: {len(vehicle_results)}")
        print(f"   Total points processed: {len(df)}")
        print(f"   Total anomalies detected: {total_anomalies}")
        print(f"   Overall anomaly rate: {total_anomalies/len(df):.1%}")
        
        return vehicle_results
    
    def process_realtime_stream(self, data_stream: List[List], 
                               column_order: List[str] = None,
                               delay_seconds: float = 2.0,
                               callback_function = None) -> List[AnomalyResult]:
        """
        Simulate real-time processing of list-of-lists data
        
        Args:
            data_stream: List of lists to process as real-time stream
            column_order: Column order
            delay_seconds: Delay between processing points (simulate real-time)
            callback_function: Function to call when anomaly is detected
            
        Returns:
            List of all detection results
        """
        
        import time
        
        if column_order is None:
            column_order = ['vehicle_id', 'lat', 'lng', 'azm', 'spd', 'alt']
        
        print(f"🔴 Starting real-time stream simulation with {len(data_stream)} points...")
        print(f"⏱️ Processing delay: {delay_seconds} seconds between points")
        
        all_results = []
        anomaly_count = 0
        
        for i, point_data in enumerate(data_stream):
            # Convert list to GPSPoint
            point_dict = dict(zip(column_order, point_data))
            
            gps_point = GPSPoint(
                vehicle_id=point_dict['vehicle_id'],
                lat=point_dict['lat'],
                lng=point_dict['lng'],
                alt=point_dict.get('alt', 0.0),
                spd=point_dict.get('spd', 0.0),
                azm=point_dict.get('azm', 0.0),
                timestamp=datetime.now().isoformat()
            )
            
            # Process point
            result = self.process_gps_point(gps_point)
            
            if result:
                all_results.append(result)
                
                # Print status
                status_icon = "🟢" if result.alert_level == "NORMAL" else "🟡" if result.alert_level in ["LOW", "MEDIUM"] else "🔴"
                print(f"{status_icon} Point {i+1:3d}: {result.vehicle_id:12s} | "
                      f"{result.alert_level:8s} | Speed: {result.driving_metrics['speed']:5.1f} km/h | "
                      f"Conf: {result.confidence:.3f}")
                
                if result.anomaly_detected:
                    anomaly_count += 1
                    print(f"      🚨 ANOMALY DETECTED! {result.risk_factors}")
                    
                    # Call callback function if provided
                    if callback_function:
                        callback_function(result, gps_point)
            else:
                print(f"⏳ Point {i+1:3d}: {point_dict['vehicle_id']:12s} | Building buffer...")
            
            # Simulate real-time delay
            if i < len(data_stream) - 1:  # Don't delay after last point
                time.sleep(delay_seconds)
        
        print(f"\n📊 Stream Complete:")
        print(f"   Points processed: {len(data_stream)}")
        print(f"   Detections made: {len(all_results)}")
        print(f"   Anomalies found: {anomaly_count}")
        print(f"   Anomaly rate: {anomaly_count/len(all_results)*100:.1f}%" if all_results else "   No detections made")
        
        return all_results
    
    def _generate_timestamps(self, df: pd.DataFrame) -> List[str]:
        """Generate realistic timestamps for GPS data"""
        base_time = datetime.now()
        timestamps = []
        
        for vehicle_id in df['randomized_id'].unique():
            vehicle_mask = df['randomized_id'] == vehicle_id
            vehicle_count = vehicle_mask.sum()
            
            # Generate timestamps for this vehicle (2-second intervals)
            for i in range(vehicle_count):
                timestamp = base_time + timedelta(seconds=i * 2)
                timestamps.append(timestamp.isoformat())
        
        return timestamps
    
    def _process_dataframe_batch(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Process DataFrame using the existing feature pipeline"""
        
        # Use your exact feature engineering pipeline
        features_df = self._calculate_features_exact_pipeline(df)
        
        if len(features_df) == 0:
            return {
                "status": "error",
                "message": "No features could be calculated",
                "processed": 0,
                "anomalies": 0
            }
        
        # Scale features
        features_scaled = self.scaler.transform(features_df)
        
        # Get anomaly scores for all points
        anomaly_results = []
        
        print("🔍 Running anomaly detection on all points...")
        
        for i in range(len(features_scaled)):
            point_scaled = features_scaled[i:i+1]
            
            # Get scores from all models
            scores = {}
            
            # Isolation Forest
            if self.isolation_forest:
                scores['isolation_forest'] = float(self.isolation_forest.decision_function(point_scaled)[0])
            
            # One-Class SVM
            if self.one_class_svm:
                scores['one_class_svm'] = float(self.one_class_svm.decision_function(point_scaled)[0])
            
            # LSTM (only if we have enough sequence data)
            if self.lstm_autoencoder and i >= self.config['lstm_sequence_length'] - 1:
                try:
                    sequence_start = max(0, i - self.config['lstm_sequence_length'] + 1)
                    sequence_features = features_scaled[sequence_start:i+1]
                    
                    if len(sequence_features) == self.config['lstm_sequence_length']:
                        sequence_tensor = torch.FloatTensor(sequence_features).unsqueeze(0).to(device)
                        
                        with torch.no_grad():
                            reconstructed = self.lstm_autoencoder(sequence_tensor)
                            reconstruction_error = torch.mean((sequence_tensor - reconstructed) ** 2).item()
                            scores['lstm'] = float(reconstruction_error)
                except:
                    scores['lstm'] = 0.0
            
            # Calculate ensemble score
            ensemble_score = self._calculate_ensemble_score(scores)
            alert_level = self._get_alert_level(ensemble_score)
            
            # Extract metrics
            feature_row = features_df.iloc[i]
            driving_metrics = self._extract_driving_metrics_from_features(feature_row)
            risk_factors = self._extract_risk_factors_from_features(feature_row)
            
            anomaly_results.append({
                'index': i,
                'vehicle_id': df.iloc[i]['randomized_id'],
                'anomaly_detected': ensemble_score > self.config['alert_threshold'],
                'confidence': ensemble_score,
                'alert_level': alert_level,
                'raw_scores': scores,
                'driving_metrics': driving_metrics,
                'risk_factors': risk_factors
            })
        
        # Generate summary
        total_anomalies = sum(1 for r in anomaly_results if r['anomaly_detected'])
        
        return {
            "status": "completed",
            "processed": len(anomaly_results),
            "anomalies": total_anomalies,
            "anomaly_rate": total_anomalies / len(anomaly_results) if anomaly_results else 0,
            "results": anomaly_results,
            "summary": {
                "total_vehicles": df['randomized_id'].nunique(),
                "total_points": len(df),
                "detection_ready_points": len(anomaly_results),
                "anomalies_by_level": {
                    level: sum(1 for r in anomaly_results if r['alert_level'] == level)
                    for level in ['NORMAL', 'LOW', 'MEDIUM', 'HIGH', 'CRITICAL']
                }
            }
        }

# Example usage functions
def example_list_of_lists_usage():
    """Example of how to use the batch processor with list of lists"""
    
    print("🔄 Example: Processing List of Lists Data")
    print("=" * 50)
    
    # Initialize batch detector
    detector = BatchAnomalyDetector("/kaggle/working/anomaly_analysis_pytorch_fixed/models")
    
    # Sample data as list of lists: [vehicle_id, lat, lng, azm, spd, alt]
    sample_data = [
        # Normal driving for vehicle_001
        ["vehicle_001", 55.7558, 37.6176, 90.0, 45.0, 156.0],
        ["vehicle_001", 55.7559, 37.6177, 92.0, 47.0, 157.0],
        ["vehicle_001", 55.7560, 37.6178, 94.0, 46.0, 158.0],
        ["vehicle_001", 55.7561, 37.6179, 96.0, 48.0, 159.0],
        ["vehicle_001", 55.7562, 37.6180, 98.0, 49.0, 160.0],
        
        # Aggressive driving for vehicle_002
        ["vehicle_002", 55.7600, 37.6200, 180.0, 70.0, 150.0],
        ["vehicle_002", 55.7601, 37.6201, 182.0, 125.0, 151.0],  # Speeding
        ["vehicle_002", 55.7602, 37.6202, 184.0, 15.0, 152.0],   # Hard braking
        ["vehicle_002", 55.7603, 37.6203, 250.0, 55.0, 153.0],   # Sharp turn
        
        # Mixed behavior for vehicle_003
        ["vehicle_003", 55.7700, 37.6300, 45.0, 40.0, 145.0],
        ["vehicle_003", 55.7701, 37.6301, 47.0, 42.0, 146.0],
        ["vehicle_003", 55.7702, 37.6302, 49.0, 110.0, 147.0],   # Speed violation
        ["vehicle_003", 55.7703, 37.6303, 51.0, 43.0, 148.0],
    ]
    
    print(f"Processing {len(sample_data)} GPS points from {len(set(row[0] for row in sample_data))} vehicles...")
    
    # Method 1: Process as batch
    print("\n📊 Method 1: Batch Processing")
    batch_results = detector.process_batch_list_of_lists(sample_data)
    
    print(f"Batch Results:")
    print(f"  Status: {batch_results['status']}")
    print(f"  Points processed: {batch_results['processed']}")
    print(f"  Anomalies detected: {batch_results['anomalies']}")
    print(f"  Anomaly rate: {batch_results['anomaly_rate']:.1%}")
    
    # Method 2: Process by vehicle
    print("\n🚛 Method 2: Vehicle-by-Vehicle Processing")
    vehicle_results = detector.process_batch_by_vehicle(sample_data)
    
    for vehicle_id, results in vehicle_results.items():
        anomaly_count = sum(1 for r in results if r.anomaly_detected)
        print(f"  {vehicle_id}: {anomaly_count} anomalies out of {len(results)} detections")
    
    # Method 3: Real-time simulation
    print("\n🔴 Method 3: Real-time Stream Simulation (first 8 points)")
    
    def anomaly_callback(result, gps_point):
        """Callback function for when anomaly is detected"""
        print(f"      📧 ALERT SENT: {result.vehicle_id} - {result.alert_level}")
    
    stream_results = detector.process_realtime_stream(
        sample_data[:8],  # First 8 points
        delay_seconds=0.5,  # Faster for demo
        callback_function=anomaly_callback
    )

def load_from_csv_example():
    """Example of loading data from CSV and converting to list of lists"""
    
    print("\n📁 Example: Loading from CSV")
    print("=" * 50)
    
    # Simulate CSV loading (you would use pd.read_csv('your_file.csv'))
    csv_data = """vehicle_id,lat,lng,azm,spd,alt
vehicle_001,55.7558,37.6176,90.0,45.0,156.0
vehicle_001,55.7559,37.6177,92.0,47.0,157.0
vehicle_002,55.7600,37.6200,180.0,125.0,150.0
vehicle_002,55.7601,37.6201,182.0,15.0,151.0"""
    
    # Convert CSV to list of lists
    from io import StringIO
    df = pd.read_csv(StringIO(csv_data))
    
    # Convert DataFrame to list of lists
    data_as_lists = df.values.tolist()
    
    print(f"Loaded {len(data_as_lists)} rows from CSV")
    print(f"Column order: {df.columns.tolist()}")
    print(f"Sample data: {data_as_lists[0]}")
    
    # Process with detector
    detector = BatchAnomalyDetector("/kaggle/working/anomaly_analysis_pytorch_fixed/models")
    results = detector.process_batch_list_of_lists(
        data_as_lists, 
        column_order=df.columns.tolist()
    )
    
    print(f"Processing complete: {results['anomalies']} anomalies detected")

def large_dataset_example():
    """Example for processing large datasets efficiently"""
    
    print("\n🔢 Example: Large Dataset Processing")
    print("=" * 50)
    
    # Simulate large dataset
    np.random.seed(42)
    large_data = []
    
    vehicles = [f"vehicle_{i:03d}" for i in range(1, 11)]  # 10 vehicles
    
    for vehicle in vehicles:
        for point in range(100):  # 100 points per vehicle
            lat = 55.7500 + np.random.uniform(-0.01, 0.01)
            lng = 37.6000 + np.random.uniform(-0.01, 0.01)
            azm = np.random.uniform(0, 360)
            spd = np.random.uniform(20, 80) if np.random.random() > 0.1 else np.random.uniform(90, 140)  # 10% aggressive
            alt = 150 + np.random.uniform(-20, 20)
            
            large_data.append([vehicle, lat, lng, azm, spd, alt])
    
    print(f"Generated large dataset: {len(large_data)} points from {len(vehicles)} vehicles")
    
    # Process efficiently
    detector = BatchAnomalyDetector("/kaggle/working/anomaly_analysis_pytorch_fixed/models")
    
    # Process in chunks for memory efficiency
    chunk_size = 500
    total_anomalies = 0
    
    for i in range(0, len(large_data), chunk_size):
        chunk = large_data[i:i + chunk_size]
        print(f"Processing chunk {i//chunk_size + 1}: points {i+1}-{i+len(chunk)}")
        
        results = detector.process_batch_list_of_lists(chunk)
        total_anomalies += results['anomalies']
        
        print(f"  Chunk anomalies: {results['anomalies']}")
    
    print(f"\nLarge dataset complete:")
    print(f"  Total points: {len(large_data)}")
    print(f"  Total anomalies: {total_anomalies}")
    print(f"  Overall anomaly rate: {total_anomalies/len(large_data):.1%}")

if __name__ == "__main__":
    print("🚀 Batch Processing Examples for AdilzhanB")
    print("List of Lists Format: [[id, lat, lng, azm, spd, alt], ...]")
    print("=" * 60)

    example_list_of_lists_usage()
    load_from_csv_example()
    large_dataset_example()
    
    print("\n🎉 All batch processing examples completed!")

🚀 Batch Processing Examples for AdilzhanB
List of Lists Format: [[id, lat, lng, azm, spd, alt], ...]
🔄 Example: Processing List of Lists Data
Processing 13 GPS points from 3 vehicles...

📊 Method 1: Batch Processing
🔄 Processing batch of 13 GPS points...
🔍 Running anomaly detection on all points...
Batch Results:
  Status: completed
  Points processed: 13
  Anomalies detected: 9
  Anomaly rate: 69.2%

🚛 Method 2: Vehicle-by-Vehicle Processing
🚛 Processing 3 vehicles with 13 total points...

📍 Processing vehicle: vehicle_001 (5 points)
   🚨 Point 5: LOW (Speed: 49.0 km/h, Conf: 0.357)
      Risk factors: {'hard_braking': False, 'hard_acceleration': True, 'excessive_speed': False, 'sharp_turn': True, 'erratic_steering': False}
   📊 Vehicle summary: 1 anomalies out of 1 detections (100.0%)

📍 Processing vehicle: vehicle_002 (4 points)
   📊 Vehicle summary: 0 anomalies out of 0 detections (0.0%)

📍 Processing vehicle: vehicle_003 (4 points)
   📊 Vehicle summary: 0 anomalies out of 0 detect