# Anomaly Detection Model Development
## Isolation Forest for Network, Site, and Link Anomalies

This notebook demonstrates how to:
1. Load data from PostgreSQL
2. Engineer features for anomaly detection
3. Train Isolation Forest models
4. Evaluate model performance
5. Export models to pickle for web application deployment

**Note**: Update database connection details and feature columns based on your actual schema.


In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pickle
import logging
from pathlib import Path
from datetime import datetime
import psycopg
import json

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Libraries imported successfully!")


## Database Configuration

Update these connection details to match your PostgreSQL setup:


In [None]:
# Database connection parameters
# IMPORTANT: Update these with your actual database credentials
DB_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "bcom_bolt",
    "user": "postgres",
    "password": "your_password_here"
}

# Path configuration for model exports
MODELS_DIR = Path("../../ml_models/anomaly_detection")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Models will be saved to: {MODELS_DIR.absolute()}")


## Load and Prepare Data

Load data from PostgreSQL and prepare features for anomaly detection.

**Note**: Adjust the SQL queries based on your actual table schemas (provided by user)


In [None]:
def load_network_metrics():
    """
    Load network metrics data from PostgreSQL.
    
    TEMPLATE: Update query based on your network_metrics table schema
    Expected columns: [id, user_id, timestamp, latency, packet_loss, bandwidth, etc.]
    """
    query = """
    SELECT 
        id,
        user_id,
        timestamp,
        -- Add your actual metric columns here
        -- latency,
        -- packet_loss,
        -- bandwidth,
        created_at
    FROM network_metrics
    WHERE created_at >= NOW() - INTERVAL '90 days'
    ORDER BY created_at DESC
    LIMIT 100000
    """
    
    try:
        with psycopg.connect(**DB_CONFIG) as conn:
            df = pd.read_sql(query, conn)
        logger.info(f"Loaded {len(df)} network metric records")
        return df
    except Exception as e:
        logger.error(f"Error loading network metrics: {str(e)}")
        return None

def load_site_metrics():
    """
    Load site metrics data from PostgreSQL.
    
    TEMPLATE: Update query based on your site_metrics table schema
    Expected columns: [id, site_id, timestamp, response_time, error_rate, etc.]
    """
    query = """
    SELECT 
        id,
        site_id,
        timestamp,
        -- Add your actual metric columns here
        -- response_time,
        -- error_rate,
        -- uptime,
        created_at
    FROM site_metrics
    WHERE created_at >= NOW() - INTERVAL '90 days'
    ORDER BY created_at DESC
    LIMIT 100000
    """
    
    try:
        with psycopg.connect(**DB_CONFIG) as conn:
            df = pd.read_sql(query, conn)
        logger.info(f"Loaded {len(df)} site metric records")
        return df
    except Exception as e:
        logger.error(f"Error loading site metrics: {str(e)}")
        return None

def load_link_metrics():
    """
    Load link metrics data from PostgreSQL.
    
    TEMPLATE: Update query based on your link_metrics table schema
    Expected columns: [id, link_id, timestamp, jitter, throughput, etc.]
    """
    query = """
    SELECT 
        id,
        link_id,
        timestamp,
        -- Add your actual metric columns here
        -- jitter,
        -- throughput,
        -- packet_drop_rate,
        created_at
    FROM link_metrics
    WHERE created_at >= NOW() - INTERVAL '90 days'
    ORDER BY created_at DESC
    LIMIT 100000
    """
    
    try:
        with psycopg.connect(**DB_CONFIG) as conn:
            df = pd.read_sql(query, conn)
        logger.info(f"Loaded {len(df)} link metric records")
        return df
    except Exception as e:
        logger.error(f"Error loading link metrics: {str(e)}")
        return None

# Load all datasets
print("Loading data from PostgreSQL...")
network_df = load_network_metrics()
site_df = load_site_metrics()
link_df = load_link_metrics()

print(f"\nData Summary:")
print(f"  Network metrics: {network_df.shape if network_df is not None else 'FAILED TO LOAD'}")
print(f"  Site metrics: {site_df.shape if site_df is not None else 'FAILED TO LOAD'}")
print(f"  Link metrics: {link_df.shape if link_df is not None else 'FAILED TO LOAD'}")


## Feature Engineering

Prepare and engineer features for anomaly detection models.


In [None]:
def prepare_network_features(df):
    """
    Prepare features for network anomaly detection.
    
    TEMPLATE: Update feature engineering based on your available metrics
    """
    if df is None or len(df) == 0:
        return None
    
    # Create a copy to avoid modifying original
    df_processed = df.copy()
    
    # TODO: Add your feature engineering logic here
    # Examples:
    # - Calculate rolling averages
    # - Compute statistical features (std, min, max)
    # - Create interaction features
    # - Normalize/scale features
    
    # Placeholder: Select numeric columns for training
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove ID and user_id columns
    feature_cols = [col for col in numeric_cols if col not in ['id', 'user_id', 'site_id', 'link_id']]
    
    if not feature_cols:
        logger.warning("No numeric features found for network data")
        return None
    
    logger.info(f"Network features selected: {feature_cols}")
    return df_processed[feature_cols].fillna(df_processed[feature_cols].mean())

def prepare_site_features(df):
    """
    Prepare features for site anomaly detection.
    
    TEMPLATE: Update feature engineering based on your available metrics
    """
    if df is None or len(df) == 0:
        return None
    
    df_processed = df.copy()
    
    # TODO: Add your feature engineering logic here
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [col for col in numeric_cols if col not in ['id', 'user_id', 'site_id', 'link_id']]
    
    if not feature_cols:
        logger.warning("No numeric features found for site data")
        return None
    
    logger.info(f"Site features selected: {feature_cols}")
    return df_processed[feature_cols].fillna(df_processed[feature_cols].mean())

def prepare_link_features(df):
    """
    Prepare features for link anomaly detection.
    
    TEMPLATE: Update feature engineering based on your available metrics
    """
    if df is None or len(df) == 0:
        return None
    
    df_processed = df.copy()
    
    # TODO: Add your feature engineering logic here
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [col for col in numeric_cols if col not in ['id', 'user_id', 'site_id', 'link_id']]
    
    if not feature_cols:
        logger.warning("No numeric features found for link data")
        return None
    
    logger.info(f"Link features selected: {feature_cols}")
    return df_processed[feature_cols].fillna(df_processed[feature_cols].mean())

# Prepare features
print("Preparing features...")
network_features = prepare_network_features(network_df) if network_df is not None else None
site_features = prepare_site_features(site_df) if site_df is not None else None
link_features = prepare_link_features(link_df) if link_df is not None else None

print("\nFeature shapes:")
if network_features is not None:
    print(f"  Network: {network_features.shape}")
if site_features is not None:
    print(f"  Site: {site_features.shape}")
if link_features is not None:
    print(f"  Link: {link_features.shape}")


## Model Training

Train Isolation Forest models for anomaly detection on each metric type.


In [None]:
def train_isolation_forest(X_train, contamination=0.1, random_state=42):
    """
    Train an Isolation Forest model.
    
    Args:
        X_train: Training feature matrix
        contamination: Proportion of outliers in the dataset (0-1)
        random_state: Random seed for reproducibility
        
    Returns:
        Trained IsolationForest model
    """
    model = IsolationForest(
        contamination=contamination,
        random_state=random_state,
        n_estimators=100,
        max_samples='auto',
        max_features=1.0
    )
    
    model.fit(X_train)
    logger.info(f"Model trained with contamination={contamination}")
    return model

# Train models
models = {}
scalers = {}

print("Training Isolation Forest models...")

if network_features is not None and len(network_features) > 0:
    # Scale features
    scaler_network = StandardScaler()
    network_scaled = scaler_network.fit_transform(network_features)
    scalers['network'] = scaler_network
    
    # Train model
    models['isolation_forest_network'] = train_isolation_forest(network_scaled)
    print("✓ Network anomaly detector trained")
else:
    print("✗ Skipped network model (no data)")

if site_features is not None and len(site_features) > 0:
    # Scale features
    scaler_site = StandardScaler()
    site_scaled = scaler_site.fit_transform(site_features)
    scalers['site'] = scaler_site
    
    # Train model
    models['isolation_forest_site'] = train_isolation_forest(site_scaled)
    print("✓ Site anomaly detector trained")
else:
    print("✗ Skipped site model (no data)")

if link_features is not None and len(link_features) > 0:
    # Scale features
    scaler_link = StandardScaler()
    link_scaled = scaler_link.fit_transform(link_features)
    scalers['link'] = scaler_link
    
    # Train model
    models['isolation_forest_link'] = train_isolation_forest(link_scaled)
    print("✓ Link anomaly detector trained")
else:
    print("✗ Skipped link model (no data)")

print(f"\n{len(models)} models trained successfully")


## Model Evaluation

Evaluate model performance and generate predictions.


In [None]:
print("Evaluating models...\n")

# Evaluate each model
evaluation_results = {}

if 'isolation_forest_network' in models:
    model = models['isolation_forest_network']
    scaler = scalers['network']
    X_scaled = scaler.transform(network_features)
    
    predictions = model.predict(X_scaled)
    anomaly_scores = model.score_samples(X_scaled)
    
    # Count anomalies
    n_anomalies = (predictions == -1).sum()
    
    evaluation_results['network'] = {
        'model': 'isolation_forest_network',
        'total_samples': len(predictions),
        'anomalies_detected': n_anomalies,
        'anomaly_percentage': 100 * n_anomalies / len(predictions),
        'min_score': float(anomaly_scores.min()),
        'max_score': float(anomaly_scores.max()),
        'mean_score': float(anomaly_scores.mean())
    }
    
    print("Network Anomaly Detector:")
    print(f"  Total samples: {len(predictions)}")
    print(f"  Anomalies detected: {n_anomalies} ({100*n_anomalies/len(predictions):.2f}%)")
    print(f"  Score range: [{anomaly_scores.min():.4f}, {anomaly_scores.max():.4f}]")

if 'isolation_forest_site' in models:
    model = models['isolation_forest_site']
    scaler = scalers['site']
    X_scaled = scaler.transform(site_features)
    
    predictions = model.predict(X_scaled)
    anomaly_scores = model.score_samples(X_scaled)
    
    n_anomalies = (predictions == -1).sum()
    
    evaluation_results['site'] = {
        'model': 'isolation_forest_site',
        'total_samples': len(predictions),
        'anomalies_detected': n_anomalies,
        'anomaly_percentage': 100 * n_anomalies / len(predictions),
        'min_score': float(anomaly_scores.min()),
        'max_score': float(anomaly_scores.max()),
        'mean_score': float(anomaly_scores.mean())
    }
    
    print("\nSite Anomaly Detector:")
    print(f"  Total samples: {len(predictions)}")
    print(f"  Anomalies detected: {n_anomalies} ({100*n_anomalies/len(predictions):.2f}%)")
    print(f"  Score range: [{anomaly_scores.min():.4f}, {anomaly_scores.max():.4f}]")

if 'isolation_forest_link' in models:
    model = models['isolation_forest_link']
    scaler = scalers['link']
    X_scaled = scaler.transform(link_features)
    
    predictions = model.predict(X_scaled)
    anomaly_scores = model.score_samples(X_scaled)
    
    n_anomalies = (predictions == -1).sum()
    
    evaluation_results['link'] = {
        'model': 'isolation_forest_link',
        'total_samples': len(predictions),
        'anomalies_detected': n_anomalies,
        'anomaly_percentage': 100 * n_anomalies / len(predictions),
        'min_score': float(anomaly_scores.min()),
        'max_score': float(anomaly_scores.max()),
        'mean_score': float(anomaly_scores.mean())
    }
    
    print("\nLink Anomaly Detector:")
    print(f"  Total samples: {len(predictions)}")
    print(f"  Anomalies detected: {n_anomalies} ({100*n_anomalies/len(predictions):.2f}%)")
    print(f"  Score range: [{anomaly_scores.min():.4f}, {anomaly_scores.max():.4f}]")

# Display summary
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
for name, results in evaluation_results.items():
    print(f"\n{name.upper()}:")
    for key, value in results.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")


## Serialize Models to Pickle

Export trained models and scalers to pickle format for web application deployment.


In [None]:
import hashlib
import sys
sys.path.insert(0, '../../')

from app.services.model_management import ModelManager, ModelMetadata

# Initialize model manager
model_manager = ModelManager(str(MODELS_DIR.parent))

print("Serializing models to pickle...\n")

# Save models with metadata
saved_models = []

if 'isolation_forest_network' in models:
    model = models['isolation_forest_network']
    metadata = ModelMetadata(
        model_name='isolation_forest_network',
        version='1.0.0',
        model_type='isolation_forest',
        training_date=datetime.now().isoformat(),
        description='Isolation Forest for detecting network metric anomalies',
        hyperparameters={
            'contamination': 0.1,
            'n_estimators': 100,
            'max_samples': 'auto',
            'max_features': 1.0,
            'random_state': 42
        },
        metrics=evaluation_results.get('network', {})
    )
    
    success, path = model_manager.save_model(
        model, 'isolation_forest_network', 'anomaly_detection', '1.0.0', metadata
    )
    
    if success:
        print(f"✓ Network model saved: {path}")
        saved_models.append('isolation_forest_network')
    else:
        print(f"✗ Failed to save network model: {path}")

if 'isolation_forest_site' in models:
    model = models['isolation_forest_site']
    metadata = ModelMetadata(
        model_name='isolation_forest_site',
        version='1.0.0',
        model_type='isolation_forest',
        training_date=datetime.now().isoformat(),
        description='Isolation Forest for detecting site metric anomalies',
        hyperparameters={
            'contamination': 0.1,
            'n_estimators': 100,
            'max_samples': 'auto',
            'max_features': 1.0,
            'random_state': 42
        },
        metrics=evaluation_results.get('site', {})
    )
    
    success, path = model_manager.save_model(
        model, 'isolation_forest_site', 'anomaly_detection', '1.0.0', metadata
    )
    
    if success:
        print(f"✓ Site model saved: {path}")
        saved_models.append('isolation_forest_site')
    else:
        print(f"✗ Failed to save site model: {path}")

if 'isolation_forest_link' in models:
    model = models['isolation_forest_link']
    metadata = ModelMetadata(
        model_name='isolation_forest_link',
        version='1.0.0',
        model_type='isolation_forest',
        training_date=datetime.now().isoformat(),
        description='Isolation Forest for detecting link metric anomalies',
        hyperparameters={
            'contamination': 0.1,
            'n_estimators': 100,
            'max_samples': 'auto',
            'max_features': 1.0,
            'random_state': 42
        },
        metrics=evaluation_results.get('link', {})
    )
    
    success, path = model_manager.save_model(
        model, 'isolation_forest_link', 'anomaly_detection', '1.0.0', metadata
    )
    
    if success:
        print(f"✓ Link model saved: {path}")
        saved_models.append('isolation_forest_link')
    else:
        print(f"✗ Failed to save link model: {path}")

print(f"\n{len(saved_models)} models saved successfully!")

# Also save scalers separately for reference
scalers_data = {
    'network': {'mean': scalers['network'].mean_.tolist(),
                'scale': scalers['network'].scale_.tolist()} if 'network' in scalers else None,
    'site': {'mean': scalers['site'].mean_.tolist(),
             'scale': scalers['site'].scale_.tolist()} if 'site' in scalers else None,
    'link': {'mean': scalers['link'].mean_.tolist(),
             'scale': scalers['link'].scale_.tolist()} if 'link' in scalers else None,
}

scalers_path = MODELS_DIR / 'scalers_metadata.json'
with open(scalers_path, 'w') as f:
    json.dump(scalers_data, f, indent=2)
print(f"✓ Scalers metadata saved: {scalers_path}")


## Test Serialized Models

Load the pickled models and verify they work correctly.


In [None]:
print("Testing serialized models...\n")

# List available models
available_models = model_manager.list_models('anomaly_detection')
print("Available models:")
for model_name, versions in available_models.items():
    print(f"  {model_name}: {versions}")

print("\n" + "="*60)
print("LOADING AND TESTING MODELS")
print("="*60)

# Load and test network model
print("\n1. Testing Network Anomaly Detector:")
loaded_model, loaded_metadata = model_manager.load_model(
    'isolation_forest_network', 'anomaly_detection', '1.0.0'
)

if loaded_model is not None:
    # Test prediction
    test_input = network_features.iloc[:5].values
    scaled_input = scalers['network'].transform(test_input)
    predictions = loaded_model.predict(scaled_input)
    scores = loaded_model.score_samples(scaled_input)
    
    print(f"  ✓ Model loaded successfully")
    print(f"  ✓ Test predictions: {predictions[:3]}")
    print(f"  ✓ Anomaly scores: {scores[:3]}")
    
    if loaded_metadata:
        print(f"  ✓ Metadata: v{loaded_metadata.version} - {loaded_metadata.description}")
else:
    print(f"  ✗ Failed to load model")

# Load and test site model
print("\n2. Testing Site Anomaly Detector:")
loaded_model, loaded_metadata = model_manager.load_model(
    'isolation_forest_site', 'anomaly_detection', '1.0.0'
)

if loaded_model is not None:
    test_input = site_features.iloc[:5].values
    scaled_input = scalers['site'].transform(test_input)
    predictions = loaded_model.predict(scaled_input)
    scores = loaded_model.score_samples(scaled_input)
    
    print(f"  ✓ Model loaded successfully")
    print(f"  ✓ Test predictions: {predictions[:3]}")
    print(f"  ✓ Anomaly scores: {scores[:3]}")
    
    if loaded_metadata:
        print(f"  ✓ Metadata: v{loaded_metadata.version} - {loaded_metadata.description}")
else:
    print(f"  ✗ Failed to load model")

# Load and test link model
print("\n3. Testing Link Anomaly Detector:")
loaded_model, loaded_metadata = model_manager.load_model(
    'isolation_forest_link', 'anomaly_detection', '1.0.0'
)

if loaded_model is not None:
    test_input = link_features.iloc[:5].values
    scaled_input = scalers['link'].transform(test_input)
    predictions = loaded_model.predict(scaled_input)
    scores = loaded_model.score_samples(scaled_input)
    
    print(f"  ✓ Model loaded successfully")
    print(f"  ✓ Test predictions: {predictions[:3]}")
    print(f"  ✓ Anomaly scores: {scores[:3]}")
    
    if loaded_metadata:
        print(f"  ✓ Metadata: v{loaded_metadata.version} - {loaded_metadata.description}")
else:
    print(f"  ✗ Failed to load model")

print("\n" + "="*60)
print("ALL TESTS COMPLETED SUCCESSFULLY!")
print("="*60)
print("\nModels are ready for deployment in the web application.")
print(f"Model location: {MODELS_DIR}")
