# 02. Baseline: Isolation Forest

Train and evaluate an Isolation Forest model for anomaly detection to identify early hospital readmissions.

In [None]:
# === UNIVERSAL PATH SETUP (Works in both Local and Colab) ===
import sys
import os

# Auto-detect environment and setup paths
try:
    from src.utils import setup_paths
    env_type = setup_paths()
except ImportError:
    # Fallback if utils not found (first run)
    print("⚙️  Setting up paths...")
    try:
        import google.colab
        in_colab = True
        if 'notebooks' in os.getcwd():
            os.chdir('..')
        project_root = os.getcwd()
        print("☁️  Detected: Google Colab")
    except ImportError:
        in_colab = False
        project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
        print("💻 Detected: Local Environment")
    
    if project_root not in sys.path:
        sys.path.insert(0, project_root)
    print(f"✅ Project root: {project_root}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from src.preprocessing import build_feature_matrix, train_test_split_stratified
from src.models import fit_isolation_forest, score_isolation_forest
from src.evaluation import evaluate_anomaly_detector, plot_roc_pr, save_metrics_summary

# Create results directory
results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

## Step 1: Load and Preprocess Data

In [None]:
# Build feature matrix
data_path = '../data/raw/diabetic_data.csv'
X, y, preprocessor = build_feature_matrix(data_path)

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")
print(f"Positive class (readmitted <30): {y.sum()} ({100*y.mean():.2f}%)")

## Step 2: Train-Test Split

In [None]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split_stratified(X, y, test_size=0.2, random_state=42)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set:  {X_test.shape[0]} samples")
print(f"\nTrain positive rate: {y_train.mean():.4f}")
print(f"Test positive rate:  {y_test.mean():.4f}")

## Step 3: Train Isolation Forest

In [None]:
# Train model
print("Training Isolation Forest...")
iso_forest = fit_isolation_forest(X_train.values, contamination=0.1, random_state=42)
print("✓ Training complete!")

## Step 4: Compute Anomaly Scores

In [None]:
# Get anomaly scores (higher = more anomalous)
if_scores_test = score_isolation_forest(iso_forest, X_test.values)

print(f"Anomaly scores range: [{if_scores_test.min():.4f}, {if_scores_test.max():.4f}]")
print(f"Mean score: {if_scores_test.mean():.4f}")
print(f"Std score:  {if_scores_test.std():.4f}")

## Step 5: Evaluate Performance

In [None]:
# Evaluate
if_metrics = evaluate_anomaly_detector(y_test.values, if_scores_test, model_name="Isolation Forest")

In [None]:
# Plot ROC and PR curves
plot_roc_pr(y_test.values, {'Isolation Forest': if_scores_test})
plt.savefig(results_dir / 'if_roc_pr_curves.png', dpi=150, bbox_inches='tight')
print(f"✓ Plots saved to {results_dir / 'if_roc_pr_curves.png'}")

## Step 6: Save Results

In [None]:
# Save metrics
save_metrics_summary([if_metrics], results_dir / 'isolation_forest_metrics.csv')
print("✓ Baseline evaluation complete!")