# 02. Baseline: Isolation Forest

This notebook implements the Isolation Forest baseline for anomaly detection.

In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')
from preprocessing import load_data, get_selected_features, clean_data, create_target, fit_transform_data
from models import train_isolation_forest, get_if_anomaly_scores
from evaluation import evaluate_anomaly_detector, plot_evaluation_curves
from sklearn.model_selection import train_test_split

## Load and Preprocess Data

In [None]:
data_path = '../data/diabetic_data.csv'
df = load_data(data_path)
features = get_selected_features()
df_clean = clean_data(df, features)
X, y = create_target(df_clean)
X_processed, preprocessor = fit_transform_data(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed.values, y.values, test_size=0.2, stratify=y, random_state=42)

## Train Isolation Forest

In [None]:
# Train on normal samples only (semi-supervised setting) or all samples (unsupervised)
# Here we train on normal samples from the training set to mimic the autoencoder setup
X_train_normal = X_train[y_train == 0]

iso_forest = train_isolation_forest(X_train_normal, contamination=0.1)
print("Isolation Forest trained.")

## Evaluate

In [None]:
if_scores = get_if_anomaly_scores(iso_forest, X_test)
metrics = evaluate_anomaly_detector(y_test, if_scores, model_name="Isolation Forest")

plot_evaluation_curves(y_test, {"Isolation Forest": if_scores})