# 04. Ontology-Aware Evaluation

This notebook evaluates the impact of the ontology-inspired rule layer on anomaly detection performance.

In [None]:
import pandas as pd
import numpy as np
import sys
import torch
from pathlib import Path

# Add src to path
sys.path.append('../src')
from preprocessing import load_data, get_selected_features, clean_data, create_target, fit_transform_data
from models import train_isolation_forest, get_if_anomaly_scores
from ontology import apply_ontology_rules, combine_scores
from evaluation import evaluate_anomaly_detector, plot_evaluation_curves, print_comparison_table
from sklearn.model_selection import train_test_split

## Load and Preprocess Data

In [None]:
data_path = '../data/diabetic_data.csv'
df = load_data(data_path)
features = get_selected_features()
df_clean = clean_data(df, features)
X, y = create_target(df_clean)
X_processed, preprocessor = fit_transform_data(X)

# Split data (we need the original df indices to map back to raw features for ontology rules)
# We'll split indices instead
train_idx, test_idx = train_test_split(df_clean.index, test_size=0.2, stratify=y, random_state=42)

X_train = X_processed.loc[train_idx]
X_test = X_processed.loc[test_idx]
y_train = y.loc[train_idx]
y_test = y.loc[test_idx]

# Get raw test data for ontology rules
df_test_raw = df_clean.loc[test_idx]

## Baseline: Isolation Forest

In [None]:
X_train_normal = X_train[y_train == 0]
iso_forest = train_isolation_forest(X_train_normal.values, contamination=0.1)
if_scores = get_if_anomaly_scores(iso_forest, X_test.values)

# Normalize IF scores to 0-1 range for combination
if_scores_norm = (if_scores - if_scores.min()) / (if_scores.max() - if_scores.min())

## Ontology Layer

In [None]:
ontology_penalties = apply_ontology_rules(df_test_raw)
print(f"Ontology penalties computed. Mean: {ontology_penalties.mean():.4f}")

## Combine and Evaluate

In [None]:
final_scores = combine_scores(if_scores_norm, ontology_penalties.values, alpha=0.7, beta=0.3)

metrics_if = evaluate_anomaly_detector(y_test, if_scores_norm, model_name="Isolation Forest")
metrics_combined = evaluate_anomaly_detector(y_test, final_scores, model_name="Combined (IF + Ontology)")

print_comparison_table([metrics_if, metrics_combined])

plot_evaluation_curves(y_test, {
    "Isolation Forest": if_scores_norm,
    "Combined": final_scores
})