# 04 - Ontology Rules Validation

Validate ML-detected anomalies using domain-specific ontological rules.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_raw_data
from src.ontology_rules import (
    OntologyRuleEngine,
    create_diabetes_rules,
    validate_with_ontology
)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

## Load Data with Anomaly Predictions

In [None]:
# Load anomaly detection results
from pathlib import Path

results_path = Path('..') / 'results' / 'reports' / 'anomaly_detection_results.csv'

if results_path.exists():
    df = pd.read_csv(results_path)
    print(f"Loaded results with anomaly predictions: {df.shape}")
else:
    print("Anomaly detection results not found. Using raw data...")
    df = load_raw_data()
    # Create dummy anomaly column for demonstration
    df['high_confidence_anomaly'] = False

df.head()

## Define Ontology Rules

In [None]:
# Create rule engine with diabetes-specific rules
rule_engine = create_diabetes_rules()

print("Ontology Rules:")
for idx, rule in enumerate(rule_engine.rules, 1):
    print(f"{idx}. {rule['name']}: {rule['description']}")

## Apply Ontology Rules

In [None]:
# Validate with ontology
anomaly_col = 'high_confidence_anomaly' if 'high_confidence_anomaly' in df.columns else None

df_validated, analysis = validate_with_ontology(df, anomaly_col=anomaly_col)

## Rule Violation Analysis

In [None]:
# Display rule violation summary
print("\nRule Violation Summary:")
print(analysis['rule_violation_summary'])

In [None]:
# Visualize rule violations
violation_summary = analysis['rule_violation_summary']

if len(violation_summary) > 0:
    plt.figure(figsize=(10, 6))
    violation_summary['violations'].plot(kind='barh', color='coral')
    plt.xlabel('Number of Violations')
    plt.title('Ontology Rule Violations by Rule Type')
    plt.tight_layout()
    plt.show()

## Compare ML Anomalies vs Rule Violations

In [None]:
# Analyze overlap
if anomaly_col and 'has_rule_violation' in df_validated.columns:
    print("\n=== ML vs Ontology Comparison ===")
    print(f"Total ML anomalies: {analysis.get('total_ml_anomalies', 0)}")
    print(f"Total rule violations: {analysis.get('total_rule_violations', 0)}")
    print(f"\nConfirmed anomalies (both ML & rules): {analysis.get('confirmed_anomalies', 0)}")
    print(f"ML only (no rule violation): {analysis.get('ml_only_anomalies', 0)}")
    print(f"Rules only (not ML anomaly): {analysis.get('rule_only_violations', 0)}")

In [None]:
# Venn diagram visualization
if anomaly_col and 'has_rule_violation' in df_validated.columns:
    from matplotlib_venn import venn2
    
    plt.figure(figsize=(8, 6))
    
    venn2(
        subsets=(
            analysis.get('ml_only_anomalies', 0),
            analysis.get('rule_only_violations', 0),
            analysis.get('confirmed_anomalies', 0)
        ),
        set_labels=('ML Anomalies', 'Rule Violations'),
        set_colors=('skyblue', 'coral'),
        alpha=0.7
    )
    
    plt.title('Overlap Between ML Anomalies and Ontology Rule Violations')
    plt.tight_layout()
    plt.show()

## Examine Confirmed Anomalies

In [None]:
# Get records that are both ML anomalies AND violate rules
if anomaly_col and 'has_rule_violation' in df_validated.columns:
    confirmed = df_validated[
        (df_validated[anomaly_col] == True) & 
        (df_validated['has_rule_violation'] == True)
    ]
    
    print(f"\nConfirmed Anomalies: {len(confirmed)}")
    print("\nSample confirmed anomalies:")
    
    # Show which rules they violate
    violation_cols = [col for col in confirmed.columns if col.startswith('violates_')]
    display_cols = violation_cols[:5] if len(violation_cols) > 5 else violation_cols
    
    confirmed[display_cols].head()

In [None]:
# Analyze which rules are most commonly violated by ML-detected anomalies
if anomaly_col and 'has_rule_violation' in df_validated.columns:
    ml_anomalies = df_validated[df_validated[anomaly_col] == True]
    
    print("\nRule violations among ML-detected anomalies:")
    violation_counts = {}
    
    for col in violation_cols:
        rule_name = col.replace('violates_', '')
        count = ml_anomalies[col].sum()
        pct = (count / len(ml_anomalies) * 100) if len(ml_anomalies) > 0 else 0
        violation_counts[rule_name] = count
        print(f"  {rule_name}: {count} ({pct:.1f}%)")
    
    # Visualize
    if violation_counts:
        plt.figure(figsize=(10, 5))
        plt.bar(violation_counts.keys(), violation_counts.values(), color='steelblue', alpha=0.7)
        plt.xlabel('Rule Type')
        plt.ylabel('Violation Count')
        plt.title('Rule Violations Among ML-Detected Anomalies')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

## Save Final Results

In [None]:
# Save validated results
output_path = Path('..') / 'results' / 'reports' / 'ontology_validated_results.csv'
output_path.parent.mkdir(parents=True, exist_ok=True)
df_validated.to_csv(output_path, index=False)

print(f"\nValidated results saved to: {output_path}")

# Save analysis summary
import json

summary_path = Path('..') / 'results' / 'reports' / 'validation_summary.json'

# Convert DataFrame to dict for JSON serialization
analysis_json = {k: v for k, v in analysis.items() if k != 'rule_violation_summary'}
analysis_json['rule_violation_summary'] = analysis['rule_violation_summary'].to_dict()

with open(summary_path, 'w') as f:
    json.dump(analysis_json, f, indent=2)

print(f"Validation summary saved to: {summary_path}")

## Conclusions

**Key Findings:**

1. **ML Anomaly Detection**: Identified anomalies using ensemble of methods
2. **Ontology Validation**: Applied domain-specific rules to validate findings
3. **Confirmed Anomalies**: Records flagged by both ML and ontology rules represent high-confidence issues
4. **ML-Only Anomalies**: May represent novel patterns not captured by current rules
5. **Rule-Only Violations**: Data quality issues that don't show unusual statistical patterns

**Next Steps:**
- Investigate confirmed anomalies for clinical significance
- Refine ontology rules based on domain expert feedback
- Update ML models with validated anomaly labels
- Consider semi-supervised approaches combining both methods