# 03 - Anomaly Detection

Apply multiple anomaly detection algorithms and analyze results.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_processed_data
from src.anomaly_detection import (
    AnomalyDetector,
    detect_anomalies,
    compare_methods
)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

## Load Preprocessed Data

In [None]:
df = load_processed_data()
print(f"Loaded data shape: {df.shape}")
df.head()

## Select Features for Anomaly Detection

In [None]:
# Select numerical features for anomaly detection
feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Using {len(feature_cols)} features for anomaly detection")
print(f"\nFeatures: {feature_cols[:10]}...")  # Show first 10

## Method 1: Isolation Forest

In [None]:
# Apply Isolation Forest
df_if, detector_if = detect_anomalies(
    df,
    feature_cols=feature_cols,
    method='isolation_forest',
    contamination=0.1
)

print(f"\nAnomalies detected: {df_if['is_anomaly'].sum()}")

In [None]:
# Visualize anomaly scores
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df_if['anomaly_score'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Distribution of Anomaly Scores (Isolation Forest)')

plt.subplot(1, 2, 2)
anomaly_counts = df_if['is_anomaly'].value_counts()
plt.bar(['Normal', 'Anomaly'], [anomaly_counts[False], anomaly_counts[True]], 
        color=['green', 'red'], alpha=0.7)
plt.ylabel('Count')
plt.title('Normal vs Anomaly')

plt.tight_layout()
plt.show()

## Method 2: Local Outlier Factor (LOF)

In [None]:
# Apply LOF
df_lof, detector_lof = detect_anomalies(
    df,
    feature_cols=feature_cols,
    method='lof',
    contamination=0.1
)

## Method 3: One-Class SVM

In [None]:
# Apply One-Class SVM
df_svm, detector_svm = detect_anomalies(
    df,
    feature_cols=feature_cols,
    method='ocsvm',
    contamination=0.1
)

## Compare Methods

In [None]:
# Compare all methods
results = compare_methods(df, feature_cols=feature_cols, contamination=0.1)

# Analyze agreement between methods
comparison_df = pd.DataFrame({
    'IsolationForest': results['isolation_forest']['predictions'],
    'LOF': results['lof']['predictions'],
    'OneClassSVM': results['ocsvm']['predictions']
})

# Count agreements
comparison_df['agreement_count'] = comparison_df.sum(axis=1)

print("\nMethod Agreement:")
print(comparison_df['agreement_count'].value_counts().sort_index())
print(f"\nAll methods agree (anomaly): {(comparison_df['agreement_count'] == 3).sum()}")
print(f"All methods agree (normal): {(comparison_df['agreement_count'] == 0).sum()}")

In [None]:
# Visualize method comparison
plt.figure(figsize=(10, 6))

agreement_counts = comparison_df['agreement_count'].value_counts().sort_index()
plt.bar(agreement_counts.index, agreement_counts.values, 
        color=['green', 'yellow', 'orange', 'red'], alpha=0.7)
plt.xlabel('Number of Methods Detecting as Anomaly')
plt.ylabel('Count of Records')
plt.title('Agreement Between Anomaly Detection Methods')
plt.xticks([0, 1, 2, 3])
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Analyze Detected Anomalies

In [None]:
# Focus on high-confidence anomalies (detected by all methods)
high_confidence_anomalies = comparison_df['agreement_count'] >= 2
anomaly_indices = comparison_df[high_confidence_anomalies].index

print(f"High-confidence anomalies: {len(anomaly_indices)}")
print(f"\nSample anomalous records:")
df.iloc[anomaly_indices].head()

In [None]:
# Compare statistics: normal vs anomalies
normal_data = df[~high_confidence_anomalies]
anomaly_data = df[high_confidence_anomalies]

print("\nComparison of Normal vs Anomaly Statistics:")
print("\nNormal Records:")
print(normal_data.describe())

print("\nAnomalous Records:")
print(anomaly_data.describe())

## Save Results

In [None]:
# Save anomaly detection results
from pathlib import Path

# Add predictions from all methods to dataframe
df_results = df.copy()
df_results['if_anomaly'] = results['isolation_forest']['predictions']
df_results['lof_anomaly'] = results['lof']['predictions']
df_results['svm_anomaly'] = results['ocsvm']['predictions']
df_results['high_confidence_anomaly'] = high_confidence_anomalies

# Save
output_path = Path('..') / 'results' / 'reports' / 'anomaly_detection_results.csv'
output_path.parent.mkdir(parents=True, exist_ok=True)
df_results.to_csv(output_path, index=False)

print(f"Results saved to: {output_path}")

## Next Steps

Proceed to `04_ontology_rules.ipynb` to validate anomalies using domain-specific ontological rules.