# 03. Autoencoder Anomaly Detection

Train a PyTorch feedforward autoencoder on normal samples and use reconstruction error for anomaly detection.

In [None]:
# CRITICAL FOR COLAB: Path fix to import from src/
import sys
import os
# Fix path to allow importing from src when running in Colab or locally
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from pathlib import Path

from src.preprocessing import build_feature_matrix, train_test_split_stratified
from src.models import train_autoencoder, reconstruction_error
from src.evaluation import evaluate_anomaly_detector, plot_roc_pr, save_metrics_summary

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Create results directory
results_dir = Path('../results')
results_dir.mkdir(exist_ok=True)

## Step 1: Load and Preprocess Data

In [None]:
# Build feature matrix
data_path = '../data/raw/diabetic_data.csv'
X, y, preprocessor = build_feature_matrix(data_path)

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")

## Step 2: Train-Test Split

In [None]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split_stratified(X, y, test_size=0.2, random_state=42)

# Filter normal samples for training
X_train_normal = X_train[y_train == 0]

print(f"Train set (all):    {X_train.shape[0]} samples")
print(f"Train set (normal): {X_train_normal.shape[0]} samples")
print(f"Test set:           {X_test.shape[0]} samples")

## Step 3: Train Autoencoder (on Normal Samples Only)

In [None]:
# Train autoencoder
input_dim = X_train.shape[1]
bottleneck_dim = 32

print(f"Training autoencoder (input_dim={input_dim}, bottleneck_dim={bottleneck_dim})...")
autoencoder, train_losses = train_autoencoder(
    X_train_normal.values, 
    input_dim=input_dim,
    bottleneck_dim=bottleneck_dim,
    epochs=20,
    batch_size=256,
    learning_rate=0.001,
    device=device,
    verbose=True
)
print("✓ Training complete!")

In [None]:
# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, linewidth=2, color='#2E86AB')
plt.xlabel('Epoch', fontsize=12, fontweight='bold')
plt.ylabel('MSE Loss', fontsize=12, fontweight='bold')
plt.title('Autoencoder Training Loss', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(results_dir / 'autoencoder_training_loss.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"✓ Training plot saved to {results_dir / 'autoencoder_training_loss.png'}")

## Step 4: Compute Reconstruction Errors

In [None]:
# Compute reconstruction errors for test set
ae_scores_test = reconstruction_error(autoencoder, X_test.values, device=device)

print(f"Reconstruction error range: [{ae_scores_test.min():.6f}, {ae_scores_test.max():.6f}]")
print(f"Mean reconstruction error: {ae_scores_test.mean():.6f}")
print(f"Std reconstruction error:  {ae_scores_test.std():.6f}")

## Step 5: Evaluate Performance

In [None]:
# Evaluate
ae_metrics = evaluate_anomaly_detector(y_test.values, ae_scores_test, model_name="Autoencoder")

In [None]:
# Plot ROC and PR curves
plot_roc_pr(y_test.values, {'Autoencoder': ae_scores_test})
plt.savefig(results_dir / 'autoencoder_roc_pr_curves.png', dpi=150, bbox_inches='tight')
print(f"✓ Plots saved to {results_dir / 'autoencoder_roc_pr_curves.png'}")

## Step 6: Save Results

In [None]:
# Save metrics
save_metrics_summary([ae_metrics], results_dir / 'autoencoder_metrics.csv')

# Save model
torch.save(autoencoder.state_dict(), results_dir / 'autoencoder_model.pth')
print(f"✓ Model saved to {results_dir / 'autoencoder_model.pth'}")
print("✓ Autoencoder evaluation complete!")