# 01: Agent Trace Baseline Results

Trains Isolation Forest, LSTM Autoencoder, and Deep Clustering on TRAIL and TRACE datasets.
Evaluates within-domain detection performance.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve

from src.data.trail_loader import load_trail_dataset, get_trail_labels
from src.data.trace_loader import load_trace_dataset, trace_to_otel_format
from src.features.agent_extractor import AgentTraceFeatureExtractor
from src.features.ubfs_schema import UBFSNormalizer, ubfs_feature_names
from src.models.isolation_forest import IsolationForestDetector
from src.models.lstm_autoencoder import LSTMAutoencoderDetector
from src.models.deep_clustering import DeepClusteringDetector
from src.evaluation.metrics import compute_metrics

## Load TRAIL Dataset

In [None]:
trail = load_trail_dataset()
y_trail = get_trail_labels(trail['annotations'])

extractor = AgentTraceFeatureExtractor()
X_trail, ids, ts = extractor.extract_batch(trail['traces'])
X_trail = UBFSNormalizer(method='zscore').fit_transform(X_trail)

print(f'TRAIL: {X_trail.shape[0]} traces, {y_trail.sum()} with errors')
print(f'Feature dimensions: {X_trail.shape[1]}')

## Train and Evaluate Models

In [None]:
trail_normal = X_trail[y_trail == 0]
print(f'Training on {len(trail_normal)} normal traces')

results = {}

# Isolation Forest
if_model = IsolationForestDetector(n_estimators=200, seed=42)
if_model.fit(trail_normal)
if_scores = if_model.score(X_trail)
results['Isolation Forest'] = compute_metrics(y_trail, if_scores)
print(f'IF AUC-ROC: {results["Isolation Forest"].auc_roc:.4f}')

# LSTM Autoencoder
lstm = LSTMAutoencoderDetector(epochs=30, batch_size=16, device='cpu', seed=42, verbose=False)
lstm.fit(trail_normal[:, np.newaxis, :])
lstm_scores = lstm.score(X_trail[:, np.newaxis, :])
results['LSTM Autoencoder'] = compute_metrics(y_trail, lstm_scores)
print(f'LSTM AUC-ROC: {results["LSTM Autoencoder"].auc_roc:.4f}')

# Deep Clustering
dc = DeepClusteringDetector(pretrain_epochs=30, batch_size=16, seed=42)
dc.fit(trail_normal)
dc_scores = dc.score(X_trail)
results['Deep Clustering'] = compute_metrics(y_trail, dc_scores)
print(f'DC AUC-ROC: {results["Deep Clustering"].auc_roc:.4f}')

## ROC Curves

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
for name, scores, color in [
    ('Isolation Forest', if_scores, '#2196F3'),
    ('LSTM Autoencoder', lstm_scores, '#FF9800'),
    ('Deep Clustering', dc_scores, '#9C27B0'),
]:
    fpr, tpr, _ = roc_curve(y_trail, scores)
    m = results[name]
    ax.plot(fpr, tpr, color=color, label=f'{name} (AUC={m.auc_roc:.3f})', linewidth=1.5)

ax.plot([0, 1], [0, 1], 'k--', alpha=0.3)
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend(loc='lower right')
plt.tight_layout()
plt.show()