# Temporal Drift-Aware Fraud Detection Exploration

This notebook demonstrates the key features of the temporal drift-aware fraud detection system.

In [None]:
# Add src to path for imports
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent / "src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Exploration

In [None]:
from temporal_drift_aware_fraud_detection_with_adversarial_validation.data.loader import DataLoader
from temporal_drift_aware_fraud_detection_with_adversarial_validation.utils.config import load_config

# Load configuration
config = load_config('../configs/default.yaml')
print("Configuration loaded successfully")

# Initialize data loader
data_loader = DataLoader(random_seed=42)

# Load synthetic data (mimics IEEE-CIS structure)
train_data, test_data = data_loader.load_data()

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Train fraud rate: {train_data['isFraud'].mean():.4f}")
print(f"Test fraud rate: {test_data['isFraud'].mean():.4f}")

In [None]:
# Explore data structure
feature_info = data_loader.get_feature_info(train_data)
print("Dataset Information:")
for key, value in feature_info.items():
    print(f"  {key}: {value}")

## 2. Temporal Analysis

In [None]:
# Analyze temporal patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Transaction volume over time
train_data['time_bin'] = pd.cut(train_data['TransactionDT'], bins=20)
time_counts = train_data.groupby('time_bin').size()
axes[0, 0].plot(range(len(time_counts)), time_counts.values)
axes[0, 0].set_title('Transaction Volume Over Time')
axes[0, 0].set_xlabel('Time Period')
axes[0, 0].set_ylabel('Number of Transactions')

# Fraud rate over time
fraud_rate_over_time = train_data.groupby('time_bin')['isFraud'].mean()
axes[0, 1].plot(range(len(fraud_rate_over_time)), fraud_rate_over_time.values, color='red')
axes[0, 1].set_title('Fraud Rate Over Time')
axes[0, 1].set_xlabel('Time Period')
axes[0, 1].set_ylabel('Fraud Rate')

# Transaction amount distribution
axes[1, 0].hist(np.log1p(train_data['TransactionAmt']), bins=50, alpha=0.7, label='Normal')
axes[1, 0].hist(np.log1p(train_data[train_data['isFraud'] == 1]['TransactionAmt']), 
                bins=50, alpha=0.7, label='Fraud', color='red')
axes[1, 0].set_title('Log Transaction Amount Distribution')
axes[1, 0].set_xlabel('Log(1 + Amount)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# Hour of day patterns
train_data['hour'] = (train_data['TransactionDT'] % (24 * 3600)) // 3600
hourly_fraud = train_data.groupby('hour')['isFraud'].mean()
axes[1, 1].bar(hourly_fraud.index, hourly_fraud.values)
axes[1, 1].set_title('Fraud Rate by Hour of Day')
axes[1, 1].set_xlabel('Hour')
axes[1, 1].set_ylabel('Fraud Rate')

plt.tight_layout()
plt.show()

## 3. Feature Engineering and Preprocessing

In [None]:
from temporal_drift_aware_fraud_detection_with_adversarial_validation.data.preprocessing import FeatureEngineer, TemporalSplitter

# Initialize feature engineer
feature_engineer = FeatureEngineer(random_seed=42)

# Apply feature engineering
print("Original features:", train_data.shape[1])
train_processed = feature_engineer.fit_transform(train_data)
print("After feature engineering:", train_processed.shape[1])

# Show some engineered features
print("\nNew features created:")
original_cols = set(train_data.columns)
new_cols = set(train_processed.columns) - original_cols
for col in sorted(list(new_cols))[:10]:
    print(f"  {col}")

In [None]:
# Temporal splitting demonstration
splitter = TemporalSplitter(random_seed=42)

# Create drift periods
drift_periods = splitter.create_drift_periods(train_data, n_periods=5)

# Analyze drift across periods
period_stats = []
for i, period in enumerate(drift_periods):
    stats = {
        'period': i,
        'size': len(period),
        'fraud_rate': period['isFraud'].mean(),
        'avg_amount': period['TransactionAmt'].mean(),
        'time_range': f"{period['TransactionDT'].min():.1f} - {period['TransactionDT'].max():.1f}"
    }
    period_stats.append(stats)

period_df = pd.DataFrame(period_stats)
print("Drift periods analysis:")
print(period_df)

## 4. Adversarial Validation Demo

In [None]:
from temporal_drift_aware_fraud_detection_with_adversarial_validation.models.model import AdversarialValidator

# Initialize adversarial validator
adv_validator = AdversarialValidator(
    model_type='lightgbm',
    random_seed=42,
    n_estimators=100  # Reduced for demo speed
)

# Use first two periods for demonstration
source_period = drift_periods[0]
target_period = drift_periods[-1]  # Last period (most drift)

print(f"Source period size: {len(source_period)}")
print(f"Target period size: {len(target_period)}")

# Train adversarial validator
metrics = adv_validator.fit(source_period, target_period)

print("\nAdversarial validation results:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

if metrics['drift_score'] > 0.6:
    print("\n⚠️  Significant drift detected!")
else:
    print("\n✅ Low drift detected.")

## 5. Ensemble Training Demo

In [None]:
from temporal_drift_aware_fraud_detection_with_adversarial_validation.training.trainer import DriftAwareTrainer

# Create a small config for demo
demo_config = config.to_dict()

# Reduce model complexity for quick demo
for model_name in demo_config.get('base_models', {}):
    if 'params' in demo_config['base_models'][model_name]:
        params = demo_config['base_models'][model_name]['params']
        params['n_estimators'] = 50
        if 'iterations' in params:
            params['iterations'] = 50

demo_config['training']['n_drift_periods'] = 3

# Initialize trainer
trainer = DriftAwareTrainer(
    config=demo_config,
    save_dir='../models_demo'
)

# Use smaller sample for demo
demo_data = train_data.sample(n=2000, random_state=42)

print("Training ensemble on demo data...")
print("(This may take a few minutes)")

# Train the system
results = trainer.train(demo_data, save_best=False)

print("\nTraining completed!")
print("Key results:")
for key, value in results.items():
    if isinstance(value, (int, float)):
        print(f"  {key}: {value:.4f}")

## 6. Prediction and Drift Monitoring

In [None]:
# Make predictions on test data
test_sample = test_data.sample(n=500, random_state=42)
probs, preds = trainer.predict(test_sample)

print(f"Made predictions on {len(test_sample)} transactions")
print(f"Predicted fraud rate: {preds.mean():.4f}")
print(f"Actual fraud rate: {test_sample['isFraud'].mean():.4f}")

# Analyze prediction distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(probs[test_sample['isFraud'] == 0], bins=30, alpha=0.7, label='Normal', density=True)
plt.hist(probs[test_sample['isFraud'] == 1], bins=30, alpha=0.7, label='Fraud', density=True)
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Prediction Distribution by True Label')
plt.legend()

plt.subplot(1, 2, 2)
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(test_sample['isFraud'], probs)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

plt.tight_layout()
plt.show()

## 7. Model Interpretability

In [None]:
# Get model contributions
feature_cols = [col for col in test_sample.columns if col not in ['isFraud', 'TransactionID']]
test_features = test_sample[feature_cols]

contributions = trainer.ensemble.get_model_contributions(test_features)

print("Individual model contributions:")
fig, axes = plt.subplots(1, len(contributions), figsize=(15, 4))

if len(contributions) == 1:
    axes = [axes]

for i, (model_name, model_preds) in enumerate(contributions.items()):
    axes[i].scatter(probs, model_preds, alpha=0.6)
    axes[i].plot([0, 1], [0, 1], 'r--', alpha=0.5)
    axes[i].set_xlabel('Ensemble Prediction')
    axes[i].set_ylabel(f'{model_name} Prediction')
    axes[i].set_title(f'{model_name} vs Ensemble')
    
    # Calculate correlation
    corr = np.corrcoef(probs, model_preds)[0, 1]
    axes[i].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                transform=axes[i].transAxes, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

# Show model weights
print("\nModel weights in ensemble:")
for name, weight in trainer.ensemble.model_weights.items():
    print(f"  {name}: {weight:.4f}")

## 8. Calibration Analysis

In [None]:
from temporal_drift_aware_fraud_detection_with_adversarial_validation.evaluation.metrics import CalibrationMetrics

# Evaluate calibration
calibration_evaluator = CalibrationMetrics(n_bins=10)
cal_metrics = calibration_evaluator.evaluate_calibration(test_sample['isFraud'], probs)

print("Calibration metrics:")
for metric, value in cal_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Plot calibration curve
from sklearn.calibration import calibration_curve

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
fraction_of_positives, mean_predicted_value = calibration_curve(
    test_sample['isFraud'], probs, n_bins=10
)

plt.plot(mean_predicted_value, fraction_of_positives, "s-", linewidth=2, label="Model")
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.title("Calibration Plot")
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(probs, bins=20, alpha=0.7, density=True)
plt.xlabel("Predicted Probability")
plt.ylabel("Density")
plt.title("Prediction Distribution")

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated the key capabilities of the temporal drift-aware fraud detection system:

1. **Temporal Pattern Analysis**: Understanding how fraud patterns evolve over time
2. **Advanced Feature Engineering**: Creating time-aware and interaction features
3. **Adversarial Validation**: Detecting distribution drift between time periods
4. **Ensemble Learning**: Combining multiple models with dynamic weighting
5. **Drift-Aware Predictions**: Making predictions while monitoring reliability
6. **Model Interpretability**: Understanding individual model contributions
7. **Calibration Assessment**: Ensuring probabilistic outputs are well-calibrated

The system provides a production-ready framework for fraud detection that explicitly handles the challenge of temporal drift, ensuring robust performance as data distributions evolve.