# Temporal Credit Degradation Detector - Exploration Notebook

This notebook demonstrates the capabilities of the temporal credit degradation detection system, including:

1. **Data Loading and Exploration**
2. **Temporal Feature Engineering**
3. **Stability-Weighted Ensemble Training**
4. **Drift Detection Analysis**
5. **Performance Evaluation**
6. **Model Interpretability**

## Key Innovation: Stability-Weighted Ensemble

Unlike standard drift detection systems that trigger binary alerts, this implementation uses a novel **stability-weighted ensemble** that:
- Continuously monitors model calibration quality
- Automatically reweights base models based on recent performance
- Enables graceful degradation rather than catastrophic failure
- Adapts to changing economic regimes without manual intervention

In [None]:
# Setup and imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / "src"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, precision_recall_curve, calibration_curve

# Project imports
from temporal_credit_degradation_detector.data.loader import DataLoader
from temporal_credit_degradation_detector.data.preprocessing import CreditDataPreprocessor
from temporal_credit_degradation_detector.models.model import StabilityWeightedEnsemble
from temporal_credit_degradation_detector.training.trainer import ModelTrainer
from temporal_credit_degradation_detector.evaluation.metrics import ModelEvaluator, DriftDetector
from temporal_credit_degradation_detector.utils.config import Config

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Setup complete!")

## 1. Data Loading and Exploration

We'll start by loading synthetic data that mimics the characteristics of real credit datasets, including temporal patterns that simulate economic regime changes.

In [None]:
# Initialize data loader and configuration
config = Config()
data_loader = DataLoader()

# Load Home Credit-like data
print("Loading Home Credit data...")
X_home, y_home = data_loader.load_home_credit_data(sample_size=5000)

print(f"Home Credit data shape: {X_home.shape}")
print(f"Default rate: {y_home.mean():.3f}")
print(f"\nFeature names: {list(X_home.columns[:10])}...")

# Display basic statistics
print("\nTarget distribution:")
print(y_home.value_counts())

In [None]:
# Explore temporal patterns in the data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Temporal Patterns in Credit Data', fontsize=16)

# Default rate by month
monthly_default_rate = X_home.groupby('APPLICATION_MONTH').apply(
    lambda x: y_home[x.index].mean()
)
axes[0, 0].plot(monthly_default_rate.index, monthly_default_rate.values, marker='o')
axes[0, 0].set_title('Default Rate by Application Month')
axes[0, 0].set_xlabel('Application Month')
axes[0, 0].set_ylabel('Default Rate')
axes[0, 0].grid(True, alpha=0.3)

# Application volume by month
monthly_volume = X_home['APPLICATION_MONTH'].value_counts().sort_index()
axes[0, 1].bar(monthly_volume.index, monthly_volume.values, alpha=0.7)
axes[0, 1].set_title('Application Volume by Month')
axes[0, 1].set_xlabel('Application Month')
axes[0, 1].set_ylabel('Number of Applications')

# Income distribution over time
income_col = 'AMT_INCOME_TOTAL'
early_income = X_home[X_home['APPLICATION_MONTH'] < 8][income_col]
late_income = X_home[X_home['APPLICATION_MONTH'] >= 16][income_col]

axes[1, 0].hist(early_income, bins=30, alpha=0.6, label='Early Period', density=True)
axes[1, 0].hist(late_income, bins=30, alpha=0.6, label='Late Period', density=True)
axes[1, 0].set_title('Income Distribution: Early vs Late Period')
axes[1, 0].set_xlabel('Income')
axes[1, 0].set_ylabel('Density')
axes[1, 0].legend()

# Credit amount vs time
credit_monthly = X_home.groupby('APPLICATION_MONTH')['AMT_CREDIT'].mean()
axes[1, 1].plot(credit_monthly.index, credit_monthly.values, marker='s', color='orange')
axes[1, 1].set_title('Average Credit Amount by Month')
axes[1, 1].set_xlabel('Application Month')
axes[1, 1].set_ylabel('Average Credit Amount')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nTemporal Analysis:")
print(f"Early period default rate (months 0-7): {y_home[X_home['APPLICATION_MONTH'] < 8].mean():.3f}")
print(f"Middle period default rate (months 8-15): {y_home[(X_home['APPLICATION_MONTH'] >= 8) & (X_home['APPLICATION_MONTH'] < 16)].mean():.3f}")
print(f"Late period default rate (months 16+): {y_home[X_home['APPLICATION_MONTH'] >= 16].mean():.3f}")

## 2. Temporal Data Splitting

Unlike random splits, we use temporal splits to simulate real-world deployment where models are trained on historical data and tested on future data.

In [None]:
# Create temporal splits
print("Creating temporal splits...")
splits = data_loader.create_temporal_splits(
    X_home, y_home,
    time_column='APPLICATION_MONTH',
    train_months=12,
    val_months=4,
    test_months=8
)

X_train, y_train = splits['train']
X_val, y_val = splits['val']
X_test, y_test = splits['test']

print(f"Train set: {X_train.shape[0]} samples, default rate: {y_train.mean():.3f}")
print(f"Validation set: {X_val.shape[0]} samples, default rate: {y_val.mean():.3f}")
print(f"Test set: {X_test.shape[0]} samples, default rate: {y_test.mean():.3f}")

# Visualize the split
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

split_info = [
    ('Train', len(X_train), y_train.mean(), 'blue'),
    ('Validation', len(X_val), y_val.mean(), 'orange'),
    ('Test', len(X_test), y_test.mean(), 'green')
]

x_pos = np.arange(len(split_info))
sizes = [info[1] for info in split_info]
default_rates = [info[2] for info in split_info]
colors = [info[3] for info in split_info]
labels = [info[0] for info in split_info]

# Create bar plot with dual y-axis
ax1 = ax
bars1 = ax1.bar(x_pos, sizes, alpha=0.7, color=colors, label='Sample Size')
ax1.set_xlabel('Data Split')
ax1.set_ylabel('Number of Samples', color='black')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(labels)

ax2 = ax1.twinx()
line = ax2.plot(x_pos, default_rates, color='red', marker='o', linewidth=3, markersize=8, label='Default Rate')
ax2.set_ylabel('Default Rate', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Add value labels on bars
for bar, size in zip(bars1, sizes):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
             f'{size:,}', ha='center', va='bottom')

# Add value labels on line
for i, rate in enumerate(default_rates):
    ax2.text(i, rate + 0.005, f'{rate:.3f}', ha='center', va='bottom', color='red')

plt.title('Temporal Data Split Overview', fontsize=14, fontweight='bold')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.tight_layout()
plt.show()

## 3. Advanced Preprocessing with Temporal Features

Our preprocessor creates temporal-aware features that help detect drift and maintain model stability.

In [None]:
# Initialize and fit preprocessor
print("Initializing preprocessor...")
preprocessor = CreditDataPreprocessor(config.preprocessing.__dict__)

print("Fitting preprocessor on training data...")
X_train_processed = preprocessor.fit_transform(X_train, y_train)

print(f"Original features: {X_train.shape[1]}")
print(f"Processed features: {X_train_processed.shape[1]}")
print(f"Feature engineering added {X_train_processed.shape[1] - X_train.shape[1]} new features")

# Transform validation and test sets
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

print("\nFeature types detected:")
print(f"Numeric features: {len(preprocessor.numeric_features)}")
print(f"Categorical features: {len(preprocessor.categorical_features)}")
print(f"Temporal features: {len(preprocessor.temporal_features)}")

# Display feature importance weights
feature_weights = preprocessor.get_feature_importance_weights(X_train_processed)
top_weighted_features = sorted(feature_weights.items(), key=lambda x: x[1], reverse=True)[:10]

print("\nTop 10 weighted features:")
for feature, weight in top_weighted_features:
    print(f"  {feature}: {weight:.3f}")

In [None]:
# Visualize preprocessing effects
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Preprocessing Effects', fontsize=16)

# Feature distributions before/after scaling
sample_feature = 'AMT_INCOME_TOTAL'
if sample_feature in X_train.columns:
    original_values = X_train[sample_feature]
    # Find corresponding processed feature
    processed_idx = list(X_train.columns).index(sample_feature)
    processed_values = X_train_processed.iloc[:, processed_idx]
    
    axes[0, 0].hist(original_values, bins=30, alpha=0.7, label='Original')
    axes[0, 0].set_title(f'{sample_feature} - Original')
    axes[0, 0].set_xlabel('Value')
    axes[0, 0].set_ylabel('Frequency')
    
    axes[0, 1].hist(processed_values, bins=30, alpha=0.7, label='Processed', color='orange')
    axes[0, 1].set_title(f'{sample_feature} - Processed (Scaled)')
    axes[0, 1].set_xlabel('Value')
    axes[0, 1].set_ylabel('Frequency')

# Missing value patterns
missing_before = X_train.isnull().sum().sum()
missing_after = X_train_processed.isnull().sum().sum()

axes[1, 0].bar(['Before', 'After'], [missing_before, missing_after], 
               color=['red', 'green'], alpha=0.7)
axes[1, 0].set_title('Missing Values')
axes[1, 0].set_ylabel('Total Missing Values')
for i, v in enumerate([missing_before, missing_after]):
    axes[1, 0].text(i, v + v*0.01, str(v), ha='center', va='bottom')

# Feature type distribution
feature_types = {
    'Numeric': len(preprocessor.numeric_features),
    'Categorical': len(preprocessor.categorical_features),
    'Temporal': len(preprocessor.temporal_features)
}

axes[1, 1].pie(feature_types.values(), labels=feature_types.keys(), 
               autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Feature Type Distribution')

plt.tight_layout()
plt.show()

print(f"\nPreprocessing Summary:")
print(f"Missing values eliminated: {missing_before} → {missing_after}")
print(f"Features scaled and normalized: {len(preprocessor.numeric_features)}")
print(f"Categorical features encoded: {len(preprocessor.categorical_features)}")

## 4. Stability-Weighted Ensemble Training

Now we'll train our novel stability-weighted ensemble that adapts to changing conditions by reweighting models based on their calibration quality.

In [None]:
# Initialize stability-weighted ensemble
print("Initializing Stability-Weighted Ensemble...")
ensemble = StabilityWeightedEnsemble(
    stability_alpha=config.model.stability_alpha,
    min_weight=config.model.min_weight,
    recalibration_threshold=config.model.recalibration_threshold,
    calibration_window=config.model.calibration_window,
    random_state=config.random_state
)

print(f"Ensemble configuration:")
print(f"  Stability alpha: {ensemble.stability_alpha}")
print(f"  Minimum weight: {ensemble.min_weight}")
print(f"  Recalibration threshold: {ensemble.recalibration_threshold}")
print(f"  Calibration window: {ensemble.calibration_window}")

# Train the ensemble
print("\nTraining ensemble...")
ensemble.fit(X_train_processed, y_train)

print(f"Number of base models: {len(ensemble.models_)}")
print(f"Initial model weights: {ensemble.weights_}")

# Make initial predictions
print("\nMaking predictions...")
y_val_proba = ensemble.predict_proba(X_val_processed)[:, 1]
y_val_pred = ensemble.predict(X_val_processed)

# Calculate initial performance
from sklearn.metrics import roc_auc_score, brier_score_loss
initial_auc = roc_auc_score(y_val, y_val_proba)
initial_brier = brier_score_loss(y_val, y_val_proba)

print(f"Initial validation performance:")
print(f"  AUC-ROC: {initial_auc:.4f}")
print(f"  Brier Score: {initial_brier:.4f}")

In [None]:
# Demonstrate adaptive weight updates
print("Demonstrating adaptive weight updates...")

# Track weight evolution
weight_history = [ensemble.weights_.copy()]
performance_history = []

# Simulate multiple weight update cycles
n_updates = 10
for update in range(n_updates):
    print(f"\nUpdate {update + 1}:")
    
    # Update weights based on current validation performance
    stats = ensemble.update_weights(X_val_processed, y_val, update_monitors=True)
    weight_history.append(ensemble.weights_.copy())
    
    # Make new predictions with updated weights
    y_val_proba_updated = ensemble.predict_proba(X_val_processed)[:, 1]
    current_auc = roc_auc_score(y_val, y_val_proba_updated)
    current_brier = brier_score_loss(y_val, y_val_proba_updated)
    
    performance_history.append({
        'auc': current_auc,
        'brier': current_brier,
        'weight_entropy': stats['weight_entropy'],
        'needs_recalibration': stats['needs_recalibration']
    })
    
    print(f"  Current weights: {ensemble.weights_.round(3)}")
    print(f"  AUC: {current_auc:.4f}, Brier: {current_brier:.4f}")
    print(f"  Weight entropy: {stats['weight_entropy']:.4f}")
    print(f"  Needs recalibration: {stats['needs_recalibration']}")
    print(f"  Dominant model: {stats['dominant_model']}")

In [None]:
# Visualize the adaptation process
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Stability-Weighted Ensemble Adaptation', fontsize=16)

# Weight evolution over time
weight_array = np.array(weight_history)
updates = range(len(weight_history))

for i in range(weight_array.shape[1]):
    axes[0, 0].plot(updates, weight_array[:, i], marker='o', label=f'Model {i+1}')

axes[0, 0].set_title('Model Weight Evolution')
axes[0, 0].set_xlabel('Update Cycle')
axes[0, 0].set_ylabel('Weight')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Performance evolution
if performance_history:
    auc_scores = [p['auc'] for p in performance_history]
    brier_scores = [p['brier'] for p in performance_history]
    
    ax_auc = axes[0, 1]
    ax_brier = ax_auc.twinx()
    
    line1 = ax_auc.plot(range(1, len(auc_scores) + 1), auc_scores, 
                        'b-o', label='AUC-ROC')
    line2 = ax_brier.plot(range(1, len(brier_scores) + 1), brier_scores, 
                          'r-s', label='Brier Score')
    
    ax_auc.set_xlabel('Update Cycle')
    ax_auc.set_ylabel('AUC-ROC', color='blue')
    ax_brier.set_ylabel('Brier Score', color='red')
    ax_auc.set_title('Performance During Adaptation')
    
    # Combine legends
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax_auc.legend(lines, labels, loc='upper left')
    ax_auc.grid(True, alpha=0.3)

# Weight entropy over time
if performance_history:
    entropy_scores = [p['weight_entropy'] for p in performance_history]
    axes[1, 0].plot(range(1, len(entropy_scores) + 1), entropy_scores, 
                    'g-o', linewidth=2, markersize=6)
    axes[1, 0].set_title('Weight Entropy Evolution')
    axes[1, 0].set_xlabel('Update Cycle')
    axes[1, 0].set_ylabel('Weight Entropy')
    axes[1, 0].grid(True, alpha=0.3)

# Final weight distribution
final_weights = ensemble.weights_
model_names = [f'Model {i+1}' for i in range(len(final_weights))]
axes[1, 1].pie(final_weights, labels=model_names, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Final Model Weight Distribution')

plt.tight_layout()
plt.show()

print(f"\nAdaptation Summary:")
print(f"Initial AUC: {initial_auc:.4f} → Final AUC: {auc_scores[-1]:.4f}")
print(f"Initial Brier: {initial_brier:.4f} → Final Brier: {brier_scores[-1]:.4f}")
print(f"Dominant model changed to: Model {np.argmax(final_weights) + 1}")

## 5. Advanced Drift Detection

We'll now demonstrate the drift detection capabilities by comparing different time periods and datasets.

In [None]:
# Load Lending Club data for cross-dataset drift analysis
print("Loading Lending Club data for drift analysis...")
X_lc, y_lc = data_loader.load_lending_club_data(sample_size=2000)

# Create temporal splits for Lending Club
lc_splits = data_loader.create_temporal_splits(
    X_lc, y_lc,
    time_column='ISSUE_MONTH',
    train_months=8,
    val_months=3,
    test_months=7
)

X_lc_test, y_lc_test = lc_splits['test']

print(f"Lending Club test set: {X_lc_test.shape[0]} samples, default rate: {y_lc_test.mean():.3f}")

# Initialize evaluator and drift detector
evaluator = ModelEvaluator()

# Get predictions on test sets
print("\nGenerating predictions for drift analysis...")
y_home_test_proba = ensemble.predict_proba(X_test_processed)[:, 1]

# Process Lending Club data with same preprocessor
X_lc_test_processed = preprocessor.transform(X_lc_test)
y_lc_test_proba = ensemble.predict_proba(X_lc_test_processed)[:, 1]

print(f"Home Credit test predictions: {len(y_home_test_proba)} samples")
print(f"Lending Club test predictions: {len(y_lc_test_proba)} samples")

In [None]:
# Comprehensive drift analysis
print("Performing comprehensive drift analysis...")

# Cross-dataset drift (Home Credit as reference, Lending Club as current)
drift_report = evaluator.create_drift_report(
    X_test, X_lc_test,
    y_home_test_proba, y_lc_test_proba,
    y_test.values, y_lc_test.values
)

print("\nCross-Dataset Drift Analysis Results:")
print(f"Overall drift score: {drift_report['overall_drift_score']:.4f}")

feature_summary = drift_report['feature_drift_summary']
print(f"\nFeature Drift Summary:")
print(f"Total features analyzed: {feature_summary['total_features']}")
print(f"Features with KS drift: {feature_summary['ks_drift_count']} ({feature_summary['ks_drift_ratio']:.2%})")
print(f"Features with JS drift: {feature_summary['js_drift_count']} ({feature_summary['js_drift_ratio']:.2%})")

prediction_drift = drift_report['prediction_drift']
print(f"\nPrediction Drift:")
print(f"KS statistic: {prediction_drift['prediction_ks_statistic']:.4f}")
print(f"P-value: {prediction_drift['prediction_ks_p_value']:.4f}")
print(f"Prediction drift detected: {prediction_drift['prediction_drift']}")
print(f"Mean prediction shift: {prediction_drift['mean_prediction_shift']:.4f}")

if 'performance_comparison' in drift_report:
    perf_comparison = drift_report['performance_comparison']
    print(f"\nPerformance Comparison:")
    print(f"Reference AUC: {perf_comparison.get('auc_roc_reference', 0):.4f}")
    print(f"Current AUC: {perf_comparison.get('auc_roc_current', 0):.4f}")
    print(f"AUC degradation: {perf_comparison.get('auc_roc_degradation', 0):.4f}")

print(f"\nRecommendations:")
for recommendation in drift_report['recommendations']:
    print(f"  - {recommendation}")

In [None]:
# Visualize drift detection results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Drift Detection Analysis', fontsize=16)

# Prediction distribution comparison
axes[0, 0].hist(y_home_test_proba, bins=30, alpha=0.6, label='Home Credit (Reference)', density=True)
axes[0, 0].hist(y_lc_test_proba, bins=30, alpha=0.6, label='Lending Club (Current)', density=True)
axes[0, 0].set_title('Prediction Distribution Comparison')
axes[0, 0].set_xlabel('Predicted Default Probability')
axes[0, 0].set_ylabel('Density')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Feature drift heatmap (top drifted features)
feature_drift_ks = drift_report['feature_drift_ks']
drift_data = []

for feature, stats in list(feature_drift_ks.items())[:15]:  # Top 15 features
    drift_data.append({
        'Feature': feature[:20] + '...' if len(feature) > 20 else feature,
        'KS_Statistic': stats['statistic'],
        'Is_Drift': stats['is_drift']
    })

drift_df = pd.DataFrame(drift_data)
if not drift_df.empty:
    # Create color map based on drift status
    colors = ['red' if x else 'green' for x in drift_df['Is_Drift']]
    
    bars = axes[0, 1].barh(drift_df['Feature'], drift_df['KS_Statistic'], color=colors, alpha=0.7)
    axes[0, 1].set_title('Feature Drift (KS Test)')
    axes[0, 1].set_xlabel('KS Statistic')
    axes[0, 1].axvline(x=0.1, color='black', linestyle='--', alpha=0.5, label='Drift Threshold')
    axes[0, 1].legend()

# Default rate comparison
default_rates = {
    'Home Credit\n(Reference)': y_test.mean(),
    'Lending Club\n(Current)': y_lc_test.mean()
}

bars = axes[1, 0].bar(default_rates.keys(), default_rates.values(), 
                     color=['blue', 'orange'], alpha=0.7)
axes[1, 0].set_title('Default Rate Comparison')
axes[1, 0].set_ylabel('Default Rate')
for bar, rate in zip(bars, default_rates.values()):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height + 0.005,
                   f'{rate:.3f}', ha='center', va='bottom')

# Drift score summary
drift_scores = {
    'Feature Drift\n(KS)': feature_summary['ks_drift_ratio'],
    'Feature Drift\n(JS)': feature_summary['js_drift_ratio'],
    'Overall\nDrift Score': drift_report['overall_drift_score']
}

bars = axes[1, 1].bar(drift_scores.keys(), drift_scores.values(), 
                     color=['lightcoral', 'lightblue', 'gold'], alpha=0.8)
axes[1, 1].set_title('Drift Score Summary')
axes[1, 1].set_ylabel('Drift Score')
axes[1, 1].axhline(y=0.3, color='red', linestyle='--', alpha=0.7, label='High Drift Threshold')
axes[1, 1].legend()
for bar, score in zip(bars, drift_scores.values()):
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{score:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 6. Comprehensive Model Evaluation

Let's evaluate our model's performance across multiple dimensions including calibration, discrimination, and business metrics.

In [None]:
# Comprehensive evaluation
print("Performing comprehensive evaluation...")

# Evaluate on Home Credit test set
home_metrics = evaluator.calculate_metrics(y_test.values, y_home_test_proba)

# Evaluate on Lending Club test set
lc_metrics = evaluator.calculate_metrics(y_lc_test.values, y_lc_test_proba)

print("Home Credit Test Performance:")
for metric, value in home_metrics.items():
    if isinstance(value, (int, float)):
        print(f"  {metric}: {value:.4f}")

print("\nLending Club Test Performance:")
for metric, value in lc_metrics.items():
    if isinstance(value, (int, float)):
        print(f"  {metric}: {value:.4f}")

# Target metrics comparison
target_metrics = {
    'auc_roc': 0.82,
    'brier_score': 0.12,
    'calibration_error': 0.05
}

print("\nTarget Metrics Comparison:")
for metric, target in target_metrics.items():
    if metric in home_metrics:
        actual = home_metrics[metric]
        if metric == 'auc_roc':
            status = "✓" if actual >= target else "✗"
        else:
            status = "✓" if actual <= target else "✗"
        print(f"  {metric}: {actual:.4f} vs {target:.4f} {status}")

In [None]:
# Create comprehensive evaluation visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Comprehensive Model Evaluation', fontsize=16)

# ROC Curves
fpr_home, tpr_home, _ = roc_curve(y_test, y_home_test_proba)
fpr_lc, tpr_lc, _ = roc_curve(y_lc_test, y_lc_test_proba)

axes[0, 0].plot(fpr_home, tpr_home, label=f'Home Credit (AUC={home_metrics["auc_roc"]:.3f})', linewidth=2)
axes[0, 0].plot(fpr_lc, tpr_lc, label=f'Lending Club (AUC={lc_metrics["auc_roc"]:.3f})', linewidth=2)
axes[0, 0].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('ROC Curves')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Precision-Recall Curves
precision_home, recall_home, _ = precision_recall_curve(y_test, y_home_test_proba)
precision_lc, recall_lc, _ = precision_recall_curve(y_lc_test, y_lc_test_proba)

axes[0, 1].plot(recall_home, precision_home, label=f'Home Credit (AP={home_metrics["auc_pr"]:.3f})', linewidth=2)
axes[0, 1].plot(recall_lc, precision_lc, label=f'Lending Club (AP={lc_metrics["auc_pr"]:.3f})', linewidth=2)
axes[0, 1].axhline(y=y_test.mean(), color='k', linestyle='--', alpha=0.5, label='Baseline')
axes[0, 1].set_xlabel('Recall')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_title('Precision-Recall Curves')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Calibration Plots
fraction_pos_home, mean_pred_home = calibration_curve(y_test, y_home_test_proba, n_bins=10)
fraction_pos_lc, mean_pred_lc = calibration_curve(y_lc_test, y_lc_test_proba, n_bins=10)

axes[0, 2].plot(mean_pred_home, fraction_pos_home, 'o-', label='Home Credit', linewidth=2, markersize=6)
axes[0, 2].plot(mean_pred_lc, fraction_pos_lc, 's-', label='Lending Club', linewidth=2, markersize=6)
axes[0, 2].plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Perfect Calibration')
axes[0, 2].set_xlabel('Mean Predicted Probability')
axes[0, 2].set_ylabel('Fraction of Positives')
axes[0, 2].set_title('Calibration Plots')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# Performance Metrics Comparison
metrics_to_compare = ['auc_roc', 'precision', 'recall', 'f1_score']
home_values = [home_metrics[m] for m in metrics_to_compare]
lc_values = [lc_metrics[m] for m in metrics_to_compare]

x = np.arange(len(metrics_to_compare))
width = 0.35

bars1 = axes[1, 0].bar(x - width/2, home_values, width, label='Home Credit', alpha=0.8)
bars2 = axes[1, 0].bar(x + width/2, lc_values, width, label='Lending Club', alpha=0.8)

axes[1, 0].set_xlabel('Metrics')
axes[1, 0].set_ylabel('Score')
axes[1, 0].set_title('Performance Metrics Comparison')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels([m.replace('_', ' ').title() for m in metrics_to_compare])
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[1, 0].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                       f'{height:.3f}', ha='center', va='bottom', fontsize=8)

# Feature Importance
feature_importance = ensemble.get_feature_importance()
top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:15]
feature_names = [f[:20] + '...' if len(f) > 20 else f for f, _ in top_features]
importance_values = [imp for _, imp in top_features]

axes[1, 1].barh(feature_names, importance_values, alpha=0.8)
axes[1, 1].set_xlabel('Importance')
axes[1, 1].set_title('Top 15 Feature Importances')
axes[1, 1].grid(True, alpha=0.3)

# Business Metrics
home_business = {k: v for k, v in home_metrics.items() 
                if k in ['optimal_threshold', 'max_profit', 'approval_rate_optimal']}
lc_business = {k: v for k, v in lc_metrics.items() 
              if k in ['optimal_threshold', 'max_profit', 'approval_rate_optimal']}

if home_business:
    business_metrics = list(home_business.keys())
    home_bus_values = [home_business[m] for m in business_metrics]
    lc_bus_values = [lc_business.get(m, 0) for m in business_metrics]
    
    x = np.arange(len(business_metrics))
    bars1 = axes[1, 2].bar(x - width/2, home_bus_values, width, label='Home Credit', alpha=0.8)
    bars2 = axes[1, 2].bar(x + width/2, lc_bus_values, width, label='Lending Club', alpha=0.8)
    
    axes[1, 2].set_xlabel('Business Metrics')
    axes[1, 2].set_ylabel('Value')
    axes[1, 2].set_title('Business Metrics Comparison')
    axes[1, 2].set_xticks(x)
    axes[1, 2].set_xticklabels([m.replace('_', ' ').title() for m in business_metrics])
    axes[1, 2].legend()
    axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Model Interpretability and Insights

Finally, let's explore what our stability-weighted ensemble has learned and how it adapts to different conditions.

In [None]:
# Model interpretability analysis
print("Analyzing model interpretability...")

# Get feature importance across all models
feature_importance = ensemble.get_feature_importance()
print(f"\nTop 10 Most Important Features:")
top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10]
for i, (feature, importance) in enumerate(top_features, 1):
    print(f"{i:2d}. {feature}: {importance:.4f}")

# Analyze model weights and their stability
print(f"\nCurrent Model Weights:")
for i, weight in enumerate(ensemble.weights_):
    print(f"  Model {i+1}: {weight:.3f}")

print(f"\nModel Configuration:")
print(f"  Dominant model: Model {np.argmax(ensemble.weights_) + 1}")
print(f"  Weight entropy: {-np.sum(ensemble.weights_ * np.log(ensemble.weights_ + 1e-8)):.4f}")
print(f"  Effective number of models: {np.exp(-np.sum(ensemble.weights_ * np.log(ensemble.weights_ + 1e-8))):.2f}")

# Calibration analysis
print(f"\nCalibration Analysis:")
for i, monitor in enumerate(ensemble.monitors_):
    if len(monitor.calibration_history) > 0:
        recent_brier = monitor.calibration_history[-1]
        trend = monitor.get_calibration_trend()
        print(f"  Model {i+1} - Recent Brier: {recent_brier:.4f}, Trend: {trend:.4f}")

# Prediction analysis across different segments
print(f"\nPrediction Analysis:")

# High vs Low risk predictions
high_risk_mask = y_home_test_proba > 0.5
low_risk_mask = y_home_test_proba <= 0.5

high_risk_actual = y_test[high_risk_mask].mean() if high_risk_mask.sum() > 0 else 0
low_risk_actual = y_test[low_risk_mask].mean() if low_risk_mask.sum() > 0 else 0

print(f"  High risk predictions (>0.5): {high_risk_mask.sum()} samples, actual default rate: {high_risk_actual:.3f}")
print(f"  Low risk predictions (≤0.5): {low_risk_mask.sum()} samples, actual default rate: {low_risk_actual:.3f}")

# Prediction distribution analysis
print(f"  Mean prediction: {y_home_test_proba.mean():.3f}")
print(f"  Prediction std: {y_home_test_proba.std():.3f}")
print(f"  Prediction range: [{y_home_test_proba.min():.3f}, {y_home_test_proba.max():.3f}]")

In [None]:
# Analyze stability across different time periods
print("Analyzing temporal stability...")

# Create temporal segments for analysis
predictions_by_time = {
    'train_period': (y_train.values, ensemble.predict_proba(X_train_processed)[:, 1]),
    'val_period': (y_val.values, ensemble.predict_proba(X_val_processed)[:, 1]),
    'test_period': (y_test.values, y_home_test_proba)
}

time_periods = list(predictions_by_time.keys())
stability_results = evaluator.evaluate_temporal_stability(predictions_by_time, time_periods)

print(f"\nTemporal Stability Results:")
print(f"Overall stability score: {stability_results['overall_stability_score']:.4f}")

if 'stability_stats' in stability_results:
    stability_stats = stability_results['stability_stats']
    for metric in ['auc_roc', 'brier_score', 'calibration_error']:
        if metric in stability_stats:
            stats = stability_stats[metric]
            print(f"  {metric}:")
            print(f"    Mean: {stats['mean']:.4f} ± {stats['std']:.4f}")
            print(f"    Range: [{stats['min']:.4f}, {stats['max']:.4f}]")
            print(f"    CV: {stats['cv']:.4f}")

# Performance across time periods
if 'metrics_by_time' in stability_results:
    metrics_by_time = stability_results['metrics_by_time']
    print(f"\nPerformance by Time Period:")
    for period, metrics in metrics_by_time.items():
        print(f"  {period}:")
        print(f"    AUC-ROC: {metrics.get('auc_roc', 0):.4f}")
        print(f"    Brier Score: {metrics.get('brier_score', 0):.4f}")
        print(f"    Calibration Error: {metrics.get('calibration_error', 0):.4f}")

In [None]:
# Generate final insights and recommendations
print("=" * 60)
print("FINAL INSIGHTS AND RECOMMENDATIONS")
print("=" * 60)

# Performance summary
print(f"\n1. PERFORMANCE SUMMARY:")
print(f"   • Home Credit Test AUC: {home_metrics['auc_roc']:.4f} {'(Target: ≥0.82)' if home_metrics['auc_roc'] >= 0.82 else '(Below target 0.82)'}")
print(f"   • Brier Score: {home_metrics['brier_score']:.4f} {'(Target: ≤0.12)' if home_metrics['brier_score'] <= 0.12 else '(Above target 0.12)'}")
print(f"   • Calibration Error: {home_metrics['calibration_error']:.4f} {'(Target: ≤0.05)' if home_metrics['calibration_error'] <= 0.05 else '(Above target 0.05)'}")
print(f"   • Cross-dataset generalization: {'Good' if abs(home_metrics['auc_roc'] - lc_metrics['auc_roc']) < 0.1 else 'Needs attention'}")

# Stability insights
print(f"\n2. STABILITY INSIGHTS:")
weight_entropy = -np.sum(ensemble.weights_ * np.log(ensemble.weights_ + 1e-8))
diversity_level = "High" if weight_entropy > 1.0 else "Medium" if weight_entropy > 0.5 else "Low"
print(f"   • Model diversity: {diversity_level} (entropy: {weight_entropy:.3f})")
print(f"   • Dominant model: Model {np.argmax(ensemble.weights_) + 1} (weight: {ensemble.weights_.max():.3f})")
print(f"   • Temporal stability: {'Good' if stability_results['overall_stability_score'] < 0.1 else 'Moderate' if stability_results['overall_stability_score'] < 0.2 else 'Needs attention'}")

# Drift analysis insights
print(f"\n3. DRIFT ANALYSIS:")
overall_drift = drift_report['overall_drift_score']
drift_level = "High" if overall_drift > 0.3 else "Medium" if overall_drift > 0.1 else "Low"
print(f"   • Overall drift score: {overall_drift:.3f} ({drift_level})")
print(f"   • Feature drift: {feature_summary['ks_drift_ratio']:.1%} of features show significant drift")
print(f"   • Prediction drift: {'Yes' if prediction_drift['prediction_drift'] else 'No'}")

# Business impact
print(f"\n4. BUSINESS IMPACT:")
if 'max_profit' in home_metrics:
    print(f"   • Optimal threshold: {home_metrics.get('optimal_threshold', 0.5):.3f}")
    print(f"   • Expected profit improvement: {home_metrics.get('max_profit', 0):.2f} units")
    print(f"   • Recommended approval rate: {home_metrics.get('approval_rate_optimal', 0):.1%}")

# Recommendations
print(f"\n5. RECOMMENDATIONS:")

recommendations = []
if home_metrics['auc_roc'] < 0.82:
    recommendations.append("Consider feature engineering or hyperparameter tuning to improve discrimination")
if home_metrics['calibration_error'] > 0.05:
    recommendations.append("Implement calibration techniques to improve probability estimates")
if overall_drift > 0.2:
    recommendations.append("Monitor for concept drift and consider model retraining")
if weight_entropy < 0.5:
    recommendations.append("Consider increasing model diversity to improve ensemble robustness")
if stability_results['overall_stability_score'] > 0.15:
    recommendations.append("Implement more robust temporal validation strategies")

if not recommendations:
    recommendations.append("Model performance is meeting targets - continue monitoring")
    recommendations.append("Consider A/B testing for production deployment")
    recommendations.append("Implement automated drift monitoring in production")

for i, rec in enumerate(recommendations, 1):
    print(f"   {i}. {rec}")

print(f"\n6. INNOVATION HIGHLIGHTS:")
print(f"   • Novel stability-weighted ensemble successfully adapts to changing conditions")
print(f"   • Graceful degradation prevents catastrophic model failure")
print(f"   • Comprehensive drift detection provides early warning system")
print(f"   • Business-aware metrics optimize for real-world impact")

print("\n" + "="*60)
print("EXPLORATION COMPLETE")
print("="*60)

print(f"\nThis notebook demonstrated the key capabilities of the temporal credit")
print(f"degradation detection system. The stability-weighted ensemble provides")
print(f"a robust, adaptive approach to credit risk modeling that handles")
print(f"changing economic conditions gracefully while maintaining high performance.")