# Validate Bayesian Model

This notebook validates the Bayesian regression model for uncertainty quantification.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src directory to path
sys.path.append('../src')

# Set up plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Check if NumPyro is Available

First, let's check if NumPyro is available. If not, we'll use a dummy implementation.

In [None]:
# Try to import NumPyro
try:
    import numpyro
    import jax.numpy as jnp
    import jax.random as random
    NUMPYRO_AVAILABLE = True
    print("NumPyro is available.")
except ImportError:
    NUMPYRO_AVAILABLE = False
    print("NumPyro is not available. Please install it with 'pip install numpyro jax jaxlib'.")

## 2. Load Data

Let's load the data for training the Bayesian model.

In [None]:
# Import data preparation module
from uncertainty.data_prep import load_df, prepare_features_targets, split_data, normalize_data

# Try to load data from different sources
try:
    # Try to load from batch features
    df = load_df("../data/features/batch/technical/*.parquet")
    print("Loaded data from batch features.")
except ValueError:
    try:
        # Try to load from processed data
        df = load_df("../data/processed/training_data.parquet")
        print("Loaded data from processed data.")
    except ValueError:
        try:
            # Try to load from raw data
            df = load_df("../data/raw/ticks/*/*.parquet")
            print("Loaded data from raw data.")
        except ValueError:
            # Create dummy data
            print("No data found. Creating dummy data.")
            np.random.seed(42)
            n_samples = 1000
            n_features = 5
            X = np.random.randn(n_samples, n_features)
            y = X[:, 0] * 0.5 + X[:, 1] * 0.3 + X[:, 2] * 0.2 + np.random.randn(n_samples) * 0.1
            
            # Create DataFrame
            df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(n_features)])
            df["label"] = y
            df["symbol"] = "DUMMY"
            df["timestamp"] = pd.date_range(start="2023-01-01", periods=n_samples)

In [None]:
# Display the first few rows
df.head()

In [None]:
# Prepare features and targets
X, y = prepare_features_targets(df)

# Split data
splits = split_data(X, y, train_size=0.7, val_size=0.15, shuffle=True, random_state=42)

# Normalize data
normalized = normalize_data(splits["X_train"], splits["X_val"], splits["X_test"])

# Print shapes
print(f"X_train shape: {normalized['X_train'].shape}")
print(f"X_val shape: {normalized['X_val'].shape}")
print(f"X_test shape: {normalized['X_test'].shape}")

## 3. Train Bayesian Model

Now, let's train the Bayesian regression model.

In [None]:
# Import Bayesian model
from uncertainty.bayesian_model import BayesianRegression

if NUMPYRO_AVAILABLE:
    # Create and fit model
    model = BayesianRegression(
        num_warmup=200,
        num_samples=500,
        num_chains=1,
        random_seed=42
    )
    
    # Get feature names
    feature_cols = [c for c in df.columns if c not in ["timestamp", "symbol", "date", "label"]]
    
    # Store normalization parameters
    model.normalization_params = normalized["params"]
    
    # Fit model
    result = model.fit(normalized["X_train"], splits["y_train"], feature_names=feature_cols)
    
    print("Model fitted successfully.")
else:
    print("Skipping model training because NumPyro is not available.")

## 4. Examine Posterior Distribution

Let's examine the posterior distribution of the model parameters.

In [None]:
if NUMPYRO_AVAILABLE:
    # Get summary statistics
    summary = model.get_summary()
    
    # Display summary
    display(summary)
    
    # Plot trace
    fig = model.plot_trace()
    plt.show()
else:
    print("Skipping posterior examination because NumPyro is not available.")

## 5. Make Predictions with Uncertainty

Now, let's make predictions with uncertainty estimates.

In [None]:
if NUMPYRO_AVAILABLE:
    # Make predictions with credible intervals
    predictions = model.predict_interval(normalized["X_test"], interval=0.9)
    
    # Plot predictions vs actual
    plt.figure(figsize=(12, 8))
    
    # Plot actual vs predicted
    plt.scatter(splits["y_test"], predictions["mean"], alpha=0.5)
    
    # Plot credible intervals for a subset of points
    for i in range(min(100, len(predictions["mean"]))):
        plt.plot([splits["y_test"][i], splits["y_test"][i]], 
                 [predictions["lower"][i], predictions["upper"][i]], 
                 color="red", alpha=0.3)
    
    # Plot diagonal line
    min_val = min(np.min(splits["y_test"]), np.min(predictions["mean"]))
    max_val = max(np.max(splits["y_test"]), np.max(predictions["mean"]))
    plt.plot([min_val, max_val], [min_val, max_val], "k--", alpha=0.5)
    
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predictions with 90% Credible Intervals")
    plt.grid(True)
    plt.show()
    
    # Calculate metrics
    mse = np.mean((splits["y_test"] - predictions["mean"]) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(splits["y_test"] - predictions["mean"]))
    
    print(f"MSE: {mse:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"MAE: {mae:.6f}")
    
    # Calculate coverage
    coverage = np.mean((splits["y_test"] >= predictions["lower"]) & (splits["y_test"] <= predictions["upper"]))
    print(f"90% Credible Interval Coverage: {coverage:.2%}")
else:
    print("Skipping predictions because NumPyro is not available.")

## 6. Sample from Posterior Predictive Distribution

Let's sample from the posterior predictive distribution to get a sense of the uncertainty in our predictions.

In [None]:
if NUMPYRO_AVAILABLE:
    # Sample from posterior predictive distribution
    n_samples = 100
    samples = model.predict(normalized["X_test"][:10], return_samples=True, n_samples=n_samples)
    
    # Plot samples
    plt.figure(figsize=(12, 8))
    
    for i in range(10):  # Plot first 10 test points
        plt.subplot(2, 5, i + 1)
        
        # Plot histogram of samples
        plt.hist(samples["mu"][:, i], bins=20, alpha=0.7, density=True)
        
        # Plot actual value
        plt.axvline(splits["y_test"][i], color="red", linestyle="--", label="Actual")
        
        # Plot mean prediction
        plt.axvline(np.mean(samples["mu"][:, i]), color="blue", linestyle="-", label="Mean")
        
        plt.title(f"Test Point {i}")
        
        if i == 0:
            plt.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("Skipping posterior sampling because NumPyro is not available.")

## 7. Save and Load Model

Let's save the model and then load it back to make sure it works.

In [None]:
if NUMPYRO_AVAILABLE:
    # Create models directory if it doesn't exist
    os.makedirs("../models", exist_ok=True)
    
    # Save model
    model_path = "../models/bayesian_regression.npz"
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
    # Load model
    loaded_model = BayesianRegression.load(model_path)
    print("Model loaded successfully.")
    
    # Make predictions with loaded model
    loaded_predictions = loaded_model.predict_interval(normalized["X_test"][:10], interval=0.9)
    
    # Compare predictions
    print("\nComparing predictions from original and loaded models:")
    comparison = pd.DataFrame({
        "Original Mean": predictions["mean"][:10],
        "Loaded Mean": loaded_predictions["mean"],
        "Original Lower": predictions["lower"][:10],
        "Loaded Lower": loaded_predictions["lower"],
        "Original Upper": predictions["upper"][:10],
        "Loaded Upper": loaded_predictions["upper"]
    })
    display(comparison)
else:
    print("Skipping model saving and loading because NumPyro is not available.")

## 8. Summary

In this notebook, we have validated the Bayesian regression model for uncertainty quantification. We have:

1. Loaded and prepared the data
2. Trained a Bayesian regression model
3. Examined the posterior distribution of the model parameters
4. Made predictions with uncertainty estimates
5. Sampled from the posterior predictive distribution
6. Saved and loaded the model

The Bayesian approach provides not only point predictions but also uncertainty estimates, which are crucial for risk management in financial applications.