In [19]:
import sys, os; sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__) if '__file__' in globals() else os.getcwd(), '..')))
#import os; os.chdir(os.path.dirname(os.getcwd()))
from utils.model_loader import get_model_fits
import numpy as np
import pandas as pd
import re
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
data_dir = f"datasets/correlated"
results_dir_tanh = "results/correlated"
model_names_tanh = ["Gaussian tanh", "Regularized Horseshoe tanh", "Dirichlet Horseshoe tanh", "Dirichlet Student T tanh"]


full_config_path = "correlated_N400_p6"

tanh_fit = get_model_fits(
    config=full_config_path,
    results_dir=results_dir_tanh,
    models=model_names_tanh,
    include_prior=False,
)


In [None]:
from sklearn.metrics import mean_squared_error
from properscoring import crps_ensemble
import numpy as np
import pandas as pd

# IMPORTANT: this y_test must correspond to the same test set used to make `output_test` in Stan,
# otherwise scores won’t be comparable.
from utils.generate_data import generate_correlated_data
X_train, X_test, y_train, y_test = generate_correlated_data(n=500, p=6)

rows = []
for model_name, model_entry in tanh_fit.items():
    post = model_entry["posterior"]

    # (S, n_test)
    y_samps = post.stan_variable("output_test").squeeze(-1)

    # Posterior-mean predictions and RMSE
    y_mean = y_samps.mean(axis=0)                                   # (n_test,)
    rmse_post_mean = float(np.sqrt(mean_squared_error(y_test, y_mean)))

    # Per-draw RMSEs and their mean
    per_draw_rmse = np.sqrt(((y_samps - y_test[None, :])**2).mean(axis=1))  # (S,)
    rmse_draw_mean = float(per_draw_rmse.mean())

    # CRPS across the ensemble (expects shape (n_test, S))
    crps = float(np.mean(crps_ensemble(y_test, y_samps.T)))

    rows.append({
        "Model": model_name,
        "RMSE_posterior_mean": rmse_post_mean,
        "RMSE_mean_over_draws": rmse_draw_mean,
        "CRPS": crps,
        "n_draws": y_samps.shape[0]
    })

results_df = pd.DataFrame(rows).sort_values("RMSE_posterior_mean")
print(results_df)


In [22]:

def compute_sparse_rmse_results_hastie(models, all_fits, forward_pass,
                         sparsity=0.0, prune_fn=None):
    results = []
    posterior_means = []
    _, X_test, _, y_test = generate_correlated_data(n=500, p=6)
    for model in models:
        try:
            fit = all_fits[model]['posterior']
            W1_samples = fit.stan_variable("W_1")           # (S, P, H)
            W2_samples = fit.stan_variable("W_L")           # (S, H, O)
            b1_samples = fit.stan_variable("hidden_bias")   # (S, O, H)
            b2_samples = fit.stan_variable("output_bias")   # (S, O)
        except KeyError:
            print(f"[SKIP] Model or posterior not found:")
            continue

        S = W1_samples.shape[0]
        rmses = np.zeros(S)
        #print(y_test.shape)
        y_hats = np.zeros((S, y_test.shape[0]))

        for i in range(S):
            W1 = W1_samples[i]
            W2 = W2_samples[i]

            # Apply pruning mask if requested
            if prune_fn is not None and sparsity > 0.0:
                masks = prune_fn([W1, W2], sparsity)
                W1 = W1 * masks[0]
                #W2 = W2 * masks[1]

            y_hat = forward_pass(X_test, W1, b1_samples[i][0], W2, b2_samples[i])
            y_hats[i] = y_hat.squeeze()  # Store the prediction for each sample
            rmses[i] = np.sqrt(np.mean((y_hat.squeeze() - y_test)**2))
            
        posterior_mean = np.mean(y_hats, axis=0)
        posterior_mean_rmse = np.sqrt(np.mean((posterior_mean - y_test.squeeze())**2))

        posterior_means.append({
            'model': model,
            'sparsity': sparsity,
            'posterior_mean_rmse': posterior_mean_rmse
        })

        for i in range(S):
            results.append({
                'model': model,
                'sparsity': sparsity,
                'rmse': rmses[i]
            })

    df_rmse = pd.DataFrame(results)
    df_posterior_rmse = pd.DataFrame(posterior_means)

    return df_rmse, df_posterior_rmse


In [23]:
from utils.sparsity import forward_pass_relu, forward_pass_tanh, local_prune_weights

sparsity_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

df_rmse_tanh, df_posterior_rmse_tanh = {}, {}

for sparsity in sparsity_levels:

    df_rmse_tanh[sparsity], df_posterior_rmse_tanh[sparsity] = compute_sparse_rmse_results_hastie(
        models = model_names_tanh,
        all_fits = tanh_fit, 
        forward_pass = forward_pass_tanh,
        sparsity=sparsity, 
        prune_fn=local_prune_weights
    )


In [None]:
# python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

frames = []
for sp, df in df_rmse_tanh.items():
    df = df.copy()
    # Sikre at sparsity-kolonnen matcher key (skriver inn hvis avvik)
    df['sparsity'] = float(sp)
    frames.append(df)

df_all = pd.concat(frames, axis=0, ignore_index=True)

# Rydding: sikre dtypes og sortering
df_all['sparsity'] = df_all['sparsity'].astype(float)
df_all['rmse'] = pd.to_numeric(df_all['rmse'], errors='coerce')
df_all = df_all.dropna(subset=['rmse', 'sparsity', 'model'])

# Valgfritt: sortér modellnavn som kategorisk for konsistent plottrekkefølge
models_order = sorted(df_all['model'].unique())
df_all['model'] = pd.Categorical(df_all['model'], categories=models_order, ordered=True)

# ---- Oppsummeringsstatistikk: mean, std, n, 95% CI ----
summary = (
    df_all.groupby(['model', 'sparsity'])
    .agg(n=('rmse', 'size'),
         mean_rmse=('rmse', 'mean'),
         std_rmse=('rmse', 'std'))
    .reset_index()
)

# Unngå deling på null
summary['sem'] = summary['std_rmse'] / summary['n'].replace(0, np.nan).pow(0.5)
# 95% CI med normaltilnærming: mean ± 1.96 * SEM
summary['ci95'] = 1.96 * summary['sem']
summary['ymin'] = summary['mean_rmse'] - summary['ci95']
summary['ymax'] = summary['mean_rmse'] + summary['ci95']

# ---- Plot-stil ----
sns.set_context('talk')
sns.set_style('whitegrid')

# ---- Figur 1: Linjeplot av mean RMSE vs sparsity, farget per modell, med CI ----
plt.figure(figsize=(10, 6))
# Linjer
sns.lineplot(
    data=summary.sort_values(['model', 'sparsity']),
    x='sparsity', y='mean_rmse', hue='model',
    linewidth=2.5, marker='o', markersize=7
)
# Errorbars (CI)
for _, row in summary.iterrows():
    plt.plot([row['sparsity'], row['sparsity']], [row['ymin'], row['ymax']],
             color=sns.color_palette()[models_order.index(row['model'])], lw=2)

plt.title('RMSE vs sparsity per modell (mean ± 95% CI)')
plt.xlabel('Sparsity')
plt.ylabel('RMSE')
plt.legend(title='Modell', loc='best')
plt.tight_layout()

# ---- Figur 2: Boxplot av RMSE fordelt per sparsity, delt (hue) på modell ----
plt.figure(figsize=(12, 6))
sns.boxplot(
    data=df_all,
    x='sparsity', y='rmse', hue='model',
    showfliers=False,  # skjul outliers for ryddigere helhetsinntrykk
    linewidth=1.2
)
sns.stripplot(
    data=df_all.sample(min(len(df_all), 2000), random_state=42),  # vis et utvalg punkter, ikke alt
    x='sparsity', y='rmse', hue='model',
    dodge=True, size=2, alpha=0.25, palette='dark'
)
# Fjern duplisert legend fra stripplot
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:len(models_order)], labels[:len(models_order)], title='Modell', loc='best')

plt.title('RMSE-fordeling per sparsity og modell (box + punkter)')
plt.xlabel('Sparsity')
plt.ylabel('RMSE')
plt.tight_layout()

plt.show()


## POSTERIOR ANALYSIS

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from utils.generate_data import generate_correlated_data
X_train, _, _, _ = generate_correlated_data(n=500, p=6)

P = 6
H = 16
L = 1
out_nodes = 1

layer_structure = {
    'input_to_hidden': {'name': 'W_1', 'shape': (P, H)},
    'hidden_to_output': {'name': 'W_L', 'shape': (H, out_nodes)}
}


def build_single_draw_weights(fits, layer_structure, draw_idx):
    """Return {model: {'W_1': (P,H), 'W_L': (H,O)}} for ONE draw."""
    out = {}
    for name, fd in fits.items():
        fit = fd["posterior"]
        W1 = fit.stan_variable(layer_structure['input_to_hidden']['name'])[draw_idx]
        WL = fit.stan_variable(layer_structure['hidden_to_output']['name'])[draw_idx]
        WL = WL.reshape(layer_structure['hidden_to_output']['shape'])
        out[name] = {"W_1": W1, "W_L": WL}
    return out

def scale_W1_for_plot(model_means, mode='global'):
    """
    Skalerer alle W_1 til [-1, 1] for rettferdig sammenligning av edge-tykkelser.

    mode:
      - 'global' : én felles skala over alle modeller (mest sammenlignbar)
      - 'per_model': egen skala per modell (uavhengig sammenligning)
      - 'per_node' : skalerer hver kolonne (node) separat til [-1,1]

    Returnerer: scaled_model_means (samme struktur som input), scale_info
    """
    scaled = {}
    if mode == 'global':
        gmax = max(np.abs(m['W_1']).max() for m in model_means.values())
        gmax = max(gmax, 1e-12)
        for name, m in model_means.items():
            W1s = m['W_1'] / gmax
            out = {k: v for k, v in m.items()}
            out['W_1'] = W1s
            scaled[name] = out
        return scaled, {'mode': 'global', 'scale': gmax}

    elif mode == 'per_model':
        for name, m in model_means.items():
            s = max(np.abs(m['W_1']).max(), 1e-12)
            out = {k: v for k, v in m.items()}
            out['W_1'] = m['W_1'] / s
            scaled[name] = out
        return scaled, {'mode': 'per_model'}

    elif mode == 'per_node':
        for name, m in model_means.items():
            W1 = m['W_1'].copy()
            P, H = W1.shape
            for h in range(H):
                colmax = max(np.abs(W1[:, h]).max(), 1e-12)
                W1[:, h] = W1[:, h] / colmax
            out = {k: v for k, v in m.items()}
            out['W_1'] = W1
            scaled[name] = out
        return scaled, {'mode': 'per_node'}

    else:
        raise ValueError("mode must be 'global', 'per_model', or 'per_node'")
#feature_names = list(X_train.columns)
def plot_models_with_activations(model_means, layer_sizes,
                                 activations=None, activation_color_max=None,
                                 ncols=3, figsize_per_plot=(5,4), signed_colors=False, feature_names=None):
    """
    model_means: dict {model_name: {'W_1':(P,H), 'W_L':(H,O), optional 'W_internal':[...]} }
    layer_sizes: f.eks [P, H, O] eller [P, H, H, O] ved internlag
    activations: dict {model_name: (H,)} – aktiveringsfrekvens kun for første skjulte lag
    activation_color_max: global maks for skalering av farger (hvis None brukes 1.0)
    """
    names = list(model_means.keys())
    n_models = len(names)
    nrows = int(np.ceil(n_models / ncols))
    figsize = (figsize_per_plot[0] * ncols, figsize_per_plot[1] * nrows)

    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    if nrows * ncols == 1:
        axes = np.array([axes])
    axes = axes.flatten()

    # Skru av blanke akser
    for ax in axes[n_models:]:
        ax.axis('off')

    for ax, name in zip(axes, names):
        weights = model_means[name]
        G = nx.DiGraph()
        pos, nodes_per_layer, node_colors = {}, [], []

        # Noder med posisjon og farge
        for li, size in enumerate(layer_sizes):
            ids = []
            ycoords = np.linspace(size - 1, 0, size) - (size - 1) / 2
            for i in range(size):
                nid = f"L{li}_{i}"
                G.add_node(nid)
                pos[nid] = (li, ycoords[i])
                ids.append(nid)
                if li == 0 and feature_names is not None:
                    ax.text(pos[nid][0]-0.12, pos[nid][1], feature_names[i],
                            ha='right', va='center', fontsize=8)

                if activations is not None and li == 1:  # kun første skjulte lag
                    #a = activations.get(name, np.zeros(size))
                    a = activations.get(name, np.zeros(size))
                    a = np.asarray(a).ravel()   # <-- flater til 1D array
                    scale = activation_color_max if activation_color_max is not None else 1.0
                    val = float(np.clip(a[i] / max(scale, 1e-12), 0.0, 1.0))
                    color = plt.cm.winter(val)
                else:
                    color = 'lightgray'
                node_colors.append(color)

            nodes_per_layer.append(ids)

        edge_colors, edge_widths = [], []

        def add_edges(W, inn, ut):
            for j, out_n in enumerate(ut):
                for i, in_n in enumerate(inn):
                    w = float(W[i, j])
                    G.add_edge(in_n, out_n, weight=abs(w))
                    edge_colors.append('red' if w >= 0 else 'blue')
                    edge_widths.append(abs(w))

        # input -> hidden(1)
        add_edges(weights['W_1'], nodes_per_layer[0], nodes_per_layer[1])

        # ev. internlag
        if 'W_internal' in weights:
            for l, Win in enumerate(weights['W_internal']):
                add_edges(Win, nodes_per_layer[l+1], nodes_per_layer[l+2])

        # siste hidden -> output
        add_edges(weights['W_L'], nodes_per_layer[-2], nodes_per_layer[-1])

        nx.draw(G, pos, ax=ax,
                node_color=node_colors,
                edge_color=(edge_colors if signed_colors else 'red'),
                width=[G[u][v]['weight'] for u,v in G.edges()],
                with_labels=False, node_size=400, arrows=False)

        ax.set_title(name, fontsize=10)
        ax.axis('off')

    plt.tight_layout()
    return fig

def compute_hidden_activation(fit_dict, x_train, draw_idx):
    fit = fit_dict['posterior']
    W1 = fit.stan_variable('W_1')[draw_idx, :, :]          # (P,H)
    try:
        b1 = fit.stan_variable('hidden_bias')[draw_idx, :] # (H,)
    except Exception:
        b1 = np.zeros(W1.shape[1])
    # tanh i [-1,1]
    a_full = np.tanh(x_train @ W1 + b1)             # (H,)
    a=np.mean(a_full, axis=0)
    return a


In [None]:
# Velg en observasjon å "lyse opp" nodefargene med
obs_idx = 3
draw_idx = 69 #pick_draw_idx(prior_fits, seed=42)      # one common draw across models
prior_draws = build_single_draw_weights(tanh_fit, layer_structure, draw_idx)

# 1) Beregn aktivasjoner for ALLE modellene
activations = {}
for name, fd in tanh_fit.items():
    a = compute_hidden_activation(fd, X_train, draw_idx)
    activations[name] = np.abs(a)      

# 2) Skaler vekter for plotting (som før)
scaled, _ = scale_W1_for_plot(prior_draws, mode='per_model')

# 3) Kall plottet med aktivasjoner
# Siden tanh ∈ [-1,1] og vi bruker |a|, så sett activation_color_max=1.0
fig = plot_models_with_activations(
    scaled,
    layer_sizes=[P, H, out_nodes],
    activations=activations,
    activation_color_max=1.0,
    ncols=2,
    feature_names = None #feature_names
)
plt.show()

## TEST SHAPLEY VALUES

In [None]:
from utils.generate_data import generate_correlated_data
X_train, X_test, y_train, y_test = generate_correlated_data(n=500, p=6)
print(X_train.shape, X_test.shape)

import pandas as pd

# Suppose X_train has shape (n, p)
X_train_pd = pd.DataFrame(X_train, columns=[f"X{i+1}" for i in range(X_train.shape[1])])
X_test_pd = pd.DataFrame(X_test, columns=[f"X{i+1}" for i in range(X_train.shape[1])])


In [None]:
import torch
import numpy as np
import shap
import pandas as pd
import matplotlib.pyplot as plt
from utils.robust_utils import build_pytorch_model_from_stan_sample

models_to_eval = [
    "Gaussian tanh",
    "Regularized Horseshoe tanh",
    "Dirichlet Horseshoe tanh",
    "Dirichlet Student T tanh"
]

H = 16                       # hidden dim
feature_names = [f"X{i+1}" for i in range(P)]

# SHAP background & evaluation subsets (reuse across models for fairness)
X_bg   = X_train_pd.sample(n=400, random_state=0).to_numpy(float)
X_eval = X_test_pd.sample(n=100, random_state=1).to_numpy(float)

results = {}   # store mean SHAP vectors per model

for model_name in models_to_eval:

    print(f"\n=== Evaluating SHAP for model: {model_name} ===")

    fit = tanh_fit[model_name]['posterior']

    # Build one representative sample from posterior
    model = build_pytorch_model_from_stan_sample(
        fit, sample_idx=69, input_dim=P, hidden_dim=H,
        output_dim=1, task="regression", activation=torch.tanh
    )
    model.eval()

    def predict_numpy(X_np):
        with torch.no_grad():
            X_t = torch.tensor(X_np, dtype=torch.float32)
            y = model(X_t).cpu().numpy()
        return y

    explainer = shap.KernelExplainer(predict_numpy, X_bg)
    shap_vals = explainer.shap_values(X_eval)  # shape = (n_eval, P)

    mean_abs_shap = np.abs(shap_vals).mean(axis=0)  # global importance

    results[model_name] = mean_abs_shap

    print(f"Top features for {model_name}:")
    order = np.argsort(mean_abs_shap)[::-1]
    for j in order[:10]:
        print(f"  {feature_names[j]:16s}  SHAP={mean_abs_shap[j]:.4f}")


## TESTING DIFFERENCE IN RMSE METRICS

In [32]:
import numpy as np

def _as_SN(arr):
    arr = np.asarray(arr)
    if arr.ndim == 3 and arr.shape[-1] == 1:
        arr = arr[..., 0]
    return arr

def mse_rmse_summary(y_true, y_samps):
    """
    y_true: (N,) or (N,1)
    y_samps: (S,N) or (S,N,1)
    Returns a dict with:
      - rmse_mean_pred: RMSE of posterior mean (Option 1)
      - rmse_per_sample: array of per-sample RMSEs (length S) (Option 2)
      - rmse_per_sample_mean: mean of per-sample RMSEs
      - mse_gap: avg_sample_MSE - MSE_of_mean = average epistemic variance
    """
    y = np.asarray(y_true).reshape(-1)
    Y = _as_SN(y_samps)  # (S,N)

    # Posterior predictive mean
    y_bar = Y.mean(axis=0)                 # (N,)
    se_mean = (y - y_bar)**2               # (N,)
    mse_mean = se_mean.mean()
    rmse_mean = np.sqrt(mse_mean)          # Option 1

    # Per-sample RMSEs
    se_all = (y[None, :] - Y)**2           # (S,N)
    mse_per_sample = se_all.mean(axis=1)   # (S,)
    rmse_per_sample = np.sqrt(mse_per_sample)  # (S,)
    rmse_per_sample_mean = rmse_per_sample.mean()   # Option 2

    # Decomposition & gap
    # avg_sample_MSE = MSE(mean) + mean Var_s(Y[:,i])
    avg_sample_mse = mse_per_sample.mean()
    epistemic_var = Y.var(axis=0, ddof=0).mean()
    mse_gap = avg_sample_mse - mse_mean    # should equal epistemic_var (up to MC noise)

    return dict(
        rmse_mean_pred=rmse_mean,
        rmse_per_sample=rmse_per_sample,
        rmse_per_sample_mean=rmse_per_sample_mean,
        avg_sample_mse=avg_sample_mse,
        mse_mean=mse_mean,
        epistemic_var_est=epistemic_var,
        mse_gap=mse_gap
    )


In [33]:
output_test_gauss = tanh_fit['Gaussian tanh']['posterior'].stan_variable("output_test")
output_test_RHS = tanh_fit['Regularized Horseshoe tanh']['posterior'].stan_variable("output_test")
output_test_DHS = tanh_fit['Dirichlet Horseshoe tanh']['posterior'].stan_variable("output_test")
output_test_DST = tanh_fit['Dirichlet Student T tanh']['posterior'].stan_variable("output_test")

In [None]:
sum_gauss = mse_rmse_summary(y_test, output_test_gauss)
sum_rhs   = mse_rmse_summary(y_test, output_test_RHS)
sum_dhs   = mse_rmse_summary(y_test, output_test_DHS)
sum_dst   = mse_rmse_summary(y_test, output_test_DST)

print("Gaussian — RMSE(mean):", sum_gauss["rmse_mean_pred"], 
      " mean per-sample RMSE:", sum_gauss["rmse_per_sample_mean"],
      " gap (avg MSE - mean MSE):", sum_gauss["mse_gap"])

print("RHS      — RMSE(mean):", sum_rhs["rmse_mean_pred"], 
      " mean per-sample RMSE:", sum_rhs["rmse_per_sample_mean"],
      " gap (avg MSE - mean MSE):", sum_rhs["mse_gap"])

print("DHS      — RMSE(mean):", sum_dhs["rmse_mean_pred"], 
      " mean per-sample RMSE:", sum_dhs["rmse_per_sample_mean"],
      " gap (avg MSE - mean MSE):", sum_dhs["mse_gap"])

print("DST      — RMSE(mean):", sum_dst["rmse_mean_pred"], 
      " mean per-sample RMSE:", sum_dst["rmse_per_sample_mean"],
      " gap (avg MSE - mean MSE):", sum_dst["mse_gap"])


In [None]:
def coverage_table(y_true, y_samps, levels=(0.5, 0.8, 0.9, 0.95)):
    """
    Central coverage: for each level q, compute lower=(1-q)/2 and upper=1-lower,
    then check fraction of y within [q_low, q_high] pointwise and average over N.
    """
    y = np.asarray(y_true).reshape(-1)
    Y = _as_SN(y_samps)  # (S,N)
    S, N = Y.shape
    cov = {}
    for q in levels:
        lo = (1 - q) / 2
        hi = 1 - lo
        q_lo = np.quantile(Y, lo, axis=0)
        q_hi = np.quantile(Y, hi, axis=0)
        covered = ((y >= q_lo) & (y <= q_hi)).mean()
        cov[q] = covered
    return cov

# Example:
cov_gauss = coverage_table(y_test, output_test_gauss)
cov_rhs   = coverage_table(y_test, output_test_RHS)
cov_dhs   = coverage_table(y_test, output_test_DHS)
cov_dst   = coverage_table(y_test, output_test_DST)
print("Coverage — Gaussian:", cov_gauss)
print("Coverage — RHS     :", cov_rhs)
print("Coverage — DHS     :", cov_dhs)
print("Coverage — DST     :", cov_dst)


In [None]:
def pit_uniformity(y_true, y_samps, n_bins=10):
    """
    Rank-based PIT: u_i = rank(y_i among draws)/S. 
    Returns summary stats and a simple KS-like max deviation from uniform CDF.
    """
    y = np.asarray(y_true).reshape(-1)
    Y = _as_SN(y_samps)  # (S,N)
    S, N = Y.shape

    # For each i, PIT u_i = (#{s: Y_s,i <= y_i}) / S
    u = np.mean(Y <= y[None, :], axis=0)   # (N,)

    # Histogram bins (for a quick glance)
    hist, edges = np.histogram(u, bins=n_bins, range=(0.0, 1.0), density=False)
    hist = hist / hist.sum()  # empirical frequencies per bin
    # KS-like sup norm to uniform CDF on the N points
    u_sorted = np.sort(u)
    ecdf = np.arange(1, N+1) / N
    ks_stat = np.max(np.abs(u_sorted - ecdf))  # lower is better

    return dict(
        u=u,
        hist=hist,
        edges=edges,
        mean=np.mean(u),
        var=np.var(u, ddof=0),
        ks_stat=ks_stat
    )

pit_gauss = pit_uniformity(y_test, output_test_gauss)
pit_rhs   = pit_uniformity(y_test, output_test_RHS)
pit_dhs   = pit_uniformity(y_test, output_test_DHS)
pit_dst   = pit_uniformity(y_test, output_test_DST)
print("PIT KS — Gaussian:", pit_gauss["ks_stat"], " mean:", pit_gauss["mean"], " var:", pit_gauss["var"])
print("PIT KS — RHS     :", pit_rhs["ks_stat"],   " mean:", pit_rhs["mean"],   " var:", pit_rhs["var"])
print("PIT KS — DHS     :", pit_dhs["ks_stat"],   " mean:", pit_dhs["mean"],   " var:", pit_dhs["var"])
print("PIT KS — DHS     :", pit_dst["ks_stat"],   " mean:", pit_dst["mean"],   " var:", pit_dst["var"])
# For perfect calibration: mean ~ 0.5, var ~ 1/12 ≈ 0.0833, small ks_stat.


In [None]:
def predictive_entropy_gaussian(y_samps, eps=1e-12):
    """
    Gaussian plug-in predictive entropy per test point from posterior draws.
    Returns per-point entropies and their mean.
    """
    Y = _as_SN(y_samps)  # (S,N)
    var_pred = Y.var(axis=0, ddof=0) + eps
    H = 0.5 * np.log(2*np.pi*np.e*var_pred)  # nats
    return H, H.mean()

H_gauss, H_gauss_mean = predictive_entropy_gaussian(output_test_gauss)
H_rhs,   H_rhs_mean   = predictive_entropy_gaussian(output_test_RHS)
H_dhs,   H_dhs_mean   = predictive_entropy_gaussian(output_test_DHS)
H_dst,   H_dst_mean   = predictive_entropy_gaussian(output_test_DST)
print("Mean predictive entropy (nats) — Gaussian:", H_gauss_mean)
print("Mean predictive entropy (nats) — RHS     :", H_rhs_mean)
print("Mean predictive entropy (nats) — DHS     :", H_dhs_mean)
print("Mean predictive entropy (nats) — DST     :", H_dst_mean)
