In [None]:
import sys, os; sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__) if '__file__' in globals() else os.getcwd(), '..')))
#import os; os.chdir(os.path.dirname(os.getcwd()))
from utils.model_loader import get_model_fits
import numpy as np
import pandas as pd
import re
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data_dir = f"datasets/abalone"
results_dir_relu = "results/regression/single_layer/relu/abalone"
results_dir_tanh = "results/regression/single_layer/tanh/abalone"

model_names_relu = ["Gaussian", "Regularized Horseshoe", "Dirichlet Horseshoe", "Dirichlet Student T", "Beta Horseshoe", "Beta Student T"]
model_names_tanh = ["Gaussian tanh", "Regularized Horseshoe tanh", "Dirichlet Horseshoe tanh", "Dirichlet Student T tanh", "Beta Horseshoe tanh", "Beta Student T tanh"]


full_config_path = "abalone_N1670_p8"
relu_fit = get_model_fits(
    config=full_config_path,
    results_dir=results_dir_relu,
    models=model_names_relu,
    include_prior=False,
)

tanh_fit = get_model_fits(
    config=full_config_path,
    results_dir=results_dir_tanh,
    models=model_names_tanh,
    include_prior=False,
)

In [None]:
from sklearn.metrics import mean_squared_error
from properscoring import crps_ensemble
import numpy as np
import pandas as pd

# IMPORTANT: this y_test must correspond to the same test set used to make `output_test` in Stan,
# otherwise scores wonâ€™t be comparable.
from utils.generate_data import load_abalone_regression_data
X_train, X_test, y_train, y_test = load_abalone_regression_data(standardized=False, frac=0.5)

rows = []
for model_name, model_entry in relu_fit.items():
    post = model_entry["posterior"]

    y_samps = post.stan_variable("output_test").squeeze(-1)

    y_mean = y_samps.mean(axis=0)                                   # (n_test,)
    rmse_post_mean = float(np.sqrt(mean_squared_error(y_test, y_mean)))

    per_draw_rmse = np.sqrt(((y_samps - y_test[None, :])**2).mean(axis=1))  # (S,)
    rmse_draw_mean = float(per_draw_rmse.mean())

    crps = float(np.mean(crps_ensemble(y_test, y_samps.T)))

    rows.append({
        "Model": model_name,
        "RMSE_posterior_mean": rmse_post_mean,
        "RMSE_mean_over_draws": rmse_draw_mean,
        "CRPS": crps,
        "n_draws": y_samps.shape[0]
    })

results_df = pd.DataFrame(rows).sort_values("RMSE_posterior_mean")
print(results_df)


python3 utils/run_all_regression_models.py --model dirichlet_horseshoe_tanh --output_dir results/regression/single_layer/tanh/friedman/small_a --N 100 && python3 utils/run_all_regression_models.py --model dirichlet_student_t_tanh --output_dir results/regression/single_layer/tanh/friedman/small_a --N 100 && python3 utils/run_all_regression_models.py --model regularized_horseshoe_tanh --output_dir results/regression/single_layer/tanh/friedman/small_a --N 100 && python3 utils/run_all_regression_models.py --model beta_horseshoe_tanh --output_dir results/regression/single_layer/tanh/friedman/small_a --N 100 && python3 utils/run_all_regression_models.py --model beta_student_t_tanh --output_dir results/regression/single_layer/tanh/friedman/small_a --N 100

In [None]:
from sklearn.metrics import mean_squared_error
from properscoring import crps_ensemble
import numpy as np
import pandas as pd

from utils.generate_data import load_abalone_regression_data
X_train, X_test, y_train, y_test = load_abalone_regression_data(standardized=False, frac=0.5)

rows = []
for model_name, model_entry in tanh_fit.items():
    post = model_entry["posterior"]

    y_samps = post.stan_variable("output_test").squeeze(-1)

    y_mean = y_samps.mean(axis=0)                                   # (n_test,)
    rmse_post_mean = float(np.sqrt(mean_squared_error(y_test, y_mean)))

    per_draw_rmse = np.sqrt(((y_samps - y_test[None, :])**2).mean(axis=1))  # (S,)
    rmse_draw_mean = float(per_draw_rmse.mean())

    crps = float(np.mean(crps_ensemble(y_test, y_samps.T)))

    rows.append({
        "Model": model_name,
        "RMSE_posterior_mean": rmse_post_mean,
        "RMSE_mean_over_draws": rmse_draw_mean,
        "CRPS": crps,
        "n_draws": y_samps.shape[0]
    })

results_df = pd.DataFrame(rows).sort_values("RMSE_posterior_mean")
print(results_df)


In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from utils.generate_data import load_abalone_regression_data
from utils.sparsity import forward_pass_relu, forward_pass_tanh, local_prune_weights

def evaluate_posterior_on_multiple_testsets(
    relu_fit,
    models,
    frac,
    forward_pass,
    n_testsets=5,
):
    rows = []

    for test_id in range(n_testsets):
        _, X_test, _, y_test = load_abalone_regression_data(
            standardized=False,
            frac=frac,
            random_state=42 + test_id
        )

        X_test_np = X_test.to_numpy()
        y_test_np = y_test.reshape(-1)

        for model in models:
            fit = relu_fit[model]["posterior"]

            W1_samples = fit.stan_variable("W_1")
            W2_samples = fit.stan_variable("W_L")
            b1_samples = fit.stan_variable("hidden_bias")
            b2_samples = fit.stan_variable("output_bias")

            S = W1_samples.shape[0]
            y_hats = np.zeros((S, y_test_np.shape[0]))
            rmse = np.zeros((S))

            for i in range(S):
                y_hat = forward_pass(
                    X_test_np,
                    W1_samples[i],
                    np.asarray(b1_samples[i]).reshape(-1),
                    W2_samples[i],
                    np.asarray(b2_samples[i]).reshape(-1),
                )
                y_hats[i] = y_hat.squeeze()
                #rmse[i] = np.sqrt(mean_squared_error(y_test_np, y_hats[i]))
                
            
            y_mean = y_hats.mean(axis=0)
            posterior_rmse = np.sqrt(mean_squared_error(y_test_np, y_mean))

            rows.append({
                "model": model,
                "test_set": test_id,
                "posterior_rmse": posterior_rmse,
                #"mean_rmse": rmse.mean(axis=0)
            })

    df = pd.DataFrame(rows)

    # ðŸ”¹ THIS is the only new part
    df_mean = (
        df.groupby("model", as_index=False)["posterior_rmse"]
          .mean()
          .rename(columns={"posterior_rmse": "mean_rmse_over_testsets"})
    )

    return df_mean, df


In [None]:
models = list(tanh_fit.keys())  # e.g. ["Gaussian", "Regularized Horseshoe", ...]
df_results, df = evaluate_posterior_on_multiple_testsets(
    relu_fit=tanh_fit,
    models=models,
    frac=0.5,        # YOU control this
    forward_pass=forward_pass_tanh,
    n_testsets=5,
)

print(df_results)

In [None]:
latex_table = results_df.to_latex(index=False, float_format="%.4f", column_format="lcc", caption="RMSE and CRPS per model.", label="tab:rmse_crps")
print(latex_table)

In [10]:
from utils.generate_data import load_abalone_regression_data
def compute_sparse_rmse_results_abalone(models, all_fits, forward_pass, frac,
                         sparsity=0.0, prune_fn=None):
    results = []
    posterior_means = []
    for model in models:
        try:
            fit = all_fits[model]['posterior']
            W1_samples = fit.stan_variable("W_1")           # (S, P, H)
            W2_samples = fit.stan_variable("W_L")           # (S, H, O)
            b1_samples = fit.stan_variable("hidden_bias")   # (S, O, H)
            b2_samples = fit.stan_variable("output_bias")   # (S, O)
        except KeyError:
            print(f"[SKIP] Model or posterior not found:")
            continue

        S = W1_samples.shape[0]
        rmses = np.zeros(S)
        #print(y_test.shape)
        _, X_test, _, y_test = load_abalone_regression_data(standardized=False, frac=frac)
        y_hats = np.zeros((S, y_test.shape[0]))

        for i in range(S):
            W1 = W1_samples[i]
            W2 = W2_samples[i]

            # Apply pruning mask if requested
            if prune_fn is not None and sparsity > 0.0:
                masks = prune_fn([W1, W2], sparsity)
                W1 = W1 * masks[0]
                #W2 = W2 * masks[1]

            y_hat = forward_pass(X_test, W1, b1_samples[i][0], W2, b2_samples[i])
            y_hats[i] = y_hat.squeeze()  # Store the prediction for each sample
            rmses[i] = np.sqrt(np.mean((y_hat.squeeze() - y_test)**2))
            
        posterior_mean = np.mean(y_hats, axis=0)
        posterior_mean_rmse = np.sqrt(np.mean((posterior_mean - y_test.squeeze())**2))

        posterior_means.append({
            'model': model,
            'sparsity': sparsity,
            'posterior_mean_rmse': posterior_mean_rmse
        })

        for i in range(S):
            results.append({
                'model': model,
                'sparsity': sparsity,
                'rmse': rmses[i]
            })

    df_rmse = pd.DataFrame(results)
    df_posterior_rmse = pd.DataFrame(posterior_means)

    return df_rmse, df_posterior_rmse


In [11]:
from utils.sparsity import forward_pass_relu, forward_pass_tanh, local_prune_weights

sparsity_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

df_rmse_relu, df_posterior_rmse_relu = {}, {}
df_rmse_tanh, df_posterior_rmse_tanh = {}, {}

for sparsity in sparsity_levels:
    df_rmse_relu[sparsity], df_posterior_rmse_relu[sparsity] = compute_sparse_rmse_results_abalone(
        models = model_names_relu,
        all_fits = relu_fit, 
        forward_pass = forward_pass_relu,
        frac=0.1,
        sparsity=sparsity, 
        prune_fn=local_prune_weights
    )

    df_rmse_tanh[sparsity], df_posterior_rmse_tanh[sparsity] = compute_sparse_rmse_results_abalone(
        models = model_names_tanh,
        all_fits = tanh_fit, 
        forward_pass = forward_pass_tanh,
        frac=0.1,
        sparsity=sparsity, 
        prune_fn=local_prune_weights
    )


In [None]:
# Combine
df_rmse_full_relu = pd.concat(
    [df.assign(sparsity=sparsity) for sparsity, df in df_rmse_relu.items()],
    ignore_index=True
)

df_rmse_full_tanh = pd.concat(
    [df.assign(sparsity=sparsity) for sparsity, df in df_rmse_tanh.items()],
    ignore_index=True
)

# Plot (simplified version)
import matplotlib.pyplot as plt
import seaborn as sns
custom_palette = {
    "Gaussian": "C0",
    "Regularized Horseshoe": "C1",
    "Dirichlet Horseshoe": "C2",
    "Dirichlet Student T": "C3",
    "Beta Horseshoe": "C4",
    "Beta Student T": "C5",
}
abbr = {
    "Gaussian": "Gauss",
    "Regularized Horseshoe": "RHS",
    "Dirichlet Horseshoe": "DHS",
    "Dirichlet Student T": "DST",
    "Beta Horseshoe": "BHS",
    "Beta Student T": "BST",
    #"Pred CP": "PCP"
}
# Clean names
df_rmse_full_relu["model"] = df_rmse_full_relu["model"].str.replace(" tanh", "", regex=False)
df_rmse_full_tanh["model"] = df_rmse_full_tanh["model"].str.replace(" tanh", "", regex=False)

fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True, sharey=True)
activation_data = [("ReLU", df_rmse_full_relu), ("tanh", df_rmse_full_tanh)]

for ax, (name, df) in zip(axes, activation_data):
    df["model_abbr"] = df["model"].map(lambda m: abbr.get(m, m))
    sns.lineplot(
        data=df,
        x='sparsity', y='rmse',
        hue='model_abbr', marker='o', errorbar=None, ax=ax,
        markersize=12,
        #palette=custom_palette,
        palette={abbr[k]: v for k, v in custom_palette.items() if k in df["model"].unique()},
        hue_order=[abbr[m] for m in sorted(df["model"].unique(), key=lambda x: list(custom_palette).index(x) if x in custom_palette else 999)],
    )
    
    ax.set_title(name, fontsize = 15)
    ax.set_xlabel("Sparsity level", fontsize = 15)
    ax.set_ylabel("RMSE", fontsize = 15)
    ax.grid(True)
    ax.legend(title="Model", loc="upper left")

plt.tight_layout()
plt.savefig("figures_for_use_in_paper/abalone_sparsity.pdf", bbox_inches="tight")
plt.show()


## Looking at hyperparameters

In [15]:
tau_DHS = tanh_fit['Dirichlet Horseshoe tanh']['posterior'].stan_variable("tau")
lambda_DHS = tanh_fit['Dirichlet Horseshoe tanh']['posterior'].stan_variable("lambda_data")
xi_DHS = tanh_fit['Dirichlet Horseshoe tanh']['posterior'].stan_variable("phi_data")
W1_DHS = tanh_fit['Dirichlet Horseshoe tanh']['posterior'].stan_variable("W_1")

In [16]:
tau_BHS = tanh_fit['Beta Horseshoe tanh']['posterior'].stan_variable("tau")
lambda_BHS = tanh_fit['Beta Horseshoe tanh']['posterior'].stan_variable("lambda_data")
xi_BHS = tanh_fit['Beta Horseshoe tanh']['posterior'].stan_variable("phi_data")
W1_BHS = tanh_fit['Beta Horseshoe tanh']['posterior'].stan_variable("W_1")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Remove pathological tiny values
tau_DHS_clipped = tau_DHS[tau_DHS > 1e-10]
tau_BHS_clipped = tau_BHS[tau_BHS > 1e-10]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Log-histogram
axes[0].hist(np.log(tau_DHS_clipped), bins=60, density=True, label="DHS")
axes[0].hist(np.log(tau_BHS_clipped), bins=60, density=True, label="BHS")
axes[0].set_title("log(tau)")
axes[0].set_xlabel("log(tau)")

# ECDF
tau_sorted_DHS = np.sort(tau_DHS_clipped)
ecdf_DHS = np.arange(1, len(tau_sorted_DHS)+1) / len(tau_sorted_DHS)

tau_sorted_BHS = np.sort(tau_BHS_clipped)
ecdf_BHS = np.arange(1, len(tau_sorted_BHS)+1) / len(tau_sorted_BHS)

axes[1].plot(tau_sorted_DHS, ecdf_DHS, label="DHS")
axes[1].plot(tau_sorted_BHS, ecdf_BHS, label="BHS")
axes[1].set_xscale("log")
axes[1].set_title("ECDF of tau")
axes[1].set_xlabel("tau")

plt.tight_layout()
plt.show()


In [None]:
lambda_flat_DHS = lambda_DHS.reshape(-1)
lambda_flat_DHS = lambda_flat_DHS[lambda_flat_DHS > 1e-12]
lo, hi = np.quantile(lambda_flat_DHS, [0.01, 0.99])
lambda_clip_DHS = lambda_flat_DHS[(lambda_flat_DHS >= lo) & (lambda_flat_DHS <= hi)]

lambda_flat_BHS = lambda_BHS.reshape(-1)
lambda_flat_BHS = lambda_flat_BHS[lambda_flat_BHS > 1e-12]
lo, hi = np.quantile(lambda_flat_BHS, [0.01, 0.99])
lambda_clip_BHS = lambda_flat_BHS[(lambda_flat_BHS >= lo) & (lambda_flat_BHS <= hi)]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Log-histogram
axes[0].hist(np.log(lambda_clip_DHS), bins=60, density=True, label="DHS", alpha=0.5)
axes[0].hist(np.log(lambda_clip_BHS), bins=60, density=True, label="BHS", alpha=0.5)
axes[0].set_title("log(lambda)")
axes[0].set_xlabel("log(lambda)")

# ECDF
lam_sorted_DHS = np.sort(lambda_flat_DHS)
ecdf_DHS = np.arange(1, len(lam_sorted_DHS)+1) / len(lam_sorted_DHS)
lam_sorted_BHS = np.sort(lambda_flat_BHS)
ecdf_BHS = np.arange(1, len(lam_sorted_BHS)+1) / len(lam_sorted_BHS)

axes[1].plot(lam_sorted_DHS, ecdf_DHS, label="DHS")
axes[1].plot(lam_sorted_BHS, ecdf_BHS, label="BHS")
axes[1].set_xscale("log")
axes[1].set_title("ECDF of lambda")
axes[1].set_xlabel("lambda")
plt.tight_layout()
plt.show()




In [None]:

xi_max_DHS = xi_DHS.max(axis=2).reshape(-1)
xi_max_BHS = xi_BHS.max(axis=2).reshape(-1)


xi_eff_DHS = 1.0 / np.sum(xi_DHS**2, axis=2)
xi_eff_DHS = xi_eff_DHS.reshape(-1)

xi_eff_BHS = 1.0 / np.sum(xi_BHS**2, axis=2)
xi_eff_BHS = xi_eff_BHS.reshape(-1)

def topk_mass(xi, k):
    xi_sorted = np.sort(xi, axis=2)[:, :, ::-1]
    return xi_sorted[:, :, :k].sum(axis=2)

top3_DHS = topk_mass(xi_DHS, k=3).reshape(-1)
top3_BHS = topk_mass(xi_BHS, k=3).reshape(-1)

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

# Log-histogram
axes[0].hist(xi_max_DHS, bins=60, density=True, label="DHS", alpha=0.5)
axes[0].hist(xi_max_BHS, bins=60, density=True, label="BHS", alpha=0.5)
axes[0].set_title("Largest xi")
axes[0].set_xlabel("max_i xi_{j,i}")

# Log-histogram
#axes[1].hist(xi_eff_DHS, bins=60, density=True, label="DHS")
axes[1].hist(xi_eff_BHS, bins=60, density=True, label="BHS")
axes[1].set_title("effective #inputs")
axes[1].set_xlabel("Dirichlet concentration per unit")

# Log-histogram
axes[2].hist(top3_DHS, bins=60, density=True, label="DHS")
#axes[2].hist(top3_BHS, bins=60, density=True, label="BHS", alpha=0.5)
axes[2].set_title("mass in top-3 inputs")
axes[2].set_xlabel("Top-3 concentration")



plt.tight_layout()
plt.show()

In [None]:
# If you saved lambda_tilde in Stan, extract that instead.

var_eff_DHS = (
    tau_DHS[:, None, None]**2
    * lambda_DHS
    * xi_DHS
)

var_flat_DHS = var_eff_DHS.reshape(-1)
var_flat_DHS = var_flat_DHS[var_flat_DHS > 1e-16]

# Log-scale ECDF
v_sorted_DHS = np.sort(var_flat_DHS)
ecdf_DHS = np.arange(1, len(v_sorted_DHS)+1) / len(v_sorted_DHS)

var_eff_BHS = (
    tau_BHS[:, None, None]**2
    * lambda_BHS
    * xi_DHS
)

var_flat_BHS = var_eff_BHS.reshape(-1)
var_flat_BHS = var_flat_BHS[var_flat_BHS > 1e-16]

# Log-scale ECDF
v_sorted_BHS = np.sort(var_flat_BHS)
ecdf_BHS = np.arange(1, len(v_sorted_BHS)+1) / len(v_sorted_BHS)



plt.figure(figsize=(6,4))
plt.plot(v_sorted_DHS, ecdf_DHS)
plt.plot(v_sorted_BHS, ecdf_BHS)
plt.xscale("log")
plt.xlabel("effective variance")
plt.title("ECDF of weight variances")
plt.show()


In [None]:
W1_flat_DHS = W1_DHS.reshape(-1)
W1_flat_DHS = W1_flat_DHS[W1_flat_DHS > 1e-12]
lo, hi = np.quantile(W1_flat_DHS, [0.01, 0.99])
W1_clip_DHS = W1_flat_DHS[(W1_flat_DHS >= lo) & (W1_flat_DHS <= hi)]

W1_flat_BHS = W1_BHS.reshape(-1)
W1_flat_BHS = W1_flat_BHS[W1_flat_BHS > 1e-12]
lo, hi = np.quantile(W1_flat_BHS, [0.01, 0.99])
W1_clip_BHS = W1_flat_BHS[(W1_flat_BHS >= lo) & (W1_flat_BHS <= hi)]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Log-histogram
axes[0].hist((W1_clip_DHS), bins=60, density=True, label="DHS", alpha=0.5)
axes[0].hist((W1_clip_BHS), bins=60, density=True, label="BHS", alpha=0.5)
axes[0].set_title("log(W1)")
axes[0].set_xlabel("log(W1)")

# ECDF
w_sorted_DHS = np.sort(W1_flat_DHS)
ecdf_DHS = np.arange(1, len(w_sorted_DHS)+1) / len(w_sorted_DHS)
w_sorted_BHS = np.sort(W1_flat_BHS)
ecdf_BHS = np.arange(1, len(w_sorted_BHS)+1) / len(w_sorted_BHS)

axes[1].plot(w_sorted_DHS, ecdf_DHS, label="DHS")
axes[1].plot(w_sorted_BHS, ecdf_BHS, label="BHS")
axes[1].set_xscale("log")
axes[1].set_title("ECDF of w1")
axes[1].set_xlabel("w1")
plt.tight_layout()
plt.show()

## OLD

In [11]:
from utils.generate_data import load_abalone_regression_data
X_train, X_test, y_train, y_test = load_abalone_regression_data(standardized=False, frac=1.0)


In [12]:
model_names = ["Gaussian", "Regularized Horseshoe", "Dirichlet Horseshoe", "Dirichlet Student T"]

model_preds = {}
for model_name in model_names:
    preds = relu_fit[model_name]['posterior'].stan_variable("output_test_rng")
    model_preds[model_name] = {
        "mean": np.mean(preds, axis=0).squeeze(-1),
        "std": np.std(preds, axis=0).squeeze(-1),
        "lower_95": np.percentile(preds, 2.5, axis=0).squeeze(-1),
        "upper_95": np.percentile(preds, 97.5, axis=0).squeeze(-1),
    }


In [None]:
fig, axs = plt.subplots(2, 2, figsize=(14, 12))
axs = axs.flatten()

# Get global limits for consistent scaling
all_preds = np.concatenate([model_preds[m]["mean"] for m in model_names])
y_min = min(y_test.min(), all_preds.min())
y_max = max(y_test.max(), all_preds.max())

for i, model_name in enumerate(model_names):
    ax = axs[i]
    sc = ax.scatter(
        y_test, model_preds[model_name]["mean"],
        c=model_preds[model_name]["std"], cmap='viridis', alpha=0.7
    )
    ax.plot([y_min, y_max], [y_min, y_max], 'k--', lw=1)
    ax.set_title(f"{model_name}")
    ax.set_xlabel("True y")
    ax.set_ylabel("Predicted mean")
    ax.grid(True)
    ax.set_xlim(y_min, y_max)
    ax.set_ylim(y_min, y_max)

cbar = fig.colorbar(sc, ax=axs, orientation='vertical', fraction=0.02, pad=0.04)
cbar.set_label("Predictive Std Dev")
plt.suptitle("Regression Predictions with Uncertainty", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
coverage = {}
for model_name in model_names:
    lower = model_preds[model_name]["lower_95"]
    upper = model_preds[model_name]["upper_95"]
    inside = (y_test >= lower) & (y_test <= upper)
    coverage[model_name] = np.mean(inside)

# Bar plot
plt.figure(figsize=(7, 5))
plt.bar(coverage.keys(), coverage.values())
plt.axhline(0.95, color='red', linestyle='--', label='Ideal Coverage (95%)')
plt.ylabel("Proportion Covered")
plt.title("Coverage of 95% Predictive Intervals")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
for model_name in model_names:
    errors = np.abs(model_preds[model_name]["mean"] - y_test)
    stds = model_preds[model_name]["std"]
    plt.scatter(stds, errors, label=model_name, alpha=0.5)

plt.xlabel("Predictive Std Dev")
plt.ylabel("Absolute Error")
plt.title("Error vs. Uncertainty")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
for model_name in model_names:
    pred = model_preds[model_name]["mean"]
    mae = np.mean(np.abs(pred - y_test))
    rmse = np.sqrt(np.mean((pred - y_test) ** 2))
    width = np.mean(model_preds[model_name]["upper_95"] - model_preds[model_name]["lower_95"])
    print(f"{model_name}: MAE = {mae:.3f}, RMSE = {rmse:.3f}, Avg Interval Width = {width:.3f}")


In [17]:
model_names = ["Gaussian tanh", "Regularized Horseshoe tanh", "Dirichlet Horseshoe tanh", "Dirichlet Student T tanh"]

model_preds = {}
for model_name in model_names:
    preds = tanh_fit[model_name]['posterior'].stan_variable("output_test_rng")
    model_preds[model_name] = {
        "mean": np.mean(preds, axis=0).squeeze(-1),
        "std": np.std(preds, axis=0).squeeze(-1),
        "lower_95": np.percentile(preds, 2.5, axis=0).squeeze(-1),
        "upper_95": np.percentile(preds, 97.5, axis=0).squeeze(-1),
    }


In [None]:
fig, axs = plt.subplots(2, 2, figsize=(14, 12))
axs = axs.flatten()

for i, model_name in enumerate(model_names):
    ax = axs[i]
    sc = ax.scatter(y_test, model_preds[model_name]["mean"],
                    c=model_preds[model_name]["std"], cmap='viridis', alpha=0.7)
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
    ax.set_title(f"{model_name}")
    ax.set_xlabel("True y")
    ax.set_ylabel("Predicted mean")
    ax.grid(True)
    
cbar = fig.colorbar(sc, ax=axs, orientation='vertical', fraction=0.02, pad=0.04)
cbar.set_label("Predictive Std Dev")
plt.suptitle("Regression Predictions with Uncertainty", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
coverage = {}
for model_name in model_names:
    lower = model_preds[model_name]["lower_95"]
    upper = model_preds[model_name]["upper_95"]
    inside = (y_test >= lower) & (y_test <= upper)
    coverage[model_name] = np.mean(inside)

# Bar plot
plt.figure(figsize=(7, 5))
plt.bar(coverage.keys(), coverage.values())
plt.axhline(0.95, color='red', linestyle='--', label='Ideal Coverage (95%)')
plt.ylabel("Proportion Covered")
plt.title("Coverage of 95% Predictive Intervals")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
for model_name in model_names:
    errors = np.abs(model_preds[model_name]["mean"] - y_test)
    stds = model_preds[model_name]["std"]
    plt.scatter(stds, errors, label=model_name, alpha=0.5)

plt.xlabel("Predictive Std Dev")
plt.ylabel("Absolute Error")
plt.title("Error vs. Uncertainty")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
for model_name in model_names:
    pred = model_preds[model_name]["mean"]
    mae = np.mean(np.abs(pred - y_test))
    rmse = np.sqrt(np.mean((pred - y_test) ** 2))
    width = np.mean(model_preds[model_name]["upper_95"] - model_preds[model_name]["lower_95"])
    print(f"{model_name}: MAE = {mae:.3f}, RMSE = {rmse:.3f}, Avg Interval Width = {width:.3f}")


## POSTERIOR ANALYSIS

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from utils.generate_data import load_abalone_regression_data
X_train, _, _, _ = load_abalone_regression_data(standardized=False, frac=1.0)

P = 8
H = 16
L = 1
out_nodes = 1

layer_structure = {
    'input_to_hidden': {'name': 'W_1', 'shape': (P, H)},
    'hidden_to_output': {'name': 'W_L', 'shape': (H, out_nodes)}
}


def build_single_draw_weights(fits, layer_structure, draw_idx):
    """Return {model: {'W_1': (P,H), 'W_L': (H,O)}} for ONE draw."""
    out = {}
    for name, fd in fits.items():
        fit = fd["posterior"]
        W1 = fit.stan_variable(layer_structure['input_to_hidden']['name'])[draw_idx]
        WL = fit.stan_variable(layer_structure['hidden_to_output']['name'])[draw_idx]
        WL = WL.reshape(layer_structure['hidden_to_output']['shape'])
        out[name] = {"W_1": W1, "W_L": WL}
    return out

def scale_W1_for_plot(model_means, mode='global'):
    """
    Skalerer alle W_1 til [-1, 1] for rettferdig sammenligning av edge-tykkelser.

    mode:
      - 'global' : Ã©n felles skala over alle modeller (mest sammenlignbar)
      - 'per_model': egen skala per modell (uavhengig sammenligning)
      - 'per_node' : skalerer hver kolonne (node) separat til [-1,1]

    Returnerer: scaled_model_means (samme struktur som input), scale_info
    """
    scaled = {}
    if mode == 'global':
        gmax = max(np.abs(m['W_1']).max() for m in model_means.values())
        gmax = max(gmax, 1e-12)
        for name, m in model_means.items():
            W1s = m['W_1'] / gmax
            out = {k: v for k, v in m.items()}
            out['W_1'] = W1s
            scaled[name] = out
        return scaled, {'mode': 'global', 'scale': gmax}

    elif mode == 'per_model':
        for name, m in model_means.items():
            s = max(np.abs(m['W_1']).max(), 1e-12)
            out = {k: v for k, v in m.items()}
            out['W_1'] = m['W_1'] / s
            scaled[name] = out
        return scaled, {'mode': 'per_model'}

    elif mode == 'per_node':
        for name, m in model_means.items():
            W1 = m['W_1'].copy()
            P, H = W1.shape
            for h in range(H):
                colmax = max(np.abs(W1[:, h]).max(), 1e-12)
                W1[:, h] = W1[:, h] / colmax
            out = {k: v for k, v in m.items()}
            out['W_1'] = W1
            scaled[name] = out
        return scaled, {'mode': 'per_node'}

    else:
        raise ValueError("mode must be 'global', 'per_model', or 'per_node'")
feature_names = list(X_train.columns)

abbr = {
    "Gaussian": "Gauss",
    "Regularized Horseshoe": "RHS",
    "Dirichlet Horseshoe": "DHS",
    "Dirichlet Student T": "DST",
    #"Pred CP": "PCP"
}

def plot_models_with_activations(model_means, layer_sizes,
                                 activations=None, activation_color_max=None,
                                 ncols=3, figsize_per_plot=(5,4), signed_colors=False, feature_names=None):
    """
    model_means: dict {model_name: {'W_1':(P,H), 'W_L':(H,O), optional 'W_internal':[...]} }
    layer_sizes: f.eks [P, H, O] eller [P, H, H, O] ved internlag
    activations: dict {model_name: (H,)} â€“ aktiveringsfrekvens kun for fÃ¸rste skjulte lag
    activation_color_max: global maks for skalering av farger (hvis None brukes 1.0)
    """
    names = list(model_means.keys())
    n_models = len(names)
    nrows = int(np.ceil(n_models / ncols))
    figsize = (figsize_per_plot[0] * ncols, figsize_per_plot[1] * nrows)

    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    if nrows * ncols == 1:
        axes = np.array([axes])
    axes = axes.flatten()

    # Skru av blanke akser
    for ax in axes[n_models:]:
        ax.axis('off')

    for ax, name in zip(axes, names):
        weights = model_means[name]
        G = nx.DiGraph()
        pos, nodes_per_layer, node_colors = {}, [], []

        # Noder med posisjon og farge
        for li, size in enumerate(layer_sizes):
            ids = []
            ycoords = np.linspace(size - 1, 0, size) - (size - 1) / 2
            for i in range(size):
                nid = f"L{li}_{i}"
                G.add_node(nid)
                pos[nid] = (li, ycoords[i])
                ids.append(nid)
                if li == 0 and feature_names is not None:
                    ax.text(pos[nid][0]-0.12, pos[nid][1], feature_names[i],
                            ha='right', va='center', fontsize=8)

                if activations is not None and li == 1:  # kun fÃ¸rste skjulte lag
                    #a = activations.get(name, np.zeros(size))
                    a = activations.get(name, np.zeros(size))
                    a = np.asarray(a).ravel()   # <-- flater til 1D array
                    scale = activation_color_max if activation_color_max is not None else 1.0
                    val = float(np.clip(a[i] / max(scale, 1e-12), 0.0, 1.0))
                    color = plt.cm.winter(val)
                else:
                    color = 'lightblue'
                node_colors.append(color)

            nodes_per_layer.append(ids)

        edge_colors, edge_widths = [], []

        def add_edges(W, inn, ut):
            for j, out_n in enumerate(ut):
                for i, in_n in enumerate(inn):
                    w = float(W[i, j])
                    G.add_edge(in_n, out_n, weight=abs(w))
                    edge_colors.append('red' if w >= 0 else 'blue')
                    edge_widths.append(abs(w))

        # input -> hidden(1)
        add_edges(weights['W_1'], nodes_per_layer[0], nodes_per_layer[1])

        # ev. internlag
        if 'W_internal' in weights:
            for l, Win in enumerate(weights['W_internal']):
                add_edges(Win, nodes_per_layer[l+1], nodes_per_layer[l+2])

        # siste hidden -> output
        add_edges(weights['W_L'], nodes_per_layer[-2], nodes_per_layer[-1])

        nx.draw(G, pos, ax=ax,
                node_color=node_colors,
                edge_color=(edge_colors if signed_colors else 'red'),
                width=[G[u][v]['weight'] for u,v in G.edges()],
                with_labels=False, node_size=400, arrows=False)

        ax.set_title(abbr[name], fontsize=10)
        ax.axis('off')

    plt.tight_layout()
    return fig

def compute_hidden_activation(fit_dict, x_train, draw_idx):
    fit = fit_dict['posterior']
    W1 = fit.stan_variable('W_1')[draw_idx, :, :]          # (P,H)
    try:
        b1 = fit.stan_variable('hidden_bias')[draw_idx, :] # (H,)
    except Exception:
        b1 = np.zeros(W1.shape[1])
    # tanh i [-1,1]
    a_full = np.tanh(x_train @ W1 + b1)             # (H,)
    a=np.mean(a_full, axis=0)
    return a


In [None]:
# Velg en observasjon Ã¥ "lyse opp" nodefargene med
obs_idx = 3
draw_idx = 69 #pick_draw_idx(prior_fits, seed=42)      # one common draw across models
prior_draws = build_single_draw_weights(relu_fit, layer_structure, draw_idx)

# 1) Beregn aktivasjoner for ALLE modellene
activations = {}
for name, fd in relu_fit.items():
    a = compute_hidden_activation(fd, X_train, draw_idx)
    activations[name] = np.abs(a)      

# 2) Skaler vekter for plotting (som fÃ¸r)
scaled, _ = scale_W1_for_plot(prior_draws, mode='per_model')

# 3) Kall plottet med aktivasjoner
# Siden tanh âˆˆ [-1,1] og vi bruker |a|, sÃ¥ sett activation_color_max=1.0
fig = plot_models_with_activations(
    scaled,
    layer_sizes=[P, H, out_nodes],
    activations=None,
    activation_color_max=1.0,
    ncols=2,
    feature_names = None
)
plt.savefig("figures_for_use_in_paper/abalone_network_relu.png", bbox_inches="tight")
plt.show()

In [12]:
import numpy as np
from itertools import combinations

def mean_abs(arr):  # arr: (S, ...)
    return np.mean(np.abs(np.asarray(arr)), axis=0)

def nid_single_hidden(posterior, W1_name="W_1", WL_name_candidates=("W_L","W_2")):
    """
    posterior: CmdStanMCMC-objekt
    W_1: shape (S, P, H)  (input -> hidden), som du har
    W_L: shape (S, H) eller (S, H, O)  (hidden -> output)
    """
    W1_samps = posterior.stan_variable(W1_name)          # (S, P, H)
    # Finn navn for siste lag
    for nm in WL_name_candidates:
        try:
            WL_samps = posterior.stan_variable(nm)       # (S, H) eller (S, H, O)
            break
        except Exception:
            WL_samps = None
    if WL_samps is None:
        raise ValueError("Fant ikke siste-lag-vekter (prÃ¸v Ã¥ angi riktig navn i WL_name_candidates).")

    # Posterior plug-in: gjennomsnitt av absoluttverdier
    W1_abs = mean_abs(W1_samps)                          # (P, H)
    WL_abs = mean_abs(WL_samps)                          # (H,) eller (H, O)

    # z^(1): aggregert node-innflytelse (sum over outputs hvis flere)
    if WL_abs.ndim == 1:
        z1 = WL_abs                                      # (H,)
    else:
        z1 = WL_abs.sum(axis=1)                          # (H,)

    P, H = W1_abs.shape

    # Main effects: Ï‰({j}) = Î£_i z_i * |W1[j,i]|
    omega_main = (W1_abs * z1[None, :]).sum(axis=1)      # (P,)

    # Pairwise: Ï‰({j,k}) = Î£_i z_i * min(|W1[j,i]|, |W1[k,i]|)
    omega_pair = np.zeros((P, P))
    for j, k in combinations(range(P), 2):
        mins = np.minimum(W1_abs[j, :], W1_abs[k, :])    # (H,)
        omega = np.dot(z1, mins)                         # skalar
        omega_pair[j, k] = omega_pair[k, j] = omega

    return z1, omega_main, omega_pair


In [None]:
post = relu_fit['Dirichlet Student T']['posterior'] 
z1, omega_main, omega_pair = nid_single_hidden(post) # W_1=(S,P,H), W_L/(W_2)=(S,H[,O]) # Eksempler: # - topp 10 viktigste noder etter z1: 
top_nodes = np.argsort(-z1) # - topp 10 viktigste features (main effects): 
top_feats = np.argsort(-omega_main) # - sterkeste parvise interaksjoner: 
P = omega_pair.shape[0] 
pairs = [(j, k, omega_pair[j, k]) for j in range(P) for k in range(j+1, P)] 
top_pairs = sorted(pairs, key=lambda t: -t[2])[:10]

res = np.array(omega_main/(np.sum(omega_main)))
print(np.round(res, 3))

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations

def gini(v):
    v = np.asarray(v, float)
    v = np.abs(v)
    if np.all(v == 0): return 0.0
    v = np.sort(v)
    n = v.size
    cum = np.cumsum(v)
    return (n + 1 - 2 * (cum.sum() / cum[-1])) / n

def topk_share(v, k):
    v = np.asarray(v, float)
    tot = v.sum()
    if tot == 0: return 0.0
    idx = np.argsort(-v)[:k]
    return v[idx].sum() / tot

def summarize_main(omega_main):
    tot = omega_main.sum()
    mx  = omega_main.max() if omega_main.size else 0.0
    return {
        "Gini(main)": gini(omega_main),
        "Top1(main)": topk_share(omega_main, 1),
        "Top3(main)": topk_share(omega_main, 3),
        "Top5(main)": topk_share(omega_main, 5),
        "#â‰¥10%max(main)": int((omega_main >= 0.10 * mx).sum()),
        "Total(main)": tot,
    }

def summarize_pairs(omega_pair):
    # ta Ã¸vre trekant uten diagonal
    P = omega_pair.shape[0]
    tri = [omega_pair[j, k] for j in range(P) for k in range(j+1, P)]
    tri = np.asarray(tri, float)
    tot = tri.sum()
    mx  = tri.max() if tri.size else 0.0
    return {
        "Gini(pair)": gini(tri),
        "Top5(pair)": topk_share(tri, 5),
        "Top10(pair)": topk_share(tri, 10),
        "#â‰¥10%max(pair)": int((tri >= 0.10 * mx).sum()),
        "Total(pair)": tot,
    }

models = [
    ("Gaussian tanh",                tanh_fit['Gaussian tanh']['posterior']),
    ("Regularized Horseshoe tanh",   tanh_fit['Regularized Horseshoe tanh']['posterior']),
    ("Dirichlet Horseshoe tanh",     tanh_fit['Dirichlet Horseshoe tanh']['posterior']),
    ("Dirichlet Student T tanh",     tanh_fit['Dirichlet Student T tanh']['posterior']),
]

rows = []
for name, post in models:
    z1, omega_main, omega_pair = nid_single_hidden(post)
    m = summarize_main(omega_main)
    p = summarize_pairs(omega_pair)
    rows.append({
        "Model": name,
        **m,
        **p,
        "Median z1": np.median(z1),
    })

df = pd.DataFrame(rows, columns=[
    "Model",
    "Gini(main)", "Top1(main)", "Top3(main)", "Top5(main)", "#â‰¥10%max(main)", "Total(main)",
    "Gini(pair)", "Top5(pair)", "Top10(pair)", "#â‰¥10%max(pair)", "Total(pair)",
    "Median z1",
])

# Kort og ryddig LaTeX
print(df.to_latex(index=False, float_format="%.3f", escape=False))


## Compute effective number of nonzero parameters

In [14]:
import numpy as np

def compute_kappa(fit, q, node_idx, input_idx, model_type='gaussian'):
    """
    Returnerer arrays per trekning (S,): kappa_original, b_j, kappa_tilde
    """
    q_hp = np.asarray(q)[:, node_idx, input_idx]  # (S,)

    if model_type == 'gaussian':
        tau = 1.0
        lam = 1.0
        c_sq = 1.0
        phi = 1.0
    else:
        tau = fit.stan_variable("tau")                          # (S,)
        c_sq = fit.stan_variable("c_sq")[:, node_idx]           # (S,)

        if model_type == 'rhs':
            lam = fit.stan_variable("lambda_tilde")[:, node_idx, input_idx]       # (S,)
            phi = 1.0
        elif model_type == 'dhs' or 'dst':
            lam = fit.stan_variable("lambda_tilde_data")[:, node_idx, input_idx]  # (S,)
            phi = fit.stan_variable("phi_data")[:, node_idx, input_idx]           # (S,)
        # elif model_type == 'dst':
        #     lam = fit.stan_variable("lambda_tilde_data")[:, node_idx, input_idx]       # (S,)
        #     phi = fit.stan_variable("phi_data")[:, node_idx, input_idx]           # (S,)
        else:
            raise ValueError("model_type mÃ¥ vÃ¦re 'gaussian', 'rhs', 'dhs' eller 'dst'.")

    kappa_original = 1.0 / (1.0 + q_hp * (tau**2) * (lam**2) * (phi))  # (S,)
    b_j            = 1.0 / (1.0 + q_hp * (tau**2) * c_sq * (phi))      # (S,)
    kappa_tilde    = (1.0 - b_j) * kappa_original + b_j                # (S,)

    return kappa_original, b_j, kappa_tilde


def compute_kappa_per_input(fit, q, node_idx, model_type='gaussian'):
    """
    For en gitt node: beregn per input p
      - E[1 - kappa_tilde]_p
      - E[(1 - kappa)(1 - b_j)]_p
    og returnÃ©r begge som (P,) + totalsum som skalarer.
    """
    S, H, P = np.asarray(q).shape
    mean_1_minus_kappa_tilde = np.zeros(P)
    mean_prod_identity       = np.zeros(P)

    for p in range(P):
        kappa, b_j, kappa_tilde = compute_kappa(fit, q, node_idx=node_idx, input_idx=p, model_type=model_type)

        # Riktig aggregering: gjennomsnitt av uttrykket per trekning
        mean_1_minus_kappa_tilde[p] = np.mean(1.0 - kappa_tilde)
        mean_prod_identity[p]       = np.mean((1.0 - kappa) * (1.0 - b_j))

    # SummÃ©r over inputs (antall ikke-null-vekter inn i noden i forventning)
    total_meff_tilde = np.sum(mean_1_minus_kappa_tilde)
    total_meff_check = np.sum(mean_prod_identity)

    return total_meff_tilde, total_meff_check



In [15]:
import numpy as np

# ---------- 1) Simuler X ----------
def simulate_X(n, P, seed=42):
    rng = np.random.default_rng(seed)
    return rng.uniform(0.0, 1.0, size=(n, P))

# ---------- 2) Aktivering og derivert ----------
def get_activation(activation="tanh"):
    if activation == "tanh":
        phi = np.tanh
        def dphi(a): return 1.0 - np.tanh(a)**2
    elif activation == "relu":
        def phi(a): return np.maximum(0.0, a)
        def dphi(a): return (a > 0.0).astype(a.dtype)
    else:
        raise ValueError(f"Unsupported activation: {activation}")
    return phi, dphi

# ---------- 3) Hovedfunksjon: q for alle trekk ----------
def compute_q_for_fit(cmdstan_mcmc, N=1000, activation="tanh", seed=1, output_index=0, X=None):
    """
    Beregn q_{ell, j} for fÃ¸rste-lagsvektene for hver trekk (draw).
    Returnerer:
      q_draws:  (n_draws, H, P)
      q_mean:   (H, P)  â€“ gjennomsnitt over trekk
      X:        (N, P)  â€“ datasettet brukt i beregningen
    """
    # Hent ut variabler fra Stan
    W1_all = cmdstan_mcmc.stan_variable("W_1")            # (draws, P, H)
    WL_all = cmdstan_mcmc.stan_variable("W_L")             # (draws, H, O)
    hb_all = cmdstan_mcmc.stan_variable("hidden_bias")     # (draws, L, H)
    sigma_all = cmdstan_mcmc.stan_variable("sigma")        # (draws,)
    Wint_all = cmdstan_mcmc.stan_variable("W_internal")    # (draws, max(L-1,1), H, H)

    draws, P, H = W1_all.shape
    O = WL_all.shape[2]
    L = hb_all.shape[1]

    if O == 0:
        raise ValueError("W_L has zero output nodes. Expected at least 1.")
    if output_index < 0 or output_index >= O:
        raise ValueError(f"output_index {output_index} out of range 0..{O-1}")

    if X is None:
        X = simulate_X(N, P, seed=seed)

    X_sq = X**2
    phi, dphi = get_activation(activation)

    q_draws = np.empty((draws, H, P), dtype=float)

    for s in range(draws):
        W1 = W1_all[s]            # (P, H)
        WL = WL_all[s]            # (H, O)
        hb = hb_all[s]            # (L, H)
        Wints = Wint_all[s]       # (max(L-1,1), H, H)
        sigma = float(sigma_all[s])

        # ----- Forward pass -----
        a_list = []
        h_list = []

        a = X @ W1 + hb[0]        # (N, H)
        h = phi(a)
        a_list.append(a); h_list.append(h)

        for l in range(1, L):
            Wl = Wints[l-1]       # (H, H)
            a = h @ Wl + hb[l]    # (N, H)
            h = phi(a)
            a_list.append(a); h_list.append(h)

        # ----- Backward: delta_L = d f / d a^(L) -----
        # lineÃ¦r utgang: df/dh^(L) = WL[:, output_index]
        v = WL[:, output_index]           # (H,)
        delta = dphi(a_list[-1]) * v      # (N, H), broadcast over N

        # Bakover gjennom skjulte lag
        for l in range(L-2, -1, -1):
            Wnext = Wints[l]              # (H, H) â€“ brukes bare hvis L>1
            delta = (delta @ Wnext.T) * dphi(a_list[l]) if L > 1 else delta

        delta1 = delta  # (N, H) == âˆ‚f/âˆ‚a^(1)

        # ----- q: (1/sigma^2) * sum_i (delta1[i,ell]^2 * X[i,j]^2) -----
        D_sq = delta1**2                  # (N, H)
        Q = (X_sq.T @ D_sq) / (sigma**2)  # (P, H)
        q_draws[s] = Q.T                  # (H, P)

    q_mean = q_draws.mean(axis=0)         # (H, P)
    return q_draws, q_mean, X

# ---------- 4) Eksempelbruk ----------


posterior_q_dhs, _, _ = compute_q_for_fit(
    tanh_fit['Dirichlet Horseshoe tanh']['posterior'],
    N=1000,             
    activation='tanh',  
    seed=123,
    output_index=0,
    X = X_train     
)

posterior_q_dst, _, _ = compute_q_for_fit(
    tanh_fit['Dirichlet Student T tanh']['posterior'],
    N=1000,             
    activation='tanh',  
    seed=123,
    output_index=0,
    X = X_train      
)

posterior_q_rhs, _, _ = compute_q_for_fit(
    tanh_fit['Regularized Horseshoe tanh']['posterior'],
    N=1000,             
    activation='tanh',  
    seed=123,
    output_index=0,
    X = X_train      
)

posterior_q_gauss, _, _ = compute_q_for_fit(
    tanh_fit['Gaussian tanh']['posterior'],
    N=1000,             
    activation='tanh',  
    seed=123,
    output_index=0,
    X = X_train      
)


In [None]:
nodes = 16
gauss_fit = tanh_fit['Gaussian tanh']['posterior']
rhs_fit = tanh_fit['Regularized Horseshoe tanh']['posterior']
dhs_fit = tanh_fit['Dirichlet Horseshoe tanh']['posterior']
dst_fit = tanh_fit['Dirichlet Student T tanh']['posterior']

nonzero = np.zeros((4, nodes))

for i in range(nodes):
    meff_tilde_g, _ = compute_kappa_per_input(
        gauss_fit, posterior_q_gauss, node_idx=i, model_type='gaussian'
    )
    meff_tilde_rhs, _ = compute_kappa_per_input(
        rhs_fit, posterior_q_rhs, node_idx=i, model_type='rhs'
    )
    meff_tilde_dhs, _ = compute_kappa_per_input(
        dhs_fit, posterior_q_dhs, node_idx=i, model_type='dhs'
    )
    meff_tilde_dst, _ = compute_kappa_per_input(
        dst_fit, posterior_q_dst, node_idx=i, model_type='dst'
    )
    nonzero[0, i] = meff_tilde_g
    nonzero[1, i] = meff_tilde_rhs
    nonzero[2, i] = meff_tilde_dhs
    nonzero[3, i] = meff_tilde_dst
    
print("Posterior number of nonzero weights: ", np.sum(nonzero, axis=1))
print("Posterior percentage of nonzero weights: ", np.sum(nonzero, axis=1)/128)

In [11]:
import numpy as np

# ---------- 1) Simuler X ----------
def simulate_X(n, P, seed=42):
    rng = np.random.default_rng(seed)
    return rng.uniform(0.0, 1.0, size=(n, P))

# ---------- 2) Aktivering og derivert ----------
def get_activation(activation="tanh"):
    if activation == "tanh":
        phi = np.tanh
        def dphi(a): return 1.0 - np.tanh(a)**2
    elif activation == "relu":
        def phi(a): return np.maximum(0.0, a)
        def dphi(a): 
            a = np.asarray(a)
            return (a > 0.0).astype(a.dtype)
    else:
        raise ValueError(f"Unsupported activation: {activation}")
    return phi, dphi

# ---------- 3) Hovedfunksjon: q for alle trekk ----------
def compute_q_for_fit(cmdstan_mcmc, N=1000, activation="tanh", seed=1, output_index=0, X=None):
    """
    Beregn q_{ell, j} for fÃ¸rste-lagsvektene for hver trekk (draw).
    Returnerer:
      q_draws:  (n_draws, H, P)
      q_mean:   (H, P)  â€“ gjennomsnitt over trekk
      X:        (N, P)  â€“ datasettet brukt i beregningen
    """
    # Hent ut variabler fra Stan
    W1_all = cmdstan_mcmc.stan_variable("W_1")            # (draws, P, H)
    WL_all = cmdstan_mcmc.stan_variable("W_L")             # (draws, H, O)
    hb_all = cmdstan_mcmc.stan_variable("hidden_bias")     # (draws, L, H)
    sigma_all = cmdstan_mcmc.stan_variable("sigma")        # (draws,)
    Wint_all = cmdstan_mcmc.stan_variable("W_internal")    # (draws, max(L-1,1), H, H)

    draws, P, H = W1_all.shape
    O = WL_all.shape[2]
    L = hb_all.shape[1]

    if O == 0:
        raise ValueError("W_L has zero output nodes. Expected at least 1.")
    if output_index < 0 or output_index >= O:
        raise ValueError(f"output_index {output_index} out of range 0..{O-1}")

    if X is None:
        X = simulate_X(N, P, seed=seed)

    X_sq = X**2
    phi, dphi = get_activation(activation)

    q_draws = np.empty((draws, H, P), dtype=float)

    for s in range(draws):
        W1 = W1_all[s]            # (P, H)
        WL = WL_all[s]            # (H, O)
        hb = hb_all[s]            # (L, H)
        Wints = Wint_all[s]       # (max(L-1,1), H, H)
        sigma = float(sigma_all[s])

        # ----- Forward pass -----
        a_list = []
        h_list = []

        a = X @ W1 + hb[0]        # (N, H)
        h = phi(a)
        a_list.append(a); h_list.append(h)

        for l in range(1, L):
            Wl = Wints[l-1]       # (H, H)
            a = h @ Wl + hb[l]    # (N, H)
            h = phi(a)
            a_list.append(a); h_list.append(h)

        # ----- Backward: delta_L = d f / d a^(L) -----
        # lineÃ¦r utgang: df/dh^(L) = WL[:, output_index]
        v = WL[:, output_index]           # (H,)
        delta = dphi(a_list[-1]) * v      # (N, H), broadcast over N

        # Bakover gjennom skjulte lag
        for l in range(L-2, -1, -1):
            Wnext = Wints[l]              # (H, H) â€“ brukes bare hvis L>1
            delta = (delta @ Wnext.T) * dphi(a_list[l]) if L > 1 else delta

        delta1 = delta  # (N, H) == âˆ‚f/âˆ‚a^(1)

        # ----- q: (1/sigma^2) * sum_i (delta1[i,ell]^2 * X[i,j]^2) -----
        D_sq = delta1**2                  # (N, H)
        Q = (X_sq.T @ D_sq) / (sigma**2)  # (P, H)
        q_draws[s] = Q.T                  # (H, P)

    q_mean = q_draws.mean(axis=0)         # (H, P)
    return q_draws, q_mean, X

# ---------- 4) Eksempelbruk ----------


posterior_q_dhs, _, _ = compute_q_for_fit(
    relu_fit['Dirichlet Horseshoe']['posterior'],
    N=1000,             
    activation='relu',  
    seed=123,
    output_index=0,
    X = X_train     
)

posterior_q_dst, _, _ = compute_q_for_fit(
    relu_fit['Dirichlet Student T']['posterior'],
    N=1000,             
    activation='relu',  
    seed=123,
    output_index=0,
    X = X_train      
)

posterior_q_rhs, _, _ = compute_q_for_fit(
    relu_fit['Regularized Horseshoe']['posterior'],
    N=1000,             
    activation='relu',  
    seed=123,
    output_index=0,
    X = X_train      
)

posterior_q_gauss, _, _ = compute_q_for_fit(
    relu_fit['Gaussian']['posterior'],
    N=1000,             
    activation='relu',  
    seed=123,
    output_index=0,
    X = X_train      
)


In [None]:
nodes = 16
gauss_fit = relu_fit['Gaussian']['posterior']
rhs_fit = relu_fit['Regularized Horseshoe']['posterior']
dhs_fit = relu_fit['Dirichlet Horseshoe']['posterior']
dst_fit = relu_fit['Dirichlet Student T']['posterior']

nonzero = np.zeros((4, nodes))

for i in range(nodes):
    meff_tilde_g, _ = compute_kappa_per_input(
        gauss_fit, posterior_q_gauss, node_idx=i, model_type='gaussian'
    )
    meff_tilde_rhs, _ = compute_kappa_per_input(
        rhs_fit, posterior_q_rhs, node_idx=i, model_type='rhs'
    )
    meff_tilde_dhs, _ = compute_kappa_per_input(
        dhs_fit, posterior_q_dhs, node_idx=i, model_type='dhs'
    )
    meff_tilde_dst, _ = compute_kappa_per_input(
        dst_fit, posterior_q_dst, node_idx=i, model_type='dst'
    )
    nonzero[0, i] = meff_tilde_g
    nonzero[1, i] = meff_tilde_rhs
    nonzero[2, i] = meff_tilde_dhs
    nonzero[3, i] = meff_tilde_dst
    
print("Posterior number of nonzero weights: ", np.sum(nonzero, axis=1))
print("Posterior percentage of nonzero weights: ", np.sum(nonzero, axis=1)/128)

## TEST SHAPLEY VALUES

In [None]:
from utils.generate_data import load_abalone_regression_data
X_train, X_test, y_train, y_test = load_abalone_regression_data(standardized=False, frac=1.0)
print(X_train.shape, X_test.shape)

In [None]:
from utils.robust_utils import build_pytorch_model_from_stan_sample
import torch, numpy as np, shap

fit = tanh_fit['Dirichlet Student T tanh']['posterior']

P = X_train.shape[1]
H=16

# 1) Build the torch model from one Stan draw
model = build_pytorch_model_from_stan_sample(
    fit, sample_idx=69, input_dim=P, hidden_dim=H,
    output_dim=1, task="regression", activation=torch.tanh
)
model.eval()

# 2) Wrap a predict function that takes a numpy array and returns numpy
def predict_numpy(X_np):
    with torch.no_grad():
        X_t = torch.tensor(X_np, dtype=torch.float32)
        y = model(X_t).cpu().numpy()
    return y

# 3) Choose a small background (50â€“200 rows) for SHAPâ€™s reference
feature_names = list(X_train.columns)
X_bg   = X_train.sample(n=3341, random_state=0).to_numpy(dtype=float)
X_eval = X_test.sample(n=836, random_state=1).to_numpy(dtype=float)  # what you want SHAP for

# 4) KernelSHAP
explainer = shap.KernelExplainer(predict_numpy, X_bg)
shap_vals = explainer.shap_values(X_eval)   # shape: (n_eval, P)

# 5) Global importance (mean |SHAP|)
mean_abs = np.abs(shap_vals).mean(axis=0)
order = np.argsort(mean_abs)[::-1]
for j in order:
    print(f"{feature_names[j]:16s}  {mean_abs[j]:.4f}")
    
shap.summary_plot(shap_vals, X_eval, feature_names=feature_names, show=False)
plt.tight_layout(); plt.show()

In [None]:
from utils.robust_utils import build_pytorch_model_from_stan_sample
import torch, numpy as np, shap

fit = relu_fit['Dirichlet Student T']['posterior']

P = X_train.shape[1]
H=16

# 1) Build the torch model from one Stan draw
model = build_pytorch_model_from_stan_sample(
    fit, sample_idx=69, input_dim=P, hidden_dim=H,
    output_dim=1, task="regression", activation=torch.relu
)
model.eval()

# 2) Wrap a predict function that takes a numpy array and returns numpy
def predict_numpy(X_np):
    with torch.no_grad():
        X_t = torch.tensor(X_np, dtype=torch.float32)
        y = model(X_t).cpu().numpy()
    return y

# 3) Choose a small background (50â€“200 rows) for SHAPâ€™s reference
feature_names = list(X_train.columns)
X_bg   = X_train.sample(n=3341, random_state=0).to_numpy(dtype=float)
X_eval = X_test.sample(n=836, random_state=1).to_numpy(dtype=float)  # what you want SHAP for

# 4) KernelSHAP
explainer = shap.KernelExplainer(predict_numpy, X_bg)
shap_vals = explainer.shap_values(X_eval)   # shape: (n_eval, P)

# 5) Global importance (mean |SHAP|)
mean_abs = np.abs(shap_vals).mean(axis=0)
order = np.argsort(mean_abs)[::-1]
for j in order:
    print(f"{feature_names[j]:16s}  {mean_abs[j]:.4f}")

shap.summary_plot(shap_vals, X_eval, feature_names=feature_names, show=False)
plt.tight_layout(); plt.show()

In [None]:
import pandas as pd
corr = pd.DataFrame(X_train, columns=X_train.columns).drop(columns=["Sex"]).corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0.92)
plt.show()


## TESTING ALIGNMENT