In [None]:
import numpy as np
import os
import sys, os; sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__) if '__file__' in globals() else os.getcwd(), '..')))
from utils.model_loader import get_model_fits
import numpy as np
import pandas as pd
import re
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt


np.random.seed(123)

N = 250              # slightly larger sample
P = 10               # fixed number of covariates

# --- Sparse true coefficients ---
beta_true = np.array([3.0, -2.0, 0.0, 0.0, 1.5, 0.0, 0.8, 0.0, 0.0, 0.0])

# --- AR(1) correlated predictors ---
rho = 0.7
Sigma = rho ** np.abs(np.subtract.outer(np.arange(P), np.arange(P)))
L = np.linalg.cholesky(Sigma)
X = np.random.normal(size=(N, P)) @ L.T

# --- Inject some covariate outliers ---
outlier_fraction = 0.05
num_outliers = int(outlier_fraction * N)
outlier_rows = np.random.choice(N, num_outliers, replace=False)
X[outlier_rows] += np.random.normal(0, 8.0, size=(num_outliers, P))

# --- Heavy-tailed noise for y ---
# Student-t with small df (df=3)
df = 3
noise = np.random.standard_t(df, size=N)

# Scale noise to moderate amplitude
sigma_true = 0.7
noise *= sigma_true

# --- Generate y ---
y = X @ beta_true + noise




In [None]:
import numpy as np
from utils.generate_data import load_linreg_dataset

X_train, X_test, y_train, y_test, meta = load_linreg_dataset()
print("Training shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


In [None]:
data_dir = f"datasets/linreg"
results_dir_relu = "results/regression/linreg"
#model_names_relu = ["Dirichlet Student T"]
model_names_relu = ["Linreg Gaussian", "Linreg Regularized Horseshoe", "Linreg Dirichlet Horseshoe", "Linreg Dirichlet Student T"]


full_config_path = "linreg_N200_p10"
linreg_fit = get_model_fits(
    config=full_config_path,
    results_dir=results_dir_relu,
    models=model_names_relu,
    include_prior=False,
)


In [5]:
beta_gauss = linreg_fit['Linreg Gaussian']['posterior'].stan_variable("beta")
beta_RHS = linreg_fit['Linreg Regularized Horseshoe']['posterior'].stan_variable("beta")
beta_DHS = linreg_fit['Linreg Dirichlet Horseshoe']['posterior'].stan_variable("beta")
beta_DST = linreg_fit['Linreg Dirichlet Student T']['posterior'].stan_variable("beta")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# Make sure y_train is 1D shape (N,)
y_vec = np.asarray(y_train).reshape(-1)
N = X_train.shape[0]

def rmse_per_sample(beta_samples, X, y):
    """
    beta_samples: (S, P)
    X: (N, P)
    y: (N,)
    Returns: array (S,) of RMSEs, one per posterior draw
    """
    # Predictions for all samples at once: (N, S)
    preds = X @ beta_samples.T
    # Broadcast y to (N, S)
    errors = preds - y[:, None]
    mse = np.mean(errors**2, axis=0)
    rmse = np.sqrt(mse)
    return rmse

rmse_gauss_samps = rmse_per_sample(beta_gauss, X_train, y_vec)
rmse_RHS_samps   = rmse_per_sample(beta_RHS,   X_train, y_vec)
rmse_DHS_samps   = rmse_per_sample(beta_DHS,   X_train, y_vec)
rmse_DST_samps   = rmse_per_sample(beta_DST,   X_train, y_vec)

print("Posterior mean RMSEs:")
print("  Gaussian:               ", rmse_gauss_samps.mean())
print("  Regularized Horseshoe:  ", rmse_RHS_samps.mean())
print("  Dirichlet Horseshoe:    ", rmse_DHS_samps.mean())
print("  Dirichlet Student-t:    ", rmse_DST_samps.mean())

# -------- Density plot of RMSEs --------
plt.figure(figsize=(8, 5))

def plot_rmse_kde(rmse_samples, label):
    kde = gaussian_kde(rmse_samples)
    xs = np.linspace(np.percentile(rmse_samples, 1),
                     np.percentile(rmse_samples, 99), 200)
    plt.plot(xs, kde(xs), label=label)

plot_rmse_kde(rmse_gauss_samps, "Gaussian")
plot_rmse_kde(rmse_RHS_samps,   "Regularized Horseshoe")
plot_rmse_kde(rmse_DHS_samps,   "Dirichlet Horseshoe")
plot_rmse_kde(rmse_DST_samps,   "Dirichlet Student-t")

plt.xlabel("RMSE on training data")
plt.ylabel("Posterior density")
plt.title("Posterior distribution of RMSE per model")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sanity: check shapes
print("Gauss:", beta_gauss.shape)
print("RHS:", beta_RHS.shape)
print("DHS:", beta_DHS.shape)
print("DST:", beta_DST.shape)

S, P = beta_gauss.shape  # number of draws, number of coefficients

# Put all draws into one long DataFrame
def beta_to_long_df(beta_array, model_name):
    """
    beta_array: (S, P)
    returns DataFrame with columns: model, draw, coeff, beta
    """
    S, P = beta_array.shape
    df = pd.DataFrame(
        beta_array.reshape(S * P),
        columns=["beta"]
    )
    df["draw"] = np.repeat(np.arange(S), P)
    df["coeff"] = np.tile(np.arange(P), S)
    df["model"] = model_name
    return df

df_gauss = beta_to_long_df(beta_gauss, "Gaussian")
df_RHS   = beta_to_long_df(beta_RHS,   "Regularized Horseshoe")
df_DHS   = beta_to_long_df(beta_DHS,   "Dirichlet Horseshoe")
df_DST   = beta_to_long_df(beta_DST,   "Dirichlet Student-t")

beta_df = pd.concat([df_gauss, df_RHS, df_DHS, df_DST], ignore_index=True)

# Optional: attach true beta if you have it loaded from the dataset
if "beta_true" in globals():
    beta_true_series = pd.Series(beta_true, index=np.arange(len(beta_true)))
    beta_df["beta_true"] = beta_df["coeff"].map(beta_true_series)
else:
    beta_true_series = None

beta_df.head()


In [None]:
# Boxplot per coefficient, grouped by model
fig, axes = plt.subplots(2, int(np.ceil(P / 2)), figsize=(16, 6), sharey=True)
axes = axes.flatten()

for j in range(P):
    ax = axes[j]
    df_j = beta_df[beta_df["coeff"] == j]
    # Make a simple boxplot of posterior for beta_j under each model
    data = [df_j[df_j["model"] == m]["beta"].values
            for m in ["Gaussian", "Regularized Horseshoe", "Dirichlet Horseshoe", "Dirichlet Student-t"]]
    ax.boxplot(data, showfliers=False)
    ax.set_xticks([1, 2, 3, 4])
    ax.set_xticklabels(["Gauss", "RHS", "DHS", "DST"], rotation=30)
    ax.set_title(f"β_{j+1}")

    # If true beta is known, add a horizontal line
    if beta_true_series is not None:
        ax.axhline(beta_true_series[j], linestyle="--", linewidth=1)

# Hide unused axes if P is odd
for k in range(P, len(axes)):
    axes[k].axis("off")

fig.suptitle("Posterior distributions of β_j by prior (boxplots)", fontsize=14)
fig.tight_layout()
plt.show()


In [None]:
from scipy.stats import gaussian_kde

def plot_beta_kde_for_coeff(j, ax):
    """
    j: coefficient index
    ax: matplotlib axis
    """
    df_j = beta_df[beta_df["coeff"] == j]

    for model_name, label in [
        ("Gaussian", "Gauss"),
        ("Regularized Horseshoe", "RHS"),
        ("Dirichlet Horseshoe", "DHS"),
        ("Dirichlet Student-t", "DST"),
    ]:
        samples = df_j[df_j["model"] == model_name]["beta"].values
        kde = gaussian_kde(samples)
        xs = np.linspace(np.percentile(samples, 1),
                         np.percentile(samples, 99), 200)
        ax.plot(xs, kde(xs), label=label, alpha=0.8)

    if beta_true_series is not None:
        ax.axvline(beta_true_series[j], linestyle="--", linewidth=1)

    ax.set_title(f"β_{j+1}")
    ax.legend(fontsize=8)

# Choose which coefficients to inspect more closely
coeffs_to_plot = [0, 1, 2, 3]  # likely non-zero in your synthetic setup
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for idx, j in enumerate(coeffs_to_plot):
    plot_beta_kde_for_coeff(j, axes[idx])

fig.suptitle("Marginal posterior densities of selected β_j", fontsize=14)
fig.tight_layout()
plt.show()


In [None]:
# Posterior means per coefficient and model
mean_df = (
    beta_df
    .groupby(["model", "coeff"])["beta"]
    .mean()
    .reset_index()
    .pivot(index="coeff", columns="model", values="beta")
)

plt.figure(figsize=(8, 5))
for model_name in ["Gaussian", "Regularized Horseshoe", "Dirichlet Horseshoe", "Dirichlet Student-t"]:
    plt.plot(mean_df.index + 1, mean_df[model_name], marker="o", label=model_name)

if beta_true_series is not None:
    plt.plot(np.arange(1, P+1), beta_true_series.values, "k--", label="True β")

plt.xlabel("Coefficient index j")
plt.ylabel("Posterior mean of β_j")
plt.title("Posterior means per coefficient and prior")
plt.legend()
plt.tight_layout()
plt.show()
