In [None]:
import numpy as np
import os
import sys, os; sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__) if '__file__' in globals() else os.getcwd(), '..')))
from utils.model_loader import get_model_fits
import numpy as np
import pandas as pd
import re
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt


In [14]:
from utils.generate_data import load_linreg_dataset
def generate_linreg_simple_data(
    N=250,
    p=3,
    rho=0.0,
    sigma=1.0,
    seed=123
):


    np.random.seed(seed)

    # --- Sparse true coefficients ---
    beta_true = np.array([0.2] + [0.0]*(p-1))
    beta_true = beta_true[:p]  # ensure correct dimension

    if rho == 0.0:
        # Independent predictors
        X = np.random.normal(0, 1, size=(N, p))
    else:
        # Correlated predictors
        Sigma = rho ** np.abs(np.subtract.outer(np.arange(p), np.arange(p)))
        L = np.linalg.cholesky(Sigma)
        X = np.random.normal(size=(N, p)) @ L.T

    # --- Generate y ---
    noise = np.random.normal(0.0, sigma, size=N)
    y = X @ beta_true + noise

    return X, y, beta_true

N=250
seed=1
X, y, beta_true = generate_linreg_simple_data(
    N=N,
    p=3,
    rho=0.9,
    sigma=1.0,
    seed=seed
)

save_dir = f"datasets/linreg/linreg_data_beta_dirichlet_2"
os.makedirs(save_dir, exist_ok=True)

np.savez(
    f"{save_dir}.npz",
    X=X,
    y=y,
    N=N,
    seed=seed
)

X_train, X_test, y_train, y_test, meta, X, y = load_linreg_dataset(
    path="datasets/linreg/linreg_data_beta_dirichlet_2.npz",
    test_fraction=0.2,
    seed=123,
)

In [None]:
data_dir = f"datasets/linreg"
results_dir = "results/regression/linreg/beta_dirichlet_v2"
#model_names = ["Linreg Gaussian", "Linreg Regularized Horseshoe", "Linreg Dirichlet Horseshoe", "Linreg Dirichlet Student T", "Linreg Beta Horseshoe", "Linreg Beta Student T"]
model_names = ["Linreg Dirichlet Horseshoe", "Linreg Dirichlet Student T", "Linreg Beta Horseshoe", "Linreg Beta Student T"]


full_config_path = "linreg_N200_p3_rho_0.0"
fits = get_model_fits(
    config=full_config_path,
    results_dir=results_dir,
    models=model_names,
    include_prior=False,
)


In [3]:
from utils.generate_data import make_grouped_duplicates_data, load_linreg_dataset
N=250
sigma=1
seed=1
# X, y, beta_true, signal_groups = make_grouped_duplicates_data(
#     n=N, G=20, m=20, sigma=1.0, test_size=0.2, seed=seed
# )

X_train, X_test, y_train, y_test, _, _, _ = load_linreg_dataset(
    path="datasets/linreg/linreg_data_beta_dirichlet_2.npz",
    test_fraction=0.2,
    seed=123,
)

In [4]:
import numpy as np

def concentration_ratio(beta_samples, eps=1e-12):
    """
    beta_samples: (S, p)
    Returns: array (S,) of max/sum concentration ratios
    """
    abs_beta = np.abs(beta_samples)
    num = np.max(abs_beta, axis=1)
    den = np.sum(abs_beta, axis=1) + eps
    return num / den


def effective_active_count(beta_samples, threshold=0.1):
    """
    Counts number of 'active' coefficients per draw
    """
    return np.sum(np.abs(beta_samples) > threshold, axis=1)


def posterior_mean_abs(beta_samples):
    """
    Posterior mean absolute value per coefficient
    """
    return np.mean(np.abs(beta_samples), axis=0)


def posterior_pairwise_corr(beta_samples):
    """
    Pairwise correlation of coefficients across posterior samples
    """
    return np.corrcoef(beta_samples, rowvar=False)


In [5]:
import numpy as np

# beta_gauss = fits["Linreg Gaussian"]["posterior"].stan_variable("beta")
# beta_RHS = fits["Linreg Regularized Horseshoe"]["posterior"].stan_variable("beta")
beta_DHS = fits["Linreg Dirichlet Horseshoe"]["posterior"].stan_variable("beta")
beta_DST = fits["Linreg Dirichlet Student T"]["posterior"].stan_variable("beta")
beta_BHS = fits["Linreg Beta Horseshoe"]["posterior"].stan_variable("beta")
beta_BST = fits["Linreg Beta Student T"]["posterior"].stan_variable("beta")

models = {
    # "Gaussian": beta_gauss,
    # "Regularized HS": beta_RHS,
    "Dirichlet HS": beta_DHS,
    "Dirichlet ST": beta_DST,
    "Beta HS": beta_BHS,
    "Beta ST": beta_BST,
}

p = beta_DST.shape[1]


In [None]:
def rmse(y, yhat):
    return np.sqrt(np.mean((y - yhat) ** 2))

print("\nPosterior mean RMSE:")
for name, beta in models.items():
    beta_mean = np.mean(beta, axis=0)
    yhat = X_test @ beta_mean
    print(f"{name:15s}: {rmse(y_test, yhat):.4f}")


In [None]:
def concentration_ratio(beta_samples, eps=1e-12):
    absb = np.abs(beta_samples)
    return np.max(absb, axis=1) / (np.sum(absb, axis=1) + eps)

print("\nConcentration ratio (max / sum):")
for name, beta in models.items():
    C = concentration_ratio(beta)
    print(
        f"{name:15s}: "
        f"mean={np.mean(C):.3f}, "
        f"median={np.median(C):.3f}, "
        f"q10={np.quantile(C,0.1):.3f}, "
        f"q90={np.quantile(C,0.9):.3f}"
    )


In [None]:
def effective_active_count(beta_samples, threshold=0.1):
    return np.sum(np.abs(beta_samples) > threshold, axis=1)

print("\nEffective number of active coefficients:")
for name, beta in models.items():
    k_eff = effective_active_count(beta, threshold=0.1)
    print(
        f"{name:15s}: "
        f"mean={np.mean(k_eff):.1f}, "
        f"median={np.median(k_eff):.0f}"
    )


In [None]:
print("\nTop 10 posterior mean |beta_j|:")

for name, beta in models.items():
    mean_abs = np.mean(np.abs(beta), axis=0)
    top = np.sort(mean_abs)[-10:][::-1]
    print(f"{name:15s}: {np.round(top, 3)}")


In [None]:
def top_k_indices(x, k):
    return set(np.argsort(np.abs(x))[-k:])

beta_true = np.array([0.2] + [0.0]*(p-1))

k = np.sum(beta_true != 0)

true_support = top_k_indices(beta_true, k)

print("\nSupport recovery (top-k overlap):")
for name, beta in models.items():
    mean_beta = np.mean(beta, axis=0)
    est_support = top_k_indices(mean_beta, k)
    overlap = len(true_support & est_support)
    print(f"{name:15s}: {overlap}/{k}")


In [None]:
def offdiag_mean_abs_corr(beta_samples):
    C = np.corrcoef(beta_samples, rowvar=False)
    mask = ~np.eye(C.shape[0], dtype=bool)
    return np.mean(np.abs(C[mask]))

print("\nMean absolute off-diagonal posterior correlation:")
for name, beta in models.items():
    print(f"{name:15s}: {offdiag_mean_abs_corr(beta):.3f}")


In [13]:
def make_groups_contiguous(p, m):
    assert p % m == 0
    G = p // m
    groups = [np.arange(g*m, (g+1)*m) for g in range(G)]
    return groups

# example: p=400, group size m=20 -> G=20 groups
groups = make_groups_contiguous(p=400, m=20)

import numpy as np

def group_concentration_ratio(beta_samples, groups, eps=1e-12):
    """
    For each posterior draw s and each group g:
      C_{s,g} = max_j |beta_{s,j}| / sum_j |beta_{s,j}|
    Returns: (S, G)
    """
    S = beta_samples.shape[0]
    G = len(groups)
    C = np.zeros((S, G))
    absb = np.abs(beta_samples)
    for gi, idx in enumerate(groups):
        num = np.max(absb[:, idx], axis=1)
        den = np.sum(absb[:, idx], axis=1) + eps
        C[:, gi] = num / den
    return C

def group_effective_active(beta_samples, groups, threshold=0.1):
    """
    For each draw and group: number of active coeffs in the group.
    Returns: (S, G)
    """
    S = beta_samples.shape[0]
    G = len(groups)
    K = np.zeros((S, G), dtype=int)
    for gi, idx in enumerate(groups):
        K[:, gi] = np.sum(np.abs(beta_samples[:, idx]) > threshold, axis=1)
    return K

def group_top1_share_of_global_l1(beta_samples, groups, eps=1e-12):
    """
    For each draw:
      - find, within each group, the max |beta|
      - sum those maxima across groups (a 'picked representative' L1)
      - divide by total L1 across all coefficients
    This is a single-number summary per draw.
    Returns: (S,)
    """
    absb = np.abs(beta_samples)
    total_l1 = np.sum(absb, axis=1) + eps

    top1_sum = np.zeros(beta_samples.shape[0])
    for idx in groups:
        top1_sum += np.max(absb[:, idx], axis=1)

    return top1_sum / total_l1


In [None]:
def summarize(x):
    return {
        "mean": float(np.mean(x)),
        "median": float(np.median(x)),
        "q10": float(np.quantile(x, 0.1)),
        "q90": float(np.quantile(x, 0.9)),
    }

# choose group size (Design A: e.g. 10, 20, 25)
m = 20
p = next(iter(models.values())).shape[1]
groups = make_groups_contiguous(p=p, m=m)

threshold = 0.1  # adjust if needed

print(f"\nGroup diagnostics with p={p}, m={m}, G={len(groups)}, threshold={threshold}\n")

for name, beta in models.items():
    C = group_concentration_ratio(beta, groups)           # (S, G)
    K = group_effective_active(beta, groups, threshold)   # (S, G)
    top1share = group_top1_share_of_global_l1(beta, groups)  # (S,)

    # aggregate over draws and groups
    C_all = C.reshape(-1)
    K_all = K.reshape(-1)

    print(f"== {name} ==")
    print("Within-group concentration max/sum:", summarize(C_all))
    print("Within-group #active coefficients:", summarize(K_all))
    print("Top-1-per-group share of global L1:", summarize(top1share))
    print()


In [None]:
def check_group_correlations(X, groups):
    vals = []
    for idx in groups:
        Xg = X[:, idx]
        R = np.corrcoef(Xg, rowvar=False)
        mask = ~np.eye(R.shape[0], dtype=bool)
        vals.append(np.mean(np.abs(R[mask])))
    return np.array(vals)

avg_abs_corr = check_group_correlations(X_train, groups)
print("Mean |corr| within each group (train):")
print("  mean:", avg_abs_corr.mean())
print("  min :", avg_abs_corr.min())
print("  max :", avg_abs_corr.max())
