\noindent
We consider the latent-factor model of Hastie--Montanari--Rosset--Tibshirani (§5.4). Let $z_i \in \mathbb{R}^d$ be latent factors, $u_i \in \mathbb{R}^p$ feature noise, and $\xi_i$ response noise. The observed feature vectors and responses are
\begin{equation}
x_i = W z_i + u_i, \qquad y_i = z_i^\top \theta + \xi_i,
\end{equation}
where $W \in \mathbb{R}^{p \times d}$ has row-wise unit norm $\|w_j\|=1$, the latent signal direction $\theta \in \mathbb{R}^d$ satisfies $\|\theta\| = r_\theta$, and
\begin{equation}
z_i \sim \mathcal{N}(0,I_d), \qquad u_i \sim \mathcal{N}(0,I_p), \qquad \xi_i \sim \mathcal{N}(0,\sigma_\xi^2).
\end{equation}
Stacking rows gives
\begin{equation}
X = Z W^\top + U, \qquad y = Z\theta + \xi.
\end{equation}
Although the response is generated from latent variables, there exists an equivalent population linear model in the observed coordinates. Define the population covariance
\begin{equation}
\Sigma = \mathbb{E}[x_i x_i^\top] = I_p + W W^\top,
\end{equation}
and the corresponding population regression coefficient
\begin{equation}
\beta_{\ast} = W \bigl(I_d + W^\top W\bigr)^{-1}\theta.
\end{equation}
Then
\begin{equation}
\mathbb{E}[y \mid x] = x^\top \beta_{\ast},
\end{equation}
so $\beta_{\ast}$ is the unique population-optimal linear predictor. Importantly, $\beta_{\ast}$ lies entirely in the $d$-dimensional signal subspace $\mathcal{S} = \mathrm{span}(W) \subset \mathbb{R}^p$.

In the fitted model, we estimate $\beta$ by the minimum-norm least squares estimator
\begin{equation}
\widehat{\beta} = X^{+} y,
\end{equation}
where $X^{+}$ is the Moore--Penrose pseudoinverse. When $p<n$, this reduces to ordinary least squares
\begin{equation}
\widehat{\beta} = (X^\top X)^{-1}X^\top y,
\end{equation}
while when $p\ge n$ and $\mathrm{rank}(X)=n$, it interpolates the data:
\begin{equation}
\widehat{\beta} = X^\top (X X^\top)^{-1} y.
\end{equation}

The performance of the estimator is measured using the population risk
\begin{equation}
R = (\widehat{\beta} - \beta_{\ast})^\top \Sigma\,(\widehat{\beta} - \beta_{\ast}).
\end{equation}

The signal $\beta_{\ast}$ lies in the low-dimensional subspace $\mathcal{S}$, while $X$ includes noise directions orthogonal to this subspace. The minimum-norm estimator does not impose knowledge of $\mathcal{S}$, and therefore its alignment with $\mathcal{S}$ depends critically on the aspect ratio
\begin{equation}
\gamma = \frac{p}{n}.
\end{equation}
When $\gamma < 1$, the estimator behaves like classical least squares and its variance grows as $\gamma \to 1$. At $\gamma = 1$, the smallest singular value of $X$ tends to zero, causing the risk $R$ to diverge. For $\gamma > 1$, the minimum-norm constraint forces $\widehat{\beta}$ to place progressively less mass in the noise subspace $\mathcal{S}^\perp$, improving alignment with $\mathcal{S}$ and decreasing the population risk. Consequently, the risk exhibits the characteristic double-descent shape: an initial increase near $\gamma = 1$, followed by monotone decrease as $\gamma$ grows and the estimator aligns with the latent signal subspace.


We now replace the direct linear regression on $X$ with a single-hidden-layer neural network feature map. For each choice of $p$ and $n$, we draw a random first-layer weight matrix $W_1 \in \mathbb{R}^{p \times H}$ from a specified prior, and define the hidden-unit representation
\begin{equation}
\Phi = \tanh(Z W_1).
\end{equation}
No likelihood or data is used in sampling $W_1$; the hidden layer is drawn purely from the prior. The second-layer coefficients are then estimated by minimum-norm least squares:
\begin{equation}
\widehat{w} = \Phi^{+} y.
\end{equation}
Thus, the fitted model is
\begin{equation}
\widehat{f}(x) = \sum_{h=1}^H \widehat{w}_h \tanh(w_{1,h}^\top \phi),
\end{equation}
where $w_{1,h}$ denotes the $h$-th column of $W_1$. We evaluate performance using the same population risk as before,
\begin{equation}
R = \mathbb{E}\bigl[(\widehat{f}(x) - x^\top \beta_{\ast})^2\bigr]
    = (\widehat{\beta} - \beta_{\ast})^\top \Sigma (\widehat{\beta} - \beta_{\ast}),
\end{equation}
except that the estimator now belongs to the nonlinear feature space spanned by $\{\tanh(w_{1,h}^\top z)\}_{h=1}^H$. The quantity $\widehat{\beta}$ refers to the effective linear predictor induced by $\widehat{f}$ in the observed coordinates.



In [1]:
import numpy as np

def make_latent_data_sec54(n, p, d=20, r_theta=1.0, sigma_xi=0.0, rng=None):
    """
    Section 5.4 latent model (Hastie–Montanari–Rosset–Tibshirani):
      X = Z W^T + U,   y = Z θ + ξ
      z_i ~ N(0, I_d), u_ij ~ N(0, 1), ξ_i ~ N(0, σ_ξ^2)
    Rows w_j of W satisfy ||w_j|| = 1.               [Fig. 5/6 setup]
    Population mapping to linear model:
      Σ = I_p + W W^T,   β = W (I + W^T W)^{-1} θ.   [eqs. (26)-(27)]
    Returns: X (n×p), y (n,), W (p×d), theta (d,), beta_true (p,), Sigma (p×p)
    """
    rng = np.random.default_rng() if rng is None else rng

    # Random W with unit-norm rows (||w_j||=1)
    W = rng.normal(size=(p, d))
    W /= np.linalg.norm(W, axis=1, keepdims=True) + 1e-12  # enforce ||w_j||=1

    # Latent Z, feature noise U, label noise ξ
    Z = rng.normal(size=(n, d))
    U = rng.normal(size=(n, p))
    xi = rng.normal(scale=sigma_xi, size=n)

    # Signal vector θ with ||θ|| = r_theta
    theta = rng.normal(size=d)
    theta *= r_theta / (np.linalg.norm(theta) + 1e-12)

    # Data
    X = Z @ W.T + U
    y = Z @ theta + xi

    # Population quantities for risk
    Sigma = np.eye(p) + W @ W.T
    beta_true = W @ np.linalg.solve(np.eye(d) + W.T @ W, theta)  # β = W (I + W^T W)^(-1) θ

    return X, Z, y, W, theta, beta_true, Sigma

def fit_min_norm(X, y):
    """
    Minimum-ℓ2-norm least squares: β̂ = X^+ y (ridgeless limit of ridge).
    """
    return np.linalg.pinv(X) @ y


In [2]:
import numpy as np

def sample_hidden_features_gauss(
    X,
    rng,
    H,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = X.shape
    W1 = rng.normal(0.0, 1.0, size=(P, H))

    Z = np.tanh(X @ W1)
    return Z, W1

def sample_hidden_features_RHS(
    X,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = X.shape
    tau0 = p_0 / (P - p_0)
    #alpha = np.full(P, alpha_scale)

    tau = np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_cauchy(size=(H, P)))
    #phi_data = rng.dirichlet(alpha, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) #* np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    Z = np.tanh(X @ W1)
    return Z, W1

def sample_hidden_features_DHS(
    X,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.1,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = X.shape
    tau0 = p_0 / (P - p_0)
    alpha = np.full(P, alpha_scale)

    tau = np.sqrt(10)*np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_cauchy(size=(H, P)))
    phi_data = rng.dirichlet(alpha, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) * np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    Z = np.tanh(X @ W1)
    return Z, W1

def sample_hidden_features_DST(
    X,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.1,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = X.shape
    tau0 = p_0 / (P - p_0)
    alpha = np.full(P, alpha_scale)

    tau = np.sqrt(10)*np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_t(df=3, size=(H, P)))
    phi_data = rng.dirichlet(alpha, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) * np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    Z = np.tanh(X @ W1)
    return Z, W1

def plot_risk_curve_hidden_units(
    n=400,
    gammas=(0.7, 0.9, 1.2, 1.5, 2, 3, 5, 8, 12, 20),
    model = "DHS",
    d=20,
    r_theta=1.0,
    sigma_xi=0.0,
    reps=50,
    risk_mc_samples=1000,
    seed=0,
    # prior hyperparams
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.5,
    # NEW: schedule H as a function of (p, n)
    H_of_p=lambda p, n: p,   # <- default ties H to p (so H grows with γ)
):
    """
    Same as before, but H now depends on p (and n) via H_of_p.
    We reuse the same W1 for train and population risk.
    """
    rng = np.random.default_rng(seed)
    G, M, S = [], [], []

    for gamma in gammas:
        p = max(1, int(round(gamma * n)))
        H = max(1, int(H_of_p(p, n)))
        risks = []

        for _ in range(reps):
            # Your data generator
            X, Z, y, W, theta, beta_true, Sigma = make_latent_data_sec54(
                n=n, p=p, d=d, r_theta=r_theta, sigma_xi=sigma_xi, rng=rng
            )

            # One sampled hidden map W1 for this rep; reuse it for population risk
            if model =="gauss":
                Z, W1 = sample_hidden_features_gauss(
                    X, rng, H=H
                )
            
            elif model =="RHS":
                Z, W1 = sample_hidden_features_RHS(
                    X, rng, H=H, p_0=p_0, a=a, b=b
                )
            elif model =="DHS":
                Z, W1 = sample_hidden_features_DHS(
                    X, rng, H=H, p_0=p_0, a=a, b=b, alpha_scale=alpha_scale
                )
            else:
                Z, W1 = sample_hidden_features_DST(
                    X, rng, H=H, p_0=p_0, a=a, b=b, alpha_scale=alpha_scale
                )

            # Min-norm on hidden units
            w_hat = fit_min_norm(Z, y)
            if w_hat.ndim > 1 and w_hat.shape[1] == 1:
                w_hat = w_hat.ravel()

            # Monte Carlo population risk with the SAME W1
            X_pop = rng.multivariate_normal(mean=np.zeros(p), cov=Sigma, size=risk_mc_samples)
            Z_pop = np.tanh(X_pop @ W1)
            y_true_pop = X_pop @ beta_true
            y_pred_pop = Z_pop @ w_hat
            risks.append(float(np.mean((y_pred_pop - y_true_pop) ** 2)))

        G.append(gamma)
        M.append(np.mean(risks))
        S.append(np.std(risks, ddof=1))

    return np.array(G), np.array(M), np.array(S)

G_h_gauss, M_h_gauss, S_h_gauss = plot_risk_curve_hidden_units(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "gauss",
    gammas=[0.16, 0.3, 0.7, 0.9, 1.5, 2, 3, 5, 8, 10, 20],
    reps=50, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)


In [3]:
G_h_RHS, M_h_RHS, S_h_RHS = plot_risk_curve_hidden_units(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "RHS",
    gammas=[0.16, 0.3, 0.7, 0.9, 1.5, 2, 3, 5, 8, 10, 20],
    reps=50, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [4]:
G_h_DHS, M_h_DHS, S_h_DHS = plot_risk_curve_hidden_units(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "DHS",
    gammas=[0.16, 0.3, 0.7, 0.9, 1.5, 2, 3, 5, 8, 10, 20],
    reps=50, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [5]:
G_h_DST, M_h_DST, S_h_DST = plot_risk_curve_hidden_units(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "DST",
    gammas=[0.16, 0.3, 0.7, 0.9, 1.5, 2, 3, 5, 8, 10, 20],
    reps=50, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [6]:

def plot_risk_curve_sec54(
    n=400,
    gammas=(0.7, 0.9, 1.2, 1.5, 2, 3, 5, 8, 12, 20),
    d=20,
    r_theta=1.0,          # "r = 1" in the captions
    sigma_xi=0.0,         # use 0 for Fig. 5 behavior; try 0, 0.25, 0.5 like Fig. 6
    reps=50,
    seed=0
):
    """
    Replicates the latent-space risk curve of §5.4 (Figs. 5–6):
      For each γ = p/n, simulate (X,y), fit min-norm β̂, and compute
      population risk R_X = (β̂−β)^T Σ (β̂−β), then average across reps.
    Expectation from §5.4: spike near γ≈1 and then *monotone decrease* for γ>1,
    reaching a global minimum as γ→∞ when β aligns with top eigenspace of Σ. 
    """
    rng = np.random.default_rng(seed)
    G, M, S = [], [], []

    for gamma in gammas:
        p = max(1, int(round(gamma * n)))
        risks = []

        for _ in range(reps):
            X, Z, y, W, theta, beta_true, Sigma = make_latent_data_sec54(
                n=n, p=p, d=d, r_theta=r_theta, sigma_xi=sigma_xi, rng=rng
            )
            beta_hat = fit_min_norm(X, y)

            diff = beta_hat - beta_true
            risks.append(float(diff @ (Sigma @ diff)))

        G.append(gamma)
        M.append(np.mean(risks))
        S.append(np.std(risks, ddof=1))

    G, M, S = np.array(G), np.array(M), np.array(S)
    ci = 1.96 * S / np.sqrt(reps)
    
    return G, M, S

# Example:
G, M, S = plot_risk_curve_sec54(n=100, d=20, r_theta=1.0, sigma_xi=0.0,
                        gammas=[0.16, 0.3, 0.7, 0.9, 1.5, 2, 3, 5, 8, 10, 20], reps=50)


In [7]:
import numpy as np
from math import isfinite
from typing import Iterable, Tuple, Dict, Any

def _c0_closed_form(gamma, psi):
    """
    Closed-form solution of Eq. (35) for c0 in Corollary 4.
    Parameters assume gamma > 1 and 0 < psi < 1 (as in the latent model).
    """
    A = float(gamma)
    L = 1.0 - 1.0/A                  # LHS constant
    t = 1.0 + 1.0/psi                # 1 + psi^{-1}

    # Quadratic in y = c0:
    # (L t A^2) y^2 + [A (L(1+t) - ((1-psi) t + psi))] y + (L - 1) = 0
    Acoef = L * t * A**2
    Bcoef = A * (L*(1.0 + t) - ((1.0 - psi)*t + psi))
    Ccoef = L - 1.0                  # = -1/A

    disc = Bcoef*Bcoef - 4.0*Acoef*Ccoef
    if disc < 0:
        # tiny negative from FP roundoff
        disc = 0.0
    # unique nonnegative root
    return (-Bcoef + np.sqrt(disc)) / (2.0*Acoef)

def corollary4_continuous(
    gammas,
    n,
    d,
    r_theta=1.0,
    sigma_xi=0.0,
):
    """
    Returns analytic test risk from Corollary 4 for the latent-space model (Sec. 5.4).
    For γ<1, uses the underparametrized variance formula R = σ^2 * γ/(1-γ) (Prop. 2).
    For γ>1, uses Corollary 4 with the closed-form c0.

    Inputs
    ------
    gammas : array-like of γ = p/n (positive)
    n      : sample size
    d      : latent dimension
    r_theta: ||θ||
    sigma_xi: σ_ξ

    Outputs (np.ndarray)
    --------------------
    dict with keys: 'gamma', 'risk', 'bias', 'var'
    """
    g = np.asarray(gammas, dtype=float)
    risk = np.empty_like(g); bias = np.empty_like(g); var = np.empty_like(g)

    for i, gamma in enumerate(g):
        if gamma <= 0:
            raise ValueError("All gamma must be > 0.")

        # finite-sample plug-in for ψ = d/p with p = γ n
        psi = d / (gamma * n)
        # σ^2 = σ_ξ^2 + ψ r_θ^2 / (1+ψ)  (Eq. (27) and Cor. 4 text)
        sigma2 = sigma_xi**2 + psi * (r_theta**2) / (1.0 + psi)

        if gamma < 1.0:
            # Underparametrized: pure variance (Prop. 2)
            bias[i] = 0.0
            var[i]  = sigma2 * gamma / (1.0 - gamma)
            risk[i] = var[i]
        else:
            # Overparametrized: Cor. 4 Eqs. (30)–(35)
            c0 = _c0_closed_form(gamma, psi)
            t  = 1.0 + 1.0/psi
            d1 = (1.0 + c0 * gamma)**2
            d2 = (1.0 + c0 * t     * gamma)**2

            # Eqs. (33)–(34)
            E1 = (1.0 - psi)/d1 + psi*(t**2)/d2
            E2 = (1.0 - psi)/d1 + (1.0 + psi)/d2

            # Eqs. (31)–(32)
            bias_i = (1.0 + gamma * c0 * (E1/E2)) * (r_theta**2) / ((1.0 + psi) * d2)
            var_i  = sigma2 * gamma * c0 * (E1/E2)

            # store
            bias[i] = max(bias_i, 0.0)  # clip tiny negatives from FP
            var[i]  = max(var_i,  0.0)
            risk[i] = bias[i] + var[i]

    return {"gamma": g, "risk": risk, "bias": bias, "var": var}


g_under = np.geomspace(0.2, 0.9, 200)
g_over  = np.geomspace(1.2, 20.0, 400)
g_all   = np.concatenate([g_under, g_over])

out = corollary4_continuous(
    gammas=g_all, n=100, d=20, r_theta=1.0, sigma_xi=0.0
)

In [None]:
import matplotlib.pyplot as plt
reps = 50
ci = 1.96 * S / np.sqrt(reps)

gamma = out["gamma"]
risk  = out["risk"]

# Split into two segments
mask_left  = gamma <= 0.9
mask_right = gamma >= 1.2

gamma_left,  risk_left  = gamma[mask_left],  risk[mask_left]
gamma_right, risk_right = gamma[mask_right], risk[mask_right]

plt.figure(figsize=(6.4, 4.4))
plt.loglog(G_h_gauss, M_h_gauss, marker="o", linewidth=2, label="Gauss")
plt.loglog(G_h_RHS,   M_h_RHS,   marker="o", linewidth=2, label="RHS")
plt.loglog(G_h_DHS,   M_h_DHS,   marker="o", linewidth=2, label="DHS")
plt.loglog(G_h_DST,   M_h_DST,   marker="o", linewidth=2, label="DST")
plt.loglog(G,         M,         marker="o", linewidth=2, label="Frequentist")

# *** Correct: plot left and right analytic segments separately ***
plt.loglog(gamma_left,  risk_left,  linewidth=2, label="Analytic", color="magenta")
plt.loglog(gamma_right, risk_right, linewidth=2, color="magenta")

# Optional shade the divergence gap
plt.axvspan(0.9, 1.2, color="grey", alpha=0.15)

plt.axvline(1.0, linestyle="--", linewidth=1)
plt.xlabel(r"Aspect ratio  $\gamma = p/n$")
plt.ylabel(r"Population risk  $R_X=(\hat\beta-\beta)^\top \Sigma (\hat\beta-\beta)$")
plt.title(r"Latent space (§5.4) — min-norm risk vs $\gamma$  (n=100, d=20, r=1, $\sigma_\xi=0$)")
plt.tight_layout()
plt.legend()
plt.show()


## TEST ONCE MORE WITH USING LOW DIM INPUT INSTEAD OF OBSERVED FEATURES

In [1]:
import numpy as np

def make_latent_data_sec54(n, p, d=20, r_theta=1.0, sigma_xi=0.0, rng=None):
    """
    Section 5.4 latent model (Hastie–Montanari–Rosset–Tibshirani):
      X = Z W^T + U,   y = Z θ + ξ
      z_i ~ N(0, I_d), u_ij ~ N(0, 1), ξ_i ~ N(0, σ_ξ^2)
    Rows w_j of W satisfy ||w_j|| = 1.               [Fig. 5/6 setup]
    Population mapping to linear model:
      Σ = I_p + W W^T,   β = W (I + W^T W)^{-1} θ.   [eqs. (26)-(27)]
    Returns: X (n×p), y (n,), W (p×d), theta (d,), beta_true (p,), Sigma (p×p)
    """
    rng = np.random.default_rng() if rng is None else rng

    # Random W with unit-norm rows (||w_j||=1)
    W = rng.normal(size=(p, d))
    W /= np.linalg.norm(W, axis=1, keepdims=True) + 1e-12  # enforce ||w_j||=1

    # Latent Z, feature noise U, label noise ξ
    Z = rng.normal(size=(n, d))
    U = rng.normal(size=(n, p))
    xi = rng.normal(scale=sigma_xi, size=n)

    # Signal vector θ with ||θ|| = r_theta
    theta = rng.normal(size=d)
    theta *= r_theta / (np.linalg.norm(theta) + 1e-12)

    # Data
    X = Z @ W.T + U
    y = Z @ theta + xi

    # Population quantities for risk
    Sigma = np.eye(p) + W @ W.T
    beta_true = W @ np.linalg.solve(np.eye(d) + W.T @ W, theta)  # β = W (I + W^T W)^(-1) θ

    return X, Z, y, W, theta, beta_true, Sigma

def fit_min_norm(X, y):
    """
    Minimum-ℓ2-norm least squares: β̂ = X^+ y (ridgeless limit of ridge).
    """
    return np.linalg.pinv(X) @ y


In [4]:
import numpy as np

def sample_hidden_features_gauss(
    Z,
    rng,
    H,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = Z.shape
    W1 = rng.normal(0.0, 1.0, size=(P, H))
    
    post_acts = np.tanh(Z @ W1)
    return post_acts, W1

def sample_hidden_features_RHS(
    Z,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = Z.shape
    tau0 = p_0 / (P - p_0)
    #alpha = np.full(P, alpha_scale)

    tau = np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_cauchy(size=(H, P)))
    #phi_data = rng.dirichlet(alpha, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) #* np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    post_acts = np.tanh(Z @ W1)
    return post_acts, W1

def sample_hidden_features_DHS(
    Z,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.1,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = Z.shape
    tau0 = p_0 / (P - p_0)
    alpha = np.full(P, alpha_scale)

    tau = np.sqrt(10)*np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_cauchy(size=(H, P)))
    phi_data = rng.dirichlet(alpha, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) * np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    post_acts = np.tanh(Z @ W1)
    return post_acts, W1

def sample_hidden_features_DST(
    Z,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.1,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = Z.shape
    tau0 = p_0 / (P - p_0)
    alpha = np.full(P, alpha_scale)

    tau = np.sqrt(10)*np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_t(df=3, size=(H, P)))
    phi_data = rng.dirichlet(alpha, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) * np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    post_acts = np.tanh(Z @ W1)
    return post_acts, W1

def sample_hidden_features_BHS(
    Z,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.1,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = Z.shape
    tau0 = p_0 / (P - p_0)
    #alpha = np.full(P, alpha_scale)

    tau = np.sqrt(10)*np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_cauchy(size=(H, P)))
    phi_data = rng.beta(a = alpha_scale, b = (P-1)*alpha_scale, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) * np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    post_acts = np.tanh(Z @ W1)
    return post_acts, W1

def sample_hidden_features_BST(
    Z,
    rng,
    H,
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.1,
):
    """
    Returns (Z, W1) with Z = tanh(X @ W1), and W1 sampled from your prior.
    """
    n, P = Z.shape
    tau0 = p_0 / (P - p_0)
    #alpha = np.full(P, alpha_scale)

    tau = np.sqrt(10)*np.abs(rng.standard_cauchy()) * tau0
    c_sq = 1.0 / rng.gamma(shape=a, scale=1.0 / b, size=H)
    lambda_data = np.abs(rng.standard_t(df=3, size=(H, P)))
    phi_data = rng.beta(a = alpha_scale, b = (P-1)*alpha_scale, size=H)

    lam_sq = lambda_data**2
    denom = c_sq[:, None] + lam_sq * (tau**2)
    lambda_tilde = (c_sq[:, None] * lam_sq) / denom
    lambda_tilde = np.maximum(lambda_tilde, 1e-12)

    W1_raw = rng.normal(0.0, 1.0, size=(P, H))
    stddev = tau * np.sqrt(lambda_tilde.T) * np.sqrt(phi_data.T)  # (P,H)
    W1 = W1_raw * stddev

    post_acts = np.tanh(Z @ W1)
    return post_acts, W1


In [5]:
def plot_risk_curve_hidden_units_alternative(
    n=400,
    gammas=(0.7, 0.9, 1.2, 1.5, 2, 3, 5, 8, 12, 20),
    model = "DHS",
    d=20,
    r_theta=1.0,
    sigma_xi=0.0,
    reps=50,
    risk_mc_samples=1000,
    seed=0,
    # prior hyperparams
    p_0=3,
    a=2.0,
    b=2.0,
    alpha_scale=0.5,
    # NEW: schedule H as a function of (p, n)
    H_of_p=lambda p, n: p,   # <- default ties H to p (so H grows with γ)
):
    """
    Same as before, but H now depends on p (and n) via H_of_p.
    We reuse the same W1 for train and population risk.
    """
    rng = np.random.default_rng(seed)
    G, M, S = [], [], []

    for gamma in gammas:
        p = max(1, int(round(gamma * n)))
        H = max(1, int(H_of_p(p, n)))
        risks = []

        for _ in range(reps):
            # Your data generator
            X, Z, y, W, theta, beta_true, Sigma = make_latent_data_sec54(
                n=n, p=p, d=d, r_theta=r_theta, sigma_xi=sigma_xi, rng=rng
            )
            
            #print(X.shape, Z.shape)
            # One sampled hidden map W1 for this rep; reuse it for population risk
            if model =="gauss":
                post_acts, W1 = sample_hidden_features_gauss(
                    Z, rng, H=H
                )
            
            elif model =="RHS":
                post_acts, W1 = sample_hidden_features_RHS(
                    Z, rng, H=H, p_0=p_0, a=a, b=b
                )
            elif model =="DHS":
                post_acts, W1 = sample_hidden_features_DHS(
                    Z, rng, H=H, p_0=p_0, a=a, b=b, alpha_scale=alpha_scale
                )
            elif model =="DST":
                post_acts, W1 = sample_hidden_features_DST(
                    Z, rng, H=H, p_0=p_0, a=a, b=b, alpha_scale=alpha_scale
                )
            elif model =="BHS":
                post_acts, W1 = sample_hidden_features_BHS(
                    Z, rng, H=H, p_0=p_0, a=a, b=b, alpha_scale=alpha_scale
                )
            else:
                post_acts, W1 = sample_hidden_features_BST(
                    Z, rng, H=H, p_0=p_0, a=a, b=b, alpha_scale=alpha_scale
                )

            # Min-norm on hidden units
            w_hat = fit_min_norm(post_acts, y)
            if w_hat.ndim > 1 and w_hat.shape[1] == 1:
                w_hat = w_hat.ravel()

            # Monte Carlo population risk with the SAME W1
            Z_pop = rng.multivariate_normal(mean=np.zeros(d), cov=np.eye(d), size=risk_mc_samples)
            post_acts_pop = np.tanh(Z_pop @ W1)
            y_true_pop = Z_pop @ theta
            y_pred_pop = post_acts_pop @ w_hat
            risks.append(float(np.mean((y_pred_pop - y_true_pop) ** 2)))

        G.append(gamma)
        M.append(np.mean(risks))
        S.append(np.std(risks, ddof=1))

    return np.array(G), np.array(M), np.array(S)



In [6]:

G_h_gauss, M_h_gauss, S_h_gauss = plot_risk_curve_hidden_units_alternative(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "gauss",
    gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
    reps=100, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)



In [7]:
G_h_RHS, M_h_RHS, S_h_RHS = plot_risk_curve_hidden_units_alternative(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "RHS",
    gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
    reps=100, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [8]:
G_h_DHS, M_h_DHS, S_h_DHS = plot_risk_curve_hidden_units_alternative(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "DHS",
    gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
    reps=100, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [9]:
G_h_DST, M_h_DST, S_h_DST = plot_risk_curve_hidden_units_alternative(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "DST",
    gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
    reps=100, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [10]:
G_h_BHS, M_h_BHS, S_h_BHS = plot_risk_curve_hidden_units_alternative(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "BHS",
    gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
    reps=100, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [11]:
G_h_BST, M_h_BST, S_h_BST = plot_risk_curve_hidden_units_alternative(
    n=100, d=20, r_theta=1.0, sigma_xi=0.0,
    model = "BST",
    gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
    reps=100, risk_mc_samples=1000, seed=123,
    H_of_p=lambda p, n: p  # H follows p (thus follows γ)
)

In [12]:

def plot_risk_curve_sec54(
    n=400,
    gammas=(0.7, 0.9, 1.2, 1.5, 2, 3, 5, 8, 12, 20),
    d=20,
    r_theta=1.0,          # "r = 1" in the captions
    sigma_xi=0.0,         # use 0 for Fig. 5 behavior; try 0, 0.25, 0.5 like Fig. 6
    reps=50,
    seed=0
):
    """
    Replicates the latent-space risk curve of §5.4 (Figs. 5–6):
      For each γ = p/n, simulate (X,y), fit min-norm β̂, and compute
      population risk R_X = (β̂−β)^T Σ (β̂−β), then average across reps.
    Expectation from §5.4: spike near γ≈1 and then *monotone decrease* for γ>1,
    reaching a global minimum as γ→∞ when β aligns with top eigenspace of Σ. 
    """
    rng = np.random.default_rng(seed)
    G, M, S = [], [], []

    for gamma in gammas:
        p = max(1, int(round(gamma * n)))
        risks = []

        for _ in range(reps):
            X, Z, y, W, theta, beta_true, Sigma = make_latent_data_sec54(
                n=n, p=p, d=d, r_theta=r_theta, sigma_xi=sigma_xi, rng=rng
            )
            beta_hat = fit_min_norm(X, y)

            diff = beta_hat - beta_true
            risks.append(float(diff @ (Sigma @ diff)))

        G.append(gamma)
        M.append(np.mean(risks))
        S.append(np.std(risks, ddof=1))

    G, M, S = np.array(G), np.array(M), np.array(S)
    ci = 1.96 * S / np.sqrt(reps)
    
    return G, M, S

# Example:
G, M, S = plot_risk_curve_sec54(n=100, d=20, r_theta=1.0, sigma_xi=0.0,
                            gammas=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100],
                            reps=100)


In [13]:
import numpy as np
from math import isfinite
from typing import Iterable, Tuple, Dict, Any

def _c0_closed_form(gamma, psi):
    """
    Closed-form solution of Eq. (35) for c0 in Corollary 4.
    Parameters assume gamma > 1 and 0 < psi < 1 (as in the latent model).
    """
    A = float(gamma)
    L = 1.0 - 1.0/A                  # LHS constant
    t = 1.0 + 1.0/psi                # 1 + psi^{-1}

    # Quadratic in y = c0:
    # (L t A^2) y^2 + [A (L(1+t) - ((1-psi) t + psi))] y + (L - 1) = 0
    Acoef = L * t * A**2
    Bcoef = A * (L*(1.0 + t) - ((1.0 - psi)*t + psi))
    Ccoef = L - 1.0                  # = -1/A

    disc = Bcoef*Bcoef - 4.0*Acoef*Ccoef
    if disc < 0:
        # tiny negative from FP roundoff
        disc = 0.0
    # unique nonnegative root
    return (-Bcoef + np.sqrt(disc)) / (2.0*Acoef)

def corollary4_continuous(
    gammas,
    n,
    d,
    r_theta=1.0,
    sigma_xi=0.0,
):
    """
    Returns analytic test risk from Corollary 4 for the latent-space model (Sec. 5.4).
    For γ<1, uses the underparametrized variance formula R = σ^2 * γ/(1-γ) (Prop. 2).
    For γ>1, uses Corollary 4 with the closed-form c0.

    Inputs
    ------
    gammas : array-like of γ = p/n (positive)
    n      : sample size
    d      : latent dimension
    r_theta: ||θ||
    sigma_xi: σ_ξ

    Outputs (np.ndarray)
    --------------------
    dict with keys: 'gamma', 'risk', 'bias', 'var'
    """
    g = np.asarray(gammas, dtype=float)
    risk = np.empty_like(g); bias = np.empty_like(g); var = np.empty_like(g)

    for i, gamma in enumerate(g):
        if gamma <= 0:
            raise ValueError("All gamma must be > 0.")

        # finite-sample plug-in for ψ = d/p with p = γ n
        psi = d / (gamma * n)
        # σ^2 = σ_ξ^2 + ψ r_θ^2 / (1+ψ)  (Eq. (27) and Cor. 4 text)
        sigma2 = sigma_xi**2 + psi * (r_theta**2) / (1.0 + psi)

        if gamma < 1.0:
            # Underparametrized: pure variance (Prop. 2)
            bias[i] = 0.0
            var[i]  = sigma2 * gamma / (1.0 - gamma)
            risk[i] = var[i]
        else:
            # Overparametrized: Cor. 4 Eqs. (30)–(35)
            c0 = _c0_closed_form(gamma, psi)
            t  = 1.0 + 1.0/psi
            d1 = (1.0 + c0 * gamma)**2
            d2 = (1.0 + c0 * t     * gamma)**2

            # Eqs. (33)–(34)
            E1 = (1.0 - psi)/d1 + psi*(t**2)/d2
            E2 = (1.0 - psi)/d1 + (1.0 + psi)/d2

            # Eqs. (31)–(32)
            bias_i = (1.0 + gamma * c0 * (E1/E2)) * (r_theta**2) / ((1.0 + psi) * d2)
            var_i  = sigma2 * gamma * c0 * (E1/E2)

            # store
            bias[i] = max(bias_i, 0.0)  # clip tiny negatives from FP
            var[i]  = max(var_i,  0.0)
            risk[i] = bias[i] + var[i]

    return {"gamma": g, "risk": risk, "bias": bias, "var": var}


g_under = np.geomspace(0.1, 0.9, 200)
g_over  = np.geomspace(1.2, 100.0, 400)
g_all   = np.concatenate([g_under, g_over])

out = corollary4_continuous(
    gammas=g_all, n=100, d=20, r_theta=1.0, sigma_xi=0.0
)

In [None]:
import matplotlib.pyplot as plt
reps = 50
ci = 1.96 * S / np.sqrt(reps)

gamma = out["gamma"]
risk  = out["risk"]

# Split into two segments
mask_left  = gamma <= 0.9
mask_right = gamma >= 1.2

gamma_left,  risk_left  = gamma[mask_left],  risk[mask_left]
gamma_right, risk_right = gamma[mask_right], risk[mask_right]

plt.figure(figsize=(6.4, 4.4))
plt.loglog(G_h_gauss, M_h_gauss, marker="o", linewidth=2, label="Gauss")
plt.loglog(G_h_RHS,   M_h_RHS,   marker="o", linewidth=2, label="RHS")
plt.loglog(G_h_DHS,   M_h_DHS,   marker="o", linewidth=2, label="DHS")
plt.loglog(G_h_DST,   M_h_DST,   marker="o", linewidth=2, label="DST")
plt.loglog(G_h_BHS,   M_h_BHS,   marker="o", linewidth=2, label="BHS")
plt.loglog(G_h_BST,   M_h_BST,   marker="o", linewidth=2, label="BST")
plt.loglog(G,         M,         marker="o", linewidth=2, label="Frequentist")

# *** Correct: plot left and right analytic segments separately ***
plt.loglog(gamma_left,  risk_left,  linewidth=2, label="Analytic", color="magenta")
plt.loglog(gamma_right, risk_right, linewidth=2, color="magenta")

# Optional shade the divergence gap
plt.axvspan(0.9, 1.2, color="grey", alpha=0.15)

plt.axvline(1.0, linestyle="--", linewidth=1)
plt.xlabel(r"Aspect ratio  $\gamma = p/n$")
plt.ylabel(r"Population risk  $R_X=(\hat\beta-\beta)^\top \Sigma (\hat\beta-\beta)$")
plt.title(r"Latent space (§5.4) — min-norm risk vs $\gamma$  (n=100, d=20, r=1, $\sigma_\xi=0$)")
plt.tight_layout()
plt.legend()
plt.show()


In [15]:
rng = np.random.default_rng(1)
X, Z, y, W, theta, beta_true, Sigma = make_latent_data_sec54(
    n=100, p=200, d=5, r_theta=1, rng=rng
)

In [16]:
H_units = 1000

post_acts_gauss, W1 = sample_hidden_features_gauss(
    Z, rng, H=H_units
)

post_acts_RHS, W1 = sample_hidden_features_RHS(
    Z, rng, H=H_units
)

post_acts_DHS, W1 = sample_hidden_features_DHS(
    Z, rng, H=H_units
)

post_acts_DST, W1 = sample_hidden_features_DST(
    Z, rng, H=H_units
)

post_acts_BHS, W1 = sample_hidden_features_BHS(
    Z, rng, H=H_units
)

post_acts_BST, W1 = sample_hidden_features_BST(
    Z, rng, H=H_units
)

In [17]:
import numpy as np

def effective_rank(A, tol=1e-12):
    """
    Compute the effective rank of a matrix A, as defined in
    Roy & Vetterli (2007), "The Effective Rank: A Measure of Effective Dimensionality".

    erank(A) = exp( H(p) ),  where
        p_k = σ_k / sum(σ_i)
        H(p) = -sum_k p_k * log(p_k)

    Parameters
    ----------
    A : ndarray
        Input matrix (M×N).
    tol : float, optional
        Threshold below which singular values are ignored (default 1e-12).

    Returns
    -------
    erank : float
        The effective rank of A.
    """
    # Singular values
    s = np.linalg.svd(A, compute_uv=False)
    s = s[s > tol]
    if len(s) == 0:
        return 0.0

    # Singular value distribution
    p = s / np.sum(s)

    # Shannon entropy
    H = -np.sum(p * np.log(p + np.finfo(float).eps))

    # Effective rank
    er = np.exp(H)
    return er


In [18]:
import numpy as np

def compute_S(Phi, center=True):
    Phi = np.asarray(Phi)
    if center: Phi = Phi - Phi.mean(axis=0, keepdims=True)
    n = Phi.shape[0]
    return (Phi.T @ Phi) / max(n, 1)

def spectral_stats(Phi, k_list=(1,3,5,10), tol=1e-12):
    S = compute_S(Phi)
    evals = np.linalg.eigvalsh(S)
    evals = np.sort(evals)[::-1]
    tot = max(evals.sum(), tol)

    stats = {}
    # 1) Effective rank (your function)
    stats["erank"] = effective_rank(S)

    # 2) CEV_k and 3) kappa_k
    for k in k_list:
        k = min(k, len(evals))
        cev = float(evals[:k].sum() / tot)
        kappa = float(evals[0] / max(evals[k-1], tol))
        stats[f"CEV@{k}"] = cev
        stats[f"kappa@{k}"] = kappa

    # 4) Mutual coherence of columns of Phi
    G = Phi.T @ Phi
    d = np.sqrt(np.clip(np.diag(G), tol, None))
    C = (G / d[:,None]) / d[None,:]
    np.fill_diagonal(C, 0.0)
    stats["mu"] = float(np.max(np.abs(C)))

    return stats, evals

# --- Example aggregation across priors ---
def summarize_priors(post_acts_dict, k_list=(1,3,5,10)):
    rows = []
    for name, Phi in post_acts_dict.items():
        stats, evals = spectral_stats(Phi, k_list=k_list)
        row = {"prior": name, **stats}
        rows.append(row)
    return rows


In [None]:
post_acts = {
    "Gauss": post_acts_gauss,
    "RHS":   post_acts_RHS,
    "DHS":   post_acts_DHS,
    "DST":   post_acts_DST,
    "BHS":   post_acts_BHS,
    "BST":   post_acts_BST,
}
rows = summarize_priors(post_acts, k_list=(1,3,5,10))
for r in rows: print(r)


In [None]:
import numpy as np
import pandas as pd

def summarize_anisotropy(Phi: np.ndarray, center: bool = True, k_leading: int = 5) -> pd.DataFrame:
    """
    Compute anisotropy metrics for S = (1/n) Phi^T Phi.

    Phi: (n, d) post-activation matrix.
    center: whether to center features before computing S.
    k_leading: k in kappa_k = lambda_1 / lambda_k, using the k leading eigenvalues.

    Returns a DataFrame with rows = metrics and a single column 'value':
      - kappa_k : lambda_1 / lambda_k within the leading k-dimensional subspace
      - CV      : coefficient of variation of eigenvalues
      - r_eff   : spectral effective rank
      - mu_mean : mean eigenvalue
    """
    S = compute_S(Phi, center=center)
    S_sym = 0.5 * (S + S.T)  # symmetrize for numerical safety

    d = S_sym.shape[0]
    eps = 1e-12

    # Eigenvalues
    eigvals = np.linalg.eigvalsh(S_sym)
    eigvals = np.clip(eigvals, eps, None)

    # Sort descending for lambda_1, lambda_k
    eigvals_sorted = np.sort(eigvals)[::-1]

    mu_mean = float(eigvals_sorted.mean())
    cv      = float(eigvals_sorted.std() / mu_mean)

    # Effective rank
    p = eigvals_sorted / eigvals_sorted.sum()
    r_eff = float(np.exp(-np.sum(p * np.log(p))))

    # kappa_k = lambda_1 / lambda_k (within leading k subspace)
    k = min(k_leading, d)
    lam1 = eigvals_sorted[0]
    lamk = eigvals_sorted[k - 1]
    kappa_k = float(lam1 / lamk)

    rows = [
        {"metric": "kappa_k", "value": kappa_k},
        {"metric": "CV",      "value": cv},
        {"metric": "r_eff",   "value": r_eff},
        {"metric": "mu_mean", "value": mu_mean},
    ]

    return pd.DataFrame(rows).set_index("metric")


summary_gauss = summarize_anisotropy(post_acts_gauss)
summary_RHS = summarize_anisotropy(post_acts_RHS)
summary_DHS = summarize_anisotropy(post_acts_DHS)
summary_DST = summarize_anisotropy(post_acts_DST)
summary_BHS = summarize_anisotropy(post_acts_BHS)
summary_BST = summarize_anisotropy(post_acts_BST)
print(summary_gauss)
print(summary_RHS)
print(summary_DHS)
print(summary_DST)
print(summary_BHS)
print(summary_BST)


In [23]:
S_gauss = compute_S(post_acts_gauss)
S_RHS = compute_S(post_acts_RHS)
S_DHS = compute_S(post_acts_DHS)
S_DST = compute_S(post_acts_DST)
S_BHS = compute_S(post_acts_BHS)
S_BST = compute_S(post_acts_BST)

In [None]:
import seaborn as sns
fig, axes = plt.subplots(3, 2, figsize=(12, 12), sharex=True, sharey=True)

corr_gauss = np.corrcoef(S_gauss, rowvar=False)
corr_RHS   = np.corrcoef(S_RHS,   rowvar=False)
corr_DHS   = np.corrcoef(S_DHS,   rowvar=False)
corr_DST   = np.corrcoef(S_DST,   rowvar=False)
corr_BHS   = np.corrcoef(S_BHS,   rowvar=False)
corr_BST   = np.corrcoef(S_BST,   rowvar=False)

sns.heatmap(corr_gauss, ax=axes[0, 0], vmin=-1, vmax=1, cmap='coolwarm')
sns.heatmap(corr_RHS,   ax=axes[0, 1], vmin=-1, vmax=1, cmap='coolwarm')
sns.heatmap(corr_DHS,   ax=axes[1, 0], vmin=-1, vmax=1, cmap='coolwarm')
sns.heatmap(corr_DST,   ax=axes[1, 1], vmin=-1, vmax=1, cmap='coolwarm')
sns.heatmap(corr_BHS,   ax=axes[2, 0], vmin=-1, vmax=1, cmap='coolwarm')
sns.heatmap(corr_BST,   ax=axes[2, 1], vmin=-1, vmax=1, cmap='coolwarm')

axes[0, 0].set_title("Gauss")
axes[0, 1].set_title("RHS")
axes[1, 0].set_title("DHS")
axes[1, 1].set_title("DST")
axes[2, 0].set_title("BHS")
axes[2, 1].set_title("BST")

for ax in axes.flat:
    ax.set_xlabel("$\\Phi$")
    ax.set_ylabel("$\\Phi$")
    ax.grid(False)

plt.tight_layout()
plt.show()


In [None]:
eig_gauss = np.linalg.eigvalsh(corr_gauss)
eig_RHS   = np.linalg.eigvalsh(corr_RHS)
eig_DHS   = np.linalg.eigvalsh(corr_DHS)
eig_DST   = np.linalg.eigvalsh(corr_DST)
eig_BHS   = np.linalg.eigvalsh(corr_BHS)
eig_BST   = np.linalg.eigvalsh(corr_BST)

plt.figure(figsize=(10,6))
plt.plot(eig_gauss[::-1], label="Gauss")
plt.plot(eig_RHS[::-1],   label="RHS")
plt.plot(eig_DHS[::-1],   label="DHS")
plt.plot(eig_DST[::-1],   label="DST")
plt.plot(eig_BHS[::-1],   label="BHS")
plt.plot(eig_BST[::-1],   label="BST")

plt.yscale("log")
plt.title("Eigenvalue spectra of feature covariances")
plt.xlabel("Index (sorted)")
plt.ylabel("Eigenvalue")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
def frob(A, B):
    return np.linalg.norm(A - B, "fro")

print("Frobenius distances from Gaussian:")
print("RHS:", frob(corr_gauss, corr_RHS))
print("DHS:", frob(corr_gauss, corr_DHS))
print("DST:", frob(corr_gauss, corr_DST))
print("BHS:", frob(corr_gauss, corr_BHS))
print("BST:", frob(corr_gauss, corr_BST))


In [None]:
def offdiag_energy(C):
    return np.linalg.norm(C - np.eye(C.shape[0]), "fro")

print("Off-diagonal energy:")
print("Gauss:", offdiag_energy(corr_gauss))
print("RHS:",   offdiag_energy(corr_RHS))
print("DHS:",   offdiag_energy(corr_DHS))
print("DST:",   offdiag_energy(corr_DST))
print("BHS:",   offdiag_energy(corr_BHS))
print("BST:",   offdiag_energy(corr_BST))


## Some more theoretical stuff

In [None]:
print("Gauss:", effective_rank(post_acts_gauss), "\n", "RHS:", effective_rank(post_acts_RHS), "\n", "DHS:", effective_rank(post_acts_DHS), "\n", "DST:", effective_rank(post_acts_DST))

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.hist(post_acts_gauss[:, 0])
plt.hist(post_acts_RHS[:, 0])
plt.hist(post_acts_DHS[:, 0])
plt.hist(post_acts_DST[:, 0])
plt.show()

In [15]:
import numpy as np

def compute_S(Phi, center=True, scale_by_n=True):
    """
    Feature covariance in feature-space:
        S = Phi^T Phi / n   (optionally with column-centering)
    Phi: (n, H) matrix of post-activations
    """
    Phi = np.asarray(Phi)
    if center:
        Phi = Phi - Phi.mean(axis=0, keepdims=True)
    n = Phi.shape[0]
    S = Phi.T @ Phi
    if scale_by_n:
        S = S / max(n, 1)
    return S

def alignment_scores(Phi, y, k_list=(1, 3, 5), ridge=1e-10, center=True):
    """
    Returns:
      - S:     (H,H) feature covariance
      - evals: eigenvalues of S (descending)
      - evecs: eigenvectors (columns) of S
      - frac_span: fraction of ||y||^2 captured by the span of Phi (projection energy)
      - frac_topk: dict k -> fraction of ||y||^2 captured by the top-k eigenspace of S

    Method:
      Let S = Phi^T Phi / n, eigendecomp S = U Λ U^T (Λ diag with descending evals).
      Full projection: y_hat = Phi (Phi^+ y)  (min-norm fit in feature space).
      Top-k projection uses only top-k eigenvectors:
        y_hat_k = Phi U_k Λ_k^{-1} U_k^T Phi^T y
      (ridge added to Λ_k for numerical stability)
    """
    Phi = np.asarray(Phi)
    y = np.asarray(y).ravel()
    if center:
        Phi = Phi - Phi.mean(axis=0, keepdims=True)
        y = y - y.mean()

    n, H = Phi.shape
    # S and eigendecomposition
    S = (Phi.T @ Phi) / max(n, 1)
    evals, evecs = np.linalg.eigh(S)
    idx = np.argsort(evals)[::-1]
    evals, evecs = evals[idx], evecs[:, idx]

    # Full-span projection (min-norm)
    y_hat = Phi @ np.linalg.pinv(Phi) @ y
    frac_span = float(np.dot(y_hat, y_hat) / np.dot(y, y)) if np.dot(y, y) > 0 else 0.0

    # Top-k projections via eigenspace of S
    g = Phi.T @ y                                 # (H,)
    frac_topk = {}
    for k in k_list:
        k_eff = min(k, H)
        Uk = evecs[:, :k_eff]                     # (H, k)
        Lk = evals[:k_eff]                        # (k,)
        inv_Lk = 1.0 / (Lk + ridge)
        # y_hat_k = Phi Uk Λ_k^{-1} Uk^T Phi^T y
        y_hat_k = Phi @ (Uk * inv_Lk) @ (Uk.T @ g)
        frac_topk[k] = float(np.dot(y_hat_k, y_hat_k) / np.dot(y, y)) if np.dot(y, y) > 0 else 0.0

    return S, evals, evecs, frac_span, frac_topk


In [None]:
rng = np.random.default_rng(1)
X, Z, y, W, theta, beta_true, Sigma = make_latent_data_sec54(
    n=100, p=200, d=5, r_theta=1, rng=rng
)

# Sample Gaussian hidden features (your function)
Phi, W1 = sample_hidden_features_gauss(Z, rng, H=100)

# Build S and test alignment of S to y
S, evals, evecs, frac_span, frac_topk = alignment_scores(Phi, y, k_list=(1,3,5,10))

print("||projection of y onto span(Phi)||^2 / ||y||^2 =", frac_span)
print("Top-k alignment (fraction of ||y||^2):", frac_topk)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

corr = np.corrcoef(S[:10, :10], rowvar=False)

plt.figure()
ax = sns.heatmap(corr, vmin=-1, vmax=1, cmap='coolwarm')

# Number of features
n = corr.shape[0]

# Set tick labels to 1..n
ax.set_xticks(np.arange(n) + 0.5)
ax.set_yticks(np.arange(n) + 0.5)
ax.set_xticklabels(np.arange(1, n+1))
ax.set_yticklabels(np.arange(1, n+1))

plt.xlabel("Feature")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
# Sample Gaussian hidden features (your function)
Phi, W1 = sample_hidden_features_RHS(Z, rng, H=100)

# Build S and test alignment of S to y
S, evals, evecs, frac_span, frac_topk = alignment_scores(Phi, y, k_list=(1,3,5,10))

print("||projection of y onto span(Phi)||^2 / ||y||^2 =", frac_span)
print("Top-k alignment (fraction of ||y||^2):", frac_topk)

In [None]:
# Sample Gaussian hidden features (your function)
Phi, W1 = sample_hidden_features_DHS(Z, rng, H=100)

# Build S and test alignment of S to y
S, evals, evecs, frac_span, frac_topk = alignment_scores(Phi, y, k_list=(1,3,5,10))

print("||projection of y onto span(Phi)||^2 / ||y||^2 =", frac_span)
print("Top-k alignment (fraction of ||y||^2):", frac_topk)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

corr = np.corrcoef(S[:10, :10], rowvar=False)

plt.figure()
ax = sns.heatmap(corr, vmin=-1, vmax=1, cmap='coolwarm')

# Number of features
n = corr.shape[0]

# Set tick labels to 1..n
ax.set_xticks(np.arange(n) + 0.5)
ax.set_yticks(np.arange(n) + 0.5)
ax.set_xticklabels(np.arange(1, n+1))
ax.set_yticklabels(np.arange(1, n+1))

plt.xlabel("Feature")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
# Sample Gaussian hidden features (your function)
Phi, W1 = sample_hidden_features_DST(Z, rng, H=100)

# Build S and test alignment of S to y
S, evals, evecs, frac_span, frac_topk = alignment_scores(Phi, y, k_list=(1,3,5,10))

print("||projection of y onto span(Phi)||^2 / ||y||^2 =", frac_span)
print("Top-k alignment (fraction of ||y||^2):", frac_topk)