In [5]:
######## VERSION 1.0.0 ########
# 经过改动正式还原了原始算法

import torch
from qpth.qp import QPFunction
import matplotlib.pyplot as plt

def build_qp_matrices(U, V, S, T, tau, X_train, y_train, lambda_reg):
    """
    构建Q, p, G, h，用于内层QP：
      minimize 0.5 * [beta^T * diag(lambda_reg) * beta + theta^T theta + sigma^T sigma]
               + tau^T sigma
      subject to  pi_li >= U_l(...) + V_l, ...
    """
    n, d = X_train.shape
    L = U.shape[0]
    H = S.shape[0]
    
    total_vars = d + L*n + 2*H*n  # [beta, pi, theta, sigma]
    
    # 对角线Q
    Q_diag = torch.zeros(total_vars, dtype=X_train.dtype, device=X_train.device)
    Q_diag[:d] = lambda_reg
    Q_diag[d + L*n : d + L*n + H*n] = 1.0      # theta
    Q_diag[d + L*n + H*n : ] = 1.0            # sigma
    
    Q = torch.diag(Q_diag).unsqueeze(0)
    
    # p 向量
    p = torch.zeros(total_vars, dtype=X_train.dtype, device=X_train.device)
    p[d + L*n + H*n:] = tau.repeat(n)  # sigma部分
    p = p.unsqueeze(0)
    
    # 不等式约束 G z <= h
    G_rows = 2*L*n + 2*H*n + d
    G = torch.zeros(G_rows, total_vars, dtype=X_train.dtype, device=X_train.device)
    h_val = torch.zeros(G_rows, dtype=X_train.dtype, device=X_train.device)
    
    row_idx = 0
    
    # pi_li >= U_l * (y_i - x_i^T beta) + V_l
    for i in range(n):
        for l in range(L):
            G[row_idx, :d] = U[l] * X_train[i]
            G[row_idx, d + l*n + i] = -1.0
            h_val[row_idx] = U[l]*y_train[i] + V[l]
            row_idx += 1
    
    # pi_li >= 0
    for i in range(n):
        for l in range(L):
            G[row_idx, d + l*n + i] = -1.0
            h_val[row_idx] = 0.0
            row_idx += 1

    # theta_hi + sigma_hi >= S_h*(y_i - x_i^T beta) + T_h
    for i in range(n):
        for h_ in range(H):
            G[row_idx, :d] = S[h_] * X_train[i]
            G[row_idx, d + L*n + h_*n + i] = -1.0
            G[row_idx, d + L*n + H*n + h_*n + i] = -1.0
            h_val[row_idx] = S[h_]*y_train[i] + T[h_]
            row_idx += 1

    # sigma_hi >= 0
    for i in range(n):
        for h_ in range(H):
            G[row_idx, d + L*n + H*n + h_*n + i] = -1.0
            h_val[row_idx] = 0.0
            row_idx += 1

    # beta_j >= 0
    for j in range(d):
        G[row_idx, j] = -1.0
        h_val[row_idx] = 0.0
        row_idx += 1
    
    G = G.unsqueeze(0)
    h = h_val.unsqueeze(0)
    
    eps = 1e-4
    Q = Q + eps * torch.eye(total_vars, dtype=X_train.dtype, device=X_train.device).unsqueeze(0)
    
    return Q, p, G, h

def solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg):
    """
    
    """
    Q, p, G, h = build_qp_matrices(U, V, S, T, tau, X_train, y_train, lambda_reg)
    z = QPFunction(verbose=False)(Q, p, G, h, torch.empty(0, device=X_train.device), torch.empty(0, device=X_train.device))
    d = X_train.shape[1]
    beta_opt = z[:, :d].squeeze(0)
    return beta_opt


def compute_outer_gradients(X_train, y_train, X_val, y_val , U, V, S, T, tau, lambda_reg):
    """
    给定超参数(U,V,S,T), 先解内层beta，再对外层loss=MSE做 backward
    """
    beta_opt = solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg)
    
    n = X_val.shape[0]
    y_val_pred = X_val @ beta_opt
    loss_outer = (1.0 / n) * (y_val - y_val_pred).pow(2).sum()
    loss_outer.backward()
    
    U_grad = U.grad.clone() if U.grad is not None else torch.zeros_like(U)
    V_grad = V.grad.clone() if V.grad is not None else torch.zeros_like(V)
    S_grad = S.grad.clone() if S.grad is not None else torch.zeros_like(S)
    T_grad = T.grad.clone() if T.grad is not None else torch.zeros_like(T)
    
    # 清零
    if U.grad is not None:
        U.grad.zero_()
    if V.grad is not None:
        V.grad.zero_()
    if S.grad is not None:
        S.grad.zero_()
    if T.grad is not None:
        T.grad.zero_()
    
    return {
        "beta_opt": beta_opt.detach().clone(),
        "loss": loss_outer.item(),
        "U_grad": U_grad,
        "V_grad": V_grad,
        "S_grad": S_grad,
        "T_grad": T_grad
    }

def train_hyperparams(X_train, y_train, X_val, y_val, U, V, S, T, tau, lambda_reg,
                      lr=1e-2, outer_steps=50):
    """
    外层多步迭代, 手动梯度下降更新 U,V,S,T
    """
    loss_history = []
    for step in range(outer_steps):
        results = compute_outer_gradients(X_train, y_train, X_val, y_val, U, V, S, T, tau, lambda_reg)
        loss_val = results["loss"]
        U_grad, V_grad = results["U_grad"], results["V_grad"]
        S_grad, T_grad = results["S_grad"], results["T_grad"]
        
        with torch.no_grad():
            U -= lr * U_grad
            V -= lr * V_grad
            S -= lr * S_grad
            T -= lr * T_grad
        
        U.requires_grad_(True)
        V.requires_grad_(True)
        S.requires_grad_(True)
        T.requires_grad_(True)
        
        loss_history.append(loss_val)
        if (step+1) % 10 == 0:
            print(f"[outer step {step+1}/{outer_steps}] MSE loss = {loss_val:.6f}")
    
    return U, V, S, T, loss_history.

def evaluate_and_print(X, y, beta_est, beta_true, label=""):
    """
    在 (X, y) 上用 beta_est 做预测, 打印 MSE/MAE, 并可对比 beta_true
    """
    with torch.no_grad():
        y_pred = X.matmul(beta_est)
        mse = ((y_pred - y)**2).mean().item()
        mae = (y_pred - y).abs().mean().item()
        print(f"{label} MSE: {mse:.6f}")
        print(f"{label} MAE: {mae:.6f}")
        if beta_true is not None:
            beta_mse = ((beta_est - beta_true)**2).mean().item()
            beta_mae = (beta_est - beta_true).abs().mean().item()
            print(f"{label} Beta MSE: {beta_mse:.6f}, Beta MAE: {beta_mae:.6f}")

# ========== 生成: Laplace 
# 设置超超参数
n, d = 200, 5
L, H = 2, 2
lambda_reg = 0.1
outer_steps = 10
lr = 1e-2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# (a) Laplace 数据
print("\n==== Laplace Noise Data ====")
torch.manual_seed(42)
from torch.distributions import Laplace
beta_true_lap = torch.rand(d, device=device) * 10
X_lap = torch.randn(n, d, device=device)
eps_lap = Laplace(0.0, 1.0).sample((n,)).to(device)  # scale=1
y_lap = X_lap @ beta_true_lap + eps_lap

# 1) Laplace
print("\n-- Train Autoloss on Laplace data --")
U_lap = torch.randn(L, device=device, requires_grad=True)
V_lap = torch.randn(L, device=device, requires_grad=True)
S_lap = torch.randn(H, device=device, requires_grad=True)
T_lap = torch.randn(H, device=device, requires_grad=True)
tau_lap = torch.ones(H, device=device, requires_grad=False)

# 2) Train-Val Split
N_train = 150
X_lap_train, y_lap_train = X_lap[:N_train], y_lap[:N_train]
X_lap_val, y_lap_val = X_lap[N_train:], y_lap[N_train:]

for iter in range(20):
    print(f"\n--- Laplace Iteration {iter+1} ---")
    U_lap, V_lap, S_lap, T_lap, lap_loss_hist = train_hyperparams(
        X_lap_train, y_lap_train, X_lap_val, y_lap_val,
        U_lap, V_lap, S_lap, T_lap, tau_lap,
        lambda_reg=lambda_reg, lr=lr, outer_steps=outer_steps
    )

    beta_autoloss_lap = solve_inner_qpth(U_lap, V_lap, S_lap, T_lap, tau_lap, 
                                            X_lap_train, y_lap_train, lambda_reg)
    
    evaluate_and_print(X_lap_val, y_lap_val, beta_autoloss_lap, beta_true_lap, label="Val Autoloss(Laplace)")
    evaluate_and_print(X_lap_train, y_lap_train, beta_autoloss_lap, beta_true_lap, label="Train Autoloss(Laplace)")





    

Using device: cuda

==== Laplace Noise Data ====

-- Train Autoloss on Laplace data --

--- Laplace Iteration 1 ---
[outer step 10/10] MSE loss = 1.147810
Val Autoloss(Laplace) MSE: 1.147461
Val Autoloss(Laplace) MAE: 0.761749
Val Autoloss(Laplace) Beta MSE: 0.041236, Beta MAE: 0.165974
Train Autoloss(Laplace) MSE: 2.374356
Train Autoloss(Laplace) MAE: 1.123015
Train Autoloss(Laplace) Beta MSE: 0.041236, Beta MAE: 0.165974

--- Laplace Iteration 2 ---
[outer step 10/10] MSE loss = 1.144650
Val Autoloss(Laplace) MSE: 1.144608
Val Autoloss(Laplace) MAE: 0.760095
Val Autoloss(Laplace) Beta MSE: 0.040721, Beta MAE: 0.166360
Train Autoloss(Laplace) MSE: 2.371023
Train Autoloss(Laplace) MAE: 1.121967
Train Autoloss(Laplace) Beta MSE: 0.040721, Beta MAE: 0.166360

--- Laplace Iteration 3 ---
[outer step 10/10] MSE loss = 1.142888
Val Autoloss(Laplace) MSE: 1.142743
Val Autoloss(Laplace) MAE: 0.758937
Val Autoloss(Laplace) Beta MSE: 0.040346, Beta MAE: 0.166572
Train Autoloss(Laplace) MSE: 2.3

In [9]:
import torch
from qpth.qp import QPFunction
import matplotlib.pyplot as plt

def build_qp_matrices(U, V, S, T, tau, X_train, y_train, lambda_reg):
    """
    构建Q, p, G, h，用于内层QP：
      minimize 0.5 * [beta^T * diag(lambda_reg) * beta + theta^T theta + sigma^T sigma]
               + tau^T sigma
      subject to  pi_li >= U_l(...) + V_l, ...
    """
    n, d = X_train.shape
    L = U.shape[0]
    H = S.shape[0]
    
    total_vars = d + L*n + 2*H*n  # [beta, pi, theta, sigma]
    
    # 对角线Q
    Q_diag = torch.zeros(total_vars, dtype=X_train.dtype, device=X_train.device)
    Q_diag[:d] = lambda_reg
    Q_diag[d + L*n : d + L*n + H*n] = 1.0      # theta
    Q_diag[d + L*n + H*n : ] = 1.0            # sigma
    
    Q = torch.diag(Q_diag).unsqueeze(0)
    
    # p 向量
    p = torch.zeros(total_vars, dtype=X_train.dtype, device=X_train.device)
    p[d + L*n + H*n:] = tau.repeat(n)  # sigma部分
    p = p.unsqueeze(0)
    
    # 不等式约束 G z <= h
    G_rows = 2*L*n + 2*H*n + d
    G = torch.zeros(G_rows, total_vars, dtype=X_train.dtype, device=X_train.device)
    h_val = torch.zeros(G_rows, dtype=X_train.dtype, device=X_train.device)
    
    row_idx = 0
    
    # pi_li >= U_l * (y_i - x_i^T beta) + V_l
    for i in range(n):
        for l in range(L):
            G[row_idx, :d] = U[l] * X_train[i]
            G[row_idx, d + l*n + i] = -1.0
            h_val[row_idx] = U[l]*y_train[i] + V[l]
            row_idx += 1
    
    # pi_li >= 0
    for i in range(n):
        for l in range(L):
            G[row_idx, d + l*n + i] = -1.0
            h_val[row_idx] = 0.0
            row_idx += 1

    # theta_hi + sigma_hi >= S_h*(y_i - x_i^T beta) + T_h
    for i in range(n):
        for h_ in range(H):
            G[row_idx, :d] = S[h_] * X_train[i]
            G[row_idx, d + L*n + h_*n + i] = -1.0
            G[row_idx, d + L*n + H*n + h_*n + i] = -1.0
            h_val[row_idx] = S[h_]*y_train[i] + T[h_]
            row_idx += 1

    # sigma_hi >= 0
    for i in range(n):
        for h_ in range(H):
            G[row_idx, d + L*n + H*n + h_*n + i] = -1.0
            h_val[row_idx] = 0.0
            row_idx += 1

    # beta_j >= 0
    for j in range(d):
        G[row_idx, j] = -1.0
        h_val[row_idx] = 0.0
        row_idx += 1
    
    G = G.unsqueeze(0)
    h = h_val.unsqueeze(0)
    
    eps = 1e-4
    Q = Q + eps * torch.eye(total_vars, dtype=X_train.dtype, device=X_train.device).unsqueeze(0)
    
    return Q, p, G, h

def solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg):
    """
    Solve the inner quadratic programming (QP) problem to obtain optimal beta coefficients.

    This function constructs and solves a QP problem defined by the given hyperparameters
    and training data. It uses the QPFunction from the qpth library to minimize the
    objective function subject to specified constraints, returning the optimized beta
    coefficients.

    Args:
        U (torch.Tensor): Coefficient tensor for linear constraints, shape (L,).
        V (torch.Tensor): Bias tensor for linear constraints, shape (L,).
        S (torch.Tensor): Coefficient tensor for additional constraints, shape (H,).
        T (torch.Tensor): Bias tensor for additional constraints, shape (H,).
        tau (torch.Tensor): Penalty coefficient tensor for slack variables, shape (H,).
        X_train (torch.Tensor): Training feature matrix, shape (n_samples, n_features).
        y_train (torch.Tensor): Training target vector, shape (n_samples,).
        lambda_reg (float): Regularization parameter for beta coefficients.

    Returns:
        torch.Tensor: Optimal beta coefficients, shape (n_features,).

    Notes:
        - The QP problem is formulated as:
          minimize 0.5 * (beta^T * diag(lambda_reg) * beta + theta^T * theta + sigma^T * sigma) + tau^T * sigma
          subject to various linear constraints defined by U, V, S, T.
        - The function assumes all input tensors are on the same device (e.g., CPU or GPU).
        - The qpth.QPFunction solver is used with verbose=False to suppress logging.
    """
    Q, p, G, h = build_qp_matrices(U, V, S, T, tau, X_train, y_train, lambda_reg)
    z = QPFunction(verbose=False)(Q, p, G, h, torch.empty(0, device=X_train.device), torch.empty(0, device=X_train.device))
    d = X_train.shape[1]
    beta_opt = z[:, :d].squeeze(0)
    return beta_opt


def compute_outer_gradients(X_train, y_train, X_val, y_val , U, V, S, T, tau, lambda_reg):
    """
    Compute gradients of the outer loss with respect to hyperparameters U, V, S, T.

    This function solves the inner QP problem to obtain optimal beta coefficients using
    the given hyperparameters and training data, then computes the outer loss (MSE) on
    validation data. It performs backpropagation to calculate gradients of the outer
    loss with respect to the hyperparameters U, V, S, and T.

    Args:
        X_train (torch.Tensor): Training feature matrix, shape (n_train_samples, n_features).
        y_train (torch.Tensor): Training target vector, shape (n_train_samples,).
        X_val (torch.Tensor): Validation feature matrix, shape (n_val_samples, n_features).
        y_val (torch.Tensor): Validation target vector, shape (n_val_samples,).
        U (torch.Tensor): Coefficient tensor for linear constraints, shape (L,), requires grad.
        V (torch.Tensor): Bias tensor for linear constraints, shape (L,), requires grad.
        S (torch.Tensor): Coefficient tensor for additional constraints, shape (H,), requires grad.
        T (torch.Tensor): Bias tensor for additional constraints, shape (H,), requires grad.
        tau (torch.Tensor): Penalty coefficient tensor for slack variables, shape (H,).
        lambda_reg (float): Regularization parameter for beta coefficients in the inner QP.

    Returns:
        dict: A dictionary containing:
            - "beta_opt" (torch.Tensor): Optimal beta coefficients, shape (n_features,).
            - "loss" (float): Mean squared error (MSE) on validation data.
            - "U_grad" (torch.Tensor): Gradient of outer loss w.r.t. U, shape (L,).
            - "V_grad" (torch.Tensor): Gradient of outer loss w.r.t. V, shape (L,).
            - "S_grad" (torch.Tensor): Gradient of outer loss w.r.t. S, shape (H,).
            - "T_grad" (torch.Tensor): Gradient of outer loss w.r.t. T, shape (H,).

    Notes:
        - The outer loss is defined as MSE = (1/n_val) * ||y_val - X_val @ beta_opt||^2.
        - Gradients are computed via PyTorch's autograd by calling backward() on the loss.
        - If a hyperparameter's gradient is None (e.g., requires_grad=False), it is replaced
            with a zero tensor of the same shape.
        - Gradients are cleared after cloning to avoid accumulation across calls.
        - All tensors are assumed to be on the same device (e.g., CPU or GPU).
    """
    beta_opt = solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg)
    
    n = X_val.shape[0]
    y_val_pred = X_val @ beta_opt
    loss_outer = (1.0 / n) * (y_val - y_val_pred).pow(2).sum()
    loss_outer.backward()
    
    U_grad = U.grad.clone() if U.grad is not None else torch.zeros_like(U)
    V_grad = V.grad.clone() if V.grad is not None else torch.zeros_like(V)
    S_grad = S.grad.clone() if S.grad is not None else torch.zeros_like(S)
    T_grad = T.grad.clone() if T.grad is not None else torch.zeros_like(T)
    
    # 清零
    if U.grad is not None:
        U.grad.zero_()
    if V.grad is not None:
        V.grad.zero_()
    if S.grad is not None:
        S.grad.zero_()
    if T.grad is not None:
        T.grad.zero_()
    
    return {
        "beta_opt": beta_opt.detach().clone(),
        "loss": loss_outer.item(),
        "U_grad": U_grad,
        "V_grad": V_grad,
        "S_grad": S_grad,
        "T_grad": T_grad
    }


def train_hyperparams(X_train, y_train, X_val, y_val, U, V, S, T, tau, lambda_reg,
                      lr=1e-2, outer_steps=50):
    """
    Train hyperparameters U, V, S, T via gradient descent on outer MSE loss.

    Performs multiple steps of gradient descent to optimize U, V, S, T based on the outer
    MSE loss, computed using beta from an inner QP solver.

    Args:
        X_train (torch.Tensor): Training features, shape (n_train_samples, n_features).
        y_train (torch.Tensor): Training targets, shape (n_train_samples,).
        X_val (torch.Tensor): Validation features, shape (n_val_samples, n_features).
        y_val (torch.Tensor): Validation targets, shape (n_val_samples,).
        U (torch.Tensor): Linear constraint coefficients, shape (L,), requires grad.
        V (torch.Tensor): Linear constraint biases, shape (L,), requires grad.
        S (torch.Tensor): Additional constraint coefficients, shape (H,), requires grad.
        T (torch.Tensor): Additional constraint biases, shape (H,), requires grad.
        tau (torch.Tensor): Slack variable penalties, shape (H,).
        lambda_reg (float): Regularization parameter for inner QP beta.
        lr (float, optional): Learning rate. Defaults to 1e-2.
        outer_steps (int, optional): Number of iterations. Defaults to 50.

    Returns:
        tuple: (U, V, S, T, loss_history)
            - U, V, S, T (torch.Tensor): Updated hyperparameters.
            - loss_history (list): MSE loss per step.

    Notes:
        - Uses compute_outer_gradients for gradient computation.
        - Prints MSE every 10 steps.
        - Assumes all tensors are on the same device.
    """
    loss_history = []
    for step in range(outer_steps):
        results = compute_outer_gradients(X_train, y_train, X_val, y_val, U, V, S, T, tau, lambda_reg)
        loss_val = results["loss"]
        U_grad, V_grad = results["U_grad"], results["V_grad"]
        S_grad, T_grad = results["S_grad"], results["T_grad"]
        
        with torch.no_grad():
            U -= lr * U_grad
            V -= lr * V_grad
            S -= lr * S_grad
            T -= lr * T_grad
        
        U.requires_grad_(True)
        V.requires_grad_(True)
        S.requires_grad_(True)
        T.requires_grad_(True)
        
        loss_history.append(loss_val)
        if (step+1) % 10 == 0:
            print(f"[outer step {step+1}/{outer_steps}] MSE loss = {loss_val:.6f}")
    
    return U, V, S, T, loss_history

def evaluate_and_print(X, y, beta_est, beta_true, label=""):
    """
    Evaluate predictions using beta_est on (X, y) and print MSE/MAE metrics.

    Computes predictions with estimated beta coefficients, calculates MSE and MAE
    on the given data, and optionally compares beta_est to beta_true.

    Args:
        X (torch.Tensor): Feature matrix, shape (n_samples, n_features).
        y (torch.Tensor): Target vector, shape (n_samples,).
        beta_est (torch.Tensor): Estimated beta coefficients, shape (n_features,).
        beta_true (torch.Tensor): True beta coefficients, shape (n_features,), optional.
        label (str, optional): Prefix for printed metrics. Defaults to "".

    Notes:
        - Prints MSE and MAE for predictions, and Beta MSE/MAE if beta_true is provided.
        - Computations are performed without gradient tracking.
    """
    with torch.no_grad():
        y_pred = X.matmul(beta_est)
        mse = ((y_pred - y)**2).mean().item()
        mae = (y_pred - y).abs().mean().item()
        print(f"{label} MSE: {mse:.6f}")
        print(f"{label} MAE: {mae:.6f}")
        if beta_true is not None:
            beta_mse = ((beta_est - beta_true)**2).mean().item()
            beta_mae = (beta_est - beta_true).abs().mean().item()
            print(f"{label} Beta MSE: {beta_mse:.6f}, Beta MAE: {beta_mae:.6f}")
            
def setup_experiment(n=200, d=5, L=2, H=2, lambda_reg=0.1, outer_steps=10, lr=1e-2, seed=42):
    """
    Set up experiment parameters and environment.

    Initializes random seed, selects device (CPU/GPU), and returns a dictionary of
    experiment settings.

    Args:
        n (int, optional): Number of samples. Defaults to 200.
        d (int, optional): Number of features. Defaults to 5.
        L (int, optional): Size of U and V tensors. Defaults to 2.
        H (int, optional): Size of S, T, and tau tensors. Defaults to 2.
        lambda_reg (float, optional): Regularization parameter. Defaults to 0.1.
        outer_steps (int, optional): Number of outer iterations. Defaults to 10.
        lr (float, optional): Learning rate. Defaults to 1e-2.
        seed (int, optional): Random seed for reproducibility. Defaults to 42.

    Returns:
        dict: Experiment settings with keys: n, d, L, H, lambda_reg, outer_steps, lr, device, seed.

    Notes:
        - Device is set to CUDA if available, otherwise CPU.
        - Random seed is applied via torch.manual_seed.
    """
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    
    return {
        "n": n, 
        "d": d,
        "L": L,
        "H": H,
        "lambda_reg": lambda_reg,
        "outer_steps": outer_steps,
        "lr": lr,
        "device": device,
        "seed": seed
    }

def generate_synthetic_data(config, noise_type="laplace", scale=1.0, df=3.0):
    """
    Generate synthetic data with specified noise distribution.

    Creates feature matrix X, true beta coefficients, and target y with added noise
    (Laplace or Student-t), then splits into training and validation sets.

    Args:
        config (dict): Experiment config with keys: n, d, device.
        noise_type (str, optional): Noise distribution ("laplace" or "student"). Defaults to "laplace".
        scale (float, optional): Noise scale factor. Defaults to 1.0.
        df (float, optional): Degrees of freedom for Student-t noise. Defaults to 3.0.

    Returns:
        dict: Synthetic data with keys: X, y, beta_true, X_train, y_train, X_val, y_val, noise_type.

    Raises:
        ValueError: If noise_type is not "laplace" or "student".

    Notes:
        - Splits data with 75% for training, 25% for validation.
        - Prints data summary (sample sizes, features).
    """
    n, d, device = config["n"], config["d"], config["device"]
    
    # 生成真实beta
    beta_true = torch.rand(d, device=device) * 10
    
    # 生成特征
    X = torch.randn(n, d, device=device)
    
    # 根据噪声类型生成噪声
    if noise_type.lower() == "laplace":
        from torch.distributions import Laplace
        eps = Laplace(0.0, scale).sample((n,)).to(device)
    elif noise_type.lower() == "student":
        from torch.distributions import StudentT
        eps = StudentT(df=df).sample((n,)).to(device) * scale
    else:
        raise ValueError(f"Unsupported noise type: {noise_type}")
    
    # 生成目标值
    y = X @ beta_true + eps
    
    # 训练验证集划分
    N_train = int(0.75 * n)
    X_train, y_train = X[:N_train], y[:N_train]
    X_val, y_val = X[N_train:], y[N_train:]
    
    print(f"\n==== {noise_type.capitalize()} Noise Data ====")
    print(f"Number of samples: {n} (Train: {N_train}, Val: {n-N_train})")
    print(f"Number of features: {d}")
    
    return {
        "X": X, 
        "y": y,
        "beta_true": beta_true,
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val,
        "noise_type": noise_type
    }

def initialize_hyperparameters(config):
    """
    Initialize model hyperparameters U, V, S, T, and tau.

    Creates random tensors for U, V, S, T with gradients enabled, and a constant tau tensor.

    Args:
        config (dict): Experiment config with keys: L, H, device.

    Returns:
        dict: Initialized hyperparameters with keys: U, V, S, T, tau.

    Notes:
        - U, V, S, T are random tensors with requires_grad=True.
        - tau is a tensor of ones with requires_grad=False.
    """
    L, H, device = config["L"], config["H"], config["device"]
    
    U = torch.randn(L, device=device, requires_grad=True)
    V = torch.randn(L, device=device, requires_grad=True)
    S = torch.randn(H, device=device, requires_grad=True)
    T = torch.randn(H, device=device, requires_grad=True)
    tau = torch.ones(H, device=device, requires_grad=False)
    
    return {
        "U": U,
        "V": V,
        "S": S,
        "T": T,
        "tau": tau
    }

def train_autoloss(data, hyperparams, config, max_iterations=20, eval_every=1):
    """
    Train AutoLoss model over multiple iterations.

    Executes the full AutoLoss training process, optimizing hyperparameters across
    iterations and evaluating performance periodically.

    Args:
        data (dict): Data dictionary with keys: X_train, y_train, X_val, y_val, beta_true, noise_type.
        hyperparams (dict): Hyperparameter dictionary with keys: U, V, S, T, tau.
        config (dict): Config dictionary with keys: lambda_reg, lr, outer_steps.
        max_iterations (int, optional): Total number of iterations. Defaults to 20.
        eval_every (int, optional): Evaluate and print every nth iteration. Defaults to 1.

    Returns:
        dict: Training results with keys:
            - "hyperparams": Updated hyperparameters (U, V, S, T, tau).
            - "beta": Final optimized beta coefficients.
            - "metrics": Lists of train_losses, val_losses, and beta_mses.

    Notes:
        - Uses train_hyperparams for optimization and solve_inner_qpth for beta computation.
        - Prints metrics for train and validation sets at specified intervals.
    """
    X_train, y_train = data["X_train"], data["y_train"]
    X_val, y_val = data["X_val"], data["y_val"]
    beta_true = data["beta_true"]
    
    U, V, S, T, tau = (
        hyperparams["U"], 
        hyperparams["V"],
        hyperparams["S"],
        hyperparams["T"],
        hyperparams["tau"]
    )
    
    lambda_reg = config["lambda_reg"]
    lr = config["lr"]
    outer_steps = config["outer_steps"]
    
    noise_type = data["noise_type"]
    print(f"\n-- Training AutoLoss on {noise_type} data --")
    
    train_losses = []
    val_losses = []
    beta_mses = []
    
    for iter in range(max_iterations):
        print(f"\n--- {noise_type} Iteration {iter+1}/{max_iterations} ---")
        
        # 训练超参数
        U, V, S, T, loss_hist = train_hyperparams(
            X_train, y_train, X_val, y_val,
            U, V, S, T, tau,
            lambda_reg=lambda_reg, lr=lr, outer_steps=outer_steps
        )
        
        # 获取最终的beta
        beta_autoloss = solve_inner_qpth(
            U, V, S, T, tau, 
            X_train, y_train, lambda_reg
        )
        
        # 评估结果
        if (iter + 1) % eval_every == 0 or iter == max_iterations - 1:
            val_metrics = evaluate_model(X_val, y_val, beta_autoloss, beta_true)
            train_metrics = evaluate_model(X_train, y_train, beta_autoloss, beta_true)
            
            # 打印评估结果
            evaluate_and_print(X_val, y_val, beta_autoloss, beta_true, 
                              label=f"Val AutoLoss({noise_type})")
            evaluate_and_print(X_train, y_train, beta_autoloss, beta_true, 
                              label=f"Train AutoLoss({noise_type})")
            
            train_losses.append(train_metrics["mse"])
            val_losses.append(val_metrics["mse"])
            beta_mses.append(val_metrics["beta_mse"])
    
    # 返回训练结果
    return {
        "hyperparams": {
            "U": U, "V": V, "S": S, "T": T, "tau": tau
        },
        "beta": beta_autoloss,
        "metrics": {
            "train_losses": train_losses,
            "val_losses": val_losses,
            "beta_mses": beta_mses
        }
    }

def evaluate_model(X, y, beta_est, beta_true=None):
    """
    评估模型性能，返回各种指标
    
    参数:
        X, y: 数据
        beta_est: 估计的beta
        beta_true: 真实beta (可选)
        
    返回:
        dict: 包含性能指标的字典
    """
    with torch.no_grad():
        y_pred = X @ beta_est
        mse = ((y_pred - y)**2).mean().item()
        mae = (y_pred - y).abs().mean().item()
        
        metrics = {
            "mse": mse,
            "mae": mae,
            "y_pred": y_pred
        }
        
        if beta_true is not None:
            beta_mse = ((beta_est - beta_true)**2).mean().item()
            beta_mae = (beta_est - beta_true).abs().mean().item()
            metrics.update({
                "beta_mse": beta_mse,
                "beta_mae": beta_mae
            })
            
        return metrics

def plot_results(results, title="AutoLoss Training Results"):
    """
    Evaluate model performance with estimated beta coefficients.

    Computes predictions and metrics (MSE, MAE) on given data, optionally comparing
    estimated beta to true beta.

    Args:
        X (torch.Tensor): Feature matrix, shape (n_samples, n_features).
        y (torch.Tensor): Target vector, shape (n_samples,).
        beta_est (torch.Tensor): Estimated beta coefficients, shape (n_features,).
        beta_true (torch.Tensor, optional): True beta coefficients, shape (n_features,). Defaults to None.

    Returns:
        dict: Metrics dictionary with keys:
            - "mse": Mean squared error (float).
            - "mae": Mean absolute error (float).
            - "y_pred": Predicted values (torch.Tensor).
            - "beta_mse", "beta_mae": Beta errors if beta_true is provided (float).

    Notes:
        - Computations are performed without gradient tracking.
    """
    
    metrics = results["metrics"]
    
    plt.figure(figsize=(15, 5))
    
    # 训练验证损失
    plt.subplot(1, 3, 1)
    plt.plot(metrics["train_losses"], label="Train Loss")
    plt.plot(metrics["val_losses"], label="Val Loss")
    plt.xlabel("Iterations")
    plt.ylabel("MSE Loss")
    plt.legend()
    plt.title("Training and Validation Loss")
    
    # Beta MSE
    plt.subplot(1, 3, 2)
    plt.plot(metrics["beta_mses"], marker='o')
    plt.xlabel("Iterations")
    plt.ylabel("Beta MSE")
    plt.title("Beta Estimation Error")
    
    # 超参数可视化
    plt.subplot(1, 3, 3)
    hyperparams = results["hyperparams"]
    
    # 创建组合索引
    param_names = []
    param_values = []
    
    for i, u in enumerate(hyperparams["U"].cpu().detach()):
        param_names.append(f"U_{i}")
        param_values.append(u.item())
    
    for i, v in enumerate(hyperparams["V"].cpu().detach()):
        param_names.append(f"V_{i}")
        param_values.append(v.item())
        
    for i, s in enumerate(hyperparams["S"].cpu().detach()):
        param_names.append(f"S_{i}")
        param_values.append(s.item())
        
    for i, t in enumerate(hyperparams["T"].cpu().detach()):
        param_names.append(f"T_{i}")
        param_values.append(t.item())
    
    plt.bar(param_names, param_values)
    plt.xlabel("Parameter")
    plt.ylabel("Value")
    plt.title("Final Hyperparameters")
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.suptitle(title, fontsize=16)
    plt.subplots_adjust(top=0.85)
    plt.show()

def run_experiment(noise_type="laplace", **kwargs):
    """
    Run a complete AutoLoss experiment with specified noise type.

    Sets up the experiment, generates synthetic data, initializes hyperparameters,
    trains the model, and plots results.

    Args:
        noise_type (str, optional): Noise distribution ("laplace" or "student"). Defaults to "laplace".
        **kwargs: Additional arguments passed to setup_experiment (e.g., n, d, lambda_reg).

    Returns:
        dict: Experiment results with keys:
            - "config": Experiment configuration.
            - "data": Generated synthetic data.
            - "results": Training results from train_autoloss.

    Notes:
        - Calls setup_experiment, generate_synthetic_data, initialize_hyperparameters,
          train_autoloss, and plot_results sequentially.
        - Plots results with a title based on noise_type.
    """
    
    # 设置实验
    config = setup_experiment(**kwargs)
    
    # 生成数据
    data = generate_synthetic_data(config, noise_type=noise_type)
    
    # 初始化超参数
    hyperparams = initialize_hyperparameters(config)
    
    # 训练模型
    results = train_autoloss(data, hyperparams, config)
    
    # 绘制结果
    plot_results(results, title=f"AutoLoss on {noise_type.capitalize()} Noise")
    
    # 合并结果
    experiment_results = {
        "config": config,
        "data": data,
        "results": results
    }
    
    return experiment_results


In [10]:
# 运行Laplace噪声实验
laplace_results = run_experiment(
    noise_type="laplace",
    n=200, 
    d=5,
    lambda_reg=0.1,
    outer_steps=10,
    lr=1e-2,
    max_iterations=20
)

# 如果需要，也可以运行Student-t噪声实验
student_results = run_experiment(
    noise_type="student",
    n=200, 
    d=5,
    lambda_reg=0.1,
    outer_steps=10,
    lr=5e-3,
    max_iterations=20
)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\xinby\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\xinby\AppData\Local\Temp\ipykernel_18120\2738433465.py", line 2, in <module>
    laplace_results = run_experiment(
                      ^^^^^^^^^^^^^^^
  File "C:\Users\xinby\AppData\Local\Temp\ipykernel_18120\1819228730.py", line 637, in run_experiment
    config = setup_experiment(**kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: setup_experiment() got an unexpected keyword argument 'max_iterations'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\xinby\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 2105, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C

In [None]:
######## VERSION 1.1.0 ########
# 在 v1.0.0 基础上，封装数据生成与训练逻辑，使代码结构更清晰

import torch
from qpth.qp import QPFunction
import matplotlib.pyplot as plt

################################################
#            1)   内层 QP 构造与求解
################################################

def build_qp_matrices(U, V, S, T, tau, X_train, y_train, lambda_reg):
    """
    构建Q, p, G, h，用于内层QP：
      minimize 0.5 * [beta^T * diag(lambda_reg) * beta + theta^T theta + sigma^T sigma]
               + tau^T sigma
      subject to  pi_li >= U_l(...) + V_l, ...
    """
    n, d = X_train.shape
    L = U.shape[0]
    H = S.shape[0]
    
    total_vars = d + L*n + 2*H*n  # [beta, pi, theta, sigma]
    
    # 对角线 Q
    Q_diag = torch.zeros(total_vars, dtype=X_train.dtype, device=X_train.device)
    Q_diag[:d] = lambda_reg              # beta 的正则
    Q_diag[d + L*n : d + L*n + H*n] = 1.0  # theta
    Q_diag[d + L*n + H*n : ] = 1.0         # sigma
    
    Q = torch.diag(Q_diag).unsqueeze(0)
    
    # p 向量
    p = torch.zeros(total_vars, dtype=X_train.dtype, device=X_train.device)
    p[d + L*n + H*n:] = tau.repeat(n)  # sigma部分
    p = p.unsqueeze(0)
    
    # 不等式约束 G z <= h
    G_rows = 2*L*n + 2*H*n + d
    G = torch.zeros(G_rows, total_vars, dtype=X_train.dtype, device=X_train.device)
    h_val = torch.zeros(G_rows, dtype=X_train.dtype, device=X_train.device)
    
    row_idx = 0
    
    # pi_li >= U_l * (y_i - x_i^T beta) + V_l
    for i in range(n):
        for l in range(L):
            G[row_idx, :d] = U[l] * X_train[i]
            G[row_idx, d + l*n + i] = -1.0
            h_val[row_idx] = U[l]*y_train[i] + V[l]
            row_idx += 1
    
    # pi_li >= 0
    for i in range(n):
        for l in range(L):
            G[row_idx, d + l*n + i] = -1.0
            h_val[row_idx] = 0.0
            row_idx += 1

    # theta_hi + sigma_hi >= S_h*(y_i - x_i^T beta) + T_h
    for i in range(n):
        for h_ in range(H):
            G[row_idx, :d] = S[h_] * X_train[i]
            G[row_idx, d + L*n + h_*n + i] = -1.0
            G[row_idx, d + L*n + H*n + h_*n + i] = -1.0
            h_val[row_idx] = S[h_]*y_train[i] + T[h_]
            row_idx += 1

    # sigma_hi >= 0
    for i in range(n):
        for h_ in range(H):
            G[row_idx, d + L*n + H*n + h_*n + i] = -1.0
            h_val[row_idx] = 0.0
            row_idx += 1

    # beta_j >= 0
    for j in range(d):
        G[row_idx, j] = -1.0
        h_val[row_idx] = 0.0
        row_idx += 1
    
    G = G.unsqueeze(0)
    h = h_val.unsqueeze(0)
    
    # 数值扰动，确保 Q SPD
    eps = 1e-4
    Q = Q + eps * torch.eye(total_vars, dtype=X_train.dtype, device=X_train.device).unsqueeze(0)
    
    return Q, p, G, h

def solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg):
    """
    Solve inner QP problem to obtain optimal beta coefficients.

    Constructs and solves a quadratic programming problem using qpth.QPFunction,
    minimizing a regularized objective subject to constraints defined by hyperparameters.

    Args:
        U (torch.Tensor): Linear constraint coefficients, shape (L,).
        V (torch.Tensor): Linear constraint biases, shape (L,).
        S (torch.Tensor): Additional constraint coefficients, shape (H,).
        T (torch.Tensor): Additional constraint biases, shape (H,).
        tau (torch.Tensor): Slack variable penalties, shape (H,).
        X_train (torch.Tensor): Training features, shape (n_samples, n_features).
        y_train (torch.Tensor): Training targets, shape (n_samples,).
        lambda_reg (float): Regularization strength for beta.

    Returns:
        torch.Tensor: Optimal beta coefficients, shape (n_features,).

    Notes:
        - Objective: 0.5 * (beta^T * diag(lambda_reg) * beta + theta^T * theta + sigma^T * sigma) + tau^T * sigma.
        - Constraints are derived from U, V, S, T.
        - Assumes all tensors are on the same device.
    """
    Q, p, G, h = build_qp_matrices(U, V, S, T, tau, X_train, y_train, lambda_reg)
    z = QPFunction(verbose=False)(Q, p, G, h, torch.empty(0, device=X_train.device), torch.empty(0, device=X_train.device))
    d = X_train.shape[1]
    beta_opt = z[:, :d].squeeze(0)
    return beta_opt

################################################
#   2) 外层训练 (带Train/Val) + 梯度计算
################################################

def compute_outer_gradients(X_train, y_train,
                            X_val,   y_val,
                            U, V, S, T, tau,
                            lambda_reg):
    """
    Compute outer loss gradients w.r.t. hyperparameters U, V, S, T.

    Solves the inner QP for beta, computes MSE on validation data, and calculates
    gradients of the outer loss via backpropagation.

    Args:
        X_train (torch.Tensor): Training features, shape (n_train_samples, n_features).
        y_train (torch.Tensor): Training targets, shape (n_train_samples,).
        X_val (torch.Tensor): Validation features, shape (n_val_samples, n_features).
        y_val (torch.Tensor): Validation targets, shape (n_val_samples,).
        U (torch.Tensor): Linear constraint coefficients, shape (L,), requires grad.
        V (torch.Tensor): Linear constraint biases, shape (L,), requires grad.
        S (torch.Tensor): Additional constraint coefficients, shape (H,), requires grad.
        T (torch.Tensor): Additional constraint biases, shape (H,), requires grad.
        tau (torch.Tensor): Slack variable penalties, shape (H,).
        lambda_reg (float): Regularization strength for inner QP beta.

    Returns:
        dict: Results with keys:
            - "beta_opt": Optimal beta, shape (n_features,).
            - "loss": Validation MSE (float).
            - "U_grad", "V_grad", "S_grad", "T_grad": Gradients, shapes (L,) or (H,).

    Notes:
        - Outer loss: MSE = (1/n_val) * ||y_val - X_val @ beta_opt||^2.
        - Gradients computed via PyTorch autograd; None gradients replaced with zeros.
        - Assumes all tensors on same device.
    """
    # 解内层
    beta_opt = solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg)
    
    # 算外层loss
    n_val = X_val.shape[0]
    y_val_pred = X_val @ beta_opt
    loss_outer = (1.0 / n_val) * (y_val - y_val_pred).pow(2).sum()
    
    # backward
    loss_outer.backward()
    
    U_grad = U.grad.clone() if U.grad is not None else torch.zeros_like(U)
    V_grad = V.grad.clone() if V.grad is not None else torch.zeros_like(V)
    S_grad = S.grad.clone() if S.grad is not None else torch.zeros_like(S)
    T_grad = T.grad.clone() if T.grad is not None else torch.zeros_like(T)
    
    # 梯度清零
    if U.grad is not None:
        U.grad.zero_()
    if V.grad is not None:
        V.grad.zero_()
    if S.grad is not None:
        S.grad.zero_()
    if T.grad is not None:
        T.grad.zero_()
    
    return {
        "beta_opt": beta_opt.detach().clone(),
        "loss": loss_outer.item(),
        "U_grad": U_grad,
        "V_grad": V_grad,
        "S_grad": S_grad,
        "T_grad": T_grad
    }

def train_hyperparams(X_train, y_train,
                      X_val,   y_val,
                      U, V, S, T, tau,
                      lambda_reg,
                      lr=1e-2,
                      outer_steps=50):
    """
    Train hyperparameters U, V, S, T via gradient descent on outer MSE loss.

    Performs multiple steps of gradient descent to optimize U, V, S, T based on the outer
    MSE loss, computed using beta from an inner QP solver.

    Args:
        X_train (torch.Tensor): Training features, shape (n_train_samples, n_features).
        y_train (torch.Tensor): Training targets, shape (n_train_samples,).
        X_val (torch.Tensor): Validation features, shape (n_val_samples, n_features).
        y_val (torch.Tensor): Validation targets, shape (n_val_samples,).
        U (torch.Tensor): Linear constraint coefficients, shape (L,), requires grad.
        V (torch.Tensor): Linear constraint biases, shape (L,), requires grad.
        S (torch.Tensor): Additional constraint coefficients, shape (H,), requires grad.
        T (torch.Tensor): Additional constraint biases, shape (H,), requires grad.
        tau (torch.Tensor): Slack variable penalties, shape (H,).
        lambda_reg (float): Regularization parameter for inner QP beta.
        lr (float, optional): Learning rate. Defaults to 1e-2.
        outer_steps (int, optional): Number of iterations. Defaults to 50.

    Returns:
        tuple: (U, V, S, T, loss_history)
            - U, V, S, T (torch.Tensor): Updated hyperparameters.
            - loss_history (list): MSE loss per step.

    Notes:
        - Uses compute_outer_gradients for gradient computation.
        - Prints MSE every 10 steps.
        - Assumes all tensors are on the same device.
    """
    loss_history = []
    for step in range(outer_steps):
        results = compute_outer_gradients(X_train, y_train,
                                          X_val,   y_val,
                                          U, V, S, T, tau,
                                          lambda_reg)
        loss_val = results["loss"]
        U_grad, V_grad = results["U_grad"], results["V_grad"]
        S_grad, T_grad = results["S_grad"], results["T_grad"]
        
        with torch.no_grad():
            U -= lr * U_grad
            V -= lr * V_grad
            S -= lr * S_grad
            T -= lr * T_grad
        
        # 继续需要梯度
        U.requires_grad_(True)
        V.requires_grad_(True)
        S.requires_grad_(True)
        T.requires_grad_(True)
        
        loss_history.append(loss_val)
        if (step+1) % 10 == 0:
            print(f"[outer step {step+1}/{outer_steps}] Val MSE loss = {loss_val:.6f}")
    
    return U, V, S, T, loss_history

################################################
#    3) 辅助: 评估/打印
################################################

def evaluate_and_print(X, y, beta_est, beta_true, label=""):
    """
    Evaluate predictions using beta_est on (X, y) and print MSE/MAE metrics.

    Computes predictions with estimated beta coefficients, calculates MSE and MAE
    on the given data, and optionally compares beta_est to beta_true.

    Args:
        X (torch.Tensor): Feature matrix, shape (n_samples, n_features).
        y (torch.Tensor): Target vector, shape (n_samples,).
        beta_est (torch.Tensor): Estimated beta coefficients, shape (n_features,).
        beta_true (torch.Tensor): True beta coefficients, shape (n_features,), optional.
        label (str, optional): Prefix for printed metrics. Defaults to "".

    Notes:
        - Prints MSE and MAE for predictions, and Beta MSE/MAE if beta_true is provided.
        - Computations are performed without gradient tracking.
    """
    with torch.no_grad():
        y_pred = X @ beta_est
        mse = ((y_pred - y)**2).mean().item()
        mae = (y_pred - y).abs().mean().item()
        print(f"{label} MSE: {mse:.6f}")
        print(f"{label} MAE: {mae:.6f}")
        if beta_true is not None:
            beta_mse = ((beta_est - beta_true)**2).mean().item()
            beta_mae = (beta_est - beta_true).abs().mean().item()
            print(f"{label} Beta MSE: {beta_mse:.6f}, Beta MAE: {beta_mae:.6f}")

################################################
#    4) 核心功能：生成 Laplace 数据 & 训练循环
################################################

def generate_laplace_data(n, d, scale=1.0, seed=42, device="cpu"):
    """
    生成 (X,y, beta_true) 其中:
     - beta_true ~ Uniform(0,10)
     - X ~ N(0,1)
     - eps ~ Laplace(0, scale)
     - y = X beta_true + eps
    """
    torch.manual_seed(seed)
    from torch.distributions import Laplace
    
    beta_true = torch.rand(d, device=device) * 10
    X = torch.randn(n, d, device=device)
    
    dist = Laplace(0.0, scale)
    eps = dist.sample((n,)).to(device)
    
    y = X @ beta_true + eps
    return X, y, beta_true

def run_laplace_experiment(n=200, d=5,
                           L=2, H=2,
                           lambda_reg=0.1,
                           outer_steps=10,
                           lr=1e-2,
                           n_iter=20,
                           N_train=150,
                           seed=42,
                           device=None):
    """
    封装一个 Laplace 噪声下的完整实验:
     1) 生成数据
     2) 初始化超参数
     3) 进行多轮(for iter in range(n_iter))的外层训练
        - 每轮里都再 train_hyperparams(outer_steps)
        - 解出beta, 打印在Val/Train集上的性能
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # 1) 生成数据
    print("\n==== Generating Laplace Noise Data ====")
    X, y, beta_true = generate_laplace_data(n, d, scale=1.0, seed=seed, device=device)
    
    # 2) Train-Val Split
    X_train, y_train = X[:N_train], y[:N_train]
    X_val,   y_val   = X[N_train:], y[N_train:]
    
    # 3) 初始化超参数
    U = torch.randn(L, device=device, requires_grad=True)
    V = torch.randn(L, device=device, requires_grad=True)
    S = torch.randn(H, device=device, requires_grad=True)
    T = torch.randn(H, device=device, requires_grad=True)
    tau = torch.ones(H, device=device, requires_grad=False)  # Usually fixed
    
    # 4) 多轮 for-loop: 每轮都再执行 train_hyperparams
    for it in range(n_iter):
        print(f"\n--- Laplace Iteration {it+1}/{n_iter} ---")
        U, V, S, T, loss_hist = train_hyperparams(
            X_train, y_train,
            X_val,   y_val,
            U, V, S, T, tau,
            lambda_reg=lambda_reg,
            lr=lr,
            outer_steps=outer_steps
        )
        # 每轮结束后, 解出最终 beta_opt
        beta_opt = solve_inner_qpth(U, V, S, T, tau, X_train, y_train, lambda_reg)
        
        print(f"[Validation set] ", end="")
        evaluate_and_print(X_val, y_val, beta_opt, beta_true, label="Val Autoloss")
        
        print(f"[Train set] ", end="")
        evaluate_and_print(X_train, y_train, beta_opt, beta_true, label="Train Autoloss")

    return U, V, S, T, beta_opt

################################################
#  5) 主函数 (可选), 演示
################################################

def main():
    # 在主函数中调用封装后的 run_laplace_experiment
    final_U, final_V, final_S, final_T, final_beta = run_laplace_experiment(
        n=200, d=5,
        L=2, H=2,
        lambda_reg=0.1,
        outer_steps=10,
        lr=1e-2,
        n_iter=5,      # 迭代 5 轮 (每轮里再训练 outer_steps=10)
        N_train=150,
        seed=42
    )
    print("\nDone. Final hyperparams and beta have been obtained.")

if __name__ == "__main__":
    main()


Using device: cuda

==== Generating Laplace Noise Data ====

--- Laplace Iteration 1/5 ---
[outer step 10/10] Val MSE loss = 1.147810
[Validation set] Val Autoloss MSE: 1.147461
Val Autoloss MAE: 0.761749
Val Autoloss Beta MSE: 0.041236, Beta MAE: 0.165974
[Train set] Train Autoloss MSE: 2.374356
Train Autoloss MAE: 1.123015
Train Autoloss Beta MSE: 0.041236, Beta MAE: 0.165974

--- Laplace Iteration 2/5 ---
[outer step 10/10] Val MSE loss = 1.144650
[Validation set] Val Autoloss MSE: 1.144608
Val Autoloss MAE: 0.760095
Val Autoloss Beta MSE: 0.040721, Beta MAE: 0.166360
[Train set] Train Autoloss MSE: 2.371023
Train Autoloss MAE: 1.121967
Train Autoloss Beta MSE: 0.040721, Beta MAE: 0.166360

--- Laplace Iteration 3/5 ---
[outer step 10/10] Val MSE loss = 1.142888
[Validation set] Val Autoloss MSE: 1.142743
Val Autoloss MAE: 0.758937
Val Autoloss Beta MSE: 0.040346, Beta MAE: 0.166572
[Train set] Train Autoloss MSE: 2.368653
Train Autoloss MAE: 1.121201
Train Autoloss Beta MSE: 0.0403