In [1]:
import torch
from qpth.qp import QPFunction

# 参数定义
n, d = 10, 3  # 数据点数和模型参数维度
L, H = 2, 2   # ReLU 和 ReHU 项数
lambda_reg = 0.1  # Ridge 正则化参数

# 生成示例数据
X = torch.randn(n, d)  # 输入数据，形状 (n, d)
y = torch.randn(n)     # 目标值，形状 (n,)

# 固定 beta（假设已通过内层优化计算）
beta_star = torch.abs(torch.randn(d))  # 确保 beta >= 0

# 初始化超参数（u_l, v_l, s_h, t_h）
U = torch.randn(L, requires_grad=True)  # u_l，形状 (L,)
V = torch.randn(L, requires_grad=True)  # v_l，形状 (L,)
S = torch.randn(H, requires_grad=True)  # s_h，形状 (H,)
T = torch.randn(H, requires_grad=True)  # t_h，形状 (H,)
tau = torch.ones(H, requires_grad=False)  # T_h 固定为 1，形状 (H,)

def solve_inner_qpth(U, V, S, T, tau, X, y, lambda_reg):
    """
    重建内层 QP，用于计算 beta 对超参数的隐式导数
    """
    # QP 参数
    total_vars = d + L * n + H * n + H * n  # [beta, pi, theta, sigma]
    Q = torch.zeros(total_vars)
    Q[:d] = lambda_reg  # beta 的 Ridge 正则化
    Q[d + L * n:d + L * n + H * n] = 1.0  # theta 的二次项
    Q = torch.diag(Q).unsqueeze(0)
    
    p = torch.zeros(total_vars)
    tau_expanded = tau.repeat(n)  # 形状 (H * n,)
    p[d + L * n + H * n:] = tau_expanded  # sigma 的线性项
    p = p.unsqueeze(0)
    
    # 约束 G z <= h
    G_rows = 2 * L * n + 2 * H * n + d  # 每样本 2L + 2H + d 个约束
    G = torch.zeros(G_rows, total_vars)
    h_values = torch.zeros(G_rows)  # 临时存储 h 的值
    
    # pi_li >= u_l (y_i - x_i^T beta) + v_l
    idx = 0
    for i in range(n):
        for l in range(L):
            G[idx, :d] = U[l] * X[i]  # u_l x_i^T beta，保持计算图
            G[idx, d + l * n + i] = -1.0  # -pi_li
            h_values[idx] = U[l] * y[i] + V[l]  # u_l y_i + v_l，保持计算图
            idx += 1
    
    # pi_li >= 0
    for i in range(n):
        for l in range(L):
            G[idx, d + l * n + i] = -1.0  # -pi_li <= 0
            h_values[idx] = 0.0
            idx += 1
    
    # theta_hi + sigma_hi >= s_h (y_i - x_i^T beta) + t_h
    for i in range(n):
        for h in range(H):
            G[idx, :d] = S[h] * X[i]  # s_h x_i^T beta，保持计算图
            G[idx, d + L * n + h * n + i] = -1.0  # -theta_hi
            G[idx, d + L * n + H * n + h * n + i] = -1.0  # -sigma_hi
            h_values[idx] = S[h] * y[i] + T[h]  # s_h y_i + t_h，保持计算图
            idx += 1
    
    # sigma_hi >= 0
    for i in range(n):
        for h in range(H):
            G[idx, d + L * n + H * n + h * n + i] = -1.0  # -sigma_hi <= 0
            h_values[idx] = 0.0
            idx += 1
    
    # beta_j >= 0
    for j in range(d):
        G[idx, j] = -1.0  # -beta_j <= 0
        h_values[idx] = 0.0
        idx += 1
    
    G = G.unsqueeze(0)
    h = h_values.unsqueeze(0)  # 转换为 (1, G_rows) 形状
    
    # 确保 Q 是 SPD
    Q += 1e-4 * torch.eye(total_vars).unsqueeze(0)
    
    # 使用 qpth 求解
    z = QPFunction(verbose=False)(Q, p, G, h, torch.Tensor(), torch.Tensor())
    beta = z[:, :d].squeeze(0)  # 提取 beta
    return beta

def compute_outer_gradients(beta_star, X, y, U, V, S, T, tau, lambda_reg):
    """
    固定 beta_star，计算外层损失对超参数的导数
    返回：U, V, S, T 的梯度和外层损失值
    """
    # 重建内层 QP，确保 beta 对超参数的依赖
    beta_opt = solve_inner_qpth(U, V, S, T, tau, X, y, lambda_reg)
    
    # 外层损失（MSE）
    y_pred = X @ beta_opt
    L_outer = (1/n) * (y - y_pred).pow(2).sum()
    
    # 计算外层损失的梯度（隐式微分）
    L_outer.backward()
    
    # 提取超参数梯度
    U_grad = U.grad.clone() if U.grad is not None else torch.zeros_like(U)
    V_grad = V.grad.clone() if V.grad is not None else torch.zeros_like(V)
    S_grad = S.grad.clone() if S.grad is not None else torch.zeros_like(S)
    T_grad = T.grad.clone() if T.grad is not None else torch.zeros_like(T)
    
    # 清零梯度（添加保护）
    if U.grad is not None:
        U.grad.zero_()
    if V.grad is not None:
        V.grad.zero_()
    if S.grad is not None:
        S.grad.zero_()
    if T.grad is not None:
        T.grad.zero_()
    
    return U_grad, V_grad, S_grad, T_grad, L_outer.item()

# 简单实验：初始化超参数，计算导数，进行一步梯度下降
if __name__ == "__main__":
    # 初始超参数
    print("初始超参数：")
    print(f"U: {U}")
    print(f"V: {V}")
    print(f"S: {S}")
    print(f"T: {T}")
    
    # 计算外层损失对超参数的导数
    U_grad, V_grad, S_grad, T_grad, outer_loss = compute_outer_gradients(
        beta_star, X, y, U, V, S, T, tau, lambda_reg
    )
    
    # 打印导数
    print("\n超参数导数：")
    print(f"U_grad: {U_grad}")
    print(f"V_grad: {V_grad}")
    print(f"S_grad: {S_grad}")
    print(f"T_grad: {T_grad}")
    print(f"外层损失 (MSE): {outer_loss:.4f}")
    
    # 进行一步梯度下降
    learning_rate = 0.01
    with torch.no_grad():
        U_new = U - learning_rate * U_grad
        V_new = V - learning_rate * V_grad
        S_new = S - learning_rate * S_grad
        T_new = T - learning_rate * T_grad
    
    # 更新超参数
    U = U_new.clone().requires_grad_(True)
    V = V_new.clone().requires_grad_(True)
    S = S_new.clone().requires_grad_(True)
    T = T_new.clone().requires_grad_(True)
    
    # 打印优化后的超参数
    print("\n优化后超参数（一步梯度下降）：")
    print(f"U: {U}")
    print(f"V: {V}")
    print(f"S: {S}")
    print(f"T: {T}")
    
    # 验证 beta 是否保持不变（重新计算 QP 解）
    beta_opt = solve_inner_qpth(U, V, S, T, tau, X, y, lambda_reg)
    print(f"\n重新计算的 beta: {beta_opt}")
    print(f"原 beta_star: {beta_star}")
    print(f"约束满足情况 (beta >= 0): {torch.all(beta_opt >= 0)}")

初始超参数：
U: tensor([-2.6461,  1.8845], requires_grad=True)
V: tensor([-0.5992, -0.2085], requires_grad=True)
S: tensor([-1.1804, -1.2868], requires_grad=True)
T: tensor([-0.4024, -0.3870], requires_grad=True)

超参数导数：
U_grad: tensor([ 5.4043e-05, -7.2844e-05])
V_grad: tensor([-1.0317e-05,  9.5653e-06])
S_grad: tensor([-0.0459, -0.0548])
T_grad: tensor([0.0507, 0.0550])
外层损失 (MSE): 1.3661

优化后超参数（一步梯度下降）：
U: tensor([-2.6461,  1.8845], requires_grad=True)
V: tensor([-0.5992, -0.2085], requires_grad=True)
S: tensor([-1.1799, -1.2862], requires_grad=True)
T: tensor([-0.4029, -0.3876], requires_grad=True)

重新计算的 beta: tensor([5.1909e-09, 9.3970e-09, 4.4165e-01], grad_fn=<SqueezeBackward1>)
原 beta_star: tensor([1.1445, 0.4805, 1.3974])
约束满足情况 (beta >= 0): True
