总体

In [None]:
import optuna
import torch
import torch.nn as nn
import numpy as np
from scipy.stats import halfnorm
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.optimize import minimize
import statsmodels.api as sm
import pandas as pd
import math
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Lasso
import seaborn as sns

# Stochastic Frontier Model类
class StochasticFrontierModel(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.beta = nn.Parameter(torch.tensor(num_features * [1.0], dtype=torch.float32))
        self.log_sigma2 = nn.Parameter(torch.tensor(0.0))
        self.log_lambda0 = nn.Parameter(torch.tensor(0.0))

    def predict(self, x):
        # 使用 torch.matmul 执行矩阵乘法
        predictions = torch.matmul(x, self.beta)
        return predictions

    def forward(self, x, y):
        sigma2 = torch.exp(self.log_sigma2)
        lambda0 = torch.exp(self.log_lambda0)
        sigma = torch.sqrt(sigma2)
        epsilon = y - torch.sum(x * self.beta, dim=1)

        term1 = x.shape[0] * self.log_sigma2 / 2
        term2 = -torch.sum(torch.log(torch.distributions.normal.Normal(0, 1).cdf(-epsilon * lambda0 / sigma) + 1e-10))
        term3 = torch.sum(epsilon**2) / (2 * sigma2)

        return (term1 + term2 + term3) / x.shape[0]

# 数据生成模块
def generate_data(num_features, sigma_u, sigma_v, N, save_path='true_parameters.csv'):
    """
    生成数据，并保存真实的 beta、lambda、sigma_sq 等参数到 CSV 文件。
    :param num_features: 特征数量
    :param sigma_u: 非负噪声的标准差
    :param sigma_v: 随机噪声的标准差
    :param N: 样本数量
    :param save_path: 保存真实参数的 CSV 文件路径
    :return: x, y, true_beta
    """
    # 生成稀疏的 beta 向量（num_features 维，只有 6 个非零系数，取值为 1 或 -1）
    beta = np.zeros(num_features)
    non_zero_indices = np.random.choice(num_features, 6, replace=False)  # 随机选择 6 个非零位置
    beta[non_zero_indices] = np.random.choice([-1, 1], 6)  # 非零系数为 1 或 -1

    # 生成特征数据 x
    x = np.random.normal(0, 1, (N, num_features))  # num_features 维特征，服从标准正态分布

    # 生成噪声 v 和 u
    v = np.random.normal(0, sigma_v, N)  # 随机噪声 v
    u = stats.halfnorm.rvs(loc=0, scale=sigma_u, size=N)  # 非负噪声 u

    # 生成目标变量 y
    y = x.dot(beta) + v - u

    # 保存真实参数到 CSV 文件
    true_params = {
        'beta': beta,
        'lambda': sigma_u / sigma_v,  # lambda = sigma_u / sigma_v
        'sigma_sq': sigma_u**2 + sigma_v**2  # sigma^2 = sigma_u^2 + sigma_v^2
    }
    pd.DataFrame(true_params).to_csv(save_path, index=False)

    # 保存生成的数据
    data = np.hstack([x, y.reshape(-1, 1)])
    pd.DataFrame(data, columns=[f'x{i}' for i in range(num_features)] + ['y']).to_csv('data.csv', index=False)

    return x, y, beta

# MLE 变量选择模块
def stochastic_frontier_mle_lasso_bic(x, y, r_range=np.logspace(-2, 4, 3), threshold=1e-3, max_iter=2):
    """
    使用自适应 Lasso 惩罚的随机前沿模型进行变量选择和参数估计。
    :param x: 特征矩阵
    :param y: 目标变量
    :param r_range: Lasso 惩罚参数的范围
    :param threshold: 系数阈值，小于该值的系数设为 0
    :param max_iter: 最大迭代次数
    :return: 估计的 beta, lambda, sigma^2, MSE, BIC, 最优 r
    """
    x_df = pd.DataFrame(x)
    y_series = pd.Series(y)

    def calculate_bic(r, x_df, y_series):
        # 第一次 Lasso 估计
        lasso = Lasso(alpha=r)
        lasso.fit(x_df, y_series)
        beta_init = lasso.coef_

        # 计算自适应 Lasso 权重
        w = np.zeros_like(beta_init)
        for j in range(len(beta_init)):
            if np.abs(beta_init[j]) > threshold:
                w[j] = 1 / np.abs(beta_init[j])
            else:
                w[j] = 0

        def logLikFun(param):
            parlab = param[:-2]
            parsigmaSq = param[-2]
            parlambda = param[-1]
            epsilon = y_series - np.dot(x_df, parlab)
            penalty = r * np.sum(w * np.abs(parlab))
            return -np.sum(0.5 * np.log(parsigmaSq) + 0.5 / parsigmaSq * epsilon**2 -
                           norm.logcdf(-epsilon * parlambda / np.sqrt(parsigmaSq))) + penalty

        # 使用最小二乘法获取初始参数
        ols = sm.OLS(y_series, x_df).fit()
        init_params = np.append(ols.params.values, [0.5, sum(ols.resid**2) / (len(y_series) - len(ols.params))])

        # 极大似然估计
        result = minimize(lambda params: -logLikFun(params), init_params, method='Nelder-Mead')

        # 提取估计的参数
        beta = result.x[:-2]
        sigma_sq = result.x[-2]
        lambda_ = result.x[-1]

        # 后处理：将绝对值小于阈值的系数设为 0
        beta[np.abs(beta) < threshold] = 0

        # 计算对数似然值
        log_lik = -logLikFun(result.x)

        # 计算 BIC
        n = len(y_series)
        k = np.sum(beta != 0) + 2
        bic = -2 * log_lik + k * np.log(n)

        return bic, beta, lambda_, sigma_sq

    # 初始化
    best_bic = np.inf
    best_r = None
    best_beta = None
    best_lambda = None
    best_sigma_sq = None
    best_features = None

    # 遍历 r_range，选择使 BIC 最小的 r
    for r in r_range:
        # 初始估计
        bic, beta, lambda_, sigma_sq = calculate_bic(r, x_df, y_series)

        # 迭代筛选和重新估计
        for _ in range(max_iter):
            # 筛选出非零特征
            selected_features = np.abs(beta) >= threshold
            if not np.any(selected_features):
                break

            # 确保 selected_features 的长度与 x_df 的列数一致
            if len(selected_features) != x_df.shape[1]:
                selected_features = np.append(selected_features, [False] * (x_df.shape[1] - len(selected_features)))

            # 使用筛选后的特征重新估计
            x_filtered = x_df.loc[:, selected_features]
            bic_new, beta_new, lambda_new, sigma_sq_new = calculate_bic(r, x_filtered, y_series)

            # 如果 BIC 没有改善，停止迭代
            if bic_new >= bic:
                break

            # 更新结果
            bic, beta, lambda_, sigma_sq = bic_new, beta_new, lambda_new, sigma_sq_new

        # 记录最优结果
        if bic < best_bic:
            best_bic = bic
            best_r = r
            best_beta = beta
            best_lambda = lambda_
            best_sigma_sq = sigma_sq
            best_features = selected_features

    # 确保 best_features 的长度与 best_beta 的长度一致
    if len(best_features) != len(best_beta):
        best_features = np.abs(best_beta) >= threshold

    # 计算 MSE
    epsilon = y_series - np.dot(x_df.loc[:, best_features], best_beta[best_features])
    mse = np.mean(epsilon**2)

    return best_beta, best_lambda, best_sigma_sq, mse, best_bic, best_r

# 非私有和私有方法模块
def train_model_without_private(x, y, num_features, num_iters=3000, constraint=100, minibatch_size=50, threshold=1e-3):
    """
    训练非私有模型。
    """
    x, mean, std = standardize_data(x)
    x = torch.tensor(x, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    model = StochasticFrontierModel(num_features=num_features)
    losses = []
    gradients = []
    parameters = []

    for i in tqdm(range(1, num_iters + 1), desc="Training without privacy"):
        minibatch_indices = np.random.choice(x.shape[0], minibatch_size, replace=True)
        minibatch_x = x[minibatch_indices]
        minibatch_y = y[minibatch_indices]
        loss = model(minibatch_x, minibatch_y)
        loss.backward()
        gradient = torch.cat([p.grad.flatten() for p in model.parameters()]).detach().numpy()
        pos_alphas = compute_alpha(constraint, gradient, model)
        neg_alphas = compute_alpha(-constraint, gradient, model)
        alphas = pos_alphas + neg_alphas
        min_alpha, size, corner_num = min(alphas, key=lambda x: x[0])
        corner = np.zeros(sum(p.numel() for p in model.parameters()))
        corner[corner_num] = size
        mu = 2 / (i + 2)
        with torch.no_grad():
            index = 0
            for p in model.parameters():
                numel = p.numel()
                p_flat = p.view(-1)
                p_flat.copy_((1 - mu) * p_flat + mu * torch.tensor(corner[index:index+numel], dtype=torch.float32))
                index += numel
        losses.append(loss.item())
        gradients.append(np.linalg.norm(gradient))  # 记录梯度的范数
        parameters.append(model.state_dict().copy())
        model.zero_grad()

    min_loss_index = np.argmin(losses)
    model.load_state_dict(parameters[min_loss_index])

    # 后处理：将绝对值小于阈值的系数设为0
    with torch.no_grad():
        model.beta[torch.abs(model.beta) < threshold] = 0

    return model, mean, std, losses[-1], calculate_mse(model, x, y)

def train_model_private(x, y, num_features, num_iters=3000, constraint=100, minibatch_size=50, lipschitz=1, epsilon=0.1, delta=1e-5, threshold=1e-3):
    """
    训练私有模型。
    """
    x, mean, std = standardize_data(x)
    x = torch.tensor(x, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    model = StochasticFrontierModel(num_features=num_features)
    n = x.shape[0]
    m = sum(p.numel() for p in model.parameters())
    losses = []
    gradients = []
    parameters = []
    noise_para = lipschitz * constraint * math.sqrt(8 * num_iters * math.log(1 / delta)) / (n * epsilon)

    for i in tqdm(range(1, num_iters + 1), desc="Training with privacy"):
        minibatch_indices = np.random.choice(x.shape[0], minibatch_size, replace=True)
        minibatch_x = x[minibatch_indices]
        minibatch_y = y[minibatch_indices]
        loss = model(minibatch_x, minibatch_y)
        loss.backward()
        gradient = torch.cat([p.grad.flatten() for p in model.parameters()]).detach().numpy()
        pos_alphas = compute_alpha_private(constraint, gradient, model, noise_para)
        neg_alphas = compute_alpha_private(-constraint, gradient, model, noise_para)
        alphas = pos_alphas + neg_alphas
        min_alpha, size, corner_num = min(alphas, key=lambda x: x[0])
        corner = np.zeros(m)
        corner[corner_num] = size
        mu = 2 / (i + 2)
        with torch.no_grad():
            index = 0
            for p in model.parameters():
                numel = p.numel()
                p_flat = p.view(-1)
                p_flat.copy_((1 - mu) * p_flat + mu * torch.tensor(corner[index:index+numel], dtype=torch.float32))
                index += numel
        losses.append(loss.item())
        gradients.append(np.linalg.norm(gradient))  # 记录梯度的范数
        parameters.append(model.state_dict().copy())
        model.zero_grad()

    min_loss_index = np.argmin(losses)
    model.load_state_dict(parameters[min_loss_index])

    # 后处理：将绝对值小于阈值的系数设为0
    with torch.no_grad():
        model.beta[torch.abs(model.beta) < threshold] = 0

    return model, mean, std, losses[-1], calculate_mse(model, x, y)

# 实验模块
def run_experiments(N_values, epsilon_values, num_repeats=2):
    """
    运行实验并保存结果。
    """
    results = []
    for N in N_values:
        # 生成数据，并保存真实参数
        true_params_path = f'true_parameters_N{N}.csv'
        x, y, true_beta = generate_data(num_features, sigma_u, sigma_v, N, save_path=true_params_path)

        # 划分训练集和测试集
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
        pd.DataFrame(np.hstack([x, y.reshape(-1, 1)]), columns=[f'x{i}' for i in range(num_features)] + ['y']).to_csv(f'data_{N}.csv', index=False)

        # MLE 方法
        beta_mle, lambda_mle, sigma_sq_mle, mse_mle, bic_mle, best_r = stochastic_frontier_mle_lasso_bic(x_train, y_train)
        sse_mle = calculate_sse(beta_mle, true_beta)
        fp_mle = calculate_fp(beta_mle, true_beta)
        sr_mle = calculate_sr(beta_mle, true_beta)

        results.append({
            'N': N,
            'epsilon': None,
            'method': 'mle',
            'beta': beta_mle,
            'lambda': lambda_mle,
            'sigma_sq': sigma_sq_mle,
            'mse': mse_mle,
            'loss': None,
            'sse': sse_mle,
            'fp': fp_mle,
            'sr': sr_mle,
            'hyperparams': {'best_r': best_r}
        })

        # 非私有和私有方法
        for epsilon in epsilon_values:
            for method in ['without_private', 'private']:
                if method == 'private':
                    model, _, _, loss, mse = train_model_private(x_train, y_train, num_features, epsilon=epsilon)
                else:
                    model, _, _, loss, mse = train_model_without_private(x_train, y_train, num_features)

                beta_hat = model.beta.detach().numpy()
                lambda_hat = torch.exp(model.log_lambda0).item()
                sigma_sq_hat = torch.exp(model.log_sigma2).item()

                # 计算 SSE, FP, SR
                sse = calculate_sse(beta_hat, true_beta)
                fp = calculate_fp(beta_hat, true_beta)
                sr = calculate_sr(beta_hat, true_beta)

                results.append({
                    'N': N,
                    'epsilon': epsilon,
                    'method': method,
                    'beta': beta_hat,
                    'lambda': lambda_hat,
                    'sigma_sq': sigma_sq_hat,
                    'mse': mse,
                    'loss': loss,
                    'sse': sse,
                    'fp': fp,
                    'sr': sr,
                    'hyperparams': {}
                })

    # 保存结果到 CSV
    pd.DataFrame(results).to_csv('result.csv', index=False)

# 主程序
if __name__ == "__main__":
    # 参数设置
    num_features = 100  # 特征数量
    sigma_u = 0.3  # 非负噪声的标准差
    sigma_v = 0.5  # 随机噪声的标准差
    N_values = [500]  # 样本数量
    epsilon_values = [1]  # 隐私预算

    # 运行实验
    run_experiments(N_values, epsilon_values)

   

# 模块分割

In [1]:
import optuna
import torch
import torch.nn as nn
import numpy as np
from scipy.stats import halfnorm
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.optimize import minimize
import statsmodels.api as sm
import pandas as pd
import math
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Lasso
import seaborn as sns

# Stochastic Frontier Model类
class StochasticFrontierModel(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.beta = nn.Parameter(torch.tensor(num_features * [1.0], dtype=torch.float32))
        self.log_sigma2 = nn.Parameter(torch.tensor(0.0))
        self.log_lambda0 = nn.Parameter(torch.tensor(0.0))

    def predict(self, x):
        # 使用 torch.matmul 执行矩阵乘法
        predictions = torch.matmul(x, self.beta)
        return predictions

    def forward(self, x, y):
        sigma2 = torch.exp(self.log_sigma2)
        lambda0 = torch.exp(self.log_lambda0)
        sigma = torch.sqrt(sigma2)
        epsilon = y - torch.sum(x * self.beta, dim=1)

        term1 = x.shape[0] * self.log_sigma2 / 2
        term2 = -torch.sum(torch.log(torch.distributions.normal.Normal(0, 1).cdf(-epsilon * lambda0 / sigma) + 1e-10))
        term3 = torch.sum(epsilon**2) / (2 * sigma2)

        return (term1 + term2 + term3) / x.shape[0]

# 数据生成模块
def generate_data(num_features, sigma_u, sigma_v, N, save_path='true_parameters.csv'):
    """
    生成数据，并保存真实的 beta、lambda、sigma_sq 等参数到 CSV 文件。
    :param num_features: 特征数量
    :param sigma_u: 非负噪声的标准差
    :param sigma_v: 随机噪声的标准差
    :param N: 样本数量
    :param save_path: 保存真实参数的 CSV 文件路径
    :return: x, y, true_beta
    """
    # 生成稀疏的 beta 向量（num_features 维，只有 6 个非零系数，取值为 1 或 -1）
    beta = np.zeros(num_features)
    non_zero_indices = np.random.choice(num_features, 6, replace=False)  # 随机选择 6 个非零位置
    beta[non_zero_indices] = np.random.choice([-1, 1], 6)  # 非零系数为 1 或 -1

    # 生成特征数据 x
    x = np.random.normal(0, 1, (N, num_features))  # num_features 维特征，服从标准正态分布

    # 生成噪声 v 和 u
    v = np.random.normal(0, sigma_v, N)  # 随机噪声 v
    u = stats.halfnorm.rvs(loc=0, scale=sigma_u, size=N)  # 非负噪声 u

    # 生成目标变量 y
    y = x.dot(beta) + v - u

    # 保存真实参数到 CSV 文件
    true_params = {
        'beta': beta,
        'lambda': sigma_u / sigma_v,  # lambda = sigma_u / sigma_v
        'sigma_sq': sigma_u**2 + sigma_v**2  # sigma^2 = sigma_u^2 + sigma_v^2
    }
    pd.DataFrame(true_params).to_csv(save_path, index=False)

    # 保存生成的数据
    data = np.hstack([x, y.reshape(-1, 1)])
    pd.DataFrame(data, columns=[f'x{i}' for i in range(num_features)] + ['y']).to_csv('data.csv', index=False)

    return x, y, beta


In [2]:
# # MLE 变量选择模块
# def stochastic_frontier_mle_lasso_bic(x, y, r_range=np.logspace(-2, 4, 3), threshold=1e-3, max_iter=2):
#     """
#     使用自适应 Lasso 惩罚的随机前沿模型进行变量选择和参数估计。
#     :param x: 特征矩阵
#     :param y: 目标变量
#     :param r_range: Lasso 惩罚参数的范围
#     :param threshold: 系数阈值，小于该值的系数设为 0
#     :param max_iter: 最大迭代次数
#     :return: 估计的 beta, lambda, sigma^2, MSE, BIC, 最优 r
#     """
#     x_df = pd.DataFrame(x)
#     y_series = pd.Series(y)

#     def calculate_bic(r, x_df, y_series):
#         # 第一次 Lasso 估计
#         lasso = Lasso(alpha=r)
#         lasso.fit(x_df, y_series)
#         beta_init = lasso.coef_

#         # 计算自适应 Lasso 权重
#         w = np.zeros_like(beta_init)
#         for j in range(len(beta_init)):
#             if np.abs(beta_init[j]) > threshold:
#                 w[j] = 1 / np.abs(beta_init[j])
#             else:
#                 w[j] = 0

#         def logLikFun(param):
#             parlab = param[:-2]
#             parsigmaSq = param[-2]
#             parlambda = param[-1]
#             epsilon = y_series - np.dot(x_df, parlab)
#             penalty = r * np.sum(w * np.abs(parlab))
#             return -np.sum(0.5 * np.log(parsigmaSq) + 0.5 / parsigmaSq * epsilon**2 -
#                            norm.logcdf(-epsilon * parlambda / np.sqrt(parsigmaSq))) + penalty

#         # 使用最小二乘法获取初始参数
#         ols = sm.OLS(y_series, x_df).fit()
#         init_params = np.append(ols.params.values, [0.5, sum(ols.resid**2) / (len(y_series) - len(ols.params))])

#         # 极大似然估计
#         result = minimize(lambda params: -logLikFun(params), init_params, method='Nelder-Mead')

#         # 提取估计的参数
#         beta = result.x[:-2]
#         sigma_sq = result.x[-2]
#         lambda_ = result.x[-1]

#         # 后处理：将绝对值小于阈值的系数设为 0
#         beta[np.abs(beta) < threshold] = 0

#         # 计算对数似然值
#         log_lik = -logLikFun(result.x)

#         # 计算 BIC
#         n = len(y_series)
#         k = np.sum(beta != 0) + 2
#         bic = -2 * log_lik + k * np.log(n)

#         return bic, beta, lambda_, sigma_sq

#     # 初始化
#     best_bic = np.inf
#     best_r = None
#     best_beta = None
#     best_lambda = None
#     best_sigma_sq = None
#     best_features = None

#     # 遍历 r_range，选择使 BIC 最小的 r
#     for r in r_range:
#         # 初始估计
#         bic, beta, lambda_, sigma_sq = calculate_bic(r, x_df, y_series)

#         # 迭代筛选和重新估计
#         for _ in range(max_iter):
#             # 筛选出非零特征
#             selected_features = np.abs(beta) >= threshold
#             if not np.any(selected_features):
#                 break

#             # 确保 selected_features 的长度与 x_df 的列数一致
#             if len(selected_features) != x_df.shape[1]:
#                 selected_features = np.append(selected_features, [False] * (x_df.shape[1] - len(selected_features)))

#             # 使用筛选后的特征重新估计
#             x_filtered = x_df.loc[:, selected_features]
#             bic_new, beta_new, lambda_new, sigma_sq_new = calculate_bic(r, x_filtered, y_series)

#             # 如果 BIC 没有改善，停止迭代
#             if bic_new >= bic:
#                 break

#             # 更新结果
#             bic, beta, lambda_, sigma_sq = bic_new, beta_new, lambda_new, sigma_sq_new

#         # 记录最优结果
#         if bic < best_bic:
#             best_bic = bic
#             best_r = r
#             best_beta = beta
#             best_lambda = lambda_
#             best_sigma_sq = sigma_sq
#             best_features = selected_features

#     # 确保 best_features 的长度与 best_beta 的长度一致
#     if len(best_features) != len(best_beta):
#         best_features = np.abs(best_beta) >= threshold

#     # 计算 MSE
#     epsilon = y_series - np.dot(x_df.loc[:, best_features], best_beta[best_features])
#     mse = np.mean(epsilon**2)

#     return best_beta, best_lambda, best_sigma_sq, mse, best_bic, best_r


In [16]:
# 非私有和私有方法模块
def train_model_without_private(x, y, num_features, num_iters=3000, constraint=100, minibatch_size=50, threshold=1e-3):
    """
    训练非私有模型。
    """
    x, mean, std = standardize_data(x)
    x = torch.tensor(x, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    model = StochasticFrontierModel(num_features=num_features)
    losses = []
    gradients = []
    parameters = []

    for i in tqdm(range(1, num_iters + 1), desc="Training without privacy"):
        minibatch_indices = np.random.choice(x.shape[0], minibatch_size, replace=True)
        minibatch_x = x[minibatch_indices]
        minibatch_y = y[minibatch_indices]
        loss = model(minibatch_x, minibatch_y)
        loss.backward()
        gradient = torch.cat([p.grad.flatten() for p in model.parameters()]).detach().numpy()
        pos_alphas = compute_alpha(constraint, gradient, model)
        neg_alphas = compute_alpha(-constraint, gradient, model)
        alphas = pos_alphas + neg_alphas
        min_alpha, size, corner_num = min(alphas, key=lambda x: x[0])
        corner = np.zeros(sum(p.numel() for p in model.parameters()))
        corner[corner_num] = size
        mu = 2 / (i + 2)
        with torch.no_grad():
            index = 0
            for p in model.parameters():
                numel = p.numel()
                p_flat = p.view(-1)
                p_flat.copy_((1 - mu) * p_flat + mu * torch.tensor(corner[index:index+numel], dtype=torch.float32))
                index += numel
        losses.append(loss.item())
        gradients.append(np.linalg.norm(gradient))  # 记录梯度的范数
        parameters.append(model.state_dict().copy())
        model.zero_grad()

    min_loss_index = np.argmin(losses)
    model.load_state_dict(parameters[min_loss_index])

    # 后处理：将绝对值小于阈值的系数设为0
    with torch.no_grad():
        model.beta[torch.abs(model.beta) < threshold] = 0

    return model, mean, std, losses[-1], calculate_mse(model, x, y)

def train_model_private(x, y, num_features, num_iters=3000, constraint=100, minibatch_size=50, lipschitz=1, epsilon=0.1, delta=1e-5, threshold=1e-3):
    """
    训练私有模型。
    """
    x, mean, std = standardize_data(x)
    x = torch.tensor(x, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    model = StochasticFrontierModel(num_features=num_features)
    n = x.shape[0]
    m = sum(p.numel() for p in model.parameters())
    losses = []
    gradients = []
    parameters = []
    noise_para = lipschitz * constraint * math.sqrt(8 * num_iters * math.log(1 / delta)) / (n * epsilon)

    for i in tqdm(range(1, num_iters + 1), desc="Training with privacy"):
        minibatch_indices = np.random.choice(x.shape[0], minibatch_size, replace=True)
        minibatch_x = x[minibatch_indices]
        minibatch_y = y[minibatch_indices]
        loss = model(minibatch_x, minibatch_y)
        loss.backward()
        gradient = torch.cat([p.grad.flatten() for p in model.parameters()]).detach().numpy()
        pos_alphas = compute_alpha_private(constraint, gradient, model, noise_para)
        neg_alphas = compute_alpha_private(-constraint, gradient, model, noise_para)
        alphas = pos_alphas + neg_alphas
        min_alpha, size, corner_num = min(alphas, key=lambda x: x[0])
        corner = np.zeros(m)
        corner[corner_num] = size
        mu = 2 / (i + 2)
        with torch.no_grad():
            index = 0
            for p in model.parameters():
                numel = p.numel()
                p_flat = p.view(-1)
                p_flat.copy_((1 - mu) * p_flat + mu * torch.tensor(corner[index:index+numel], dtype=torch.float32))
                index += numel
        losses.append(loss.item())
        gradients.append(np.linalg.norm(gradient))  # 记录梯度的范数
        parameters.append(model.state_dict().copy())
        model.zero_grad()

    min_loss_index = np.argmin(losses)
    model.load_state_dict(parameters[min_loss_index])

    # 后处理：将绝对值小于阈值的系数设为0
    with torch.no_grad():
        model.beta[torch.abs(model.beta) < threshold] = 0

    return model, mean, std, losses[-1], calculate_mse(model, x, y)
def compute_alpha( corner_size, gradient, model):
    alpha = gradient * corner_size
    corner_size = (np.ones(sum(p.numel() for p in model.parameters())) * corner_size).tolist()
    corner_num = np.arange(sum(p.numel() for p in model.parameters())).tolist()
    return list(zip(alpha, corner_size, corner_num))

def compute_alpha_private( corner_size, gradient, model, noise_para):
    
    alpha = gradient * corner_size
    noise = np.random.laplace(scale=noise_para, size=sum(p.numel() for p in model.parameters()))
    alpha = alpha + noise
    corner_size = (np.ones(sum(p.numel() for p in model.parameters())) * corner_size).tolist()
    corner_num = np.arange(sum(p.numel() for p in model.parameters())).tolist()
    return list(zip(alpha, corner_size, corner_num))


In [4]:
# 实验模块
def run_experiments(N_values, epsilon_values, num_repeats=2):
    """
    运行实验并保存结果。
    """
    results = []
    for N in N_values:
        # 生成数据，并保存真实参数
        true_params_path = f'true_parameters_N{N}.csv'
        x, y, true_beta = generate_data(num_features, sigma_u, sigma_v, N, save_path=true_params_path)

        # 划分训练集和测试集
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
        pd.DataFrame(np.hstack([x, y.reshape(-1, 1)]), columns=[f'x{i}' for i in range(num_features)] + ['y']).to_csv(f'data_{N}.csv', index=False)

        # MLE 方法
        beta_mle, lambda_mle, sigma_sq_mle, mse_mle, bic_mle, best_r = stochastic_frontier_mle_lasso_bic(x_train, y_train)
        sse_mle = calculate_sse(beta_mle, true_beta)
        fp_mle = calculate_fp(beta_mle, true_beta)
        sr_mle = calculate_sr(beta_mle, true_beta)

        results.append({
            'N': N,
            'epsilon': None,
            'method': 'mle',
            'beta': beta_mle,
            'lambda': lambda_mle,
            'sigma_sq': sigma_sq_mle,
            'mse': mse_mle,
            'loss': None,
            'sse': sse_mle,
            'fp': fp_mle,
            'sr': sr_mle,
            'hyperparams': {'best_r': best_r}
        })

        # 非私有和私有方法
        for epsilon in epsilon_values:
            for method in ['without_private', 'private']:
                if method == 'private':
                    model, _, _, loss, mse = train_model_private(x_train, y_train, num_features, epsilon=epsilon)
                else:
                    model, _, _, loss, mse = train_model_without_private(x_train, y_train, num_features)

                beta_hat = model.beta.detach().numpy()
                lambda_hat = torch.exp(model.log_lambda0).item()
                sigma_sq_hat = torch.exp(model.log_sigma2).item()

                # 计算 SSE, FP, SR
                sse = calculate_sse(beta_hat, true_beta)
                fp = calculate_fp(beta_hat, true_beta)
                sr = calculate_sr(beta_hat, true_beta)

                results.append({
                    'N': N,
                    'epsilon': epsilon,
                    'method': method,
                    'beta': beta_hat,
                    'lambda': lambda_hat,
                    'sigma_sq': sigma_sq_hat,
                    'mse': mse,
                    'loss': loss,
                    'sse': sse,
                    'fp': fp,
                    'sr': sr,
                    'hyperparams': {}
                })

    # 保存结果到 CSV
    pd.DataFrame(results).to_csv('result.csv', index=False)


   

In [8]:
def stochastic_frontier_mle_lasso_bic(x, y, r_range=np.logspace(-2, 4, 3), threshold=1e-3, max_iter=2):
    """
    使用自适应 Lasso 惩罚的随机前沿模型进行变量选择和参数估计。
    :param x: 特征矩阵
    :param y: 目标变量
    :param r_range: Lasso 惩罚参数的范围
    :param threshold: 系数阈值，小于该值的系数设为 0
    :param max_iter: 最大迭代次数
    :return: 估计的 beta, lambda, sigma^2, MSE, BIC, 最优 r
    """
    x_df = pd.DataFrame(x)
    y_series = pd.Series(y)

    def calculate_bic(r, x_df, y_series):
        # 第一次 Lasso 估计
        lasso = Lasso(alpha=r)
        lasso.fit(x_df, y_series)
        beta_init = lasso.coef_

        # 计算自适应 Lasso 权重
        w = np.zeros_like(beta_init)
        for j in range(len(beta_init)):
            if np.abs(beta_init[j]) > threshold:
                w[j] = 1 / np.abs(beta_init[j])
            else:
                w[j] = 0

        def logLikFun(param):
            parlab = param[:-2]
            parsigmaSq = param[-2]
            parlambda = param[-1]
            epsilon = y_series - np.dot(x_df, parlab)
            penalty = r * np.sum(w * np.abs(parlab))
            return -np.sum(0.5 * np.log(parsigmaSq) + 0.5 / parsigmaSq * epsilon**2 -
                           norm.logcdf(-epsilon * parlambda / np.sqrt(parsigmaSq))) + penalty

        # 使用最小二乘法获取初始参数
        ols = sm.OLS(y_series, x_df).fit()
        init_params = np.append(ols.params.values, [0.5, sum(ols.resid**2) / (len(y_series) - len(ols.params))])

        # 极大似然估计
        result = minimize(lambda params: -logLikFun(params), init_params, method='Nelder-Mead')

        # 提取估计的参数
        beta = result.x[:-2]
        sigma_sq = result.x[-2]
        lambda_ = result.x[-1]

        # 后处理：将绝对值小于阈值的系数设为 0
        beta[np.abs(beta) < threshold] = 0

        # 计算对数似然值
        log_lik = -logLikFun(result.x)

        # 计算 BIC
        n = len(y_series)
        k = np.sum(beta != 0) + 2
        bic = -2 * log_lik + k * np.log(n)

        return bic, beta, lambda_, sigma_sq

    # 初始化
    best_bic = np.inf
    best_r = None
    best_beta = None
    best_lambda = None
    best_sigma_sq = None
    best_features = None

    # 遍历 r_range，选择使 BIC 最小的 r
    for r in r_range:
        # 初始估计
        bic, beta, lambda_, sigma_sq = calculate_bic(r, x_df, y_series)

        # 迭代筛选和重新估计
        for _ in range(max_iter):
            # 筛选出非零特征
            selected_features = np.abs(beta) >= threshold
            if not np.any(selected_features):
                break

            # 确保 selected_features 的长度与 x_df 的列数一致
            if len(selected_features) != x_df.shape[1]:
                selected_features = np.append(selected_features, [False] * (x_df.shape[1] - len(selected_features)))

            # 使用筛选后的特征重新估计
            x_filtered = x_df.loc[:, selected_features]
            bic_new, beta_new, lambda_new, sigma_sq_new = calculate_bic(r, x_filtered, y_series)

            # 如果 BIC 没有改善，停止迭代
            if bic_new >= bic:
                break

            # 更新结果
            bic, beta, lambda_, sigma_sq = bic_new, beta_new, lambda_new, sigma_sq_new

        # 记录最优结果
        if bic < best_bic:
            best_bic = bic
            best_r = r
            best_beta = beta
            best_lambda = lambda_
            best_sigma_sq = sigma_sq
            best_features = selected_features

    # 确保 best_features 的长度与 x_df 的列数一致
    if len(best_features) != x_df.shape[1]:
        best_features = np.append(best_features, [False] * (x_df.shape[1] - len(best_features)))

    # 计算 MSE
    epsilon = y_series - np.dot(x_df.loc[:, best_features], best_beta)
    mse = np.mean(epsilon**2)

    return best_beta, best_lambda, best_sigma_sq, mse, best_bic, best_r

In [12]:
# def calculate_sse( beta_hat, beta_true):
#     return np.sum((beta_hat - beta_true) ** 2)

# def calculate_fp( beta_hat, beta_true):
#     return np.sum((beta_true == 0) & (beta_hat != 0))

# def calculate_sr( beta_hat, beta_true, threshold=1e-3):
#     return np.linalg.norm(beta_hat - beta_true) <= threshold
def calculate_sse(beta_hat, beta_true):
    """
    计算 SSE（Sum of Squared Errors）。
    :param beta_hat: 估计的 beta
    :param beta_true: 真实的 beta
    :return: SSE
    """
    # 将 beta_hat 的长度扩展到与 beta_true 相同
    if len(beta_hat) < len(beta_true):
        beta_hat = np.pad(beta_hat, (0, len(beta_true) - len(beta_hat)), mode='constant')
    return np.sum((beta_hat - beta_true) ** 2)

def calculate_fp(beta_hat, beta_true):
    """
    计算 FP（False Positives）。
    :param beta_hat: 估计的 beta
    :param beta_true: 真实的 beta
    :return: FP
    """
    # 将 beta_hat 的长度扩展到与 beta_true 相同
    if len(beta_hat) < len(beta_true):
        beta_hat = np.pad(beta_hat, (0, len(beta_true) - len(beta_hat)), mode='constant')
    return np.sum((beta_true == 0) & (beta_hat != 0))

def calculate_sr(beta_hat, beta_true, threshold=1e-3):
    """
    计算 SR（Selection Rate）。
    :param beta_hat: 估计的 beta
    :param beta_true: 真实的 beta
    :param threshold: 阈值
    :return: SR
    """
    # 将 beta_hat 的长度扩展到与 beta_true 相同
    if len(beta_hat) < len(beta_true):
        beta_hat = np.pad(beta_hat, (0, len(beta_true) - len(beta_hat)), mode='constant')
    return np.linalg.norm(beta_hat - beta_true) <= threshold
def standardize_data( x):

    mean = np.mean(x[:, 1:], axis=0)
    std = np.std(x[:, 1:], axis=0)
    x_standardized = np.column_stack([x[:, 0], (x[:, 1:] - mean) / std])
    return x_standardized, np.insert(mean, 0, 0), np.insert(std, 0, 1)


In [None]:
# 主程序
if __name__ == "__main__":
    # 参数设置
    num_features = 100  # 特征数量
    sigma_u = 0.3  # 非负噪声的标准差
    sigma_v = 0.5  # 随机噪声的标准差
    N_values = [500]  # 样本数量
    epsilon_values = [1]  # 隐私预算

    # 运行实验
    run_experiments(N_values, epsilon_values)
