In [20]:
import sys
import os

# 设置运行路径
sys.path.append(os.path.abspath('../../'))

import math
import numpy as np
from algorithm.sum_dp_module import SumDP
from algorithm.laplace_module import LaplaceMechanism
from algorithm.error_evaluation.SumDP_eva import SumDPExperiment
from algorithm.error_evaluation.laplace_eva import LaplaceExperiment
import pandas as pd

In [16]:
import os

# Ensure the datasets directory exists and set the working directory
sys.path.append(os.path.abspath('../../datasets'))


# 加载数据集
loan_df = pd.read_csv('loan_data.csv')
insurance_df = pd.read_csv('insurance.csv')
students_df = pd.read_csv('StudentsPerformance.csv')

# 展示每个数据集前几行
print("Loan Dataset:")
print(loan_df.head())

print("\nInsurance Dataset:")
print(insurance_df.head())

print("\nStudents Performance Dataset:")
print(students_df.head())

# 提取需要的列并处理为整数列表
loan_data = loan_df['LoanAmount'].dropna().astype(int).tolist()
insurance_data = insurance_df['charges'].dropna().astype(int).tolist()
students_data = students_df['math score'].dropna().astype(int).tolist()

# 打印一下前几个数值，确认没问题
print("LoanAmount sample:", loan_data[:5])
print("Charges sample:", insurance_data[:5])
print("Math score sample:", students_data[:5])



datasets = {
    "loan": loan_data,
    "insurance": insurance_data,
    "students": students_data
}



Loan Dataset:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001003   Male     Yes          1      Graduate            No   
1  LP001005   Male     Yes          0      Graduate           Yes   
2  LP001006   Male     Yes          0  Not Graduate            No   
3  LP001008   Male      No          0      Graduate            No   
4  LP001013   Male     Yes          0  Not Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             4583             1508.0       128.0             360.0   
1             3000                0.0        66.0             360.0   
2             2583             2358.0       120.0             360.0   
3             6000                0.0       141.0             360.0   
4             2333             1516.0        95.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Rural           N  
1             1.0         Urban           Y  
2     

In [23]:
# 差分隐私参数设置
epsilon = 1.0      # 隐私预算
beta = 1e-5        # 概率参数（通常设置为 1e-5）
# upper bound 设置
U_loan = 512       # 数据上界，取大于等于 max(LoanAmount)
U_charge = 2**15   # 数据上界，医疗费用最大值（16884~21984）
U_math = 128       # 学生成绩在 0~100 内，向上取为 128（2的幂）

datasets_U = {
    "loan": U_loan,
    "insurance": U_charge,
    "students": U_math
}

results = []

for name, data in datasets.items():
    U = datasets_U[name]
    max_val = max(data)
    clipped_data = [min(max(int(x), 0), U) for x in data]
    true_sum = sum(clipped_data)

    sumdp_instance = SumDP(epsilon=epsilon, beta=beta, U=U)
    laplace_instance = LaplaceMechanism(epsilon=epsilon, sensitivity=U)

    sumdp = SumDP(epsilon=epsilon, beta=beta, U=U)
    lap = LaplaceMechanism(epsilon=epsilon, sensitivity=U)

    sumdp_eval = SumDPExperiment(sumdp_instance, clipped_data)
    laplace_eval = LaplaceExperiment(laplace_instance, clipped_data)

    sumdp_mean = sumdp_eval.run_experiment()
    laplace_mean = laplace_eval.run_experiment()

    results.append({
        'Dataset': name,
        'True Sum': true_sum,
        'SumDP Error': sumdp_mean,
        'Laplace Error': laplace_mean,
        'SumDP Bound': sumdp_eval.theoretical_error_bound(),
        'Laplace Bound': laplace_eval.theoretical_error_bound()
    })

In [24]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Dataset,True Sum,SumDP Error,Laplace Error,SumDP Bound,Laplace Bound
0,loan,40000,325.976396,512.891763,5658.377,724.077344
1,insurance,16443554,39435.67966,31481.784635,1276134.0,46340.950012
2,students,66089,156.38308,130.700866,3714.234,181.019336


In [None]:


# 初始化 SumDP 算法对象
loan_dp = SumDP(epsilon=epsilon, beta=beta, U=U_loan)
insurance_dp = SumDP(epsilon=epsilon, beta=beta, U=U_charge)
student_dp = SumDP(epsilon=epsilon, beta=beta, U=U_math)

# 初始化实验器
loan_exp = SumDPExperiment(loan_dp, loan_data)
insurance_exp = SumDPExperiment(insurance_dp, insurance_data)
student_exp = SumDPExperiment(student_dp, students_data)

# 运行实验
print("Loan Dataset:")
print("  实验平均误差:", loan_exp.run_experiment(n_trials=500))
print("  理论误差上界:", loan_exp.theoretical_error_bound())

print("\nInsurance Dataset:")
print("  实验平均误差:", insurance_exp.run_experiment(n_trials=500))
print("  理论误差上界:", insurance_exp.theoretical_error_bound())

print("\nStudents Dataset:")
print("  实验平均误差:", student_exp.run_experiment(n_trials=500))
print("  理论误差上界:", student_exp.theoretical_error_bound())

import matplotlib.pyplot as plt

def plot_error_distribution(experimenter, title, n_trials=1000):
    errors = []
    for _ in range(n_trials):
        est_sum, _, _ = experimenter.algorithm.run(experimenter.x_list)
        errors.append(abs(est_sum - experimenter.true_sum))

    plt.figure(figsize=(8, 5))
    plt.hist(errors, bins=40, color='skyblue', edgecolor='black')
    plt.axvline(np.mean(errors), color='red', linestyle='dashed', linewidth=2, label='Mean Error')
    plt.title(f'{title} - 实验误差分布')
    plt.xlabel('绝对误差')
    plt.ylabel('出现次数')
    plt.legend()
    plt.grid(True)
    plt.show()

# 分别画图
plot_error_distribution(loan_exp, 'Loan Dataset')
plot_error_distribution(insurance_exp, 'Insurance Dataset')
plot_error_distribution(student_exp, 'Students Dataset')

def plot_error_vs_epsilon(x_list, dataset_name):
    epsilons = [0.1, 0.5, 1, 2, 5]
    errors = []
    for eps in epsilons:
        dp = SumDP(epsilon=eps, beta=0.01, U=max(x_list))
        exp = SumDPExperiment(dp, x_list)
        err = exp.run_experiment(n_trials=100)
        errors.append(err)

    plt.figure()
    plt.plot(epsilons, errors, marker='o')
    plt.title(f"{dataset_name} - 实验 ε 敏感性分析")
    plt.xlabel("ε (隐私预算)")
    plt.ylabel("实验误差")
    plt.grid(True)
    plt.show()
plot_error_vs_epsilon(loan_data, "Loan Dataset")
plot_error_vs_epsilon(insurance_data, "Insurance Dataset")
plot_error_vs_epsilon(students_data, "Students Dataset")