In [5]:
import numpy as np
import pandas as pd
import random
import utilities
from a3mV2 import OptVariance
from synthetic_generate import generate_synthetic_data
np.set_printoptions(precision=3)

def generate_perturbed_pool(M, a_grid, N):
    _,dx =M.shape
    perturbed_pool = np.zeros((N,dx))
    for j in range(dx):
        p=M[:,j]/np.sum(M[:,j])
        perturbed_pool[:,j] = np.random.choice(a=a_grid,p=p,size=N)
    return perturbed_pool

def estimate_distribution(eps,est_type,M,histo_true,pool,repeat,range,bins):
    q_est = np.zeros((1,bins))
    for k in np.arange(repeat):
        data_perturbed=[]
        for i in np.arange(len(histo_true)):
            temp = np.random.choice(a = pool[:,i], size = histo_true[i])
            data_perturbed = np.concatenate((data_perturbed,temp))
        a_grid,hist_perturbed = np.unique(data_perturbed,return_counts=True)
        if est_type!="aaa":
            temp = utilities.EM(M,hist_perturbed,eps)
        else:
            temp = hist_perturbed/data_perturbed.size
        q_est +=temp
    q_est = q_est/repeat
    return q_est

def DP_dist_estimation(data, bins, range, est_type, eps, est_repeat, test_repeat):
    print('eps=',eps)
    histo_true,_ = np.histogram(a=data, range=range, bins=bins)
    q_true = histo_true/N
    print('true histo-q is:',q_true)

    if est_type == 'sw':
        a_grid, M_est = utilities.SquareWave(eps,bin_idxs)
    elif est_type == 'grr':
        a_grid, M_est = utilities.GenRandResp(eps,bin_idxs)
    else:
        raise NotImplementedError()
    perturbed_pool_est = generate_perturbed_pool(M=M_est, a_grid=a_grid, N=10000)
    print("perturbation pools generated for %s estimator." %(est_type))

    q_est = estimate_distribution(eps=eps,est_type=est_type,
                                M=M_est,
                                histo_true=histo_true, 
                                pool=perturbed_pool_est,
                                repeat=est_repeat,
                                range = range,
                                bins = bins)
    print('estimate histo-q is:', q_est)

    _, M_aaa = OptVariance(eps, bin_idxs, q_est)
    print('AAA solution found.')

    perturbed_pool_aaa = generate_perturbed_pool(M=M_aaa, a_grid=bin_idxs, N=10000)
    print("perturbation pools generated for aaa estimator.")

    var_aaa = utilities.M2Var(M_aaa,bin_idxs,bin_idxs,q_true)
    var_est = utilities.M2Var(M_est,a_grid,bin_idxs,q_true)

    q_aaa = estimate_distribution(eps=eps,est_type="aaa",
                                M=M_aaa,
                                histo_true=histo_true, 
                                pool=perturbed_pool_aaa,
                                repeat=test_repeat,
                                range = range,
                                bins = bins)
    wass_aaa = utilities.WassDist(q_aaa,q_true)/test_repeat

    q_est = estimate_distribution(eps=eps,est_type=est_type,
                                M=M_est,
                                histo_true=histo_true, 
                                pool=perturbed_pool_est,
                                repeat=test_repeat,
                                range = range,
                                bins = bins)
    wass_est = utilities.WassDist(q_est,q_true)/test_repeat
    print('estimation complete, repeat %d times.\n' %(test_repeat))
    return var_aaa, var_est, wass_aaa, wass_est

# set key parameters
d = 10 # num of bins in [low,high]
beta = 1
data_type = "GAUSSIAN"
est_type = "sw"
N=d*10000
est_repeat = 5
test_repeat = 20

# set index of bins
bin_width = beta/d
bin_idxs = np.linspace(bin_width/2, beta-bin_width/2,d)

# generate data
data = generate_synthetic_data(data_type=data_type,
                                low=-3,
                                high=3, 
                                n=N, 
                                beta=beta)
print("synthetic data generated, N=%d." %(data.size))

# set epsilon
eps_grid = 0.5*np.arange(1,6)

result = np.zeros((eps_grid.size,4))
for i in range(len(eps_grid)):
    result[i,:] = DP_dist_estimation(data, 
                                    bins=d, 
                                    range=(0,beta), 
                                    est_type='sw', 
                                    eps=eps_grid[i], 
                                    est_repeat=est_repeat, 
                                    test_repeat=test_repeat)
    
filename = 'data/result_%s_%s_%d_(%d,%d).csv' %(data_type,est_type,d,est_repeat,test_repeat)
temp = {'eps_grid': eps_grid, 
        'var_aaa':result[:,0], 
        'var_est':result[:,1], 
        'wass_aaa':result[:,2], 
        'wass_est':result[:,3]}
pd.DataFrame(temp).to_csv(filename)
print('task complete!')

synthetic data generated, N=100000.
eps= 0.5
true histo-q is: [0.009 0.028 0.08  0.161 0.226 0.225 0.158 0.077 0.028 0.009]
[0.08  0.08  0.08  0.08  0.08  0.08  0.08  0.049 0.049 0.049 0.049 0.049
 0.049 0.049 0.049 0.049]
[0.049 0.08  0.08  0.08  0.08  0.08  0.08  0.08  0.049 0.049 0.049 0.049
 0.049 0.049 0.049 0.049]
[0.049 0.049 0.08  0.08  0.08  0.08  0.08  0.08  0.08  0.049 0.049 0.049
 0.049 0.049 0.049 0.049]
[0.049 0.049 0.049 0.08  0.08  0.08  0.08  0.08  0.08  0.08  0.049 0.049
 0.049 0.049 0.049 0.049]
[0.049 0.049 0.049 0.049 0.08  0.08  0.08  0.08  0.08  0.08  0.08  0.049
 0.049 0.049 0.049 0.049]
[0.049 0.049 0.049 0.049 0.049 0.08  0.08  0.08  0.08  0.08  0.08  0.08
 0.049 0.049 0.049 0.049]
[0.049 0.049 0.049 0.049 0.049 0.049 0.08  0.08  0.08  0.08  0.08  0.08
 0.08  0.049 0.049 0.049]
[0.049 0.049 0.049 0.049 0.049 0.049 0.049 0.08  0.08  0.08  0.08  0.08
 0.08  0.08  0.049 0.049]
[0.049 0.049 0.049 0.049 0.049 0.049 0.049 0.049 0.08  0.08  0.08  0.08
 0.08  0.08  0.

ValueError: probabilities do not sum to 1