In [3]:
import numpy as np
import pandas as pd
import random
import estimator as est

np.set_printoptions(precision=3)

d = 10
dist_type = "uni"
est_type = "sw"
N=100000
total_repeat = 1

filename = 'data/eps_grid.csv'
temp = pd.read_csv(filename)
eps_grid = np.asarray(temp['eps_grid'])

filename = 'data/%s_q_%d.csv' %(dist_type, d)
temp = pd.read_csv(filename)
x_grid = np.asarray(temp['x_grid'])
x_q = np.asarray(temp['x_q'])

# generated data samples
idx_original = random.choices(np.arange(d),x_q,k=N)
sample_original = x_grid[idx_original]
print('%d data samples generated with %s distribution.\n' %(N, dist_type))

var_aaa = np.zeros(len(eps_grid))
var_est = np.zeros(len(eps_grid))

distance_aaa = np.zeros(len(eps_grid))
distance_est = np.zeros(len(eps_grid))

for i in range(len(eps_grid)):
    eps = eps_grid[i]
    print('eps=%.2f'%(eps))

    filename = 'data/%s_%s_M_%.2f_%d.csv'%(est_type,dist_type,eps,d)
    temp = pd.read_csv(filename)
    M_aaa = np.asarray(temp)[:,1:]

    filename = 'data/%s_M_%.2f_%d.csv'%(est_type,eps,d)
    temp = pd.read_csv(filename)
    M_est = np.asarray(temp)[:,1:]

    filename = 'data/%s_a_%.2f_%d.csv'%(est_type, eps,d)
    temp = pd.read_csv(filename)
    a_grid = np.asarray(temp['a_grid'])

    # compute the variances
    var_aaa[i] = est.M2Var(M_aaa,x_grid,x_grid,x_q)
    var_est[i] = est.M2Var(M_est,a_grid,x_grid,x_q)
    print('compute variances for eps=%.2f.'%(eps))

    # generate random pools for each value
    rand_num_aaa = np.zeros((N,d))
    rand_num_est = np.zeros((N,d))

    for j in range(d):
        rand_num_aaa[:,j] = random.choices(x_grid,M_aaa[:,j],k=N)
        rand_num_est[:,j] = random.choices(a_grid,M_est[:,j],k=N)
    print("random num pools generated.")

    temp1,temp2 = 0.0,0.0
    for k in range(total_repeat):
        idx_noise = random.choices(range(N),k=N)
        idx_pair = list(zip(idx_noise,idx_original))

        sample_perturbed = rand_num_aaa[tuple(zip(*idx_pair))]
        _,counts = np.unique(sample_perturbed,return_counts=True)
        x_q_aaa = counts/N
        temp1 = temp1 + est.WassDist(x_q_aaa,x_q)

        sample_perturbed = rand_num_est[tuple(zip(*idx_pair))]
        _,counts = np.unique(sample_perturbed,return_counts=True)
        x_q_est = est.EM(M_est,counts,eps)
        x_q_est = x_q_est.ravel()
        temp2 = temp2 + est.WassDist(x_q_est,x_q)

    distance_aaa[i] = temp1/total_repeat
    distance_est[i] = temp2/total_repeat
    print('compute average WassDist, total repeat=%d.\n' %(total_repeat))

results ={'eps_grid': eps_grid,
          'aaa_dist': distance_aaa, 
          '%s_dist'%(est_type):distance_est,
          'aaa_var': var_aaa,
          '%s_var'%(est_type): var_est} 
filename = 'data/results_%s_%s_%d' %(est_type,dist_type,d)
pd.DataFrame(results).to_csv(filename)
print('complete.')

100000 data samples generated with uni distribution.

eps=0.50
compute variances for eps=0.50.
random num pools generated.
compute average WassDist, total repeat=1.

eps=1.00
compute variances for eps=1.00.
random num pools generated.
compute average WassDist, total repeat=1.

eps=1.50
compute variances for eps=1.50.
random num pools generated.
compute average WassDist, total repeat=1.

eps=2.00
compute variances for eps=2.00.
random num pools generated.
compute average WassDist, total repeat=1.

eps=2.50
compute variances for eps=2.50.
random num pools generated.
compute average WassDist, total repeat=1.

complete.
