In [37]:
import numpy as np
import pandas as pd
import random
import utilities as est
from a3mV2 import OptVariance
from synthetic_generate import generate_synthetic_data
np.set_printoptions(precision=3)

# set key parameters
d = 10 # num of bins in [low,high]
beta = 1
data_type = "GAUSSIAN"
est_type = "sw"
N=d*10000
total_repeat = 10

# set index of bins
bin_width = beta/d
bin_idxs = np.linspace(bin_width/2, beta-bin_width/2,d)


# generate data
synth_data = generate_synthetic_data(data_type=data_type,low=-3,high=3, n=N, beta=beta)
print("synthetic data generated, N=%d." %(synth_data.size))

hist_true,_ = np.histogram(synth_data, bins=d)
q_true = hist_true/N

# set epsilon
eps_grid = 0.5*np.arange(1,6)
filename = 'data/eps_grid.csv'
temp = {'eps_grid': eps_grid}
pd.DataFrame(temp).to_csv(filename)

for i in range(len(eps_grid)):
    eps= eps_grid[i]
    print('eps=%.2f'%(eps))

    if est_type == 'sw':
        a_grid, M = est.SquareWave(eps,x_grid)
    elif est_type == 'grr':
        a_grid, M = est.GenRandResp(eps, x_grid)
    else:
        print('warning: invalid estimator type!')
    print('Found matrix M for %s estimator!' %(est_type))

    filename = 'data/%s_M_%.2f_%d.csv'%(est_type,eps,d)
    pd.DataFrame(M).to_csv(filename)
    filename = 'data/%s_a_%.2f_%d.csv'%(est_type,eps,d)
    temp = {'a_grid':a_grid}
    pd.DataFrame(temp).to_csv(filename)

    print("generating random num pools.")
    rand_num = np.zeros((N,d))
    for j in range(d):
        rand_num[:,j] = random.choices(a_grid,M[:,j],k=N)
    
    print('estimating data distribution with %s estimator, repeat %d times.'
           %(est_type, total_repeat))
    x_q_noisy = np.zeros((1,d))
    for k in range(total_repeat):
        idx_noise = random.choices(range(N),k=N)
        idx_pair = list(zip(idx_noise,idx_original))
        sample_perturbed = rand_num[tuple(zip(*idx_pair))]
        elements,counts = np.unique(sample_perturbed,return_counts=True)
        x_q_est = est.EM(M,counts,eps)
        x_q_noisy +=x_q_est
    x_q_noisy = x_q_noisy/np.sum(x_q_noisy)
    print('q_est(x)=', x_q_noisy)

    # Find the AAA transition matrix
    _, M = OptVariance(eps, x_grid, x_q_noisy)
    filename = 'data/%s_%s_M_%.2f_%d.csv'%(est_type,data_type,eps,d)
    pd.DataFrame(M).to_csv(filename)
    print("AAA Solution Found!\n")
print('taks complete!')


synthetic data generated, N=100000.
[0.008 0.027 0.078 0.159 0.23  0.224 0.159 0.079 0.028 0.008]


In [None]:
import numpy as np
import pandas as pd
import random
import utilities

np.set_printoptions(precision=3)

# set key parameters
d = 20
dist_type = "exp"
est_type = "sw"
N=d*10000
total_repeat = 50

filename = 'data/eps_grid.csv'
temp = pd.read_csv(filename)
eps_grid = np.asarray(temp['eps_grid'])

filename = 'data/%s_q_%d.csv' %(dist_type, d)
temp = pd.read_csv(filename)
x_grid = np.asarray(temp['x_grid'])
x_q = np.asarray(temp['x_q'])

# generated data samples
idx_original = random.choices(np.arange(d),x_q,k=N)
sample_original = x_grid[idx_original]
print('generating %d data samples with %s distribution.\n' %(N, dist_type))

var_aaa = np.zeros(len(eps_grid))
var_est = np.zeros(len(eps_grid))

distance_aaa = np.zeros(len(eps_grid))
distance_est = np.zeros(len(eps_grid))

for i in range(len(eps_grid)):
    eps = eps_grid[i]
    print('eps=%.2f'%(eps))

    filename = 'data/%s_%s_M_%.2f_%d.csv'%(est_type,dist_type,eps,d)
    temp = pd.read_csv(filename)
    M_aaa = np.asarray(temp)[:,1:]

    filename = 'data/%s_M_%.2f_%d.csv'%(est_type,eps,d)
    temp = pd.read_csv(filename)
    M_est = np.asarray(temp)[:,1:]

    filename = 'data/%s_a_%.2f_%d.csv'%(est_type, eps,d)
    temp = pd.read_csv(filename)
    a_grid = np.asarray(temp['a_grid'])

    print('computing variances.')
    var_aaa[i] = utilities.M2Var(M_aaa,x_grid,x_grid,x_q)
    var_est[i] = utilities.M2Var(M_est,a_grid,x_grid,x_q)

    print("generating random num pools.")
    rand_num_aaa = np.zeros((N,d))
    rand_num_est = np.zeros((N,d))

    for j in range(d):
        rand_num_aaa[:,j] = random.choices(x_grid,M_aaa[:,j],k=N)
        rand_num_est[:,j] = random.choices(a_grid,M_est[:,j],k=N)

    print('computing average WassDist, repeat %d times.\n' %(total_repeat))
    temp1,temp2 = 0.0,0.0
    for k in range(total_repeat):
        idx_noise = random.choices(range(N),k=N)
        idx_pair = list(zip(idx_noise,idx_original))

        sample_perturbed = rand_num_aaa[tuple(zip(*idx_pair))]
        _,counts = np.unique(sample_perturbed,return_counts=True)
        x_q_aaa = counts/N
        temp1 = temp1 + utilities.WassDist(x_q_aaa,x_q)

        sample_perturbed = rand_num_est[tuple(zip(*idx_pair))]
        _,counts = np.unique(sample_perturbed,return_counts=True)
        x_q_est = utilities.EM(M_est,counts,eps)
        x_q_est = x_q_est.ravel()
        temp2 = temp2 + utilities.WassDist(x_q_est,x_q)

    distance_aaa[i] = temp1/total_repeat
    distance_est[i] = temp2/total_repeat
    

results ={'eps_grid': eps_grid,
          'aaa_dist': distance_aaa, 
          '%s_dist'%(est_type):distance_est,
          'aaa_var': var_aaa,
          '%s_var'%(est_type): var_est} 
filename = 'data/results_%s_%s_%d' %(est_type,dist_type,d)
pd.DataFrame(results).to_csv(filename)
print('task complete!')