In [5]:
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import scipy
import matplotlib.colors as mcolors
from scipy.stats import bootstrap
import random as rd
from file_helper import load_all_files, create_prefix_from_list
from distutils.spawn import find_executable
from itertools import cycle
import os
from sklearn.linear_model import LinearRegression

import pickle as pk
from statistic.generate_statistics import genSstat, get_ranking_results, reject_if_bad_test
from plotting import plot_stat, put_plot_rank, get_color
from test_distribution import float_to_print
from table_helper import build_latex_table

In [14]:
np.random.seed(3)
random.seed(3)
experiment = "GEN"  # either SYNTH or GEN
TYPE = "SHARP"  # TAIL, SHARP, FLAT
test_epsilon = None
delta = 0.5
compute_random = False
list_of_binning = ['algo']
if experiment == "SYNTH":  # if we generate q ourselves
    print('You are running the synthetic experiment...')

    power_base = 10
    U = power_base**power_base
    m_per_splits = 20000
    init_e = 0.05
    init_b = 0.5
    splits = 10
    S = 4
    ratio = 5
    distribution_type = 'STAIRS'  # STAIRS
    list_of_espilon_q = [0, init_e, init_e*1.5, init_e*2]
    list_of_title_q = [TYPE+':q ' +
                       float_to_print(e) for e in list_of_espilon_q]

else:  # if we take q as the generative models we have, we load the samples.
    print('You are running the generative model experiment...')
    power_base = 6
    U = power_base**power_base
    m_per_splits = 10000
    S = 2
    ratio = 3
    splits = 10

print("for this round m is ", m_per_splits*splits)
print("and U is ", U)
metrics = ['S', 'test', 'binning', 'A', 'nll', 'e', 'std_nll', 'l1']
if experiment == "SYNTH":
    if distribution_type == 'UNIFORM':
        ground_truth_p = prob_array_to_dict(makeUniProbArr(U))

    elif distribution_type == 'STAIRS':
        # posU = math.factorial(power_base)/U
        posU = 0.9
        ground_truth_p = make_stair_prob(
            U, posU=posU, ratio=ratio,  S=S)

    else:
        raise NotImplemented
    list_of_samples, list_of_pmf_q = load_samples(
        list_of_espilon_q, init_b, ground_truth_p, splits, U, m_per_splits, S, ratio, TYPE)
    store_results = {}
    store_results_ranking = {}
    for algo in list_of_binning:
        store_results_ranking[algo] = []

    for metric in metrics:
        store_results[metric] = {}
        for title in list_of_title_q:
            store_results[metric][title] = {}
else:
    dict_of_samples, ground_truth_p = load_generative_model_samples(
        power_base, num_files=1, using_max=True)
    list_of_samples = [val for _, val in dict_of_samples.items()]
    list_of_title_q = [key for key, _ in dict_of_samples.items()]
    store_results = {}
    store_results_ranking = {}
    for algo in list_of_binning:
        store_results_ranking[algo] = []
    for metric in metrics:
        store_results[metric] = {}
        for title in list_of_title_q:
            store_results[metric][title] = {}
    list_of_pmf_q = None
trials = 50

You are running the generative model experiment...
for this round m is  100000
and U is  46656
PERCENT NULL 0.1
shuffling process within the samples
shuffling process within the samples
should_be_one 1.0000000000000022
should_be_one 1.000000000000002
should_be_one 1.000000000000002
should_be_one 1.000000000000002


In [15]:
print(dict_of_samples)

{'argmaxAR': [{165: 0.0011500000000000024, 301: 0.0017000000000000038, 281: 0.0016800000000000038, 676: 0.0003700000000000003, 278: 0.001410000000000003, 216: 0.0024400000000000055, 678: 0.0008900000000000017, 79: 0.0018900000000000043, 57: 0.0015600000000000034, 60: 0.002160000000000005, 396: 0.0015100000000000033, 19: 0.0021000000000000046, 7: 0.002610000000000006, 138: 0.0019000000000000043, 72: 0.0027700000000000064, 12321: 2e-05, 61: 0.001430000000000003, 344: 0.0014700000000000032, 460: 0.0007500000000000013, 176: 0.0019500000000000045, 413: 0.0007500000000000013, 414: 0.0016300000000000036, 35: 0.0024000000000000054, 321: 0.0021500000000000048, 643: 0.0004900000000000006, 77: 0.002250000000000005, 17: 0.002970000000000007, 247: 0.0019500000000000045, 140: 0.002570000000000006, 180: 0.002670000000000006, 62: 0.002200000000000005, 555: 0.000640000000000001, 191: 0.001740000000000004, 1783: 1e-05, 326: 0.0015700000000000035, 404: 0.0015700000000000035, 626: 0.000620000000000001, 24

In [20]:
C = int(math.factorial(power_base)/2)
# for i in range(len(list_of_samples)):
#     samples_of_model = list_of_samples[i]
#     first_trial_of_model = samples_of_model[0]
    
#     xs = list(first_trial_of_model.keys())
#     ys= list(first_trial_of_model.values())

#     plt.plot(xs, ys, 'o', label=list_of_title_q[i])
# plt.legend()
# plt.show()
# plt.close()
list_of_title_q = list(data['A'].keys())
def format_name(q_name):
    if experiment == 'SYNTH':
        epsilon = q_name.split(' ')[1]
        if float(epsilon) ==0:
            return '$p$'
        string = '$q_{{{:.2f}}}$'.format(float(epsilon))
        return string
    else:
        if q_name =='FCDM':
            return 'GMCD'
        return q_name
for i in range(len(list_of_samples)-1):
   
    samples_of_model = list_of_samples[i]
    first_trial_of_model = samples_of_model[0]

    xs = list(first_trial_of_model.keys())
    ys= list(first_trial_of_model.values())

    only_pos_x = [x for x in xs if x< math.factorial(power_base)]
    only_pos_y = [ys[i] for i in range(len(ys)) if xs[i]< math.factorial(power_base)]


    plt.plot(only_pos_x, only_pos_y, 'o', markersize=1,label=format_name(list_of_title_q[i]))
gt_x1 = list(range(C))
gt_y1 = [3/(2*math.factorial(power_base)) for _ in gt_x1]


gt_y2 = [1/(2*math.factorial(power_base)) for _ in gt_x1]

plt.plot(range(C*2), gt_y1+gt_y2,label='ground truth')
plt.legend()
plt.ylabel('p(x)')
plt.xlabel('x')
plt.savefig('unsorted.pdf')
plt.close()