## This code is to simulate the outcome of lots of top-k sampling, 
we generate a table of outcomes (z-scores) and then simply multiply that across for whatever distribution we wish

In [18]:
import numpy as np
import bisect

n = 100
precision = 1000 # this controls the confidence interval around our estimate for the z-scores
'''
Dictionary has the following structure:
list_length (1 through n) : {1 : [associated values] 
                through n : [associated values]}
'''
values_lists = {} # this dictionary will contain the k-th highest observation for a sample of size n
final_scores = {} # this dictionary will be structured the same way as the values lists, except that the final contained value will not be a list but instead be the observed value with a 99% confidence interval

In [19]:
for i in range(precision): # precision times
    sample_list = []
    for j in range(n): # n times
        if not values_lists.get(j+1):
                values_lists[j+1] = {}
                final_scores[j+1] = {}
        sample = np.random.standard_normal()
        bisect.insort_left(sample_list, sample) # O(n) time where n is the size of the existing list

        inner_dict = values_lists[j+1]
        for k in range(j+1):

            # for each k with a list of size j+1 document the value of the item at the k-th position
            if not inner_dict.get(k+1):
                inner_dict[k+1] = [sample_list[k]]
                final_scores[j+1][k+1] = {'mean':0, 'std':0, 'num_samples':0}
            else:
                inner_dict[k+1].append(sample_list[k])

        # list length is j+1



In [23]:
for list_len in values_lists.keys():
    k_ths = values_lists[list_len]
    for k_th in k_ths.keys():
        #print(k_th, k_ths[k_th])
        final_scores[list_len][k_th]['mean'] = np.mean(k_ths[k_th])
        final_scores[list_len][k_th]['std'] = np.std(k_ths[k_th])
        final_scores[list_len][k_th]['num_samples'] = len(k_ths[k_th])
        #print()
    #print()

#print(final_scores)
    '''
for list_len in final_scores.keys():
    print(final_scores[list_len])
    print()
    '''

{1: {'mean': -0.03368072199135788, 'std': 0.9838961671815148, 'num_samples': 1000}}

{1: {'mean': -0.5766882288051718, 'std': 0.8382815444258518, 'num_samples': 1000}, 2: {'mean': 0.5333677953079043, 'std': 0.8414702661191852, 'num_samples': 1000}}

{1: {'mean': -0.8867885827160082, 'std': 0.7593317655392515, 'num_samples': 1000}, 2: {'mean': -0.035703870028768084, 'std': 0.6641409362912024, 'num_samples': 1000}, 3: {'mean': 0.8186699268884139, 'std': 0.7524899753617975, 'num_samples': 1000}}

{1: {'mean': -1.0693544486531972, 'std': 0.7142582373425914, 'num_samples': 1000}, 2: {'mean': -0.3274670226507743, 'std': 0.5822688276086269, 'num_samples': 1000}, 3: {'mean': 0.2537288481225415, 'std': 0.5819472351242333, 'num_samples': 1000}, 4: {'mean': 1.0067338253543126, 'std': 0.7104692788121695, 'num_samples': 1000}}

{1: {'mean': -1.1990827611282797, 'std': 0.6778217551879349, 'num_samples': 1000}, 2: {'mean': -0.5157063134092102, 'std': 0.5489936793779562, 'num_samples': 1000}, 3: {'mea

In [None]:
# Let's observe the shape of a particular list to see a fuller distribution. We're going to need to retrieve said shape from a full list.
# Right now