Paper:

http://www.jmlr.org/proceedings/papers/v40/Kamath15.pdf

k = number of symbols 
n = number of samples

Distributions:

Zipf distribution (with different parameters: 1, 1.5, 0.5)

Uniform distribution

Option to vary k

Step distribution

Estimators: Add-constant (beta = 0, 0.5, 1, sqrt(n))

Error metrics (loss function): L2, L1, chi squared, KL divergence

In [72]:
# Include libraries
import numpy as np

In [73]:
# Parameter settings
k = 5      # number of symbols
n = 10000   # number of samples
beta = 1   # param for add-constant estimator
p = [0.2, 0.3, 0.2, 0.2, 0.1] # probability vector
maxIterations = 10

In [74]:
# Integrated loss function
def loss(p, q, funcs):
    ret_val = []
    for func in funcs:
        if func == "L1":
            # L1 distance
            ret_val.append(np.sum(np.absolute(p - q)))
        elif func == "L2_sq":
            # L2 squared distance
            ret_val.append(np.sum(np.power((p - q), 2)))
        elif func == "chi_sq":
            # chi-squared divergence
            ret_val.append(np.sum(np.power((p - q), 2)/q))
        elif func == "KL_div":
            # KL-divergence
            ret_val.append(np.sum(p * np.log(p/q)))
        else:
            ret_val.append(-1)
    return ret_val

In [75]:
# Uniform distribution
samples = np.sort(np.random.uniform(0,1,n))
symbol_cnt = np.zeros(k, dtype=int) # count for each symbol

# Assign symbol & count symbols
# TODO: Can be optimized?
symbol_index = 0
symbol_val = p[0]
for sample_index in range(0, n):
    if samples[sample_index] >= symbol_val:
        symbol_index += 1
        symbol_val += p[symbol_index]
    symbol_cnt[symbol_index] += 1

In [76]:
# Add-constant estimator
p_est = np.empty(k, dtype=float) # probability vector for each symbol

for i in range(0, k):
    p_est[i] = (symbol_cnt[i] + beta) / (n + k * beta)

In [77]:
# Error calculation
error = loss(p, p_est, ["L1", "L2_sq", "chi_sq", "KL_div"])
print(error)

[0.012093953023488216, 3.539959155854344e-05, 0.00019913221714025936, 9.9418032047736849e-05]
