Paper:

http://www.jmlr.org/proceedings/papers/v40/Kamath15.pdf

k = number of symbols 
n = number of samples

Distributions:

Zipf distribution (with different parameters: 1, 1.5, 0.5)

Uniform distribution

Option to vary k

Step distribution

Estimators: Add-constant (beta = 0, 0.5, 1, sqrt(n))

Error metrics (loss function): L2, L1, chi squared, KL divergence

https://github.com/ipython/ipywidgets

In [54]:
import ipywidgets as widgets
import numpy as np
import plotly
from IPython.display import display
from ipywidgets import Output, FloatSlider
from ipywidgets import *
from plotly.graph_objs import *

plotly.offline.init_notebook_mode()
widget_output = Output()

In [55]:
'''
Description: Integrated loss function. Computes L1 distance,
             L2 squared distance, Chi-squared divergence and
             KL divergence.
Param1: p is the observed value
Param2: q is the predicted value
Param3: funcs specifies which loss functions are used
Return: Vector of results from different loss functions
'''
def loss(p, q, funcs):
    ret_val = []
    for func in funcs:
        if func == "L1":
            # L1 distance
            ret_val.append(np.sum(np.absolute(p - q)))
        elif func == "L2_sq":
            # L2 squared distance
            ret_val.append(np.sum(np.power((p - q), 2)))
        elif func == "chi_sq":
            # chi-squared divergence
            ret_val.append(np.sum(np.power((p - q), 2) / q))
        elif func == "KL_div":
            # KL-divergence
            ret_val.append(np.sum(p * np.log(p / q)))
        else:
            raise ValueError("Please verify your loss function is valid")
    return ret_val

In [56]:
'''
Description: Generates a random sample set with given distribution
Param1: n is number of samples
Param2: p is the distribution/probability vector
Return: vector containing how many times each symbol appears
'''
def sample_gen(n, p):
    # Generate random numbers in sample space
    samples = np.sort(np.random.uniform(0,1,n))
    symbol_cnt = np.zeros(len(p), dtype=int) # initialize count for each symbol
    
    # Assign & count symbols
    # Separate sorted radom sample from 0 to 1 with the given probability distribution
    symbol_index = 0
    symbol_val = p[0]
    for sample_index in range(0, n):
        if samples[sample_index] >= symbol_val:
            symbol_index += 1
            symbol_val += p[symbol_index]
        symbol_cnt[symbol_index] += 1
    
    if debug: print ("[DEBUG] Symbol distribution:", symbol_cnt)
    return symbol_cnt


'''
Description: Runs a selected estimator. Supports add-constant,
             Braess and Sauer, and Good-Turing estimators.
Param1: n is number of samples
Param2: p is the distribution/probability vector
Param3: symbol_cnt shows how many times each symbol appears
Param4: estimator should be specified as "Add-constant", "Braess-Sauer" or "Good-Turing
Param5: Specific parameters for some estimators. For add-constant estimator, this should
        be beta value.
Return: The estimated distribution q, as a vector
'''
def estimate(n, p, symbol_cnt, estimator, params):
    q = np.empty(k, dtype=float)
    k = len(p) # number of symbols
    
    # Add-constant estimator
    if estimator == "Add-constant":
        beta = params
        for i in range(0, k):
            q[i] = (symbol_cnt[i] + beta) / (n + k * beta)
    
    # Braess and Sauer estimator
    elif estimator == "Braess-Sauer":
        for i in range(0, k):
            if symbol_cnt[i] == 0:
                beta = 0.5
            elif symbol_cnt[i] == 1:
                beta = 1
            else:
                beta = 0.75
            q[i] = (symbol_cnt[i] + beta) / (n + k * beta)
    
    # Good-Turing estimator
    elif estimator == "Good-Turing":
        # Vector of phi's where phi_t denotes number of elements appearing t times
        # t can vary from 0 to n+1, as phi_{t+1} is also used by this estimator
        phi = np.empty(n+2, dtype=int)
        for t in symbol_cnt:
            phi[t] += 1
            
        # This step is modified from the original formula, as it does not divide q
        # by the normalization factor N, which will be done later.
        for i in range(0, k):
            t = symbol_cnt[i]
            if t > phi[t+1]:
                q[i] = t
            else:
                q[i] = (phi[t+1] + 1) * (t + 1) / phi[t]
                
        # Divide all probabilities by normalization factor to ensure they add up to 1
        q = [x / sum(q) for x in q]
        
    else:
        raise ValueError("Please verify your estimator is valid")
    
    return q


'''
n is number of samples
p is the distribution/probability vector
beta is parameter for add-constant estimator
'''
def run (n, p, beta_arr, maxIterations):
    k = len(p) # number of symbols
    error_list = [] # store error matrices across iterations
    error_matrix_sum = 0 # sum of error matrices across iterations
    
    for iteration in range(0, maxIterations):
        symbol_cnt = sample_gen(n, p)
        
        # Add-constant estimator for each beta value
        error_matrix = []
        error_sum = [0,0,0,0]
        for beta in beta_arr:
            q = np.empty(k, dtype=float) # probability vector for each symbol
            for i in range(0, k):
                q[i] = (symbol_cnt[i] + beta) / (n + k * beta)
            
            # Error calculation
            error = loss(p, q, ["L1", "L2_sq", "chi_sq", "KL_div"])
            error_sum += np.array(error)
            error_matrix.append(error)
        
        # Calculate error mean over iterations
        error_matrix = np.asarray(error_matrix)
        error_list.append(error_matrix)
        error_matrix_sum += error_matrix
    error_matrix_mean = error_matrix_sum / maxIterations
    
    # Calculate error standard deviation over iterations
    error_matrix_sqsum = 0
    for iteration in range(0, len(error_list)):
        error_matrix_sqsum += np.square(error_list[iteration] - error_matrix_mean)
    error_matrix_std_dev = np.sqrt(error_matrix_sqsum / maxIterations)
    
    if debug: print ("[DEBUG] Error_matrix dimensions:", error_matrix_sum.shape)
    
    # returns a list consists of error matrix mean and its standard deviation
    # each row corresponds to a loss function, each col corresponds to a beta value
    return [error_matrix_mean.T, error_matrix_std_dev.T]

In [57]:
'''
error_matrix_range is a list of error matrix mean and its standard deviation
'''
def plot(beta_arr, error_matrix_range):
    if debug: print(error_matrix_range[1])
    
    trace1_upper = Scatter(
        name = "L1 Upper",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][0] + error_matrix_range[1][0],
        mode = "lines",
        marker = dict(color = "444"),
        line = dict(width = 0),
        fillcolor = "rgba(231, 107, 243, 0.2)",
        fill = "tonexty")

    trace1 = Scatter(
        name = "L1",
        x = beta_arr,
        y = error_matrix_range[0][0],
        mode = "lines",
        line = dict(color = "rgb(231, 107, 243)"),
        fillcolor = "rgba(231, 107, 243, 0.2)",
        fill = "tonexty")

    trace1_lower = Scatter(
        name = "L1 Lower",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][0] - error_matrix_range[1][0],
        marker = dict(color = "444"),
        line = dict(width = 0),
        mode = "lines")

    trace2_upper = Scatter(
        name = "L2 Upper",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][1] + error_matrix_range[1][1],
        mode = "lines",
        marker = dict(color = "444"),
        line = dict(width = 0),
        fillcolor = "rgba(0, 176, 246, 0.2)",
        fill = "tonexty")

    trace2 = Scatter(
        name = "L2 squared",
        x = beta_arr,
        y = error_matrix_range[0][1],
        mode = "lines",
        line = dict(color = "rgb(0, 176, 246)"),
        fillcolor = "rgba(0, 176, 246, 0.2)",
        fill = "tonexty")

    trace2_lower = Scatter(
        name = "L2 Lower",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][1] - error_matrix_range[1][1],
        marker = dict(color = "444"),
        line = dict(width = 0),
        mode = "lines")
    
    trace3_upper = Scatter(
        name = "chi Upper",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][2] + error_matrix_range[1][2],
        mode = "lines",
        marker = dict(color = "444"),
        line = dict(width = 0),
        fillcolor = "rgba(0, 100, 80, 0.2)",
        fill = "tonexty")

    trace3 = Scatter(
        name = "chi squared",
        x = beta_arr,
        y = error_matrix_range[0][2],
        mode = "lines",
        line = dict(color = "rgb(0, 100, 80)"),
        fillcolor = "rgba(0, 100, 80, 0.2)",
        fill = "tonexty")

    trace3_lower = Scatter(
        name = "chi Lower",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][2] - error_matrix_range[1][2],
        marker = dict(color = "444"),
        line = dict(width = 0),
        mode = "lines")
    
    trace4_upper = Scatter(
        name = "KL Upper",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][3] + error_matrix_range[1][3],
        mode = "lines",
        marker = dict(color = "444"),
        line = dict(width = 0),
        fillcolor = "rgba(117, 175, 150, 0.2)",
        fill = "tonexty")

    trace4 = Scatter(
        name = "KL divergence",
        x = beta_arr,
        y = error_matrix_range[0][3],
        mode = "lines",
        line = dict(color = "rgb(117, 175, 150)"),
        fillcolor = "rgba(117, 175, 150, 0.2)",
        fill = "tonexty")

    trace4_lower = Scatter(
        name = "KL Lower",
        showlegend = False,
        x = beta_arr,
        y = error_matrix_range[0][3] - error_matrix_range[1][3],
        marker = dict(color = "444"),
        line = dict(width = 0),
        mode = "lines")
    
    data = Data([trace1_lower, trace1, trace1_upper,
                 trace2_lower, trace2, trace2_upper,
                 trace3_lower, trace3, trace3_upper,
                 trace4_lower, trace4, trace4_upper
                ])
    '''
    layout = Layout(
        paper_bgcolor='rgb(255,255,255)',
        plot_bgcolor='rgb(229,229,229)',
        xaxis=XAxis(
            gridcolor='rgb(255,255,255)',
            showgrid=True,
            showline=False,
            showticklabels=True,
            tickcolor='rgb(127,127,127)',
            ticks='outside',
            zeroline=False
        ),
        yaxis=YAxis(
            gridcolor='rgb(255,255,255)',
            #range=[0,1],
            showgrid=True,
            showline=False,
            showticklabels=True,
            tickcolor='rgb(127,127,127)',
            ticks='outside',
            zeroline=False
        ),
    )
    '''
    title = "Distribution over " + str(len(p)) + " elements: " + str(p)
    layout = Layout(
        xaxis = XAxis(
            title='beta',
            zeroline=False),
        yaxis = YAxis(
            title='Error',
            zeroline = False),
        title = title,
        height = 550
    )
    
    fig = Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)

In [58]:
debug = 0

n = 10
#p = [0.2, 0.3, 0.2, 0.2, 0.1]
#p = [0.1, 0.1, 0.1, 0.1, 0.1,0.1,0.1,0.1,0.1,0.1]
p = [0.9, 0.1]
beta_min = 0.1
beta_max = 10
beta_step = (beta_max - beta_min) / 1000
maxIterations = 10
beta_arr = np.arange(beta_min, beta_max, beta_step)

error_matrix_range = run (n, p, beta_arr, maxIterations)

# checkbox debug
# checkbox envelope
# sliders control beta range

lambd_widget = FloatSlider(min=0.00, max=1.00, step=0.01, value=0.50)
display(lambd_widget)
#display(widget_output)
#with widget_output:
plot(beta_arr, error_matrix_range)

print("Distribution over", len(p),"elements:", p)
print("n =",n,", repetition:", maxIterations)

Distribution over 2 elements: [0.9, 0.1]
n = 10 , repetition: 10


Input:

Given a distribution

Given raw data

Given file (.csv and probably .mat)

Find max in L1 and L2, make the range twice of that

In [6]:
import IPython
print(IPython.__version__)

4.1.2
