# Probability distributions

## Let's define a few things


In [1]:
# %matplotlib notebook
import numpy as np
from numpy import log
import plotly.offline as py
import plotly.graph_objs as go

layout = go.Layout(
    autosize=False,
    width=600,
    height=600,
    margin=go.layout.Margin(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    scene=dict(
        xaxis=dict(
            title='target'
        ),
        yaxis=dict(
            title='prediction'
        ),
        zaxis=dict(
            title='measure'
        ),
    )
)

EPSILON = np.finfo(float).eps

def clip(value):
    if value == 0:
        return value + EPSILON
    elif value == 1:
        return value - EPSILON
    else:
        return value

In [2]:
def plot_divergence(divergence, colors, plotting=True, axis=None):
    """
    `divergence` must be a list of callable taking prediction and target and returning a value
    `color` must be a list of string
    `axis` can be a tuple containing iterable consisting of values in which the function will be evaluated
    """
    
    # creating points for axis: doing this here is convenient if multiple divergences are passed as argument
    if axis is None:
        x = []
        y = []
        for target in np.arange(0, 1.05, 0.05):
            for prediction in np.arange(0, 1.05, 0.05):
                x.append(target)
                y.append(prediction)
        x = np.array(x)
        y = np.array(y)
    else:
        x, y = axis
            
    for j in range(len(divergence)):
        fun = divergence[j]
        z = []
        # computing values
        for i, prediction in enumerate(x):
            target = y[i]
            z.append(fun(prediction, target))
        # storing values
        divergence[j] = np.array(z)

    if plotting:
        data = []
        for i, z in enumerate(divergence):
            trace = go.Scatter3d(
                x=x, y=y, z=z,
                mode='markers',
                marker=dict(
                    size=3,
                    color=colors[i],
                    opacity=0.5
                )
            )
            data.append(trace)
        fig = go.Figure(data=data, layout=layout)
        py.iplot(fig)


## Ok, now we'll look into Kullback-Leibler divergence.
Note that the actual Kullbach-Leibler divergence has a sum factor, while we are looking at the argument of the sum: Kullbach-Leibler sums over all the predictions that the model makes; we, instead are looking how each prediction is considered by the Kullbach Leibler divergence.

In [3]:
# Kullbach-Leibler divergence

def kullbach_leibler(y_pred, y_true):
    # You may want to instead make copies to avoid changing the np arrays.
    P = clip(y_true)
    Q = clip(y_pred)
    return P * np.log(P / Q)

plot_divergence([kullbach_leibler], ['brown'])

ModuleNotFoundError: No module named 'retrying'

## Ok, this is strange...

Kullbach-Leibler does not have a clear meaning and get completely crazy if the true value is $0$. Let's see what happens if in the other values by removing $0$ from the true values...

In [5]:
x = []
y = []
for target in np.arange(0.05, 1.0, 0.05):
    for prediction in np.arange(0.05, 1.0, 0.05):
        x.append(target)
        y.append(prediction)
x = np.array(x)
y = np.array(y)

plot_divergence([kullbach_leibler], ['brown'], axis=(x, y))


# REEEEEEEEEALLY STRANGE!
Kullbach-Leibler seems to advantage uncorrect predictions in some points... but let's see what happens if you use Kullbach-Leibler by summing the divergence of the probability of having predicted true and the divergence of the probability of having predicted false: 


In [6]:
def kullbach_leibler_corrected(y_pred, y_true):
    return kullbach_leibler(y_pred, y_true) + kullbach_leibler(1 - y_pred, 1 - y_true)

plot_divergence([kullbach_leibler_corrected], ['brown'])
plot_divergence([kullbach_leibler_corrected], ['brown'], axis=(x, y))


Basically, KL can be used succesfully in binary classification problems just by adding the divergence of the other class. The idea comes from the binary crossentropy, that is used in machine learning. It takes minimum values for correct classification of true values $0$ and $1$, but for true values of $0.5$ it's not so good; because of this binary crossentropy is used for binary classification tasks:

## Binary cross-entropy

In [8]:
# binary cross entropy
def binary_crossentropy(y_pred, y_true):
    P = clip(y_pred)
            
    return y_true * np.log(P) + (1 - y_true) * np.log(1 - P)
    
plot_divergence([binary_crossentropy], ['brown'])
plot_divergence([binary_crossentropy], ['brown'], axis=(x, y))

## Jensen–Shannon divergence
A quite common divergence to compare probability distributions is the Jensen–Shannon divergence (Information Radius, IRad). It is similar to KL but it's symmetric and finite (WOW).

In [9]:
def jensen_shannon(y_pred, y_true):
    M = 1/2 * (y_pred + y_true)
    return 1/2 * kullbach_leibler(M, y_true) + 1/2 * kullbach_leibler(M, y_pred)

def jensen_shannon_kl_corrected(y_pred, y_true):
    M = 1/2 * (y_pred + y_true)
    return 1/2 * kullbach_leibler_corrected(M, y_true) + 1/2 * kullbach_leibler_corrected(M, y_pred)

def jensen_shannon_corrected(y_pred, y_true):
    return jensen_shannon(y_pred, y_true) + jensen_shannon(1 - y_pred, 1 - y_true)

def difference(y_pred, y_true):
    return jensen_shannon_corrected(y_pred, y_true) - jensen_shannon_kl_corrected(y_pred, y_true)

plot_divergence([jensen_shannon], ['brown'])
plot_divergence([jensen_shannon_corrected, jensen_shannon_kl_corrected, difference], ['yellow', 'green', 'blue'])
plot_divergence([difference], ['blue'])


Jensen–Shannon divergence seems much much better and it is even better in the binary classification case. However, using the KL corrected version or using IRad correction itself seems to produce almost transcurable but existing difference.

## Squared error and Absolute error
We all know them, just let's plot and see that they are not that bad, after all. 


In [81]:
def squared_error(y_pred, y_true):
    return (y_pred - y_true) ** 2
def absolute_error(y_pred, y_true):
    return abs(y_pred - y_true)

plot_divergence([squared_error, absolute_error, jensen_shannon_corrected], colors=['red', 'green', 'blue'])


## Kononenko-Bratko Information Score
This one is reported by Japkowicz and Shah (2011), but it semms very strange...


In [85]:
def kononenko_bratko(y_pred, y_true):
    if y_pred >= y_true:
        return np.log(clip(y_pred)) - np.log(clip(y_true))
    else:
        return np.log(clip(1 - y_pred)) - np.log(clip(1 - y_true))

plot_divergence([kononenko_bratko], colors=['brown'])
plot_divergence([kononenko_bratko], colors=['brown'], axis=(x, y))