### Distance Metrics for Probability Distributions

We'll be looking at 3 different distance metrics, and see how different probability distributions look with them.

### Creating probability distributions

In [None]:
import pymc3 as pm
import numpy as np

In [None]:
import matplotlib.pyplot as plt

# Initialize random number generator
np.random.seed(123)

# True parameter values
alpha, sigma = 1, 1
beta = [1, 2.5]

# Size of dataset
size = 100

# Predictor variable
X1 = np.random.randn(size)
X2 = np.random.randn(size) * 0.2

# Simulate outcome variable
Y = alpha + beta[0]*X1 + beta[1]*X2 + np.random.randn(size)*sigma

### Create Models

Let's create traces based on different sampling methods.

In [None]:
basic_model = pm.Model()

with basic_model:
    
    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=10, shape=2)
    sigma = pm.HalfNormal('sigma', sd=1)
    
    # Expected value of outcome
    mu = alpha + beta[0]*X1 + beta[1]*X2
    
    # Likelihood (sampling distribution) of observations
    Y_obs = pm.Normal('Y_obs', mu=mu, sd=sigma, observed=Y)

In [None]:
from scipy import optimize

with basic_model:
    # draw 500 posterior samples
    trace_default = pm.sample()

In [None]:
with basic_model:
    # obtain starting values via MAP
    start = pm.find_MAP(fmin=optimize.fmin_powell)
    # instantiate sampler
    step = pm.Slice() 
    # draw 5000 posterior samples
    trace_slice = pm.sample(5000, step=step, start=start)

In [None]:
with basic_model:
    # instantiate sampler
    step = pm.HamiltonianMC()
    # draw 5000 posterior samples
    trace_HMC = pm.sample(5000, step=step)

In [None]:
with basic_model:
    step = pm.NUTS()
    # draw 5000 posterior samples
    trace_NUTS = pm.sample(5000, step=step)

In [None]:
with basic_model:
    step = pm.Metropolis()
    # draw 5000 posterior samples
    trace_metropolis = pm.sample(5000, step=step)

In [None]:
# SMC is still an experimental method.
# with basic_model:
#     step = pm.SMC()
#     # draw 5000 posterior samples
#     trace_SMC = pm.sample(5000, step=step)

### Creating Manifolds

Torus. Sphere?

### Kullback–Leibler divergence

In [None]:
def KLdivergence(dist_1, dist_2):
    distance = np.sum(dist_1 * np.log(dist_1 / dist_2))
    return distance

### Hellinger Distance

In [None]:
def hellinger(dist_1, dist_2):
    distance = np.sqrt(0.5 * ((np.sqrt(dist_1) - np.sqrt(dist_2))**2).sum())
    return distance

### Fischer-Rao Metric

The Fischer-Rao metric is a particular Riemannian metric. We normally have a statistical manifold with coordinates at each point; in this small snippet we will make do with pseudo code.

In [None]:
def fischer_rao(distribution, coordinate_1, coordinate_2):
    distance = np.sum(np.log(distribution(coordinate_1)) * np.log(distribution(coordinate_2))*distribution)
    return distance

### SoftAbs Metric

The SoftAbs metric is based on an exponential map.
We need to compute the gradient of the quadratic form, and the log determinant. 
Here p is the momenta and pi(q) is the N-dimensional Target density.

H = Q . $lambda$ . $Q^T$

$lambda$ = Diag($lambda_{i}$)

Lambda is the diagonal matrix of eigenvalues and Q is the corresponding matrix of eigenvectors. 

In [None]:
def grad_quad(H_ij, p):
    Q, lambda_i = decompose(H_ij)
    D = diag(Q_t . p / (lambda_i . coth(alpha . lambda_i))
    J = d(lambda_i . coth(alpha . lambda_i))
    grad = - Trace(Q . D . J . D . Q_t . d(H))
    return grad

In [None]:
def grad_log(H_ij):
    Q, lambda_i = decompose(H_ij)
    J = d(lambda_i . coth(alpha . lambda_i))
    R = diag(1 / lambda_i . coth(alpha . lambda_i)
    grad = Trace(Q . (R ◦ J). Q_t . dH)
    return grad