In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import ezmc

# Testing MCMC Samplers

We can validate a MCMC sampler as follows:

- Choose a multivariate distribution which can sample directly from, e.g. multivariate Normal.
- Draw N samples...
    - Directly.
    - Using the MCMC samplers to be tested (after burn-in, thinning)
    - Using pymc3
- Compare the results
    - Visually using density plots. ✓
    - By looking at quantiles. ✗
    - Komogorov-Smirnof test? ✗
         - This shows a significant difference even for pymc3, so maybe not.
         

Let's use a bivariate Normal as our reference distribution,

$$
[\theta_1, \theta_2] \sim Normal(\mu, \Sigma); \\
\mu = [\mu_1, \mu_2];\\
\Sigma = 
\begin{bmatrix}
 \sigma_1^2 & \rho \sigma_1 \sigma_2 \\
 \rho \sigma_2 \sigma_1 & \sigma_2^2
\end{bmatrix};\\
$$

with 
$$
\mu_1 = 0, \mu_2 = 0, \sigma_1 = 1, \sigma_1 = 1, \rho = .9
$$

In [None]:
# mu1, mu2, s1, s2, r = reference_pars = [0, 1, 4, 2, .8]
mu1, mu2, s1, s2, r = reference_pars = [0, 0, 1, 1, .9]

MU = [mu1, mu2]
SIGMA = [[s1**2, r*s1*s2], [r*s2*s1, s2**2]]
reference_dist = stats.multivariate_normal(mean=MU, cov=SIGMA)

def posterior_density(x):
    return reference_dist.logpdf(x)

n_samples = 100000
direct_samples = reference_dist.rvs(n_samples).T

In [None]:
## Some plotting tools
def setup_axes(newfigure=True):
    if newfigure:
        fig = plt.figure(figsize=(4,4))
    plt.xlabel('$θ_1$')
    plt.ylabel('$θ_2$')
    ax.set_xlim(-5, 5)
    ax.set_ylim(-5, 5)
    if newfigure:
        return fig
    
def density2d(x, y, dpi=10, xlim=None, ylim=None):
    import mpl_scatter_density
    if xlim is None:
        xlim = [x.min(), x.max()]
    if ylim is None:
        ylim = [y.min(), y.max()]
    fig = plt.figure(figsize=(4,4))
    ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
    density = ax.scatter_density(x, y, dpi=dpi)
    ax.set_xlabel('$θ_1$')
    ax.set_ylabel('$θ_2$')
    ax.set_xlim(-5, 5)
    ax.set_ylim(-5, 5)
    
setup_axes()
plt.scatter(direct_samples[0], direct_samples[1], alpha=.5)
plt.title('Direct samples')
plt.show()

In [None]:
density2d(*direct_samples)

## Metropolis Sampler

Next, we use a MCMC sampler (Metropolis in this case) to sample from this same distribution.

In [None]:
def initialise():
    return np.random.normal(0, 20, 2)

proposal_sd = .5
metropolis = ezmc.MetropolisSampler(func=posterior_density, 
                                    par_names=['θ1', 'θ2'], noisy=False,
                                    proposal_sd=proposal_sd, init_func=initialise)

## Figure out how many steps to take to achieve right number of samples
burn_in = 5000
thin = 5
n_chains = metropolis.n_chains
steps_to_take = int((n_samples * thin)/n_chains + burn_in)
print('Taking %i steps' % steps_to_take)
metropolis.sample_chains(n=steps_to_take, verbose=1)

In [None]:
chains = metropolis.get_chains()
ezmc.viz.traceplot(chains)
plt.show()

In [None]:
metropolis_results = metropolis.get_results(burn_in=burn_in, thin=thin)
metropolis_samples = metropolis_results[['θ1', 'θ2']].values.T
ezmc.viz.traceplot(metropolis_results, pars=['θ1', 'θ2']);

In [None]:
print(len(metropolis_results), n_samples)
assert len(metropolis_results) == n_samples 

In [None]:
density2d(*metropolis_samples)

## Differential Evolution Sampler

We do the same with the Differential Evolution (DEMC) sampler.

In [None]:
demc = ezmc.DifferentialEvolutionSampler(func=posterior_density, 
                                         par_names=['θ1', 'θ2'], 
                                         init_bounds=[[-30, 30], [-30, 30]])

burn_in = 5000
thin = 5
n_chains = demc.n_chains
steps_to_take = int((n_samples * thin)/n_chains + burn_in)
print('Taking %i steps' % steps_to_take)
demc.sample_chains(n=steps_to_take, verbose=1)

In [None]:
chains = demc.get_chains()
ezmc.viz.traceplot(chains)
plt.show()

In [None]:
demc_results = demc.get_results(burn_in=burn_in, thin=thin)
demc_samples = demc_results[['θ1', 'θ2']].values.T
ezmc.viz.traceplot(demc_results, pars=['θ1', 'θ2']);

In [None]:
print(len(demc_results), n_samples)
assert len(demc_results) == n_samples 

## Reference pymc3 (NUTS) Sampler

As an additional reference, we use `pymc3` to sample from the same distribution.

In [None]:
import pymc3 as pm

with pm.Model():
    theta = pm.MvNormal('theta', mu=np.array(MU), cov=np.array(SIGMA), shape=2)
    step = pm.Metropolis()
    trace = pm.sample(int(n_samples/4), step)
pm.traceplot(trace)
pm_samples = trace.get_values('theta').T

In [None]:
def density1d(x, xlim=None, **kwargs):
    if xlim is None:
        xlim = [x.min(), x.max()]
    kde = stats.kde.gaussian_kde(x)
    xi = np.linspace(*xlim, num=100)
    y = kde(xi)
    plt.plot(xi, y, **kwargs)
    kwargs['label'] = None
    plt.fill_between(xi, y, 0, **kwargs, alpha=.1)
    
fig, axes = plt.subplots(1, 2, figsize=(12, 3))

for i in range(2):
    plt.sca(axes[i])
    density1d(direct_samples[i], label='Direct samples')
    density1d(metropolis_samples[i], label='ezmc - Metropolis')
    density1d(demc_samples[i], label='ezmc - DEMC')
    density1d(pm_samples[i], label='pymc3')
    plt.legend()
    plt.title(metropolis.par_names[i])
plt.tight_layout()
plt.show()

-----

# Kitchen Scraps

Everything below this line is a work in progress.

In [None]:
## KS-test vs. direct samples

In [None]:
d = stats.norm(0, 1)
p = [stats.kstest(d.rvs(10000), d.cdf).pvalue for i in range(1000) ]
p = np.array(p)
np.mean(p < .05)

In [None]:
reference_1d = stats.norm(0, 1)
p = [stats.kstest(reference_dist.rvs(10)[:, 0], reference_1d.cdf).pvalue for i in range(100) ]
p = np.array(p)
np.mean(p < .05)

In [None]:
stats.kstest(direct_samples.T, reference_dist.cdf)

In [None]:
# My likelihood method
metro_loglik = np.sum(reference_dist.logpdf(metropolis_samples.T))
demc_loglik = np.sum(reference_dist.logpdf(demc_samples.T))
pymc3_loglik = np.sum(reference_dist.logpdf(pm_samples.T))
ref_loglik = np.array([
    np.sum(reference_dist.logpdf(reference_dist.rvs(n_samples)))
    for i in range(1000)
])

In [None]:
plt.hist(ref_loglik, bins=20, density=True, color='grey', alpha=.5)
plt.vlines(metro_loglik, *plt.ylim(), label='ezmc - Metro', color='red')
plt.vlines(demc_loglik, *plt.ylim(), label='ezmc - DEMC', color='green')
plt.vlines(pymc3_loglik, *plt.ylim(), label='pymc3', color='blue')
plt.legend()

In [None]:
raise Exception

In [None]:
## Cook method #1
v = np.linspace(0, 1, 20)
np.mean( mu1 > direct_samples[0])

In [None]:
for i in range(2):
    print(stats.ks_2samp(mcmc_samples[i], direct_samples[i]))

In [None]:
plt.scatter(f(direct_samples[1]), f(mcmc_samples[1]))

In [None]:
qf = lambda x:  x > np.median(x)

In [None]:
plt.hist(reference_dist.cdf(mcmc_samples.T))

In [None]:
q = np.linspace(.05, 1-.05, 10)
f = lambda x: np.percentile(x, q)
plt.figure(figsize=(12, 3))
for i in range(2):
    plt.subplot(1, 2, i+1)
    plt.scatter(f(direct_samples[i]), f(mcmc_samples[i]))