In [110]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt

from mcmc import mcmc
import densities
import my_problemLR

%matplotlib notebook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Parameter for the problem

In [111]:
# parameters for the problem
Npoint = 100

# data abscissa {Xi}
my_problemLR.Xi = np.random.rand(Npoint)
my_problemLR.sigmad = 0.1 # sec

## Linear regression coefficients to be found

In [112]:
slope = 2.
intercept = 1.

## Create "observed data"

In [113]:
#  Data value = {Yi}
my_problemLR.Yi = slope * my_problemLR.Xi + intercept + np.random.randn(Npoint) * my_problemLR.sigmad

## Trial solution

In [114]:
X0 = np.array([10., 10.])

## Run MCMC with Gaussian ('Gaus') and exponential ('Expo') residual distributions

In [115]:
step = np.array([0.01, 0.01])# *2 or *5 or *10

niter = 100000


mout_gauss, mMAP_gauss, accrate_gauss = mcmc(my_problemLR.logprior, my_problemLR.loglikelyhood,
                                             densities.generate, densities.logproposal, X0, niter, step, 'Gaus')

mout_expo, mMAP_expo, accrate_expo = mcmc(my_problemLR.logprior, my_problemLR.loglikelyhood,
                                          densities.generate, densities.logproposal, X0, niter, step, 'Expo')

## Plot solutions

In [116]:
x_gauss = mout_gauss[:, 0]
y_gauss = mout_gauss[:, 1]

x_expo = mout_expo[:, 0]
y_expo = mout_expo[:, 1]

plt.figure(figsize=(9, 4))
plt.subplot(121)
plt.title('Gaussian')
print('acceptanceRate',accrate)
plt.scatter(x_gauss, y_gauss,cmap= plt.get_cmap('seismic'), c = np.arange(len(x_gauss))/len(x_gauss))
print('Solution with highest probability is ',mMAP)
plt.xlabel('Slope')
plt.ylabel('Intercept')
plt.subplot(122)
plt.title('Exponential')
print('acceptanceRate',accrate)
plt.scatter(x_expo, y_expo,cmap= plt.get_cmap('seismic'), c = np.arange(len(x_expo))/len(x_expo))
print('Solution with highest probability is ',mMAP)
plt.xlabel('Slope')
plt.ylabel('Intercept')
cb = plt.colorbar()
cb.set_label('Percentage of values (%)')
plt.subplots_adjust(wspace=0.3)

<IPython.core.display.Javascript object>

acceptanceRate 0.64371
Solution with highest probability is  [2.0179488 1.0004095]
acceptanceRate 0.64371
Solution with highest probability is  [2.0179488 1.0004095]


The chain starts at point (10, 10) and ends around point (2, 2). The color illustrates the density of points. Important is to note the low percentage values until the chain approaches (2, 2). This part is known as the burn-in chain where the chain has not converged to the dense part of the probability distribution yet and is oversampling low probability area. Then, we can see a high density area around the point (2, 2) known as the posterior chain where the chain settled into the equilibrium solution, each step is a sample of the target distribution. We will have to determine the burn-in chain and delete it from the overall chain.

The two chains with Gaussian and exponential residual distributions have very similar paths.

### Defining the burning period

In [117]:
iter=np.arange(0.,niter)

plt.figure(figsize=(9, 4))
plt.subplot(121)
plt.title('Gaussian')
plt.plot(iter/niter, x_gauss,iter/niter,y_gauss)
plt.xlabel('step number along markov chain %')
plt.subplot(122)
plt.title('Exponential')
plt.plot(iter/niter, x_expo,iter/niter,y_expo)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7f94d0ea5b20>,
 <matplotlib.lines.Line2D at 0x7f94d0ea5bb0>]

We detect the posterior chain where the above curve stabilises. We chose a threshold of 0.04 to remove the burn-in chain (see below). The same threshold is used for Gaussian and exponential residual distributions.

In [118]:
burning_period = int(0.04*niter)

a_gauss = mout_gauss[burning_period:-1,0]
b_gauss = mout_gauss[burning_period:-1,1]

a_expo = mout_expo[burning_period:-1,0]
b_expo = mout_expo[burning_period:-1,1]

### Defining approximately the correlation length

In [119]:
import scipy.signal as sig

plt.figure(figsize=(9, 4))
plt.subplot(221)
plt.title('Gaussian')
plt.plot(sig.correlate(a_gauss-np.mean(a_gauss),a_gauss-np.mean(a_gauss)))
plt.subplot(223)
plt.plot(sig.correlate(a_gauss-np.mean(a_gauss),a_gauss-np.mean(a_gauss)))
plt.xlim([96000-400, 96000+400])
plt.subplot(222)
plt.title('Exponential')
plt.plot(sig.correlate(a_expo-np.mean(a_expo),a_expo-np.mean(a_expo)))
plt.subplot(224)
plt.plot(sig.correlate(a_expo-np.mean(a_expo),a_expo-np.mean(a_expo)))
plt.xlim([96000-400, 96000+400])

<IPython.core.display.Javascript object>

(95600.0, 96400.0)

The central peak is approximatively $\pm$200 values wide (correlation length). We wil therefore only take every 200th value. Again, the same value is used for Gaussian and exponential residual distributions.

### Create a "true" Markov-chain after removal of the burning period and after getting rid of correlation

In [121]:
a_gauss_true=a_gauss[0:-1:200]
b_gauss_true=b_gauss[0:-1:200]

a_expo_true=a_expo[0:-1:200]
b_expo_true=b_expo[0:-1:200]

## Represent histogram for slope and intercept

In [128]:
plt.figure(figsize=(9, 7))
plt.subplot(221)
plt.title('Gaussian slope')
ha = plt.hist(a_gauss_true, int(np.sqrt(np.shape(a_gauss_true)[0])))
plt.subplot(223)
plt.title('Gaussian intercept')
hb = plt.hist(b_gauss_true, int(np.sqrt(np.shape(b_gauss_true)[0])))
plt.subplot(222)
plt.title('Exponential slope')
ha = plt.hist(a_expo_true, int(np.sqrt(np.shape(a_expo_true)[0])))
plt.subplot(224)
plt.title('Exponential intercept')
hb = plt.hist(b_expo_true, int(np.sqrt(np.shape(b_expo_true)[0])))
plt.subplots_adjust(wspace=0.5, hspace=0.3)

<IPython.core.display.Javascript object>

# Distribution of posterior density function

Here we test different distributions for the posterior density function using the Kolmogorov-Smirnov test.

In [129]:
import scipy.stats as st

def get_best_distribution(data):
    
    dist_names = ['weibull_min','norm','weibull_max','beta',
              'invgauss','uniform','gamma','expon',   
              'lognorm','pearson3','triang']
    
    dist_results = []
    
    params = {}
    
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)

        params[dist_name] = param
        
        # Applying the Kolmogorov-Smirnov test
        D, p = st.kstest(data, dist_name, args=param)
        # print("p value for "+dist_name+" = "+str(p))
        # print('D=%.3f' % D)
        dist_results.append((dist_name, p))

    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value

    print("Best fitting distribution: "+str(best_dist))
    print("Best p value: "+ str(best_p))
    print("Parameters for the best fit: "+ str(params[best_dist]))

    return best_dist, best_p, params[best_dist]

In [133]:
print('GAUSSIAN SLOPE:')
gs = get_best_distribution(a_gauss_true)
print('GAUSSIAN INTERCEPT:')
gi = get_best_distribution(b_gauss_true)

GAUSSIAN SLOPE:
Best fitting distribution: norm
Best p value: 0.9038459540133402
Parameters for the best fit: (1.9558441998948448, 0.03332677329747833)
GAUSSIAN INTERCEPT:
Best fitting distribution: norm
Best p value: 0.936677225456873
Parameters for the best fit: (1.0245232799465624, 0.018607689255177643)


In [134]:
print('EXPONENTIAL SLOPE:')
es = get_best_distribution(a_expo_true)
print('EXPONENTIAL INTERCEPT:')
ei = get_best_distribution(b_expo_true)

EXPONENTIAL SLOPE:
Best fitting distribution: norm
Best p value: 0.8491782318319895
Parameters for the best fit: (1.960519727868721, 0.03315932576142809)
EXPONENTIAL INTERCEPT:
Best fitting distribution: lognorm
Best p value: 0.9713309881859188
Parameters for the best fit: (0.012912086944308732, -0.49413145586915996, 1.5247647866117524)
