# Usage demo for BernoulliLikelihoodVariableSelector

In [1]:
import numpy as np
from scipy.special import expit
import pandas as pd
from millipede import BernoulliLikelihoodVariableSelector

## First we create a demo dataset with 3 causal and 97 spurious features

In [2]:
# note that there's relatively little information in a binary-valued observation so
# that we need a fair number of observations to pin down small effects
num_datapoints = 2500  
num_covariates = 100

# create covariates
X = np.random.RandomState(0).randn(num_datapoints * num_covariates)
X = X.reshape((num_datapoints, num_covariates))

# specify the true causal coefficients
true_coefficients = np.array([1.0, -0.5, 0.25] + [0.0] * 97)
print("true_coefficients:\n", true_coefficients)

true_coefficients:
 [ 1.   -0.5   0.25  0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.  ]


In [3]:
# compute responses using the true linear model with logistic link function
bernoulli_probs = expit(X @ true_coefficients)
Y = np.random.RandomState(1).binomial(1.0, bernoulli_probs)
print("Observed counts Y[:100]:\n", Y[:100])

# put the covariates and responses into a single numpy array
YX = np.concatenate([Y[:, None], X], axis=-1)
print("\nX.shape: ", X.shape, "  Y.shape: ", Y.shape, "  YX.shape: ", YX.shape)

Observed counts Y[:100]:
 [1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0
 1 0 1 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1
 1 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 1 0]

X.shape:  (2500, 100)   Y.shape:  (2500,)   YX.shape:  (2500, 101)


## Then we package the data as a Pandas DataFrame, giving each covariate a  unique name

In [4]:
columns = ['Response', 'Causal1', 'Causal2', 'Causal3']
columns += ['Spurious{}'.format(k) for k in range(1, 98)]
dataframe = pd.DataFrame(YX, columns=columns)
dataframe.head(5)

Unnamed: 0,Response,Causal1,Causal2,Causal3,Spurious1,Spurious2,Spurious3,Spurious4,Spurious5,Spurious6,...,Spurious88,Spurious89,Spurious90,Spurious91,Spurious92,Spurious93,Spurious94,Spurious95,Spurious96,Spurious97
0,1.0,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278,0.950088,-0.151357,-0.103219,...,-0.403177,1.222445,0.208275,0.976639,0.356366,0.706573,0.0105,1.78587,0.126912,0.401989
1,1.0,1.883151,-1.347759,-1.270485,0.969397,-1.173123,1.943621,-0.413619,-0.747455,1.922942,...,-1.292857,0.267051,-0.039283,-1.168093,0.523277,-0.171546,0.771791,0.823504,2.163236,1.336528
2,1.0,-0.369182,-0.239379,1.09966,0.655264,0.640132,-1.616956,-0.024326,-0.738031,0.279925,...,-0.628088,-0.481027,2.303917,-1.060016,-0.13595,1.136891,0.097725,0.582954,-0.399449,0.370056
3,0.0,-1.306527,1.658131,-0.118164,-0.680178,0.666383,-0.46072,-1.334258,-1.346718,0.693773,...,0.56729,-0.222675,-0.353432,-1.616474,-0.291837,-0.761492,0.857924,1.141102,1.466579,0.852552
4,1.0,-0.598654,-1.115897,0.766663,0.356293,-1.768538,0.355482,0.81452,0.058926,-0.185054,...,-1.029935,-0.349943,1.100284,1.298022,2.696224,-0.073925,-0.658553,-0.514234,-1.018042,-0.077855


## Next we create a VariableSelector object appropriate for our binary-valued responses

In [5]:
selector = BernoulliLikelihoodVariableSelector(dataframe,    # pass in the data
                                               'Response',   # indicate the column of responses
                                               S=1,          # specify the expected number of covariates to include a priori
                                               )

## Finally we run the MCMC algorithm to compute posterior inclusion probabilities (PIPs) and other posterior quanties of interest

In [6]:
selector.run(T=2000, T_burnin=1000, verbose=False, seed=2)

## The results are available in the selector.summary DataFrame

- As expected only the 3 causal covariates have large PIPs. 
- In addition the true coefficients are identified correctly (up to noise).
- Note that the intercept term does not have a corresponding PIP, since it is always included in the model by assumption.

In [7]:
selector.summary

Unnamed: 0,PIP,Coefficient,Coefficient StdDev,Conditional Coefficient,Conditional Coefficient StdDev
Causal1,1.000000,9.571969e-01,0.052702,0.957197,0.052702
Causal2,1.000000,-4.559088e-01,0.044594,-0.455909,0.044594
Causal3,0.627126,1.281106e-01,0.104126,0.203756,0.042791
Spurious1,0.000054,2.507360e-06,0.000415,0.040871,0.033750
Spurious2,0.000091,3.876521e-06,0.000542,0.049418,0.036170
...,...,...,...,...,...
Spurious94,0.000048,-1.435914e-06,0.000531,-0.024503,0.064883
Spurious95,0.000053,-2.765018e-06,0.000398,-0.043345,0.024582
Spurious96,0.000048,4.934875e-08,0.000101,0.001349,0.016610
Spurious97,0.000081,1.317045e-06,0.000247,0.031267,0.021698


For example the largest spurious PIP is given by:

In [8]:
selector.summary.PIP.values[3:-1].max()

0.012637395683250953

Some additional stats about the MCMC run are available in `selector.stats`:

In [9]:
selector.stats

{'Weight quantiles': '5/10/20/50/90/95:  1.92e-16  1.63e-01  1.91e-01  1.51e+01  1.59e+01  1.63e+01',
 'Weight moments': 'mean/std/min/max:  9.82e+00  7.05e+00  1.92e-16  1.64e+01',
 'Elapsed MCMC time': '9.3 seconds',
 'Mean iteration time': '3.085 ms',
 'Number of retained samples': 2000,
 'Number of burn-in samples': 1000,
 'Adapted xi value': '2.634',
 'Polya-Gamma MH stats': 'Mean acc. prob.: 0.885  Accepted/Attempted: 475/513'}