# Usage demo for BinomialLikelihoodVariableSelector

In [1]:
import numpy as np
from scipy.special import expit
import pandas as pd
from millipede import BinomialLikelihoodVariableSelector

## First we create a demo dataset with 3 causal and 97 spurious features

In [2]:
num_datapoints = 100
num_covariates = 100

# create covariates
X = np.random.RandomState(0).randn(num_datapoints * num_covariates)
X = X.reshape((num_datapoints, num_covariates))

# specify the true causal coefficients
true_coefficients = np.array([1.0, -0.5, 0.25] + [0.0] * 97)
print("true_coefficients:\n", true_coefficients)

true_coefficients:
 [ 1.   -0.5   0.25  0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.  ]


In [3]:
# let's suppose each data point corresponds to a Binomial observation with a total count of 20
total_counts = 20 * np.ones(num_datapoints, dtype=np.int64)

# compute responses using the true (generalized) linear model
binomial_probs = expit(X @ true_coefficients)
Y = np.random.RandomState(1).binomial(total_counts, binomial_probs)
print("Observed counts Y:\n", Y)

# put the covariates, responses, and total count vector into a single numpy array
YTCX = np.concatenate([Y[:, None], total_counts[:, None], X], axis=-1)
print("\nX.shape: ", X.shape, "  Y.shape: ", Y.shape, 
      "  total_counts.shape: ", total_counts.shape, "  YTCX.shape: ", YTCX.shape)

Observed counts Y:
 [18 17 18  1 13 16  1  4 15  5 10  8 14  7  1 10 12  5 18  8  9 14  7 10
  9  6 16  2  7  7 18 16  6  3  4  7 11  7  3 10 14  3  7  8 13 16 15  2
 16 18 16 11  8 16 12  2 13  2 10  8 20  4 13 15 18 12 16 10  9  8  7 16
  4  7 16  4  8  6 11  9 13 14 12 13 19 16 16 14 10  2 18 10 18  9 18  2
 10 10 17  1]

X.shape:  (100, 100)   Y.shape:  (100,)   total_counts.shape:  (100,)   YTCX.shape:  (100, 102)


## Then we package the data as a Pandas DataFrame, giving each covariate a  unique name

In [4]:
columns = ['Response', 'TotalCount', 'Causal1', 'Causal2', 'Causal3']
columns += ['Spurious{}'.format(k) for k in range(1, 98)]
dataframe = pd.DataFrame(YTCX, columns=columns)
dataframe.head(5)

Unnamed: 0,Response,TotalCount,Causal1,Causal2,Causal3,Spurious1,Spurious2,Spurious3,Spurious4,Spurious5,...,Spurious88,Spurious89,Spurious90,Spurious91,Spurious92,Spurious93,Spurious94,Spurious95,Spurious96,Spurious97
0,18.0,20.0,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278,0.950088,-0.151357,...,-0.403177,1.222445,0.208275,0.976639,0.356366,0.706573,0.0105,1.78587,0.126912,0.401989
1,17.0,20.0,1.883151,-1.347759,-1.270485,0.969397,-1.173123,1.943621,-0.413619,-0.747455,...,-1.292857,0.267051,-0.039283,-1.168093,0.523277,-0.171546,0.771791,0.823504,2.163236,1.336528
2,18.0,20.0,-0.369182,-0.239379,1.09966,0.655264,0.640132,-1.616956,-0.024326,-0.738031,...,-0.628088,-0.481027,2.303917,-1.060016,-0.13595,1.136891,0.097725,0.582954,-0.399449,0.370056
3,1.0,20.0,-1.306527,1.658131,-0.118164,-0.680178,0.666383,-0.46072,-1.334258,-1.346718,...,0.56729,-0.222675,-0.353432,-1.616474,-0.291837,-0.761492,0.857924,1.141102,1.466579,0.852552
4,13.0,20.0,-0.598654,-1.115897,0.766663,0.356293,-1.768538,0.355482,0.81452,0.058926,...,-1.029935,-0.349943,1.100284,1.298022,2.696224,-0.073925,-0.658553,-0.514234,-1.018042,-0.077855


## Next we create a VariableSelector object appropriate for our count-valued responses

In [5]:
selector = BinomialLikelihoodVariableSelector(dataframe,    # pass in the data
                                              'Response',   # indicate the column of responses
                                              'TotalCount', # indicate the column of total counts
                                              S=1,          # specify the expected number of covariates to include a priori
                                              )

## Finally we run the MCMC algorithm to compute posterior inclusion probabilities (PIPs) and other posterior quanties of interest

In [6]:
selector.run(T=1000, T_burnin=500, verbose=False, seed=2)

## The results are available in the selector.summary DataFrame

- As expected only the 3 causal covariates have large PIPs. 
- In addition the true coefficients are identified correctly (up to noise).
- Note that the intercept term does not have a corresponding PIP, since it is always included in the model by assumption.

In [7]:
selector.summary

Unnamed: 0,PIP,Coefficient,Coefficient StdDev,Conditional Coefficient,Conditional Coefficient StdDev
Causal1,1.000000,1.016604e+00,0.054659,1.016604,5.465915e-02
Causal2,1.000000,-5.744908e-01,0.059060,-0.574491,5.906017e-02
Causal3,0.988069,2.728325e-01,0.063074,0.277914,5.138318e-02
Spurious1,0.000079,0.000000e+00,0.000000,0.000000,0.000000e+00
Spurious2,0.000051,5.896582e-07,0.000133,0.030001,1.357624e-09
...,...,...,...,...,...
Spurious94,0.000058,-1.645650e-06,0.000343,-0.022653,3.329725e-02
Spurious95,0.000120,0.000000e+00,0.000000,0.000000,0.000000e+00
Spurious96,0.000077,5.546312e-06,0.000544,0.041449,2.227512e-02
Spurious97,0.000058,5.773681e-07,0.000327,0.012308,4.613108e-02


For example the largest spurious PIP is given by:

In [8]:
selector.summary.PIP.values[3:-1].max()

0.005362170242442198

Some additional stats about the MCMC run are available in `selector.stats`:

In [9]:
selector.stats

{'Weight quantiles': '5/10/20/50/90/95:  1.92e-16  1.63e-02  2.17e-01  1.45e+01  1.45e+01  1.45e+01',
 'Weight moments': 'mean/std/min/max:  9.26e+00  6.77e+00  1.92e-16  1.49e+01',
 'Elapsed MCMC time': '2.1 seconds',
 'Mean iteration time': '1.384 ms',
 'Number of retained samples': 1000,
 'Number of burn-in samples': 500,
 'Adapted xi value': '2.960',
 'Polya-Gamma MH stats': 'Mean acc. prob.: 0.881  Accepted/Attempted: 257/282'}