# Usage demo for NormalLikelihoodVariableSelector

In [2]:
import numpy as np
import pandas as pd
from millipede import NormalLikelihoodVariableSelector

## First we create a demo dataset with 3 causal and 97 spurious features

In [7]:
num_datapoints = 50
num_covariates = 100
X = np.random.RandomState(0).randn(num_datapoints * num_covariates)
X = X.reshape((num_datapoints, num_covariates))

true_coefficient = np.array([1.0, -0.5, 0.25] + [0.0] * 97)
print("true_coefficient:\n", true_coefficient)

true_coefficient:
 [ 1.   -0.5   0.25  0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.  ]


In [17]:
observation_noise = 0.3
Y = X @ true_coefficient + observation_noise * np.random.RandomState(1).randn(num_datapoints)
XY = np.concatenate([X, Y[:, None]], axis=-1)
print("X.shape: ", X.shape, "  Y.shape: ", Y.shape, "  XY.shape: ", XY.shape)

X.shape:  (50, 100)   Y.shape:  (50,)   XY.shape:  (50, 101)


## Then we package it as a Pandas DataFrame

In [23]:
columns = ['Causal1', 'Causal2', 'Causal3']
columns += ['Spurious{}'.format(k) for k in range(1, 98)]
columns += ['Response']
dataframe = pd.DataFrame(XY, columns=columns)
dataframe.head(5)

Unnamed: 0,Causal1,Causal2,Causal3,Spurious1,Spurious2,Spurious3,Spurious4,Spurious5,Spurious6,Spurious7,...,Spurious89,Spurious90,Spurious91,Spurious92,Spurious93,Spurious94,Spurious95,Spurious96,Spurious97,Response
0,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278,0.950088,-0.151357,-0.103219,0.410599,...,1.222445,0.208275,0.976639,0.356366,0.706573,0.0105,1.78587,0.126912,0.401989,2.295962
1,1.883151,-1.347759,-1.270485,0.969397,-1.173123,1.943621,-0.413619,-0.747455,1.922942,1.480515,...,0.267051,-0.039283,-1.168093,0.523277,-0.171546,0.771791,0.823504,2.163236,1.336528,2.055882
2,-0.369182,-0.239379,1.09966,0.655264,0.640132,-1.616956,-0.024326,-0.738031,0.279925,-0.09815,...,-0.481027,2.303917,-1.060016,-0.13595,1.136891,0.097725,0.582954,-0.399449,0.370056,-0.133029
3,-1.306527,1.658131,-0.118164,-0.680178,0.666383,-0.46072,-1.334258,-1.346718,0.693773,-0.159573,...,-0.222675,-0.353432,-1.616474,-0.291837,-0.761492,0.857924,1.141102,1.466579,0.852552,-2.487024
4,-0.598654,-1.115897,0.766663,0.356293,-1.768538,0.355482,0.81452,0.058926,-0.185054,-0.807648,...,-0.349943,1.100284,1.298022,2.696224,-0.073925,-0.658553,-0.514234,-1.018042,-0.077855,0.410583


## Next we create a VariableSelector object appropriate for our continuous-valued responses

In [24]:
selector = NormalLikelihoodVariableSelector(dataframe, # pass in the data
                                            'Response', # indicate the column of responses
                                            S=1, # specify the expected number of causal covariates a priori
                                            prior="isotropic" # specify the prior over the coefficients
                                           )

Initialized NormalLikelihoodSampler with isotropic prior and (N, P, S, tau) = (50, 100, 1.0, 0.010)


## Finally we run the MCMC algorithm to compute posterior inclusion probabilities (PIPs) and other quanties of interest

In [25]:
selector.run(T=1000, T_burnin=500, verbose=False, seed=2)

[Iteration 1499]	# of active features: 4   mean iteration time: 0.86 ms


## The results are available in the selector.summary DataFrame

- As expected only 3 causal covariates were assigned large PIPs. 
- In addition the true coefficients were identified correctly (up to noise).
- Note that the intercept term does not have a corresponding PIP, since it is always included in the model by assumption.

In [27]:
selector.summary

Unnamed: 0,PIP,Coefficient StdDev,Coefficient,Conditional Coefficient
Causal1,1.000000,1.034942,0.009100,1.034942
Causal2,0.999996,-0.451010,0.002860,-0.451011
Causal3,0.820852,0.181283,0.083550,0.219711
Spurious1,0.000298,0.000013,0.000804,0.048916
Spurious2,0.000780,0.000126,0.003371,0.089460
...,...,...,...,...
Spurious94,0.000181,0.000005,0.000459,0.040419
Spurious95,0.000149,-0.000001,0.000097,-0.004919
Spurious96,0.000192,0.000001,0.000354,0.006722
Spurious97,0.000198,0.000001,0.000273,0.007652


Some additional stats about the MCMC run are available in selector.stats

In [29]:
selector.stats

{'Weight quantiles': '5/10/20/50/90/95:  9.52e-17  3.54e-06  3.03e-01  8.48e+00  1.24e+01  1.24e+01',
 'Weight moments': 'mean/std/min/max:  6.34e+00  5.63e+00  9.52e-17  1.24e+01',
 'Elapsed MCMC time': '1.3 seconds',
 'Mean iteration time': '0.870 ms',
 'Number of retained samples': 1000,
 'Number of burn-in samples': 500}