# Usage demo for NormalLikelihoodVariableSelector

In [1]:
import numpy as np
import pandas as pd
from millipede import NormalLikelihoodVariableSelector

## First we create a demo dataset with 3 causal and 97 spurious features

In [2]:
num_datapoints = 50
num_covariates = 100

# create covariates
X = np.random.RandomState(0).randn(num_datapoints * num_covariates)
X = X.reshape((num_datapoints, num_covariates))

# specify the true causal coefficients
true_coefficients = np.array([1.0, -0.5, 0.25] + [0.0] * 97)
print("true_coefficients:\n", true_coefficients)

true_coefficients:
 [ 1.   -0.5   0.25  0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.  ]


In [3]:
observation_noise = 0.3
# compute responses using the true linear model and add additional observation noise
Y = X @ true_coefficients + observation_noise * np.random.RandomState(1).randn(num_datapoints)

# put the covariates and responses into a single numpy array
YX = np.concatenate([Y[:, None], X], axis=-1)
print("X.shape: ", X.shape, "  Y.shape: ", Y.shape, "  YX.shape: ", YX.shape)

X.shape:  (50, 100)   Y.shape:  (50,)   YX.shape:  (50, 101)


## Then we package the data as a Pandas DataFrame, giving each covariate a  unique name

In [4]:
columns = ['Response', 'Causal1', 'Causal2', 'Causal3']
columns += ['Spurious{}'.format(k) for k in range(1, 98)]
dataframe = pd.DataFrame(YX, columns=columns)
dataframe.head(5)

Unnamed: 0,Response,Causal1,Causal2,Causal3,Spurious1,Spurious2,Spurious3,Spurious4,Spurious5,Spurious6,...,Spurious88,Spurious89,Spurious90,Spurious91,Spurious92,Spurious93,Spurious94,Spurious95,Spurious96,Spurious97
0,2.295962,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278,0.950088,-0.151357,-0.103219,...,-0.403177,1.222445,0.208275,0.976639,0.356366,0.706573,0.0105,1.78587,0.126912,0.401989
1,2.055882,1.883151,-1.347759,-1.270485,0.969397,-1.173123,1.943621,-0.413619,-0.747455,1.922942,...,-1.292857,0.267051,-0.039283,-1.168093,0.523277,-0.171546,0.771791,0.823504,2.163236,1.336528
2,-0.133029,-0.369182,-0.239379,1.09966,0.655264,0.640132,-1.616956,-0.024326,-0.738031,0.279925,...,-0.628088,-0.481027,2.303917,-1.060016,-0.13595,1.136891,0.097725,0.582954,-0.399449,0.370056
3,-2.487024,-1.306527,1.658131,-0.118164,-0.680178,0.666383,-0.46072,-1.334258,-1.346718,0.693773,...,0.56729,-0.222675,-0.353432,-1.616474,-0.291837,-0.761492,0.857924,1.141102,1.466579,0.852552
4,0.410583,-0.598654,-1.115897,0.766663,0.356293,-1.768538,0.355482,0.81452,0.058926,-0.185054,...,-1.029935,-0.349943,1.100284,1.298022,2.696224,-0.073925,-0.658553,-0.514234,-1.018042,-0.077855


## Next we create a VariableSelector object appropriate for our continuous-valued responses

In [5]:
selector = NormalLikelihoodVariableSelector(dataframe,  # pass in the data
                                            'Response', # indicate the column of responses
                                            S=1.0,        # specify the expected number of covariates included a priori
                                            prior="isotropic" # specify the prior over the coefficients
                                           )

## Finally we run the MCMC algorithm to compute posterior inclusion probabilities (PIPs) and other posterior quantities of interest

In [6]:
selector.run(T=2000, T_burnin=1000, verbosity='bar', seed=2)

  0%|          | 0/3000 [00:00<?, ?it/s]

## The results are available in the selector.summary DataFrame

- As expected only the 3 causal covariates have PIPs. 
- In addition the true coefficients are identified correctly (up to noise).
- Note that the intercept term does not have a corresponding PIP, since it is always included in the model by assumption.

In [7]:
selector.summary

Unnamed: 0,PIP,Coefficient,Coefficient StdDev,Conditional Coefficient,Conditional Coefficient StdDev
Causal1,1.000000,1.044535e+00,0.121924,1.044535,0.121924
Causal2,0.999996,-4.460963e-01,0.166167,-0.446098,0.166164
Causal3,0.819722,1.777110e-01,0.162826,0.216471,0.154611
Spurious1,0.000296,2.396770e-05,0.002901,0.074107,0.143295
Spurious2,0.000775,8.255881e-05,0.005091,0.218468,0.144459
...,...,...,...,...,...
Spurious94,0.000181,-6.315729e-07,0.001819,-0.003903,0.142950
Spurious95,0.000149,2.219246e-06,0.000779,0.049486,0.105354
Spurious96,0.000192,2.733288e-06,0.002367,0.009579,0.139787
Spurious97,0.000198,-1.404630e-05,0.003142,-0.060836,0.197614


For example the largest spurious PIP is given by:

In [8]:
selector.summary.PIP.values[3:-1].max()

0.002819352120226041

Some additional stats about the MCMC run are available in `selector.stats`:

In [9]:
selector.stats

{'Weight quantiles': '5/10/20/50/90/95:  1.92e-16  7.15e-06  6.12e-01  1.71e+01  2.51e+01  2.51e+01',
 'Weight moments': 'mean/std/min/max:  1.31e+01  1.14e+01  1.92e-16  2.51e+01',
 'Elapsed MCMC time': '3.5 seconds',
 'Mean iteration time': '1.152 ms',
 'Number of retained samples': 2000,
 'Number of burn-in samples': 1000}

## Using per-covariate prior inclusion probabilities

If we have additional prior information about which covariates are more or less likely a priori, 
we can provide this information by setting the `S` argument to a P-dimensional numpy array of covariate-specific
prior inclusion probabilities.

In [10]:
# let's make the 3rd covariate *less unlikely* a priori
S = np.ones(num_covariates) / num_covariates
S[2] = 1.0e-6
selector = NormalLikelihoodVariableSelector(dataframe, 'Response',
                                            S=S, prior="isotropic")

In [11]:
selector.run(T=2000, T_burnin=1000, verbosity='bar', seed=2)

  0%|          | 0/3000 [00:00<?, ?it/s]

### As expected, the PIP of the 3rd covariate is now very small

In [12]:
selector.summary

Unnamed: 0,PIP,Coefficient,Coefficient StdDev,Conditional Coefficient,Conditional Coefficient StdDev
Causal1,1.000000,1.052418,0.124988,1.052418,0.124988
Causal2,0.999977,-0.431317,0.171944,-0.431326,0.171935
Causal3,0.000450,0.000081,0.005243,0.197150,0.168357
Spurious1,0.000202,0.000015,0.001955,0.140766,0.123704
Spurious2,0.000840,0.000005,0.004574,0.005766,0.150168
...,...,...,...,...,...
Spurious94,0.000216,0.000002,0.002704,0.009496,0.183155
Spurious95,0.000147,0.000002,0.001764,0.014872,0.160307
Spurious96,0.000189,-0.000020,0.001912,-0.101424,0.090521
Spurious97,0.000156,-0.000014,0.001897,-0.111273,0.125865
