# Usage demo for NormalLikelihoodVariableSelector

In [None]:
import numpy as np
import pandas as pd
from millipede import NormalLikelihoodVariableSelector

## First we create a demo dataset with 3 causal and 97 spurious features

In [None]:
num_datapoints = 50
num_covariates = 100

# create covariates
X = np.random.RandomState(0).randn(num_datapoints * num_covariates)
X = X.reshape((num_datapoints, num_covariates))

# specify the true causal coefficients
true_coefficients = np.array([1.0, -0.5, 0.25] + [0.0] * 97)
print("true_coefficients:\n", true_coefficients)

In [None]:
observation_noise = 0.3
# compute responses using the true linear model and add additional observation noise
Y = X @ true_coefficients + observation_noise * np.random.RandomState(1).randn(num_datapoints)

# put the covariates and responses into a single numpy array
YX = np.concatenate([Y[:, None], X], axis=-1)
print("X.shape: ", X.shape, "  Y.shape: ", Y.shape, "  YX.shape: ", YX.shape)

## Then we package the data as a Pandas DataFrame, giving each covariate a  unique name

In [None]:
columns = ['Response', 'Causal1', 'Causal2', 'Causal3']
columns += ['Spurious{}'.format(k) for k in range(1, 98)]
dataframe = pd.DataFrame(YX, columns=columns)
dataframe.head(5)

## Next we create a VariableSelector object appropriate for our continuous-valued responses

In [None]:
selector = NormalLikelihoodVariableSelector(dataframe,  # pass in the data
                                            'Response', # indicate the column of responses
                                            S=1,        # specify the expected number of covariates included a priori
                                            prior="isotropic" # specify the prior over the coefficients
                                           )

## Finally we run the MCMC algorithm to compute posterior inclusion probabilities (PIPs) and other posterior quantities of interest

In [None]:
selector.run(T=2000, T_burnin=1000, verbose=False, seed=2)

## The results are available in the selector.summary DataFrame

- As expected only the 3 causal covariates are assigned large PIPs. 
- In addition the true coefficients are identified correctly (up to noise).
- Note that the intercept term does not have a corresponding PIP, since it is always included in the model by assumption.

In [None]:
selector.summary

For example the largest spurious PIP is given by:

In [None]:
selector.summary.PIP.values[3:-1].max()

Some additional stats about the MCMC run are available in `selector.stats`:

In [None]:
selector.stats