## Parameter recovery CRDM

We simulated CRDM data for a set of design and various ground truth parameters. Now we will try to estimate those parameters from the simulated data

In [1]:
# Built-in/Generic Imports
import os,sys
import glob
import time

# Libs
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az

import logging
logger = logging.getLogger("pymc")
logger.setLevel(logging.ERROR)

In [2]:
def estimate_bhm(subj_id=[],design_df=[],choices=[],type='single'):

    sure_p = design_df['crdm_sure_p'].values
    lott_p = design_df['crdm_lott_p'].values
    sure_amt = design_df['crdm_sure_amt'].values
    lott_amt = design_df['crdm_lott_amt'].values
    ambig = design_df['crdm_amb_lev'].values

    # We will fit a model for each subject
    with pm.Model() as model_simple:

        # Hyperparameters for alpha, beta
        mu_alpha_hyper = pm.Normal('mu_alpha_hyper',mu=-0.635,sigma=0.05)
        sd_alpha_hyper = pm.Normal('sd_alpha_hyper',mu=0.418,sigma=0.01)

        mu_beta_hyper = pm.Normal('mu_beta_hyper',mu=0.254,sigma=0.05)
        sd_beta_hyper = pm.Normal('sd_beta_hyper',mu=0.575,sigma=0.01)
        
        # mu_gamma_hyper = pm.Normal('mu_gamma_hyper',mu=3.011,sigma=0.05)
        sd_gamma_hyper = pm.Normal('sd_gamma_hyper',mu=2.046,sigma=0.01)

        alpha = pm.LogNormal('alpha',mu=mu_alpha_hyper, sigma=sd_alpha_hyper,shape=np.size(np.unique(subj_id)))
        beta = pm.Normal('beta',mu=mu_beta_hyper,sigma=sd_beta_hyper,shape=np.size(np.unique(subj_id)))
        gamma = pm.HalfNormal('gamma',sigma=sd_gamma_hyper,shape=np.size(np.unique(subj_id)))
        
        prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * 
                                                             ((lott_amt**alpha[subj_id])*(lott_p-beta[subj_id]*ambig/2)
                                                              - (sure_amt**alpha[subj_id])*sure_p)))) 
        
        y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

        # trace_prior = pm.sample(100, tune=20, cores=2,target_accept=0.95,progressbar=False)
        trace_prior = pm.sample(10000, tune=1000, cores=5,target_accept=0.99,progressbar=False)

    # This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
    summary= az.summary(trace_prior,round_to=10)
    if type=='single':
        alpha_hat = summary['mean'].loc['alpha[{}]'.format(0)]
        beta_hat = summary['mean'].loc['beta[{}]'.format(0)]
        gamma_hat = summary['mean'].loc['gamma[{}]'.format(0)]
    elif type=='aggregate':
        alpha_hat = [summary['mean'].loc['alpha[{}]'.format(x)] for x in set(subj_id)]
        beta_hat = [summary['mean'].loc['beta[{}]'.format(x)] for x in set(subj_id)]
        gamma_hat = [summary['mean'].loc['gamma[{}]'.format(x)] for x in set(subj_id)]
    return alpha_hat,beta_hat,gamma_hat


In [3]:
fn = os.path.join('simul','ground_truth.csv')
params_df = pd.read_csv(fn,index_col=0)

fn = os.path.join('simul','design_set.csv')
design_df_single = pd.read_csv(fn,index_col=0)

simulated_data = sorted(glob.glob(os.path.join('simul','split','*/crdm/*.csv')))

choice_col = 'crdm_choice'

# tStep0 = time.time()

# single
# alpha_hat,beta_hat,gamma_hat = [],[],[]
# aggregate
subj_id,choices,design_list = [],[],[]
for index,fn in enumerate(simulated_data):
    print(fn)
    df = pd.read_csv(fn,index_col=0)
    # single
    # ah,bh,gh = estimate_bhm(subj_id=[0]*len(df[choice_col]),
    #                      design_df=design_df_single,
    #                      choices=df[choice_col],type='single')
    # alpha_hat += [ah]
    # beta_hat += [bh]
    # gamma_hat += [gh]

    # aggregate
    choices += df[choice_col].values.tolist()
    subj_id += [index]*len(df[choice_col])
    design_list += [design_df_single]

# params_df['alpha_bhm_sing'] = alpha_hat
# params_df['beta_bhm_sing'] = beta_hat
# params_df['gamma_bhm_sing'] = gamma_hat
# print('Time to complete {} single BHM : {} minutes'.format(len(simulated_data),(time.time() - tStep0)/60.0))

tStep1 = time.time()
design_df_agg = pd.concat(design_list,axis=0)
# kappa_hat,gamma_hat = estimate_bhm(subj_id,design_df_agg,choices,type='aggregate')
params_df['alpha_bhm_agg'],params_df['beta_bhm_agg'],params_df['gamma_bhm_agg'] = estimate_bhm(
    subj_id=subj_id,design_df=design_df_agg,choices=choices,type='aggregate')

print('Time to complete {} aggregate BHM : {} minutes'.format(len(simulated_data),(time.time() - tStep1)/60.0))


fn = os.path.join('simul','parameter_estimate_bhm_v002.csv')
print('Saving estimates to : {}'.format(fn))
params_df.to_csv(fn)


simul/split/p0000/crdm/p0000_crdm.csv
simul/split/p0001/crdm/p0001_crdm.csv
simul/split/p0002/crdm/p0002_crdm.csv
simul/split/p0003/crdm/p0003_crdm.csv
simul/split/p0004/crdm/p0004_crdm.csv
simul/split/p0005/crdm/p0005_crdm.csv
simul/split/p0006/crdm/p0006_crdm.csv
simul/split/p0007/crdm/p0007_crdm.csv
simul/split/p0008/crdm/p0008_crdm.csv
simul/split/p0009/crdm/p0009_crdm.csv
simul/split/p0010/crdm/p0010_crdm.csv
simul/split/p0011/crdm/p0011_crdm.csv
simul/split/p0012/crdm/p0012_crdm.csv
simul/split/p0013/crdm/p0013_crdm.csv
simul/split/p0014/crdm/p0014_crdm.csv
simul/split/p0015/crdm/p0015_crdm.csv
simul/split/p0016/crdm/p0016_crdm.csv
simul/split/p0017/crdm/p0017_crdm.csv
simul/split/p0018/crdm/p0018_crdm.csv
simul/split/p0019/crdm/p0019_crdm.csv
simul/split/p0020/crdm/p0020_crdm.csv
simul/split/p0021/crdm/p0021_crdm.csv
simul/split/p0022/crdm/p0022_crdm.csv
simul/split/p0023/crdm/p0023_crdm.csv
simul/split/p0024/crdm/p0024_crdm.csv
simul/split/p0025/crdm/p0025_crdm.csv
simul/split/

  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples -

Time to complete 125 aggregate BHM : 69.62915831804276 minutes
Saving estimates to : simul/parameter_estimate_bhm_v001.csv
