# Bayesian parameter estimation 

Written for CPDM task as part of the IDM dataset collected online with Mturk. Here we are using the CASANDRE model (instead of utility model) to analyze the CPDM data

Extended to work for all datasets

### Import modules, libraries, etc

In [55]:
# Built-in/Generic Imports
import os,sys
import glob,time

# Libs :: all are part of idm_env, except for arviz and pymc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import pymc as pm
import arviz as az
from sklearn.preprocessing import LabelBinarizer
# Lognormal inverse cumulative distribution function
from scipy.special import ndtri

# This reduces the amount of pymc output, can comment or change level to see more information
import logging
logger = logging.getLogger("pymc")
logger.setLevel(logging.ERROR)

# explicitly use a path to let the script know where the IDM_model is located to import model_functions
parent = '/Users/pizarror/IDM'
# adding the parent directory to the sys.path.
sys.path.append(parent)
from IDM_model.src import model_functions as mf


### Diagnostic plots

Run `diagnistic_plots()` individually by subject :: trace, posterior, bivariate densities, rank plots

In [7]:

def diganostic_plots(trace,experiment='experiment',utility_dir='/tmp/',subject='23_IDM_0001',task='cdd_nlh',coords={},var_names=['kappa','gamma'],figsize=(10,10)):

    bh_dir = os.path.join(utility_dir,subject,task,'bh')
    if not os.path.exists(bh_dir):
        os.makedirs(bh_dir)
    print('Saving diagnostic plots to bh_dir : {}'.format(bh_dir))

    title_dict = {'fontsize':15}

    # 2by2 : rows 2 varialbes, cols 2 for distribution and sampled values
    axes = az.plot_trace(trace, var_names=var_names,coords=coords,compact=False)
    for r in range(axes.shape[0]):
        for c in range(axes.shape[1]):
            axes[r,c].set_title('{}: {}'.format(subject,var_names[r]))
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_trace_plot.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_pair(trace,kind='kde', coords=coords,var_names=var_names,marginals=True)
    axes[0,0].set_title(subject,fontdict=title_dict)
    axes[1,0].set_ylabel(var_names[1])
    axes[1,0].set_xlabel(var_names[0])
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_bivariate_densities.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_posterior(trace,var_names=var_names,coords=coords)
    # print(axes.shape)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig_fn = os.path.join(bh_dir,'{}_{}_posterior.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    az.plot_rank(trace,var_names=var_names,coords=coords,ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_bars.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    axes = az.plot_rank(trace,var_names=var_names, coords=coords,kind="vlines",vlines_kwargs={'lw':0}, marker_vlines_kwargs={'lw':3},ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_lines.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()




### Load data

We will load the data from all participants so we can run the modeling schema

In [8]:
def read_load_data(subject='23_IDM_0144',fn='/tmp',cols=[]):
    cpdm_df = pd.read_csv(fn)
    task='cpdm'
    cpdm_df = mf.remap_response(cpdm_df,task=task)
    cpdm_df = mf.drop_pract(cpdm_df,task=task)
    cpdm_df,response_rate = mf.drop_non_responses(cpdm_df,task=task,verbose=False)
    data = mf.get_data(cpdm_df,cols,alpha_hat=1)[0]
    data['subject'] = subject
    return data

def dirs_and_data(root_dir='/tmp',dataset='IDM'):

    dataset_dir = os.path.join(root_dir,dataset)
    split_dir = os.path.join(dataset_dir,'split')
    utility_dir = os.path.join(dataset_dir,'utility')
    save_dir = os.path.join(utility_dir,'BHM/cpdm')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Takes about 10 seconds for 149 subjects form Mturk data
    task = 'cpdm'
    subjs = sorted(glob.glob(os.path.join(split_dir,'*')))
    cols = ['cpdm_choice','cpdm_gabor_orient', 'cpdm_gabor_contrast', 'cpdm_run_dimension', 'cpdm_trial_resp.keys','cpdm_trial_resp.rt']
    data = pd.DataFrame(columns=['subject']+cols)
    subj_id = 0
    for s in subjs:
        subject = os.path.basename(s)
        fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
        if os.path.exists(fn):
            subj_data = read_load_data(subject=subject,fn=fn,cols=cols)
            if subj_data.empty:
                continue
            # for c in cols:
            #     subj_data[c] = subj_data[c].astype(float)
            subj_data['subject_id'] = int(subj_id)
            subj_id = subj_id+1
            data = pd.concat([data,subj_data],ignore_index=True)

    return utility_dir,save_dir,data

def extract_cols(data):
    subjects = data['subject'].unique()
    # nb_subj = subjects.shape[0]
    # nb_trials = data.shape[0]//nb_subj
    subj_id_list = data['subject_id'].to_list()
    subj_id = [int(s) for s in subj_id_list]
    # old_id = np.array([ [s]*nb_trials for s in range(nb_subj) ]).flatten()

    gabor_orient = data['cpdm_gabor_orient'].values
    gabor_contrast = data['cpdm_gabor_contrast'].values
    run_dimension = data['cpdm_run_dimension'].values
    trial_resp_keys = data['cpdm_trial_resp.keys'].values
    trial_resp_rt = data['cpdm_trial_resp.rt'].values
    choices = data['cpdm_choice'].values

    return subjects,subj_id,gabor_orient,gabor_contrast,run_dimension,trial_resp_keys,trial_resp_rt,choices


def get_respslong(choices):
    enc = LabelBinarizer()
    respslong = enc.fit_transform(choices)
    # return transpose matrix
    return respslong.T

In [49]:
def get_nt(data,experiment=''):
    nt = 0
    for s in data['subject'].unique():
        if len(experiment)>0:
            temp_nt = data.loc[(data['subject']==s) & (data['cpdm_run_dimension']==experiment)].shape[0]
        else:
            temp_nt = data.loc[data['subject']==s].shape[0]
        if temp_nt > nt:
            nt = temp_nt
    return nt

### Bayesian Hierarchical Model

Developing choice of prior distribution and parameters.


In [9]:
def get_llhC(guess_rate,sds,se,confidence_criterion):
    


# This is the meat of the script that is used to estimate the parameters of the BHM 

def run_BHM(save_dir,experiment,data):
    
    subjects,subj_id,gabor_orient,gabor_contrast,run_dimension,trial_resp_keys,trial_resp_rt,choices = extract_cols(data)
    # number of samples/subjects
    ns = np.unique(subj_id).shape[0]
    # number of trials
    nt = get_nt(data)
    sampx = np.linspace(0.5/30,1-0.5/30,30)

    orislong = gabor_orient
    respslong = get_respslong(choices)

    tStep1 = time.time()

    # We will fit a model for each subject
    with pm.Model() as model_simple:

        # Hyperparameters for stimulus sensitivity (sens), decision criterion (deci), 
        # meta-uncertainty (meta), and confidence criterion (conf)
        mu_sens_hyper = pm.Normal('mu_sens_hyper',mu=0,sigma=1.0)
        sd_sens_hyper = pm.LogNormal('sd_sens_hyper',mu=0,sigma=1.0)
        mu_deci_hyper = pm.Normal('mu_deci_hyper',mu=0,sigma=1.0)
        sd_deci_hyper = pm.LogNormal('sd_deci_hyper',mu=0,sigma=1.0)
        mu_meta_hyper = pm.Normal('mu_meta_hyper',mu=0,sigma=1.0)
        sd_meta_hyper = pm.LogNormal('sd_meta_hyper',mu=0,sigma=1.0)
        mu_conf_hyper = pm.Normal('mu_conf_hyper',mu=0,sigma=1.0)
        sd_conf_hyper = pm.LogNormal('sd_conf_hyper',mu=0,sigma=1.0)

        # Priors
        # where does Corey get these values? mu, sigma or alpha beta?
        guess_rate = pm.Beta('guess_rate',mu=1,sigma=193.0/3.0)
        stimulus_sensitivity = pm.LogNormal('stimulus_sensitivity',mu=mu_sens_hyper,sigma=sd_sens_hyper,shape=np.size(np.unique(subj_id)))
        decision_criterion = pm.Normal('decision_criterion',mu=mu_deci_hyper,sigma=sd_deci_hyper,shape=np.size(np.unique(subj_id)))
        meta_uncertainty = pm.LogNormal('meta_uncertainty',mu=mu_meta_hyper,sigma=sd_meta_hyper,shape=np.size(np.unique(subj_id)))
        confidence_criterion = pm.LogNormal('confidence_criterion',mu=mu_conf_hyper,sigma=sd_conf_hyper,shape=np.size(np.unique(subj_id)))

        # rescaled stimulus sensitivity
        # sm = np.multiply(orislong,np.array(stimulus_sensitivity).reshape(orislong.shape))
        sm = np.broadcast_to(orislong,(ns,len(orislong)))
        sm = orislong*stimulus_sensitivity
        se = 0.5*np.ones([nt,5])
        # rescaled confidence criterion
        sc = decision_criterion*stimulus_sensitivity
        # xtrans = logncdfinv(sampx,log(1/sqrt(meta.^2+1)),sqrt(log(meta.^2+1)));
        muLogN = np.log(1/np.sqrt(meta_uncertainty**2 + 1.0))
        sigmaLogN = np.sqrt(np.log(meta_uncertainty**2 + 1.0))
        # transformed xaxis per subject, lognormal inverse CDF
        xtrans = ndtri(sampx, loc=muLogN, scale=sigmaLogN)

        # sigma parameters for likelihood
        sds = 1.0/xtrans
        # difference matrix
        se = sm - sc

        llhC = get_llhC(guess_rate,sds,se,confidence_criterion)


        kappa = pm.LogNormal('kappa',mu=mu_kappa_hyper,sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
        gamma = pm.HalfNormal('gamma',sigma=sd_gamma_hyper,shape=np.size(np.unique(subj_id)))
        
        prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( (delay_amt**alpha[subj_id])/(1+(kappa[subj_id]*delay_wait)) 
                                                                                - (immed_amt**alpha[subj_id])/(1+(kappa[subj_id]*immed_wait)) ))))

        y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

        trace_prior = pm.sample(10000, tune=1000, cores=5,target_accept=0.99,progressbar=False)


    # This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
    summary= az.summary(trace_prior,round_to=10)
    fn = os.path.join(save_dir,'BHM_model_summary_{}.csv'.format(experiment))
    print('Saving to : {}'.format(fn))
    summary.to_csv(fn)

    fn = os.path.join(save_dir,'BHM_model_trace_{}.pkl'.format(experiment))
    print('Saving to : {}'.format(fn))
    with open(fn,'wb') as buff:
        pickle.dump({'trace':trace_prior},buff)
        # pm.save_trace(trace_prior,fn)

    print('Time to complete {} aggregate BHM : {} minutes'.format(len(subjects),(time.time() - tStep1)/60.0))
    return trace_prior,subjects,subj_id





In [10]:
# Set the paths needed to find the data and load

# The user of the script can edit root_dir and dataset variables to get it to work for their dataset.
# This script will work given the data is stored in the appropriate BIDS format in the split directory
root_dir = '/Volumes/UCDN/datasets/'
dataset = 'IDM'

# log the BHM experiment version, in case we cahnge it in the future
experiment='v001'

utility_dir,save_dir,data = dirs_and_data(root_dir=root_dir,dataset=dataset)
subjects,subj_id,gabor_orient,gabor_contrast,run_dimension,trial_resp_keys,trial_resp_rt,choices = extract_cols(data)


"\n\ntrace_prior,subjects,subj_id = run_BHM(save_dir,experiment,data)\ndf_bhm = extract_mean(save_dir=save_dir,var_names=['kappa','gamma'],\n                        subjects=subjects,experiment=experiment)\nfor s in set(subj_id):\n    coords={'kappa_dim_0': [s],'gamma_dim_0':[s]}\n    diganostic_plots(trace_prior,experiment=experiment,utility_dir=utility_dir,subject=subjects[s],\n                        coords=coords,var_names=['kappa','gamma'],figsize=(10,10))\n\n"

In [56]:
nt = get_nt(data,experiment = 'low_vol_low_risk')
display(nt)

200

(200, 5)

In [70]:
orislong = gabor_orient
sc = 3*orislong
display(sc.shape)
sc = np.broadcast_to(orislong,(5,len(orislong)))
display(sc.shape)

(118791,)

(5, 118791)

In [None]:

'''

trace_prior,subjects,subj_id = run_BHM(save_dir,experiment,data)
df_bhm = extract_mean(save_dir=save_dir,var_names=['kappa','gamma'],
                        subjects=subjects,experiment=experiment)
for s in set(subj_id):
    coords={'kappa_dim_0': [s],'gamma_dim_0':[s]}
    diganostic_plots(trace_prior,experiment=experiment,utility_dir=utility_dir,subject=subjects[s],
                        coords=coords,var_names=['kappa','gamma'],figsize=(10,10))

'''


In [13]:
data.head(10)

Unnamed: 0,subject,cpdm_choice,cpdm_gabor_orient,cpdm_gabor_contrast,cpdm_run_dimension,cpdm_trial_resp.keys,cpdm_trial_resp.rt,subject_id
0,23_IDM_0001,-2.0,-4.0,0.05,low_vol_low_risk,q,0.4953,0.0
1,23_IDM_0001,2.0,3.0,0.2,low_vol_low_risk,p,0.2921,0.0
2,23_IDM_0001,2.0,1.5,0.05,low_vol_low_risk,p,0.2786,0.0
3,23_IDM_0001,-2.0,-2.0,0.05,low_vol_low_risk,q,0.2839,0.0
4,23_IDM_0001,-2.0,-5.0,0.2,low_vol_low_risk,q,0.2509,0.0
5,23_IDM_0001,-2.0,-5.0,0.2,low_vol_low_risk,q,0.2336,0.0
6,23_IDM_0001,2.0,5.0,0.2,low_vol_low_risk,p,0.1418,0.0
7,23_IDM_0001,2.0,5.0,0.05,low_vol_low_risk,p,0.2769,0.0
8,23_IDM_0001,-2.0,-1.0,0.2,low_vol_low_risk,q,0.2343,0.0
9,23_IDM_0001,-2.0,-2.5,0.05,low_vol_low_risk,q,0.2338,0.0


In [11]:
# # Takes about 10 seconds

# # we will change this when we change utility to 1st level analysis (or split)
# split_dir = '/Volumes/UCDN/datasets/IDM/split/'
# utility_dir = '/Volumes/UCDN/datasets/IDM/utility/'
# save_dir = '/Volumes/UCDN/datasets/IDM/utility/BHM/cpdm/'
# subjs = sorted(glob.glob(os.path.join(split_dir,'23_IDM_*')))
# task = 'cpdm'
# cols = ['cpdm_choice','cpdm_gabor_orient', 'cpdm_gabor_contrast', 'cpdm_run_dimension', 'cpdm_trial_resp.keys','cpdm_trial_resp.rt']
# data = pd.DataFrame(columns=['subject']+cols)

# for subj_id,s in enumerate(subjs):
#     subject = os.path.basename(s)
#     fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
#     if os.path.exists(fn):
#         subj_data = read_load_data(subject=subject,fn=fn,cols=cols)
#         # for c in cols:
#             # subj_data[c] = subj_data[c].astype(float)        
#         subj_data['subject_id'] = int(subj_id)
#         data = pd.concat([data,subj_data],ignore_index=True)

# data.head(10)


In [12]:


# parameters to model
# nParams = 2 + numTasks + totRel + numTasks*nConfCrit; % [Guess rate, stimulus criterion], [meta-uncertainty], [stimulus sensitivity], [confidence criteria]


# Required order for getLlhChoice: [guess rate, stim sens, stim crit, meta-uncertainty, conf criteria]


# 2 :: [Guess rate, stimulus criterion]
# numTasks :: [meta-uncertainty]
# totRel :: [stimulus sensitivity]
# numTasks*nConfCrit :: [confidence criteria]

