# Bayesian parameter estimation 

Written for CPDM task as part of the IDM dataset collected online with Mturk. Here we are using the CASANDRE model (instead of utility model) to analyze the CPDM data

In [29]:
# Built-in/Generic Imports
import os,sys
import glob
import time

# Libs
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import pickle
pd.options.display.max_rows = 999
pd.options.display.max_columns = 99

import logging
logger = logging.getLogger("pymc")
logger.setLevel(logging.ERROR)


In [30]:
# getting the name of the directory
# where the this file is present.
# current = os.path.dirname(os.path.realpath(__file__))
current = os.path.dirname(os.getcwd())
 
# Getting the parent directory name
# where the current directory is present.
# parent = os.path.dirname(os.path.dirname(current))
# parent = current
parent = '/Users/pizarror/IDM'
# print(parent)
#/Users/pizarror/IDM

# adding the parent directory to
# the sys.path.
sys.path.append(parent)

from IDM_model.src import model_functions as mf


In [31]:
def read_load_data(subject='23_IDM_0144',fn='/tmp',cols=[]):
    cpdm_df = pd.read_csv(fn)
    task='cpdm'
    cpdm_df = mf.remap_response(cpdm_df,task=task)
    cpdm_df = mf.drop_pract(cpdm_df,task=task)
    cpdm_df,response_rate = mf.drop_non_responses(cpdm_df,task=task,verbose=False)
    data = mf.get_data(cpdm_df,cols,alpha_hat=1)[0]
    data['subject'] = subject
    return data

def diganostic_plots(trace,experiment='experiment',utility_dir='/tmp/',subject='23_IDM_0001',task='cdd_nlh',coords={},var_names=['kappa','gamma'],figsize=(10,10)):

    bh_dir = os.path.join(utility_dir,subject,task,'bh')
    if not os.path.exists(bh_dir):
        os.makedirs(bh_dir)
    print('Saving diagnostic plots to bh_dir : {}'.format(bh_dir))

    title_dict = {'fontsize':15}

    # 2by2 : rows 2 varialbes, cols 2 for distribution and sampled values
    axes = az.plot_trace(trace, var_names=var_names,coords=coords,compact=False)
    for r in range(axes.shape[0]):
        for c in range(axes.shape[1]):
            axes[r,c].set_title('{}: {}'.format(subject,var_names[r]))
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_trace_plot.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_pair(trace,kind='kde', coords=coords,var_names=var_names,marginals=True)
    axes[0,0].set_title(subject,fontdict=title_dict)
    axes[1,0].set_ylabel(var_names[1])
    axes[1,0].set_xlabel(var_names[0])
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_bivariate_densities.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_posterior(trace,var_names=var_names,coords=coords)
    # print(axes.shape)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig_fn = os.path.join(bh_dir,'{}_{}_posterior.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    az.plot_rank(trace,var_names=var_names,coords=coords,ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_bars.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    axes = az.plot_rank(trace,var_names=var_names, coords=coords,kind="vlines",vlines_kwargs={'lw':0}, marker_vlines_kwargs={'lw':3},ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_lines.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()




## Bayesian Hierarchical Model (BHM)

We pooled all data together and ran simple BH model in complete_pool_as_prior.

We are using complete pooling to generate priors for when we implement a higherarchical bayesian model.

### Load data

We will load the data from all participants so we can run the modeling schema

In [32]:
# Takes about 10 seconds

# we will change this when we change utility to 1st level analysis (or split)
split_dir = '/Volumes/UCDN/datasets/IDM/split/'
utility_dir = '/Volumes/UCDN/datasets/IDM/utility/'
save_dir = '/Volumes/UCDN/datasets/IDM/utility/BHM/cpdm/'
subjs = sorted(glob.glob(os.path.join(split_dir,'23_IDM_*')))
task = 'cpdm'
cols = ['cpdm_choice','cpdm_gabor_orient', 'cpdm_gabor_contrast', 'cpdm_run_dimension', 'cpdm_trial_resp.keys','cpdm_trial_resp.rt']
data = pd.DataFrame(columns=['subject']+cols)

for subj_id,s in enumerate(subjs):
    subject = os.path.basename(s)
    fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
    if os.path.exists(fn):
        subj_data = read_load_data(subject=subject,fn=fn,cols=cols)
        # for c in cols:
            # subj_data[c] = subj_data[c].astype(float)        
        subj_data['subject_id'] = int(subj_id)
        data = pd.concat([data,subj_data],ignore_index=True)

data.head(10)


Unnamed: 0,subject,cpdm_choice,cpdm_gabor_orient,cpdm_gabor_contrast,cpdm_run_dimension,cpdm_trial_resp.keys,cpdm_trial_resp.rt,subject_id
0,23_IDM_0001,-2.0,-4.0,0.05,low_vol_low_risk,q,0.4953,0.0
1,23_IDM_0001,2.0,3.0,0.2,low_vol_low_risk,p,0.2921,0.0
2,23_IDM_0001,2.0,1.5,0.05,low_vol_low_risk,p,0.2786,0.0
3,23_IDM_0001,-2.0,-2.0,0.05,low_vol_low_risk,q,0.2839,0.0
4,23_IDM_0001,-2.0,-5.0,0.2,low_vol_low_risk,q,0.2509,0.0
5,23_IDM_0001,-2.0,-5.0,0.2,low_vol_low_risk,q,0.2336,0.0
6,23_IDM_0001,2.0,5.0,0.2,low_vol_low_risk,p,0.1418,0.0
7,23_IDM_0001,2.0,5.0,0.05,low_vol_low_risk,p,0.2769,0.0
8,23_IDM_0001,-2.0,-1.0,0.2,low_vol_low_risk,q,0.2343,0.0
9,23_IDM_0001,-2.0,-2.5,0.05,low_vol_low_risk,q,0.2338,0.0


In [None]:

subjects = data['subject'].unique()
# subjects = subjects[:10]
# print(subjects)
data = data.loc[data['subject'].isin(subjects)]


In [None]:

subjects = data['subject'].unique()
nb_subj = subjects.shape[0]
nb_trials = data.shape[0]//nb_subj
subj_id_list = data['subject_id'].to_list()
subj_id = [int(s) for s in subj_id_list]
# subj_id = data['subject'].to_list()
old_id = np.array([ [s]*nb_trials for s in range(nb_subj) ]).flatten()



delay_amt = data['cdd_delay_amt'].values
delay_wait = data['cdd_delay_wait'].values
immed_amt = data['cdd_immed_amt'].values
immed_wait = data['cdd_immed_wait'].values
alpha = data['alpha'].values
choices = data['cdd_choice'].values




# parameters to model
# nParams = 2 + numTasks + totRel + numTasks*nConfCrit; % [Guess rate, stimulus criterion], [meta-uncertainty], [stimulus sensitivity], [confidence criteria]


# Required order for getLlhChoice: [guess rate, stim sens, stim crit, meta-uncertainty, conf criteria]


# 2 :: [Guess rate, stimulus criterion]
# numTasks :: [meta-uncertainty]
# totRel :: [stimulus sensitivity]
# numTasks*nConfCrit :: [confidence criteria]



### Bayesian Hierarchical Model

Developed under parameter receovery


In [None]:
tStep1 = time.time()

# We will fit a model for each subject
with pm.Model() as model_simple:

    # Hyperparameters for kappa and gamma
    # estimated from MLE approximations : np.exp(-3.60) = 0.0273, np.sqrt(1.71)=1.308
    mu_kappa_hyper = pm.Beta('mu_kappa_hyper',mu=np.exp(-3.60),sigma=0.01)
    sd_kappa_hyper = pm.Normal('sd_kappa_hyper',mu=np.sqrt(1.71),sigma=0.1)
    # estimated from MLE approximations : np.sqrt(2.30) = 1.517
    sd_gamma_hyper = pm.Normal('sd_hyper',mu=np.sqrt(2.30),sigma=0.1)

    kappa = pm.LogNormal('kappa',mu=mu_kappa_hyper,sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
    gamma = pm.HalfNormal('gamma',sigma=sd_gamma_hyper,shape=np.size(np.unique(subj_id)))
    
    prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( (delay_amt**alpha[subj_id])/(1+(kappa[subj_id]*delay_wait)) 
                                                                            - (immed_amt**alpha[subj_id])/(1+(kappa[subj_id]*immed_wait)) ))))

    y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

    trace_prior = pm.sample(10000, tune=1000, cores=5,target_accept=0.99,progressbar=False)



# This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
summary= az.summary(trace_prior,round_to=10)
fn = os.path.join(save_dir,'BHM_model_summary_v004.csv')
print('Saving to : {}'.format(fn))
summary.to_csv(fn)

fn = os.path.join(save_dir,'BHM_model_trace_v004.pkl')
print('Saving to : {}'.format(fn))
with open(fn,'wb') as buff:
    pickle.dump({'trace':trace_prior},buff)
    # pm.save_trace(trace_prior,fn)

print('Time to complete {} aggregate BHM : {} minutes'.format(len(subjects),(time.time() - tStep1)/60.0))


### Extract parameter estimates and save

We can incorporate this into the script above

In [None]:
def extract_mean(fn,var_names=['kappa','gamma'],subjects=[]):
    nb_subjects = len(subjects)
    df = pd.read_csv(fn,index_col=0)
    df_bhm = pd.DataFrame([],columns=var_names)
    df_bhm['subject'] = subjects
    for var in var_names:
        ind_list = ['{}[{}]'.format(var,sub_id) for sub_id in range(nb_subjects)]
        df_bhm[var] = df.loc[df.index.isin(ind_list)]['mean'].reset_index(drop=True)
    return df_bhm

experiment='v004'
bhm_dir = '/Volumes/UCDN/datasets/IDM/utility/BHM/'
bhm_fn = os.path.join(bhm_dir,'cdd_nlh','BHM_model_summary_{}.csv'.format(experiment))
df_bhm = extract_mean(bhm_fn,var_names=['kappa','gamma'],subjects=subjects)
split_CDD_fn = os.path.join(bhm_dir,'split_CDD_nlh_BHM.csv')
df_bhm.to_csv(split_CDD_fn)
df_bhm

### Diagnostic plots

Too many subjects to run `diagnistic_plots()`  but can run them individually
Trace, posterior, rank plots

... need to figure out for each subject how to plot_pair() which plots the bivariate distirbutions


In [None]:
for s in set(subj_id):
    coords={'kappa_dim_0': [s],'gamma_dim_0':[s]}
    diganostic_plots(trace_prior,experiment=experiment,utility_dir=utility_dir,subject=subjects[s],task='cdd_nlh',coords=coords,var_names=['kappa','gamma'],figsize=(10,10))
