# Bayesian parameter estimation 

Written for CDD nonlinear hyperbolic (NLH, alpha estimated from CRDM) task as part of the IDM dataset collected online with Mturk

Extended to work for all datasets

### Import modules, libraries, etc

In [31]:
# Built-in/Generic Imports
import os,sys
import glob
import time

# Libs
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import pickle
pd.options.display.max_rows = 999
pd.options.display.max_columns = 99

import logging
logger = logging.getLogger("pymc")
logger.setLevel(logging.ERROR)

# explicitly use a path to let the script know where the IDM_model is located to import model_functions
parent = '/Users/pizarror/IDM'
# adding the parent directory to the sys.path.
sys.path.append(parent)
from IDM_model.src import model_functions as mf

### Diagnostic plots

Run `diagnistic_plots()` individually by subject :: trace, posterior, bivariate densities, rank plots

In [32]:
def diganostic_plots(trace,experiment='experiment',utility_dir='/tmp/',subject='23_IDM_0001',task='cdd_nlh',coords={},var_names=['kappa','gamma'],figsize=(10,10)):

    bh_dir = os.path.join(utility_dir,subject,task,'bh')
    if not os.path.exists(bh_dir):
        os.makedirs(bh_dir)
    print('Saving diagnostic plots to bh_dir : {}'.format(bh_dir))

    title_dict = {'fontsize':15}

    # 2by2 : rows 2 varialbes, cols 2 for distribution and sampled values
    axes = az.plot_trace(trace, var_names=var_names,coords=coords,compact=False)
    for r in range(axes.shape[0]):
        for c in range(axes.shape[1]):
            axes[r,c].set_title('{}: {}'.format(subject,var_names[r]))
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_trace_plot.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_pair(trace,kind='kde', coords=coords,var_names=var_names,marginals=True)
    axes[0,0].set_title(subject,fontdict=title_dict)
    axes[1,0].set_ylabel(var_names[1])
    axes[1,0].set_xlabel(var_names[0])
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_bivariate_densities.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_posterior(trace,var_names=var_names,coords=coords)
    # print(axes.shape)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig_fn = os.path.join(bh_dir,'{}_{}_posterior.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    az.plot_rank(trace,var_names=var_names,coords=coords,ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_bars.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    axes = az.plot_rank(trace,var_names=var_names, coords=coords,kind="vlines",vlines_kwargs={'lw':0}, marker_vlines_kwargs={'lw':3},ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_lines.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()




### Load data

We will load the data from all participants so we can run the modeling schema

In [33]:
def read_load_data(subject='23_IDM_0144',fn='/tmp',alpha0=1.0,cols=[]):
    cdd_df = pd.read_csv(fn)
    task='cdd'
    cdd_df,response_rate = mf.drop_non_responses(cdd_df,task=task,verbose=False) 
    cdd_df = mf.drop_pract(cdd_df,task=task)
    data = mf.get_data(cdd_df,cols,alpha_hat=alpha0)[0]
    data['subject'] = subject
    return data 

def get_alpha(utility_dir,subject=''):
    split_crdm_bhm_fn = os.path.join(utility_dir,'BHM/split_CRDM_BHM_gain.csv')
    crdm_bhm_df = pd.read_csv(split_crdm_bhm_fn)
    if subject in crdm_bhm_df['subject'].values:
        try:
            display()
            alpha = crdm_bhm_df.loc[crdm_bhm_df['subject']==subject,'alpha'].iloc[0]
        except ValueError:
            print('ValueError: likely subject not found under CRDM, will use alpha = 1.0')
            alpha = 1.0
    else:
        print('Could not find alpha for subject : {}'.format(subject))
        alpha = 1.0
    return alpha

def dirs_and_data(root_dir='/tmp',dataset='IDM'):

    dataset_dir = os.path.join(root_dir,dataset)
    split_dir = os.path.join(dataset_dir,'split')
    utility_dir = os.path.join(dataset_dir,'utility')
    save_dir = os.path.join(utility_dir,'BHM/cdd_nlh')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Takes about 10 seconds for 149 subjects form Mturk data
    task = 'cdd'
    subjs = sorted(glob.glob(os.path.join(split_dir,'*')))
    cols = ['cdd_choice','cdd_immed_amt','cdd_delay_amt','cdd_immed_wait','cdd_delay_wait','alpha']
    data = pd.DataFrame(columns=['subject']+cols)
    subj_id = 0
    for s in subjs:
        subject = os.path.basename(s)
        fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
        if os.path.exists(fn):
            alpha = get_alpha(utility_dir,subject=subject)
            subj_data = read_load_data(subject=subject,fn=fn,alpha0=alpha,cols=cols)
            if subj_data.empty:
                continue
            for c in cols:
                subj_data[c] = subj_data[c].astype(float)        
            subj_data['subject_id'] = int(subj_id)
            subj_id = subj_id+1
            data = pd.concat([data,subj_data],ignore_index=True)

    return utility_dir,save_dir,data

def extract_cols(data):
    subjects = data['subject'].unique()
    # nb_subj = subjects.shape[0]
    # nb_trials = data.shape[0]//nb_subj
    subj_id_list = data['subject_id'].to_list()
    subj_id = [int(s) for s in subj_id_list]
    # old_id = np.array([ [s]*nb_trials for s in range(nb_subj) ]).flatten()

    delay_amt = data['cdd_delay_amt'].values
    delay_wait = data['cdd_delay_wait'].values
    immed_amt = data['cdd_immed_amt'].values
    immed_wait = data['cdd_immed_wait'].values
    alpha = data['alpha'].values
    choices = data['cdd_choice'].values

    return subjects,subj_id,delay_amt,delay_wait,immed_amt,immed_wait,alpha,choices

In [34]:
# Takes about 10 seconds

# we will change this when we change utility to 1st level analysis (or split)
# split_dir = '/Volumes/UCDN/datasets/IDM/split/'
# utility_dir = '/Volumes/UCDN/datasets/IDM/utility/'
# save_dir = '/Volumes/UCDN/datasets/IDM/utility/BHM/cdd_nlh/'
# subjs = sorted(glob.glob(os.path.join(split_dir,'23_IDM_*')))
# task = 'cdd'
# cols = ['cdd_choice','cdd_immed_amt','cdd_delay_amt','cdd_immed_wait','cdd_delay_wait','alpha']
# data = pd.DataFrame(columns=['subject']+cols)

# for subj_id,s in enumerate(subjs):
#     subject = os.path.basename(s)
#     fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
#     if os.path.exists(fn):
#         alpha = get_alpha(utility_dir,subject=subject)
#         subj_data = read_load_data(subject=subject,fn=fn,alpha0=alpha,cols=cols)
#         for c in cols:
#             subj_data[c] = subj_data[c].astype(float)        
#         subj_data['subject_id'] = int(subj_id)
#         data = pd.concat([data,subj_data],ignore_index=True)

# data.head(10)


# subjects = data['subject'].unique()
# nb_subj = subjects.shape[0]
# nb_trials = data.shape[0]//nb_subj
# subj_id_list = data['subject_id'].to_list()
# subj_id = [int(s) for s in subj_id_list]

# # subj_id = data['subject'].to_list()

# old_id = np.array([ [s]*nb_trials for s in range(nb_subj) ]).flatten()

# delay_amt = data['cdd_delay_amt'].values
# delay_wait = data['cdd_delay_wait'].values
# immed_amt = data['cdd_immed_amt'].values
# immed_wait = data['cdd_immed_wait'].values
# alpha = data['alpha'].values
# choices = data['cdd_choice'].values


### Bayesian Hierarchical Model

Choice of prior distribution and parameters used to define were developed under cdd :: `parameter_receovery.ipynb`


In [35]:
# This is the meat of the script that is used to estimate the parameters of the BHM 

def run_BHM(save_dir,experiment,data):

    subjects,subj_id,delay_amt,delay_wait,immed_amt,immed_wait,alpha,choices =  extract_cols(data)

    tStep1 = time.time()

    # We will fit a model for each subject
    with pm.Model() as model_simple:

        # Hyperparameters for kappa and gamma
        # estimated from MLE approximations : np.exp(-3.60) = 0.0273, np.sqrt(1.71)=1.308
        mu_kappa_hyper = pm.Beta('mu_kappa_hyper',mu=np.exp(-3.60),sigma=0.01)
        sd_kappa_hyper = pm.Normal('sd_kappa_hyper',mu=np.sqrt(1.71),sigma=0.1)
        # estimated from MLE approximations : np.sqrt(2.30) = 1.517
        sd_gamma_hyper = pm.Normal('sd_hyper',mu=np.sqrt(2.30),sigma=0.1)

        kappa = pm.LogNormal('kappa',mu=mu_kappa_hyper,sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
        gamma = pm.HalfNormal('gamma',sigma=sd_gamma_hyper,shape=np.size(np.unique(subj_id)))
        
        prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( (delay_amt**alpha[subj_id])/(1+(kappa[subj_id]*delay_wait)) 
                                                                                - (immed_amt**alpha[subj_id])/(1+(kappa[subj_id]*immed_wait)) ))))

        y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

        trace_prior = pm.sample(10000, tune=1000, cores=5,target_accept=0.99,progressbar=True)


    # This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
    summary= az.summary(trace_prior,round_to=10)
    fn = os.path.join(save_dir,'BHM_model_summary_{}.csv'.format(experiment))
    print('Saving to : {}'.format(fn))
    summary.to_csv(fn)

    fn = os.path.join(save_dir,'BHM_model_trace_{}.pkl'.format(experiment))
    print('Saving to : {}'.format(fn))
    with open(fn,'wb') as buff:
        pickle.dump({'trace':trace_prior},buff)
        # pm.save_trace(trace_prior,fn)

    print('Time to complete {} aggregate BHM : {} minutes'.format(len(subjects),(time.time() - tStep1)/60.0))
    return trace_prior,subjects,subj_id

# Extract parameter estimates and save
def extract_mean(save_dir,var_names=['kappa','gamma'],subjects=[],experiment='v004'):
    fn = os.path.join(save_dir,'BHM_model_summary_{}.csv'.format(experiment))
    nb_subjects = len(subjects)
    df = pd.read_csv(fn,index_col=0)
    df_bhm = pd.DataFrame([],columns=var_names)
    df_bhm['subject'] = subjects
    for var in var_names:
        ind_list = ['{}[{}]'.format(var,sub_id) for sub_id in range(nb_subjects)]
        df_bhm[var] = df.loc[df.index.isin(ind_list)]['mean'].reset_index(drop=True)

    split_CDD_fn = os.path.join(os.path.dirname(save_dir),'split_CDD_nlh_BHM.csv')
    print('Saving split BHM : {}'.format(split_CDD_fn))
    df_bhm.to_csv(split_CDD_fn)
    display(df_bhm)
    
    return df_bhm


In [36]:
# Set the paths needed to find the data and load

# The user of the script can edit root_dir and dataset variables to get it to work for their dataset.
# This script will work given the data is stored in the appropriate BIDS format in the split directory
root_dir = '/Volumes/UCDN/datasets/'
dataset = 'SDM'

# log the BHM experiment version, in case we cahnge it in the future
experiment='v004'

utility_dir,save_dir,data = dirs_and_data(root_dir=root_dir,dataset=dataset)
trace_prior,subjects,subj_id = run_BHM(save_dir,experiment,data)
df_bhm = extract_mean(save_dir=save_dir,var_names=['kappa','gamma'],
                        subjects=subjects,experiment=experiment)
for s in set(subj_id):
    coords={'kappa_dim_0': [s],'gamma_dim_0':[s]}
    diganostic_plots(trace_prior,experiment=experiment,utility_dir=utility_dir,subject=subjects[s], 
                        task='cdd_nlh',coords=coords,var_names=['kappa','gamma'],figsize=(10,10))



Could not find alpha for subject : 23_SDM_0006_E
Could not find alpha for subject : 23_SDM_0012_N
Could not find alpha for subject : 23_SDM_0014_E


Saving to : /Volumes/UCDN/datasets/SDM/utility/BHM/cdd_nlh/BHM_model_summary_v004.csv
Saving to : /Volumes/UCDN/datasets/SDM/utility/BHM/cdd_nlh/BHM_model_trace_v004.pkl
Time to complete 16 aggregate BHM : 2.109278452396393 minutes
Saving split BHM : /Volumes/UCDN/datasets/SDM/utility/BHM/split_CDD_nlh_BHM.csv


Unnamed: 0,kappa,gamma,subject
0,0.004878,0.614883,23_SDM_0001_N
1,0.002902,0.211473,23_SDM_0002_E
2,0.005634,0.24477,23_SDM_0002_N
3,0.016741,0.38313,23_SDM_0004_E
4,0.012072,0.526142,23_SDM_0004_N
5,0.016348,0.481437,23_SDM_0005_E
6,0.015996,0.348708,23_SDM_0005_N
7,0.024382,0.406561,23_SDM_0006_E
8,0.026856,0.353819,23_SDM_0006_N
9,0.001025,2.025326,23_SDM_0009_E


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0001_N/cdd_nlh/bh


The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0002_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0002_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0004_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0004_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0005_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0005_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0006_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0006_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0009_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0009_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0011_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0011_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0012_E/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0012_N/cdd_nlh/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0014_E/cdd_nlh/bh


  plt.tight_layout()
