# Bayesian parameter estimation 

Written for CDD task as part of the IDM dataset collected online with Mturk

In [16]:
# Built-in/Generic Imports
import os,sys
import glob
import time

# Libs
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import pickle

import logging
logger = logging.getLogger("pymc")
logger.setLevel(logging.ERROR)


In [17]:
# getting the name of the directory
# where the this file is present.
# current = os.path.dirname(os.path.realpath(__file__))
current = os.path.dirname(os.getcwd())
 
# Getting the parent directory name
# where the current directory is present.
# parent = os.path.dirname(os.path.dirname(current))
# parent = current
parent = '/Users/pizarror/IDM'
# print(parent)
#/Users/pizarror/IDM

# adding the parent directory to
# the sys.path.
sys.path.append(parent)

from IDM_model.src import model_functions as mf


In [18]:
def read_load_data(subject='23_IDM_0144',fn='/tmp',alpha0=1.0,cols=[]):
    cdd_df = pd.read_csv(fn)
    task='cdd'
    cdd_df = mf.drop_pract(cdd_df,task=task)
    cdd_df,response_rate = mf.drop_non_responses(cdd_df,task=task,verbose=False) 
    data = mf.get_data(cdd_df,cols,alpha_hat=alpha0)[0]
    data['subject'] = subject
    return data 

def diganostic_plots(trace,experiment='experiment',utility_dir='/tmp/',subject='23_IDM_0001',task='cdd',coords={},var_names=['kappa','gamma'],figsize=(10,10)):

    bh_dir = os.path.join(utility_dir,subject,task,'bh')
    if not os.path.exists(bh_dir):
        os.makedirs(bh_dir)
    print('Saving diagnostic plots to bh_dir : {}'.format(bh_dir))

    title_dict = {'fontsize':15}

    # 2by2 : rows 2 varialbes, cols 2 for distribution and sampled values
    axes = az.plot_trace(trace, var_names=var_names,coords=coords,compact=False)
    for r in range(axes.shape[0]):
        for c in range(axes.shape[1]):
            axes[r,c].set_title('{}: {}'.format(subject,var_names[r]))
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_trace_plot.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_pair(trace,kind='kde', coords=coords,var_names=var_names,marginals=True)
    axes[0,0].set_title(subject,fontdict=title_dict)
    axes[1,0].set_ylabel(var_names[1])
    axes[1,0].set_xlabel(var_names[0])
    plt.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_bivariate_densities.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()
    
    axes = az.plot_posterior(trace,var_names=var_names,coords=coords)
    # print(axes.shape)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig_fn = os.path.join(bh_dir,'{}_{}_posterior.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    az.plot_rank(trace,var_names=var_names,coords=coords,ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_bars.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()

    fig, axes = plt.subplots(1,len(var_names), figsize=figsize)
    axes = az.plot_rank(trace,var_names=var_names, coords=coords,kind="vlines",vlines_kwargs={'lw':0}, marker_vlines_kwargs={'lw':3},ax=axes)
    for c in range(axes.shape[0]):
        axes[c].set_title('{}: {}'.format(subject,var_names[c]),fontdict=title_dict)
    fig.tight_layout()
    fig_fn = os.path.join(bh_dir,'{}_{}_rank_plot_lines.{}.eps'.format(subject,task,experiment))
    plt.savefig(fig_fn,format='eps')
    plt.close()




## Bayesian Hierarchical Model (BHM)

We pooled all data together and ran simple BH model in complete_pool_as_prior.

We are using complete pooling to generate priors for when we implement a higherarchical bayesian model.

### Load data

We will load the data from all participants so we can run the modeling schema

In [19]:
# Takes about 10 seconds

# we will change this when we change utility to 1st level analysis (or split)
root_dir = '/Volumes/UCDN/datasets/'
dataset = 'SDM'
dataset_dir = os.path.join(root_dir,dataset)

split_dir = os.path.join(dataset_dir,'split')
utility_dir = os.path.join(dataset_dir,'utility')
save_dir = os.path.join(utility_dir,'BHM/cdd/')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
subjs = sorted(glob.glob(os.path.join(split_dir,'*')))
task = 'cdd'
cols = ['cdd_choice','cdd_immed_amt','cdd_delay_amt','cdd_immed_wait','cdd_delay_wait','alpha']
data = pd.DataFrame(columns=['subject']+cols)

subj_id = 0

for s in subjs:
    subject = os.path.basename(s)
    fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
    if os.path.exists(fn):
        subj_data = read_load_data(subject=subject,fn=fn,alpha0=1.0,cols=cols)
        for c in cols:
            subj_data[c] = subj_data[c].astype(float)        
        subj_data['subject_id'] = int(subj_id)
        subj_id = subj_id+1
        data = pd.concat([data,subj_data],ignore_index=True)

data.head(100)


Unnamed: 0,subject,cdd_choice,cdd_immed_amt,cdd_delay_amt,cdd_immed_wait,cdd_delay_wait,alpha,subject_id
0,23_SDM_0001_N,1.0,2.0,65.0,0.0,29.0,1.0,0.0
1,23_SDM_0001_N,0.0,10.0,21.0,0.0,150.0,1.0,0.0
2,23_SDM_0001_N,0.0,20.0,39.0,0.0,150.0,1.0,0.0
3,23_SDM_0001_N,0.0,2.0,11.0,0.0,151.0,1.0,0.0
4,23_SDM_0001_N,1.0,10.0,31.0,0.0,29.0,1.0,0.0
...,...,...,...,...,...,...,...,...
95,23_SDM_0001_N,1.0,20.0,65.0,0.0,5.0,1.0,0.0
96,23_SDM_0002_E,0.0,20.0,18.0,0.0,90.0,1.0,1.0
97,23_SDM_0002_E,1.0,20.0,26.0,0.0,31.0,1.0,1.0
98,23_SDM_0002_E,1.0,2.0,7.0,0.0,5.0,1.0,1.0


In [20]:
# this is a way to filter the first N=10 subjects
subjects = data['subject'].unique()
# subjects = subjects[:10]
print(subjects)
data = data.loc[data['subject'].isin(subjects)]


['23_SDM_0001_N' '23_SDM_0002_E' '23_SDM_0002_N' '23_SDM_0004_E'
 '23_SDM_0004_N' '23_SDM_0005_E' '23_SDM_0005_N']


In [21]:
subjects = data['subject'].unique()
nb_subj = subjects.shape[0]
nb_trials = data.shape[0]//nb_subj
subj_id_list = data['subject_id'].to_list()
subj_id = [int(s) for s in subj_id_list]
# old_id = np.array([ [s]*nb_trials for s in range(nb_subj) ]).flatten()

delay_amt = data['cdd_delay_amt'].values
delay_wait = data['cdd_delay_wait'].values
immed_amt = data['cdd_immed_amt'].values
immed_wait = data['cdd_immed_wait'].values
choices = data['cdd_choice'].values

### Bayesian Hierarchical Model

Developed under parameter receovery


In [22]:
tStep1 = time.time()

# We will fit a model for each subject
with pm.Model() as model_simple:

    # Hyperparameters for kappa and gamma
    # estimated from MLE approximations : np.exp(-3.60) = 0.0273, np.sqrt(1.71)=1.308
    mu_kappa_hyper = pm.Beta('mu_kappa_hyper',mu=np.exp(-3.60),sigma=0.01)
    sd_kappa_hyper = pm.Normal('sd_kappa_hyper',mu=np.sqrt(1.71),sigma=0.1)
    # estimated from MLE approximations : np.sqrt(2.30) = 1.517
    sd_gamma_hyper = pm.Normal('sd_hyper',mu=np.sqrt(2.30),sigma=0.1)

    kappa = pm.LogNormal('kappa',mu=mu_kappa_hyper,sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
    gamma = pm.HalfNormal('gamma',sigma=sd_gamma_hyper,shape=np.size(np.unique(subj_id)))
    
    prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( delay_amt/(1+(kappa[subj_id]*delay_wait)) 
                                                                            - immed_amt/(1+(kappa[subj_id]*immed_wait)) ))))

    y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

    trace_prior = pm.sample(10000, tune=1000, cores=5,target_accept=0.99,progressbar=True)
    


# This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
summary= az.summary(trace_prior,round_to=10)
fn = os.path.join(save_dir,'BHM_model_summary_v004.csv')
print('Saving to : {}'.format(fn))
summary.to_csv(fn)

fn = os.path.join(save_dir,'BHM_model_trace_v004.pkl')
print('Saving to : {}'.format(fn))
with open(fn,'wb') as buff:
    pickle.dump({'trace':trace_prior},buff)
    # pm.save_trace(trace_prior,fn)

print('Time to complete {} aggregate BHM : {} minutes'.format(len(subjects),(time.time() - tStep1)/60.0))


Saving to : /Volumes/UCDN/datasets/SDM/utility/BHM/cdd/BHM_model_summary_v004.csv
Saving to : /Volumes/UCDN/datasets/SDM/utility/BHM/cdd/BHM_model_trace_v004.pkl
Time to complete 7 aggregate BHM : 1.123501431941986 minutes


### Extract parameter estimates and save

We can incorporate this into the script above

In [23]:
summary.head(20)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
sd_kappa_hyper,1.635241,0.077997,1.487813,1.779871,0.0003450194,0.0002447217,51201.150818,37724.361471,1.000092
sd_hyper,1.472263,0.099932,1.289415,1.66503,0.0004330875,0.0003062408,53197.547916,37391.396544,1.000315
mu_kappa_hyper,0.026161,0.009522,0.009885,0.044107,3.9192e-05,2.78981e-05,53683.271714,31423.659637,0.999998
kappa[0],0.00507,0.00105,0.003093,0.007063,5.0695e-06,3.5847e-06,41651.987696,25539.561544,1.000242
kappa[1],0.00351,0.001698,0.000611,0.006558,7.6253e-06,5.3919e-06,40383.423544,25307.898957,1.000004
kappa[2],0.006088,0.001916,0.00248,0.009609,8.4056e-06,5.9437e-06,47131.30225,25843.536524,1.000069
kappa[3],0.017216,0.002852,0.011991,0.022568,1.31447e-05,9.3721e-06,46734.625379,31275.757574,1.000121
kappa[4],0.01241,0.00177,0.009101,0.015737,8.588e-06,6.0726e-06,42012.659847,29571.425418,1.000373
kappa[5],0.016783,0.002534,0.012057,0.021468,1.16088e-05,8.2826e-06,47656.496801,32170.130689,1.000125
kappa[6],0.016527,0.003091,0.0108,0.022297,1.38009e-05,9.8428e-06,49470.401365,31349.186618,1.000073


In [24]:
def extract_mean(fn,var_names=['kappa','gamma'],subjects=[]):
    nb_subjects = len(subjects)
    df = pd.read_csv(fn,index_col=0)
    df_bhm = pd.DataFrame([],columns=var_names)
    df_bhm['subject'] = subjects
    for var in var_names:
        ind_list = ['{}[{}]'.format(var,sub_id) for sub_id in range(nb_subjects)]
        df_bhm[var] = df.loc[df.index.isin(ind_list)]['mean'].reset_index(drop=True)
    return df_bhm

experiment='v004'
bhm_dir = save_dir
bhm_fn = os.path.join(bhm_dir,'BHM_model_summary_{}.csv'.format(experiment))
df_bhm = extract_mean(bhm_fn,var_names=['kappa','gamma'],subjects=subjects)
split_CDD_fn = os.path.join(bhm_dir,'split_CDD_BHM.csv')
df_bhm.to_csv(split_CDD_fn)
df_bhm

Unnamed: 0,kappa,gamma,subject
0,0.00507,0.595353,23_SDM_0001_N
1,0.00351,0.206604,23_SDM_0002_E
2,0.006088,0.238005,23_SDM_0002_N
3,0.017216,0.370271,23_SDM_0004_E
4,0.01241,0.507587,23_SDM_0004_N
5,0.016783,0.464131,23_SDM_0005_E
6,0.016527,0.336708,23_SDM_0005_N


### Diagnostic plots

Too many subjects to run `diagnistic_plots()`  but can run them individually
Trace, posterior, rank plots

... need to figure out for each subject how to plot_pair() which plots the bivariate distirbutions


In [25]:
for s in set(subj_id):
    coords={'kappa_dim_0': [s],'gamma_dim_0':[s]}
    diganostic_plots(trace_prior,experiment=experiment,utility_dir=utility_dir,subject=subjects[s],coords=coords,var_names=['kappa','gamma'],figsize=(10,10))


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0001_N/cdd/bh


The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
  plt.tight_layout()


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0002_E/cdd/bh


The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0002_N/cdd/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0004_E/cdd/bh


  plt.tight_layout()


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0004_N/cdd/bh


The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0005_E/cdd/bh


  plt.tight_layout()
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.


Saving diagnostic plots to bh_dir : /Volumes/UCDN/datasets/SDM/utility/23_SDM_0005_N/cdd/bh


  plt.tight_layout()
