In [1]:
import numpy as np
import math
import scipy as sp
from scipy import optimize
from numpy import genfromtxt
import pandas as pd 
import glob
import pymc as pm
import arviz as az
import statistics as stats
import matplotlib.pyplot as plt
import os,sys

In [2]:
# getting the name of the directory
# where the this file is present.
# current = os.path.dirname(os.path.realpath(__file__))
current = os.path.dirname(os.getcwd())
 
# Getting the parent directory name
# where the current directory is present.
# parent = os.path.dirname(os.path.dirname(current))
parent = current
# print(parent)
#/Users/pizarror/IDM

# adding the parent directory to
# the sys.path.
sys.path.append(parent)

from IDM_model.src import model_functions as mf


/Users/pizarror/IDM


In [3]:
def read_load_data(subject='23_IDM_0144',fn='/tmp',alpha0=1.0,cols=[]):
    cdd_df = pd.read_csv(fn)
    task='cdd'
    cdd_df = mf.drop_pract(cdd_df,task=task)
    cdd_df,response_rate = mf.drop_non_responses(cdd_df,task=task,verbose=False) 
    data = mf.get_data(cdd_df,cols,alpha_hat=alpha0)[0]
    data['subject'] = subject
    return data # vn,vr,tn,tr,choice

## Complete Pooling

Complete pooling ignores the group-level information and considers all data as belonging to the same category. All groups are described with the same model. 

We are using complete pooling to generate priors for when we implement a higherarchical bayesian model.

### Load data

We will load the data from all participants so we can run the modeling schema

In [11]:
# Takes about 10 seconds

# we will change this when we change utility to 1st level analysis (or split)
split_dir = '/Volumes/UCDN/datasets/IDM/split/'
save_dir = '/Volumes/UCDN/datasets/IDM/BH/csv'
subjs = sorted(glob.glob(os.path.join(split_dir,'23_IDM_*')))
task = 'cdd'
# VNa,VRa,TNa,TRa,Cha = [[]]*5
cols = ['cdd_choice','cdd_immed_amt','cdd_delay_amt','cdd_immed_wait','cdd_delay_wait','alpha']
data = pd.DataFrame(columns=['subject']+cols)

for subj_id,s in enumerate(subjs):
    subject = os.path.basename(s)
    fn  = os.path.join(s,task,'{}_{}.csv'.format(os.path.basename(s),task))
    if os.path.exists(fn):
        subj_data = read_load_data(subject=subject,fn=fn,alpha0=1.0,cols=cols)
        for c in cols:
            subj_data[c] = subj_data[c].astype(float)        
        subj_data['subject_id'] = int(subj_id)
        data = pd.concat([data,subj_data],ignore_index=True)

data.head(10)


Unnamed: 0,subject,cdd_choice,cdd_immed_amt,cdd_delay_amt,cdd_immed_wait,cdd_delay_wait,alpha,subject_id
0,23_IDM_0001,1.0,5.0,26.0,0.0,29.0,1.0,0.0
1,23_IDM_0001,1.0,2.0,22.0,0.0,90.0,1.0,0.0
2,23_IDM_0001,0.0,2.0,6.0,0.0,90.0,1.0,0.0
3,23_IDM_0001,0.0,15.0,15.0,0.0,151.0,1.0,0.0
4,23_IDM_0001,1.0,15.0,55.0,0.0,90.0,1.0,0.0
5,23_IDM_0001,1.0,2.0,65.0,0.0,29.0,1.0,0.0
6,23_IDM_0001,0.0,15.0,20.0,0.0,59.0,1.0,0.0
7,23_IDM_0001,1.0,15.0,26.0,0.0,6.0,1.0,0.0
8,23_IDM_0001,0.0,5.0,10.0,0.0,149.0,1.0,0.0
9,23_IDM_0001,0.0,5.0,4.0,0.0,31.0,1.0,0.0


### 1. Simple Hierarchical Model

We pooled all data together and ran simple BH model in complete_pool_as_prior.

Here we can select the number of subjects we want to analyze


In [13]:

subjects = data['subject'].unique()
# subjects = subjects[:10]
# print(subjects)
data = data.loc[data['subject'].isin(subjects)]


In [14]:

subjects = data['subject'].unique()
nb_subj = subjects.shape[0]
nb_trials = data.shape[0]//nb_subj
subj_id_list = data['subject_id'].to_list()
subj_id = [int(s) for s in subj_id_list]
old_id = np.array([ [s]*nb_trials for s in range(nb_subj) ]).flatten()

delay_amt = data['cdd_delay_amt'].values
delay_wait = data['cdd_delay_wait'].values
immed_amt = data['cdd_immed_amt'].values
immed_wait = data['cdd_immed_wait'].values
choices = data['cdd_choice'].values


### Load results from completely_pooled_model

We use the kappa and gamma from previous run model

In [7]:

fn = os.path.join(save_dir,'completely_pooled_model.csv')
pool_model = pd.read_csv(fn,index_col=0)

mu_kappa_hat,std_kappa_hat = pool_model.loc['kappa[0]','mean'],pool_model.loc['kappa[0]','sd']
mu_gamma_hat,std_gamma_hat = pool_model.loc['gamma[0]','mean'],pool_model.loc['gamma[0]','sd']

print('For kappa, use the following (mu,sigma) : ({},{})'.format(mu_kappa_hat, std_kappa_hat))
print('For gamma, use the following (mu,sigma) : ({},{})'.format(mu_gamma_hat, std_gamma_hat))

For kappa, use the following (mu,sigma) : (0.0199494192,0.0004967748)
For gamma, use the following (mu,sigma) : (0.1290928433,0.0027119263)


In [8]:
# We will fit a model for each subject
with pm.Model() as model_simple:

    # Hyperparameters for k
    # mu_kappa_hyper = pm.Beta('mu_kappa_hyper',mu=mu_kappa_hat,sigma=0.1)
    sd_kappa_hyper = pm.Beta('sd_kappa_hyper',mu=std_kappa_hat,sigma=0.01)

    # use above mean and stdev to define kappa and gamma, the posterior from the pooled is now our prior
    # kappa = pm.Beta('kappa',mu=mu_kappa_hyper,sigma=std_kappa_hat,shape=np.size(np.unique(subj_id)))
    kappa = pm.HalfNormal('kappa',sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
    gamma = pm.HalfNormal('gamma',sigma=0.02,shape=np.size(np.unique(subj_id)))
    
    prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( delay_amt/(1+(kappa[subj_id]*delay_wait)) 
                                                                            - immed_amt/(1+(kappa[subj_id]*immed_wait)) ))))

    y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

    trace_prior = pm.sample(10000, tune=10000, cores=5,target_accept=0.95)

# increase number of subplots
az.rcParams["plot.max_subplots"] = 600
# Call the trace whatever you like. This just saves it. You don't want to run a whole model and then accidentally x-out your window or refresh or something and lose it all!
az.plot_trace(trace_prior, var_names=["kappa","gamma"],compact=False)
# This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
summary= az.summary(trace_prior,round_to=10)
fn = os.path.join(save_dir,"simple_BH_model.csv")
# Again, call it what you want (yeah - call it what you want tooooo)
print('Saving to : {}'.format(fn))
summary.to_csv(fn)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (5 chains in 5 jobs)
NUTS: [sd_kappa_hyper, kappa, gamma]
Process worker_chain_0:
Traceback (most recent call last):
  File "/Users/pizarror/opt/anaconda3/envs/idm_jupy/lib/python3.9/site-packages/pymc/sampling/parallel.py", line 122, in run
    self._start_loop()
  File "/Users/pizarror/opt/anaconda3/envs/idm_jupy/lib/python3.9/site-packages/pymc/sampling/parallel.py", line 161, in _start_loop
    msg = self._recv_msg()
  File "/Users/pizarror/opt/anaconda3/envs/idm_jupy/lib/python3.9/site-packages/pymc/sampling/parallel.py", line 153, in _recv_msg
    return self._msg_pipe.recv()
  File "/Users/pizarror/opt/anaconda3/envs/idm_jupy/lib/python3.9/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/Users/pizarror/opt/anaconda3/envs/idm_jupy/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/Users/pizarr

KeyboardInterrupt: 

## Working set
The sets of parameters below seem to be generating posterior for kappa and gamma that have some variability... is it corect? Not sure, we have to plot against MLE and find out, or we can use the estimates to generate a softmax

In [None]:


# We will fit a model for each subject
with pm.Model() as model_simple:

    # Hyperparameters for k
    mu_kappa_hyper = pm.Beta('mu_kappa_hyper',mu=mu_kappa_hat,sigma=0.1)
    sd_kappa_hyper = pm.Beta('sd_kappa_hyper',mu=std_kappa_hat,sigma=0.01)

    # use above mean and stdev to define kappa and gamma, the posterior from the pooled is now our prior
    kappa = pm.Beta('kappa',mu=mu_kappa_hyper,sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
    # kappa = pm.HalfNormal('kappa',sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
    gamma = pm.HalfNormal('gamma',sigma=0.05,shape=np.size(np.unique(subj_id)))
    
    prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( delay_amt/(1+(kappa[subj_id]*delay_wait)) 
                                                                            - immed_amt/(1+(kappa[subj_id]*immed_wait)) ))))

    y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

    trace_prior = pm.sample(10000, tune=10000, cores=2,target_accept=0.98)


# increase number of subplots
az.rcParams["plot.max_subplots"] = 600
# Call the trace whatever you like. This just saves it. You don't want to run a whole model and then accidentally x-out your window or refresh or something and lose it all!
az.plot_trace(trace_prior, var_names=["kappa","gamma"],compact=False)
# This is how you get a nice array. Note that this returns a pandas DataFrame, not a numpy array. Indexing is totally different.
summary= az.summary(trace_prior,round_to=10)
fn = os.path.join(save_dir,"simple_BH_model.csv")
# Again, call it what you want (yeah - call it what you want tooooo)
print('Saving to : {}'.format(fn))
summary.to_csv(fn)

## Exploratory

Let's find out when the hyperparameters breakdown by increasing each on slowly

In [None]:
# sigma (SD) for mu_kappa_hyper worked for 0.12, broke at 0.15
muk_sigma_prior = [0.12, 0.15, 0.17, 0.2]
# sigma (SD) for sd_kappa_hyper worked for 0.02, broke at 0.03
sdk_sigma_prior = [0.01, 0.02, 0.03, 0.04, 0.05]
# sigma (SD) for gamma HalfNormal worked for 15.0, broke at 20.0
g_sigma_prior = [15.0, 20.0, 30.0]


In [None]:
# sigma (SD) for gamma HalfNormal worked for 10.0, broke at 50.0
g_sigma_prior = [15.0, 20.0, 30.0]

for igs in g_sigma_prior:

    imks = 0.10 # worked for 0.12 but we will keep it just below
    print('We are setting SD (sigma) on the prior of the mean of kappa to : {}'.format(imks))
    isks = 0.02 # worked for 0.02 we will try to keep it here, could move down to 0.01
    print('We are setting SD (sigma) on the prior of the SD of kappa to : {}'.format(isks))
    print('We are setting SD (sigma) on the HalfNormal of gamma to : {}'.format(igs))

    try:
        with pm.Model() as model_simple:

            # Hyperparameters for k
            mu_kappa_hyper = pm.Beta('mu_kappa_hyper',mu=mu_kappa_hat,sigma=imks)
            sd_kappa_hyper = pm.Beta('sd_kappa_hyper',mu=std_kappa_hat,sigma=isks)

            # use above mean and stdev to define kappa and gamma, the posterior from the pooled is now our prior
            kappa = pm.Beta('kappa',mu=mu_kappa_hyper,sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
            # kappa = pm.HalfNormal('kappa',sigma=sd_kappa_hyper,shape=np.size(np.unique(subj_id)))
            gamma = pm.HalfNormal('gamma',sigma=igs,shape=np.size(np.unique(subj_id)))
            
            prob = pm.Deterministic('prob', 1 / (1 + pm.math.exp(-gamma[subj_id] * ( delay_amt/(1+(kappa[subj_id]*delay_wait)) 
                                                                                    - immed_amt/(1+(kappa[subj_id]*immed_wait)) ))))

            y_1 = pm.Bernoulli('y_1',p=prob,observed=choices)

            trace_prior = pm.sample(10000, tune=10000, cores=5,target_accept=0.95)
            # trace_prior = pm.sample(100, tune=100, cores=1,target_accept=0.5)

    except Exception as e:
        print(e)
        print(model_simple.debug(verbose=True))
        print("SamplingError : lets continue to the next parameter")
        break
    except:
        print("Something else went wrong")



print('we are ready to continue!')
