In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sn
import gc

from joblib import Parallel, delayed
from tqdm import tqdm
from easydict import EasyDict as edict
from sklearn.linear_model import LogisticRegression

from samplers import *
from datgen import *

import scipy.stats as sp



# Continuous Outcome

In [2]:
def parallel_unit(i,
                  D_lst,
                  summaryDs_lst,
                  method='BLR',
                  bal_method='NearMatch',
                  cov_adj=True,
                  y_type='C',
                  random_state=2021):
    
    D = D_lst[i]
    summaryDs = summaryDs_lst[i]
    result_dict = edict()
    
    if method == 'BLR':
        result = BLR(D,cov_adj=cov_adj,
                     y_type=y_type,
                     random_state=random_state)
        result_dict.theta1 = result['theta1']
        result_dict.M = None
        result_dict.pis = None
        
    elif method == 'UIP':
        result = UIP_Dirichlet(D,summaryDs,
                               bal_method=bal_method,
                               cov_adj=cov_adj,
                               y_type=y_type,
                               gammas_ps=False,
                               random_state=random_state)
        result_dict.theta1 = result['theta1']
        result_dict.M = result['M']
        result_dict.pis = result['pis']
        
    return result


def method_eval(theta_pred_mat,theta,trim_rate=0.005):
    
    if trim_rate==0:
        bias = theta_pred_mat[:,0].mean()-theta[1]
        rmse = np.sqrt(np.mean((theta_pred_mat[:,0]-theta[1])**2))
        ci_width = np.mean(theta_pred_mat[:,2]-theta_pred_mat[:,1])
        ci_coverage = np.mean((theta_pred_mat[:,2]>=theta[1])*(theta_pred_mat[:,1]<=theta[1]))
        
    else:
        # prevent some rare outliers
        bias = sp.trim_mean(theta_pred_mat[:,0],trim_rate)-theta[1]
        rmse = np.sqrt(sp.trim_mean((theta_pred_mat[:,0]-theta[1])**2,trim_rate))
        ci_width = sp.trim_mean(theta_pred_mat[:,2]-theta_pred_mat[:,1],trim_rate)
        ci_coverage = sp.trim_mean((theta_pred_mat[:,2]>=theta[1])*(theta_pred_mat[:,1]<=theta[1]),trim_rate)
    
    return np.array([bias, rmse, ci_width, ci_coverage])

In [3]:
# date
date = '0705'

# simulation scenarios
scenarios = [4,5]

# replications & simulations
reps = 200
num_cores = 10
random_state = 2021

# cov_adj in RWD and RCT
cov_adj_datgen = True
cov_adj_pred = True

In [4]:
nH = 500 # sample size for the historical data
n = 200 # sample size for current data
K = 3 # number of historical data
rho = 0.1 # correlation coefficient
d = 6 # dimension of coveriates
y_type = 'C' # y_type: C => continuous / B=> Binary
m_threshold = 0.1 # balance treshold

# coefficient of the response surface
theta = np.array([1,1]+[1]*d).astype('float')
sigmat = rho*np.ones((d,d))+(1-rho)*np.eye(d)

# treatment effect
theta1 = {
    '1': [1,1,1],
    '2': [0.8,1.1,1.3],
    '3': [0.8,1.1,1.3],
    '4': [0.1,0.5,0.9],
    '5': [2,3,4]
}

# parameter to generate covariates
X_means = {'1':[0,0,0],
           '2':[0,0,0],
           '3':[0.5,1,1.5],
           '4':[0.5,1,1.5],
           '5':[0.5,1,1.5]}

X_stds =  {'1':[1,1,1],
           '2':[1,1,1],
           '3':[0.5,1.5,2],
           '4':[0.5,1.5,2],
           '5':[0.5,1.5,2]}

sig_err_k = [.5,.5,.5]
sig_err = .5

betas = [np.array([0,0]+[0.2]*2+[-0.2]*2),
         np.array([1,1]+[1]*2+[-1]*2),
         np.array([2,2]+[2]*2+[-2]*2)]

In [5]:
metrics_dfs = []
pis_lst = []
M_lst = []

for scenario in scenarios:
    print('Data Generation\n')
    print('--------------- Scenario',scenario,'---------------')

    save_folder = './results/{}C_outcome/scenario{}/'.format(date,scenario)

    if not os.path.exists(save_folder):
        print('Creat the folder!')
        os.makedirs(save_folder)

    # Generate the historical data
    if not (os.path.exists(save_folder + 'scenario{}_histDs.npy'.format(scenario))):
        print('Generate Historical Data!')

        np.random.seed(2021)
        histDs_lst = []

        for i in tqdm(range(reps)):
            histDs = edict()
            histDs.RWDs = []
            histDs.betas = []
            histDs.thetas = []
            histDs.ps_true = []

            for k in range(K):
                betak = betas[k]

                thetak = theta.copy().astype('float')
                thetak[1] = theta1[str(scenario)][k]
                thetak[2:] = theta[2:].copy()+0.1*np.random.randn(d) if scenario>=2 else theta[2:].copy()
            
                sigmatk = np.sqrt(X_stds[str(scenario)][k])*rho*np.ones((d,d))+ \
                         (X_stds[str(scenario)][k]-np.sqrt(X_stds[str(scenario)][k])*rho)*np.eye(d)

                Xk = np.random.multivariate_normal(X_means[str(scenario)][k]*np.ones(d),sigmatk,size=nH)
                Xk[:,:int(0.4*d)] = np.array(Xk[:,:int(0.4*d)]>X_means[str(scenario)][k],dtype=float)

                Tk, yk, psk_true = y_gen(Xk,thetak,betak,y_type=y_type,sig_err=sig_err_k[k])

                histDs.RWDs.append((Xk,Tk,yk))
                histDs.thetas.append(thetak)
                histDs.betas.append(betak)
                histDs.ps_true.append(psk_true)

            histDs_lst.append(histDs)

        np.save(save_folder+'scenario{}_histDs.npy'.format(scenario), np.array(histDs_lst))

    else:

        print('Load Saved Historical Data!')
        histDs_lst = np.load(save_folder+'scenario{}_histDs.npy'.format(scenario),allow_pickle=True)


    # Generate current study
    if not (os.path.exists(save_folder + 'scenario{}_Ds.npy'.format(scenario))):
        print('Generate Current RCT Data!')

        np.random.seed(2021)
        Ds_lst = []

        for i in tqdm(range(reps)):
            D = edict()
            D.X = np.random.multivariate_normal(np.zeros(d),sigmat,size=n)
            D.X[:,:int(0.4*d)] = np.array(D.X[:,:int(0.4*d)]>0,dtype=float)
            D.T, D.y, D.ps_true = y_gen(D.X,theta,
                                        np.zeros(d),
                                        y_type=y_type,
                                        sig_err=sig_err)
            Ds_lst.append(D)

        np.save(save_folder+'scenario{}_Ds.npy'.format(scenario), np.array(Ds_lst))

    else:

        print('Load Saved RCT Data!')
        Ds_lst = np.load(save_folder+'scenario{}_Ds.npy'.format(scenario),allow_pickle=True)


    # Generate the aggregate data based on historical data
    if not os.path.exists(save_folder + 'scenario{}_summaryDs.bin'.format(scenario)):
        print('Generate Current Summary Data!')
        from balance_methods import *

        numpy2ri.activate()
        pandas2ri.activate()

        summaryDs_lst = []

        for i in tqdm(range(reps)):
            summaryDs = edict()
            summaryDs.ps_pred = []
            summaryDs.beta_hat = []
            summaryDs.NearMatch = []
            summaryDs.IPW = []

            histDs = histDs_lst[i]

            for k in range(K):
                Xk,Tk,yk = histDs.RWDs[k]
                clf = LogisticRegression()
                clf.fit(Xk,Tk)
                psk_pred = clf.predict_proba(Xk)[:,1]
                betak_hat = np.array([clf.intercept_[0]]+clf.coef_.flatten().tolist())

                summaryDs.ps_pred.append(psk_pred)
                summaryDs.beta_hat.append(betak_hat)

                # nearest matching
                mdata, summary_fit, bal_out = matchit_wrapper(yk,Tk,Xk,
                                                              y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='nearest',
                                                  estimand='ATT',
                                                  replace=False)
                summaryDs.NearMatch.append(((mdata['weights']>0).sum(), 
                                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                                            summary_fit, mdata))
                
                # inverse probability weighting
                wdata, summary_fit, bal_out = weightit_wrapper(yk,Tk,Xk,
                                                  y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='ps',
                                                  estimand='ATT',
                                                  link='logit')
                summaryDs.IPW.append(((wdata['weights']>0).sum(), 
                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                            summary_fit, wdata))
                

            summaryDs_lst.append(summaryDs)

        file=open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"wb")
        pickle.dump(summaryDs_lst, file) 
        file.close()

        numpy2ri.deactivate()
        pandas2ri.deactivate()

    else:

        print('Load Saved Summary Data!')
        summaryDs_lst = pickle.load(open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"rb"))
    
    for bal_method in ['NearMatch','IPW']:
        
        print('Balance Method:',bal_method)

        # NIP
        if not os.path.exists(save_folder + 'scenario{}_blr_results.bin'.format(scenario)):
            trace_blr_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                               summaryDs_lst, 
                                                                               method='BLR',
                                                                               bal_method=bal_method,
                                                                               cov_adj=cov_adj_pred,
                                                                               y_type=y_type,
                                                                              random_state=random_state) 
                                                                              for i in tqdm(range(reps)))
            file=open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"wb")
            pickle.dump(trace_blr_lst, file) 
            file.close()

        else:
            trace_blr_lst = pickle.load(open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"rb"))

        theta_pred_blr_mat = np.array([np.array([(trace_blr_lst[i]['theta1']).mean(),
                                                 np.quantile((trace_blr_lst[i]['theta1']),0.025),
                                                 np.quantile((trace_blr_lst[i]['theta1']),0.975)]) for i in range(reps)])

        del trace_blr_lst
        gc.collect()
        print('NIP Complete!')


        # UIP
        if not os.path.exists(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario)):
            try:
                trace_uip_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            except:
                trace_uip_lst = Parallel(n_jobs=1)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            file=open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"wb")
            pickle.dump(trace_uip_lst, file) 
            file.close()

        else:

            trace_uip_lst = pickle.load(open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"rb"))


        theta_pred_uip_mat = np.array([np.array([(trace_uip_lst[i]['theta1']).mean(),
                                                 np.quantile((trace_uip_lst[i]['theta1']),0.025),
                                                 np.quantile((trace_uip_lst[i]['theta1']),0.975)]) for i in range(reps)])
        uip_pis_array = np.array([trace_uip_lst[i]['pis'].mean(axis=0) for i in range(reps)])
        uip_M_array = np.array([trace_uip_lst[i]['M'].mean(axis=0) for i in range(reps)])

        del trace_uip_lst
        gc.collect()
        print('UIP Complete!')

        pis_lst.append((uip_pis_array))
        M_lst.append((uip_M_array))

        # saving
        # evaluation metrics
        uip_df = pd.DataFrame(np.array([np.nan*np.ones(K),
                           uip_pis_array.mean(axis=0)]),columns=['$w_{}$'.format(i+1) for i in range(K)])
        uip_df['$M$'] = [np.nan,
                         uip_M_array.mean(axis=0)]


        metrics_array = np.array([method_eval(theta_pred_blr_mat,theta),
                                  method_eval(theta_pred_uip_mat,theta)])
        metrics_df = pd.DataFrame(metrics_array,columns=['Bias','RMSE','CI Width','CI Coverage'])
        metrics_df['Method'] = ['NIP','UIP']
        metrics_df['Case'] = 'Scenario {}'.format(scenario)
        metrics_df['Bal_Method'] = bal_method
        metrics_df = pd.concat([metrics_df,uip_df],axis=1)

        print(metrics_df.set_index(['Case','Bal_Method','Method']))

        metrics_df.set_index(['Case','Bal_Method','Method']).to_csv(save_folder+bal_method+'_metric_df.csv')

        metrics_dfs.append(metrics_df)

Data Generation

--------------- Scenario 4 ---------------
Load Saved Historical Data!
Load Saved RCT Data!
Load Saved Summary Data!
Balance Method: NearMatch
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 4 NearMatch  NIP     0.011511  0.072667  0.283409     0.939394   
                      UIP    -0.021914  0.074391  0.273485     0.924242   

                                $w_1$     $w_2$     $w_3$        $M$  
Case       Bal_Method Method                                          
Scenario 4 NearMatch  NIP         NaN       NaN       NaN        NaN  
                      UIP     0.13983  0.292796  0.567374  71.699573  
Balance Method: IPW
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 4 IPW        NIP    

In [6]:
metrics_all = pd.concat(metrics_dfs)
metrics_all.set_index(['Case','Method','Bal_Method'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Bias,RMSE,CI Width,CI Coverage,$w_1$,$w_2$,$w_3$,$M$
Case,Method,Bal_Method,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Scenario 4,NIP,NearMatch,0.011511,0.072667,0.283409,0.939394,,,,
Scenario 4,UIP,NearMatch,-0.021914,0.074391,0.273485,0.924242,0.13983,0.292796,0.567374,71.699573
Scenario 4,NIP,IPW,0.011511,0.072667,0.283409,0.939394,,,,
Scenario 4,UIP,IPW,-0.020251,0.074071,0.274169,0.939394,0.144377,0.303507,0.552116,73.020328
Scenario 5,NIP,NearMatch,0.011511,0.072667,0.283409,0.939394,,,,
Scenario 5,UIP,NearMatch,0.016967,0.074524,0.275486,0.914141,0.398443,0.348554,0.253003,1.827237
Scenario 5,NIP,IPW,0.011511,0.072667,0.283409,0.939394,,,,
Scenario 5,UIP,IPW,0.016597,0.074056,0.274857,0.929293,0.394889,0.347019,0.258092,1.858178


In [7]:
metrics_all.set_index(['Case','Method','Bal_Method']).to_csv('./results/{}C_outcome/metric_df_extreme.csv'.format(date))

# Binary

In [8]:
nH = 500 # sample size for the historical data
n = 200 # sample size for current data
K = 3 # number of historical data
rho = 0.1 # correlation coefficient
d = 6 # dimension
y_type = 'B' # response type
m_threshold = 0.1 # balance treshold

# coefficient of the response surface
theta = np.array([-1,1]+[0.1]*d).astype('float')
sigmat = rho*np.ones((d,d))+(1-rho)*np.eye(d)


# treatment effect
theta1 = {
    '1': [1,1,1],
    '2': [0.8,1.1,1.3],
    '3': [0.8,1.1,1.3],
    '4': [0.5,2.5,4.5],
    '5': [5,7,9]
}

# parameter to generate covariates
X_means = {'1':[0,0,0],
           '2':[0,0,0],
           '3':[0.5,1,1.5],
           '4':[0.5,1,1.5],
           '5':[0.5,1,1.5]}

X_stds =  {'1':[1,1,1],
           '2':[1,1,1],
           '3':[0.5,1.5,2],
           '4':[0.5,1.5,2],
           '5':[0.5,1.5,2]}

# coefficients to generate the propensity scores
betas = [np.array([0,0]+[0.2]*2+[-0.2]*2),
         np.array([1,1]+[1]*2+[-1]*2),
         np.array([2,2]+[2]*2+[-2]*2)]

num_cores = 10

In [9]:
metrics_dfs = []
pis_lst = []
M_lst = []

for scenario in scenarios:
    print('Data Generation\n')
    print('--------------- Scenario',scenario,'---------------')

    save_folder = './results/{}B_outcome/scenario{}/'.format(date,scenario)

    if not os.path.exists(save_folder):
        print('Creat the folder!')
        os.makedirs(save_folder)

    # Generate the historical data
    if not (os.path.exists(save_folder + 'scenario{}_histDs.npy'.format(scenario))):
        print('Generate Historical Data!')

        np.random.seed(2021)
        histDs_lst = []

        for i in tqdm(range(reps)):
            histDs = edict()
            histDs.RWDs = []
            histDs.betas = []
            histDs.thetas = []
            histDs.ps_true = []

            for k in range(K):
                betak = betas[k]

                thetak = theta.copy().astype('float')
                thetak[1] = theta1[str(scenario)][k]
                thetak[2:] = theta[2:].copy()+0.1*np.random.randn(d) if scenario>=2 else theta[2:].copy()
            
                sigmatk = np.sqrt(X_stds[str(scenario)][k])*rho*np.ones((d,d))+(X_stds[str(scenario)][k]-np.sqrt(X_stds[str(scenario)][k])*rho)*np.eye(d)

                Xk = np.random.multivariate_normal(X_means[str(scenario)][k]*np.ones(d),sigmatk,size=nH)
                Xk[:,:int(0.4*d)] = np.array(Xk[:,:int(0.4*d)]>X_means[str(scenario)][k],dtype=float)

                Tk, yk, psk_true = y_gen(Xk,thetak,betak,y_type=y_type)

                histDs.RWDs.append((Xk,Tk,yk))
                histDs.thetas.append(thetak)
                histDs.betas.append(betak)
                histDs.ps_true.append(psk_true)

            histDs_lst.append(histDs)

        np.save(save_folder+'scenario{}_histDs.npy'.format(scenario), np.array(histDs_lst))

    else:

        print('Load Saved Historical Data!')
        histDs_lst = np.load(save_folder+'scenario{}_histDs.npy'.format(scenario),allow_pickle=True)


    # Generate current study
    if not (os.path.exists(save_folder + 'scenario{}_Ds.npy'.format(scenario))):
        print('Generate Current RCT Data!')

        np.random.seed(2021)
        Ds_lst = []

        for i in tqdm(range(reps)):
            D = edict()
            D.X = np.random.multivariate_normal(np.zeros(d),sigmat,size=n)
            D.X[:,:int(0.4*d)] = np.array(D.X[:,:int(0.4*d)]>0,dtype=float)
            D.T, D.y, D.ps_true = y_gen(D.X,theta,np.zeros(d),y_type=y_type)
            Ds_lst.append(D)

        np.save(save_folder+'scenario{}_Ds.npy'.format(scenario), np.array(Ds_lst))

    else:

        print('Load Saved RCT Data!')
        Ds_lst = np.load(save_folder+'scenario{}_Ds.npy'.format(scenario),allow_pickle=True)


    # Generate the summary info based on historical data
    if not os.path.exists(save_folder + 'scenario{}_summaryDs.bin'.format(scenario)):
        print('Generate Current Summary Data!')
        from balance_methods import *

        numpy2ri.activate()
        pandas2ri.activate()

        summaryDs_lst = []

        for i in tqdm(range(reps)):
            summaryDs = edict()
            summaryDs.ps_pred = []
            summaryDs.beta_hat = []
            summaryDs.NearMatch = []
            summaryDs.IPW = []

            histDs = histDs_lst[i]

            for k in range(K):
                Xk,Tk,yk = histDs.RWDs[k]
                clf = LogisticRegression()
                clf.fit(Xk,Tk)
                psk_pred = clf.predict_proba(Xk)[:,1]
                betak_hat = np.array([clf.intercept_[0]]+clf.coef_.flatten().tolist())

                summaryDs.ps_pred.append(psk_pred)
                summaryDs.beta_hat.append(betak_hat)

                # nearest matching
                mdata, summary_fit, bal_out = matchit_wrapper(yk,Tk,Xk,
                                                              y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='nearest',
                                                  estimand='ATT',
                                                  replace=False)
                summaryDs.NearMatch.append(((mdata['weights']>0).sum(), 
                                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                                            summary_fit, mdata))
                
                # inverse probability weighting
                wdata, summary_fit, bal_out = weightit_wrapper(yk,Tk,Xk,
                                                  y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='ps',
                                                  estimand='ATT',
                                                  link='logit')
                summaryDs.IPW.append(((wdata['weights']>0).sum(), 
                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                            summary_fit, wdata))
                

            summaryDs_lst.append(summaryDs)

        file=open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"wb")
        pickle.dump(summaryDs_lst, file) 
        file.close()

        numpy2ri.deactivate()
        pandas2ri.deactivate()

    else:

        print('Load Saved Summary Data!')
        summaryDs_lst = pickle.load(open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"rb"))
    
    for bal_method in ['NearMatch','IPW']:
        
        print('Balance Method:',bal_method)

        # NIP
        if not os.path.exists(save_folder + 'scenario{}_blr_results.bin'.format(scenario)):
            trace_blr_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                               summaryDs_lst, 
                                                                               method='BLR',
                                                                               bal_method=bal_method,
                                                                               cov_adj=cov_adj_pred,
                                                                               y_type=y_type,
                                                                              random_state=random_state) 
                                                                              for i in tqdm(range(reps)))
            file=open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"wb")
            pickle.dump(trace_blr_lst, file) 
            file.close()

        else:
            trace_blr_lst = pickle.load(open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"rb"))

        theta_pred_blr_mat = np.array([np.array([(trace_blr_lst[i]['theta1']).mean(),
                                                 np.quantile((trace_blr_lst[i]['theta1']),0.025),
                                                 np.quantile((trace_blr_lst[i]['theta1']),0.975)]) for i in range(reps)])

        del trace_blr_lst
        gc.collect()
        print('NIP Complete!')


        # UIP
        if not os.path.exists(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario)):
            try:
                trace_uip_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            except:
                trace_uip_lst = Parallel(n_jobs=1)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            file=open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"wb")
            pickle.dump(trace_uip_lst, file) 
            file.close()

        else:

            trace_uip_lst = pickle.load(open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"rb"))


        theta_pred_uip_mat = np.array([np.array([(trace_uip_lst[i]['theta1']).mean(),
                                                 np.quantile((trace_uip_lst[i]['theta1']),0.025),
                                                 np.quantile((trace_uip_lst[i]['theta1']),0.975)]) for i in range(reps)])
        uip_pis_array = np.array([trace_uip_lst[i]['pis'].mean(axis=0) for i in range(reps)])
        uip_M_array = np.array([trace_uip_lst[i]['M'].mean(axis=0) for i in range(reps)])

        del trace_uip_lst
        gc.collect()
        print('UIP Complete!')

        pis_lst.append((uip_pis_array))
        M_lst.append((uip_M_array))

        # saving
        # evaluation metrics
        uip_df = pd.DataFrame(np.array([np.nan*np.ones(K),
                           uip_pis_array.mean(axis=0)]),columns=['$w_{}$'.format(i+1) for i in range(K)])
        uip_df['$M$'] = [np.nan,
                         uip_M_array.mean(axis=0)]


        metrics_array = np.array([method_eval(theta_pred_blr_mat,theta),
                                  method_eval(theta_pred_uip_mat,theta)])
        metrics_df = pd.DataFrame(metrics_array,columns=['Bias','RMSE','CI Width','CI Coverage'])
        metrics_df['Method'] = ['NIP','UIP']
        metrics_df['Case'] = 'Scenario {}'.format(scenario)
        metrics_df['Bal_Method'] = bal_method
        metrics_df = pd.concat([metrics_df,uip_df],axis=1)

        print(metrics_df.set_index(['Case','Bal_Method','Method']))

        metrics_df.set_index(['Case','Bal_Method','Method']).to_csv(save_folder+bal_method+'_metric_df.csv')

        metrics_dfs.append(metrics_df)

Data Generation

--------------- Scenario 4 ---------------
Load Saved Historical Data!
Load Saved RCT Data!
Load Saved Summary Data!
Balance Method: NearMatch
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 4 NearMatch  NIP     0.060925  0.345514  1.238014     0.929293   
                      UIP     0.207158  0.376190  1.233114     0.893939   

                                 $w_1$     $w_2$     $w_3$        $M$  
Case       Bal_Method Method                                           
Scenario 4 NearMatch  NIP          NaN       NaN       NaN        NaN  
                      UIP     0.527001  0.303782  0.169218  95.718702  
Balance Method: IPW
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 4 IPW        NIP

In [10]:
metrics_all = pd.concat(metrics_dfs)
metrics_all.set_index(['Case','Method','Bal_Method'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Bias,RMSE,CI Width,CI Coverage,$w_1$,$w_2$,$w_3$,$M$
Case,Method,Bal_Method,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Scenario 4,NIP,NearMatch,0.060925,0.345514,1.238014,0.929293,,,,
Scenario 4,UIP,NearMatch,0.207158,0.37619,1.233114,0.893939,0.527001,0.303782,0.169218,95.718702
Scenario 4,NIP,IPW,0.060925,0.345514,1.238014,0.929293,,,,
Scenario 4,UIP,IPW,0.207905,0.378133,1.236464,0.878788,0.544429,0.299113,0.156458,93.798194
Scenario 5,NIP,NearMatch,0.060925,0.345514,1.238014,0.929293,,,,
Scenario 5,UIP,NearMatch,0.13633,0.375055,1.290145,0.929293,0.547831,0.260926,0.191242,11.97123
Scenario 5,NIP,IPW,0.060925,0.345514,1.238014,0.929293,,,,
Scenario 5,UIP,IPW,0.131011,0.374231,1.288353,0.939394,0.466399,0.312096,0.221505,14.140089


In [11]:
metrics_all.to_csv('./results/{}B_outcome/metric_df_extreme.csv'.format(date))