In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sn
import gc

from joblib import Parallel, delayed
from tqdm import tqdm
from easydict import EasyDict as edict
from sklearn.linear_model import LogisticRegression

from samplers import *
from datgen import *



# Continuous Outcome

In [2]:
def parallel_unit(i,
                  D_lst,
                  summaryDs_lst,
                  method='BLR',
                  bal_method='NearMatch',
                  cov_adj=True,
                  y_type='C',
                  random_state=2021):
    
    D = D_lst[i]
    summaryDs = summaryDs_lst[i]
    result_dict = edict()
    
    if method == 'BLR':
        result = BLR(D,cov_adj=cov_adj,
                     y_type=y_type,
                     random_state=random_state)
        result_dict.theta1 = result['theta1']
        result_dict.M = None
        result_dict.pis = None
        
    elif method == 'UIP':
        result = UIP_Dirichlet(D,summaryDs,
                               bal_method=bal_method,
                               cov_adj=cov_adj,
                               y_type=y_type,
                               gammas_ps=False,
                               random_state=random_state)
        result_dict.theta1 = result['theta1']
        result_dict.M = result['M']
        result_dict.pis = result['pis']
        
    return result


def method_eval(theta_pred_mat,theta):
    
    bias = theta_pred_mat[:,0].mean()-theta[1]
    rmse = np.sqrt(np.mean((theta_pred_mat[:,0]-theta[1])**2))
    ci_width = np.mean(theta_pred_mat[:,2]-theta_pred_mat[:,1])
    ci_coverage = np.mean((theta_pred_mat[:,2]>=theta[1])*(theta_pred_mat[:,1]<=theta[1]))
    
    return np.array([bias, rmse, ci_width, ci_coverage])

In [3]:
# date
date = '1123'

# simulation scenarios
scenarios = [1,2,3]

# replications & simulations
reps = 500
num_cores = 10
random_state = 2021

# cov_adj in RWD and RCT
cov_adj_datgen = True
cov_adj_pred = True

In [4]:
nH = 500 # sample size for the historical data
n = 200 # sample size for current data
K = 3 # number of historical data
rho = 0.1 # correlation coefficient
d = 10 # dimension of coveriates
y_type = 'C' # y_type: C => continuous / B=> Binary
m_threshold = 0.1 # balance treshold

# coefficient of the response surface
theta = np.array([1,1]+[1]*d).astype('float')
sigmat = rho*np.ones((d,d))+(1-rho)*np.eye(d)

# treatment effect
theta1 = {
    '1': [1,1,1],
    '2': [0.8,1.1,1.3],
    '3': [0.8,1.1,1.3]
}

# parameter to generate covariates
X_means = {'1':[0,0,0],
           '2':[0.5,1,1.5],
           '3':[0,0,0]}

X_stds =  {'1':[1,1,1],
           '2':[1,1,1],
           '3':[0.5,1.5,2]}

# coefficients to generate the propensity scores
betas = [np.array([0.1]*4+[0.3]*3+[-0.3]*3),
         np.array([0.1]*4+[0.6]*3+[-0.6]*3),
         np.array([0.1]*4+[1]*3+[-1]*3)]

In [5]:
metrics_dfs = []
pis_lst = []
M_lst = []

for scenario in scenarios:
    print('Data Generation\n')
    print('--------------- Scenario',scenario,'---------------')

    save_folder = './results/{}C_outcome/scenario{}/'.format(date,scenario)

    if not os.path.exists(save_folder):
        print('Creat the folder!')
        os.makedirs(save_folder)

    # Generate the historical data
    if not (os.path.exists(save_folder + 'scenario{}_histDs.npy'.format(scenario))):
        print('Generate Historical Data!')

        np.random.seed(2021)
        histDs_lst = []

        for i in tqdm(range(reps)):
            histDs = edict()
            histDs.RWDs = []
            histDs.betas = []
            histDs.thetas = []
            histDs.ps_true = []

            for k in range(K):
                betak = betas[k]

                thetak = theta.copy().astype('float')
                thetak[1] = theta1[str(scenario)][k]
                thetak[2:] = theta[2:].copy()+0.1*np.random.randn(d) if scenario>=2 else theta[2:].copy()
            
                sigmatk = rho*np.ones((d,d))+(X_stds[str(scenario)][k]-rho)*np.eye(d)

                Xk = np.random.multivariate_normal(X_means[str(scenario)][k]*np.ones(d),sigmatk,size=nH)
                Xk[:,:4] = np.array(Xk[:,:4]>X_means[str(scenario)][k],dtype=float)

                Tk, yk, psk_true = y_gen(Xk,thetak,betak,y_type=y_type)

                histDs.RWDs.append((Xk,Tk,yk))
                histDs.thetas.append(thetak)
                histDs.betas.append(betak)
                histDs.ps_true.append(psk_true)

            histDs_lst.append(histDs)

        np.save(save_folder+'scenario{}_histDs.npy'.format(scenario), np.array(histDs_lst))

    else:

        print('Load Saved Historical Data!')
        histDs_lst = np.load(save_folder+'scenario{}_histDs.npy'.format(scenario),allow_pickle=True)


    # Generate current study
    if not (os.path.exists(save_folder + 'scenario{}_Ds.npy'.format(scenario))):
        print('Generate Current RCT Data!')

        np.random.seed(2021)
        Ds_lst = []

        for i in tqdm(range(reps)):
            D = edict()
            D.X = np.random.multivariate_normal(np.zeros(d),sigmat,size=n)
            D.X[:,:4] = np.array(D.X[:,:4]>0,dtype=float)
            D.T, D.y, D.ps_true = y_gen(D.X,theta,np.zeros(d),y_type=y_type)
            Ds_lst.append(D)

        np.save(save_folder+'scenario{}_Ds.npy'.format(scenario), np.array(Ds_lst))

    else:

        print('Load Saved RCT Data!')
        Ds_lst = np.load(save_folder+'scenario{}_Ds.npy'.format(scenario),allow_pickle=True)


    # Generate the aggregate data based on historical data
    if not os.path.exists(save_folder + 'scenario{}_summaryDs.bin'.format(scenario)):
        print('Generate Current Summary Data!')
        from balance_methods import *

        numpy2ri.activate()
        pandas2ri.activate()

        summaryDs_lst = []

        for i in tqdm(range(reps)):
            summaryDs = edict()
            summaryDs.ps_pred = []
            summaryDs.beta_hat = []
            summaryDs.NearMatch = []
            summaryDs.IPW = []

            histDs = histDs_lst[i]

            for k in range(K):
                Xk,Tk,yk = histDs.RWDs[k]
                clf = LogisticRegression()
                clf.fit(Xk,Tk)
                psk_pred = clf.predict_proba(Xk)[:,1]
                betak_hat = np.array([clf.intercept_[0]]+clf.coef_.flatten().tolist())

                summaryDs.ps_pred.append(psk_pred)
                summaryDs.beta_hat.append(betak_hat)

                # nearest matching
                mdata, summary_fit, bal_out = matchit_wrapper(yk,Tk,Xk,
                                                              y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='nearest',
                                                  estimand='ATT',
                                                  replace=False)
                summaryDs.NearMatch.append(((mdata['weights']>0).sum(), 
                                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                                            summary_fit, mdata))
                
                # inverse probability weighting
                wdata, summary_fit, bal_out = weightit_wrapper(yk,Tk,Xk,
                                                  y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='ps',
                                                  estimand='ATE',
                                                  link='logit')
                summaryDs.IPW.append(((wdata['weights']>0).sum(), 
                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                            summary_fit, wdata))
                

            summaryDs_lst.append(summaryDs)

        file=open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"wb")
        pickle.dump(summaryDs_lst, file) 
        file.close()

        numpy2ri.deactivate()
        pandas2ri.deactivate()

    else:

        print('Load Saved Summary Data!')
        summaryDs_lst = pickle.load(open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"rb"))
    
    for bal_method in ['NearMatch','IPW']:
        
        print('Balance Method:',bal_method)

        # NIP
        if not os.path.exists(save_folder + 'scenario{}_blr_results.bin'.format(scenario)):
            trace_blr_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                               summaryDs_lst, 
                                                                               method='BLR',
                                                                               bal_method=bal_method,
                                                                               cov_adj=cov_adj_pred,
                                                                               y_type=y_type,
                                                                              random_state=random_state) 
                                                                              for i in tqdm(range(reps)))
            file=open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"wb")
            pickle.dump(trace_blr_lst, file) 
            file.close()

        else:
            trace_blr_lst = pickle.load(open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"rb"))

        theta_pred_blr_mat = np.array([np.array([np.exp(trace_blr_lst[i]['theta1']).mean(),
                                                 np.quantile(np.exp(trace_blr_lst[i]['theta1']),0.025),
                                                 np.quantile(np.exp(trace_blr_lst[i]['theta1']),0.975)]) for i in range(reps)])

        del trace_blr_lst
        gc.collect()
        print('NIP Complete!')


        # UIP
        if not os.path.exists(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario)):
            try:
                trace_uip_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            except:
                trace_uip_lst = Parallel(n_jobs=1)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            file=open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"wb")
            pickle.dump(trace_uip_lst, file) 
            file.close()

        else:

            trace_uip_lst = pickle.load(open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"rb"))


        theta_pred_uip_mat = np.array([np.array([np.exp(trace_uip_lst[i]['theta1']).mean(),
                                                 np.quantile(np.exp(trace_uip_lst[i]['theta1']),0.025),
                                                 np.quantile(np.exp(trace_uip_lst[i]['theta1']),0.975)]) for i in range(reps)])
        uip_pis_array = np.array([trace_uip_lst[i]['pis'].mean(axis=0) for i in range(reps)])
        uip_M_array = np.array([trace_uip_lst[i]['M'].mean(axis=0) for i in range(reps)])

        del trace_uip_lst
        gc.collect()
        print('UIP Complete!')

        pis_lst.append((uip_pis_array))
        M_lst.append((uip_M_array))

        # saving
        # evaluation metrics
        uip_df = pd.DataFrame(np.array([np.nan*np.ones(K),
                           uip_pis_array.mean(axis=0)]),columns=['$w_{}$'.format(i+1) for i in range(K)])
        uip_df['$M$'] = [np.nan,
                         uip_M_array.mean(axis=0)]


        metrics_array = np.array([method_eval(theta_pred_blr_mat,np.exp(theta)),
                                  method_eval(theta_pred_uip_mat,np.exp(theta))])
        metrics_df = pd.DataFrame(metrics_array,columns=['Bias','RMSE','CI Width','CI Coverage'])
        metrics_df['Method'] = ['NIP','UIP']
        metrics_df['Case'] = 'Scenario {}'.format(scenario)
        metrics_df['Bal_Method'] = bal_method
        metrics_df = pd.concat([metrics_df,uip_df],axis=1)

        print(metrics_df.set_index(['Case','Bal_Method','Method']))

        metrics_df.set_index(['Case','Bal_Method','Method']).to_csv(save_folder+bal_method+'_metric_df.csv')

        metrics_dfs.append(metrics_df)

Data Generation

--------------- Scenario 1 ---------------
Load Saved Historical Data!
Load Saved RCT Data!
Load Saved Summary Data!
Balance Method: NearMatch
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 1 NearMatch  NIP     0.028455  0.193653  0.794168        0.958   
                      UIP     0.002068  0.145616  0.686123        0.974   

                                 $w_1$     $w_2$     $w_3$         $M$  
Case       Bal_Method Method                                            
Scenario 1 NearMatch  NIP          NaN       NaN       NaN         NaN  
                      UIP     0.335161  0.336331  0.328508  113.859113  
Balance Method: IPW
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 1 IPW       

In [6]:
# np.array([[summaryDs_lst[i]['IPW'][k][2][1,1] for k in range(3)] for i in range(200)]).mean(axis=0)

In [7]:
# summaryDs_lst[0]['IPW'][0][2]

In [8]:
# bal_method,scenario

In [9]:
# for i in range(reps):
#     print('i:',i)
#     result = parallel_unit(i, Ds_lst, 
#                 summaryDs_lst, 
#                 method='UIP',
#                 bal_method=bal_method,
#                 cov_adj=cov_adj_pred,
#                 y_type=y_type,
#                 random_state=random_state)

In [10]:
metrics_all = pd.concat(metrics_dfs)
metrics_all.set_index(['Case','Method','Bal_Method'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Bias,RMSE,CI Width,CI Coverage,$w_1$,$w_2$,$w_3$,$M$
Case,Method,Bal_Method,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Scenario 1,NIP,NearMatch,0.028455,0.193653,0.794168,0.958,,,,
Scenario 1,UIP,NearMatch,0.002068,0.145616,0.686123,0.974,0.335161,0.336331,0.328508,113.859113
Scenario 1,NIP,IPW,0.028455,0.193653,0.794168,0.958,,,,
Scenario 1,UIP,IPW,0.003763,0.143488,0.679818,0.974,0.338109,0.336787,0.325104,113.32799
Scenario 2,NIP,NearMatch,0.028455,0.193653,0.794168,0.958,,,,
Scenario 2,UIP,NearMatch,0.031048,0.1664,0.735868,0.966,0.370313,0.341607,0.28808,110.390848
Scenario 2,NIP,IPW,0.028455,0.193653,0.794168,0.958,,,,
Scenario 2,UIP,IPW,0.031559,0.165766,0.733424,0.966,0.373178,0.341206,0.285616,109.863473
Scenario 3,NIP,NearMatch,0.028455,0.193653,0.794168,0.958,,,,
Scenario 3,UIP,NearMatch,0.029881,0.166685,0.736857,0.964,0.370142,0.340797,0.289061,110.54225


In [11]:
metrics_all.set_index(['Case','Method','Bal_Method']).to_csv('./results/{}C_outcome/metric_df.csv'.format(date))

# Binary Outcome

In [12]:
nH = 500 # sample size for the historical data
n = 200 # sample size for current data
K = 3 # number of historical data
rho = 0.1 # correlation coefficient
d = 10 # dimension
y_type = 'B' # response type
m_threshold = 0.1 # balance treshold

# coefficient of the response surface
theta = np.array([-1,1]+[0.1]*d).astype('float')
sigmat = rho*np.ones((d,d))+(1-rho)*np.eye(d)

theta1 = {
    '1': [1,1,1],
    '2': [0.8,1.1,1.3],
    '3': [0.8,1.1,1.3]
}

# coefficient of the covariates
X_means = {'1':[0,0,0],
           '2':[0.5,1,1.5],
           '3':[0,0,0]}

X_stds =  {'1':[1,1,1],
           '2':[1,1,1],
           '3':[0.5,1.5,2]}

# coefficients of the propensity score models
betas = [np.array([0.1]*4+[0.3]*3+[-0.3]*3),
         np.array([0.1]*4+[0.6]*3+[-0.6]*3),
         np.array([0.1]*4+[1]*3+[-1]*3)]

num_cores = 10

In [13]:
metrics_dfs = []
pis_lst = []
M_lst = []

for scenario in scenarios:
    print('Data Generation\n')
    print('--------------- Scenario',scenario,'---------------')

    save_folder = './results/{}B_outcome/scenario{}/'.format(date,scenario)

    if not os.path.exists(save_folder):
        print('Creat the folder!')
        os.makedirs(save_folder)

    # Generate the historical data
    if not (os.path.exists(save_folder + 'scenario{}_histDs.npy'.format(scenario))):
        print('Generate Historical Data!')

        np.random.seed(2021)
        histDs_lst = []

        for i in tqdm(range(reps)):
            histDs = edict()
            histDs.RWDs = []
            histDs.betas = []
            histDs.thetas = []
            histDs.ps_true = []

            for k in range(K):
                betak = betas[k]

                thetak = theta.copy().astype('float')
                thetak[1] = theta1[str(scenario)][k]
                thetak[2:] = theta[2:].copy()+0.1*np.random.randn(d) if scenario>=2 else theta[2:].copy()
            
                sigmatk = rho*np.ones((d,d))+(X_stds[str(scenario)][k]-rho)*np.eye(d)

                Xk = np.random.multivariate_normal(X_means[str(scenario)][k]*np.ones(d),sigmatk,size=nH)
                Xk[:,:4] = np.array(Xk[:,:4]>X_means[str(scenario)][k],dtype=float)

                Tk, yk, psk_true = y_gen(Xk,thetak,betak,y_type=y_type)

                histDs.RWDs.append((Xk,Tk,yk))
                histDs.thetas.append(thetak)
                histDs.betas.append(betak)
                histDs.ps_true.append(psk_true)

            histDs_lst.append(histDs)

        np.save(save_folder+'scenario{}_histDs.npy'.format(scenario), np.array(histDs_lst))

    else:

        print('Load Saved Historical Data!')
        histDs_lst = np.load(save_folder+'scenario{}_histDs.npy'.format(scenario),allow_pickle=True)


    # Generate current study
    if not (os.path.exists(save_folder + 'scenario{}_Ds.npy'.format(scenario))):
        print('Generate Current RCT Data!')

        np.random.seed(2021)
        Ds_lst = []

        for i in tqdm(range(reps)):
            D = edict()
            D.X = np.random.multivariate_normal(np.zeros(d),sigmat,size=n)
            D.X[:,:4] = np.array(D.X[:,:4]>0,dtype=float)
            D.T, D.y, D.ps_true = y_gen(D.X,theta,np.zeros(d),y_type=y_type)
            Ds_lst.append(D)

        np.save(save_folder+'scenario{}_Ds.npy'.format(scenario), np.array(Ds_lst))

    else:

        print('Load Saved RCT Data!')
        Ds_lst = np.load(save_folder+'scenario{}_Ds.npy'.format(scenario),allow_pickle=True)


    # Generate the summary info based on historical data
    if not os.path.exists(save_folder + 'scenario{}_summaryDs.bin'.format(scenario)):
        print('Generate Current Summary Data!')
        from balance_methods import *

        numpy2ri.activate()
        pandas2ri.activate()

        summaryDs_lst = []

        for i in tqdm(range(reps)):
            summaryDs = edict()
            summaryDs.ps_pred = []
            summaryDs.beta_hat = []
            summaryDs.NearMatch = []
            summaryDs.IPW = []

            histDs = histDs_lst[i]

            for k in range(K):
                Xk,Tk,yk = histDs.RWDs[k]
                clf = LogisticRegression()
                clf.fit(Xk,Tk)
                psk_pred = clf.predict_proba(Xk)[:,1]
                betak_hat = np.array([clf.intercept_[0]]+clf.coef_.flatten().tolist())

                summaryDs.ps_pred.append(psk_pred)
                summaryDs.beta_hat.append(betak_hat)

                # nearest matching
                mdata, summary_fit, bal_out = matchit_wrapper(yk,Tk,Xk,
                                                              y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='nearest',
                                                  estimand='ATT',
                                                  replace=False)
                summaryDs.NearMatch.append(((mdata['weights']>0).sum(), 
                                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                                            summary_fit, mdata))
                
                # inverse probability weighting
                wdata, summary_fit, bal_out = weightit_wrapper(yk,Tk,Xk,
                                                  y_type=y_type,
                                                  cov_adj=cov_adj_datgen,
                                                  method='ps',
                                                  estimand='ATE',
                                                  link='logit')
                summaryDs.IPW.append(((wdata['weights']>0).sum(), 
                            (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                            summary_fit, wdata))
                

            summaryDs_lst.append(summaryDs)

        file=open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"wb")
        pickle.dump(summaryDs_lst, file) 
        file.close()

        numpy2ri.deactivate()
        pandas2ri.deactivate()

    else:

        print('Load Saved Summary Data!')
        summaryDs_lst = pickle.load(open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"rb"))
    
    for bal_method in ['NearMatch','IPW']:
        
        print('Balance Method:',bal_method)

        # NIP
        if not os.path.exists(save_folder + 'scenario{}_blr_results.bin'.format(scenario)):
            trace_blr_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                               summaryDs_lst, 
                                                                               method='BLR',
                                                                               bal_method=bal_method,
                                                                               cov_adj=cov_adj_pred,
                                                                               y_type=y_type,
                                                                              random_state=random_state) 
                                                                              for i in tqdm(range(reps)))
            file=open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"wb")
            pickle.dump(trace_blr_lst, file) 
            file.close()

        else:
            trace_blr_lst = pickle.load(open(save_folder + 'scenario{}_blr_results.bin'.format(scenario),"rb"))

        theta_pred_blr_mat = np.array([np.array([np.exp(trace_blr_lst[i]['theta1']).mean(),
                                                 np.quantile(np.exp(trace_blr_lst[i]['theta1']),0.025),
                                                 np.quantile(np.exp(trace_blr_lst[i]['theta1']),0.975)]) for i in range(reps)])

        del trace_blr_lst
        gc.collect()
        print('NIP Complete!')


        # UIP
        if not os.path.exists(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario)):
            try:
                trace_uip_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            except:
                trace_uip_lst = Parallel(n_jobs=1)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                    summaryDs_lst, 
                                                                                    method='UIP',
                                                                                    bal_method=bal_method,
                                                                                    cov_adj=cov_adj_pred,
                                                                                    y_type=y_type,
                                                                                  random_state=random_state) 
                                                                                for i in tqdm(range(reps)))
            file=open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"wb")
            pickle.dump(trace_uip_lst, file) 
            file.close()

        else:

            trace_uip_lst = pickle.load(open(save_folder + bal_method +'_scenario{}_uip_results.bin'.format(scenario),"rb"))


        theta_pred_uip_mat = np.array([np.array([np.exp(trace_uip_lst[i]['theta1']).mean(),
                                                 np.quantile(np.exp(trace_uip_lst[i]['theta1']),0.025),
                                                 np.quantile(np.exp(trace_uip_lst[i]['theta1']),0.975)]) for i in range(reps)])
        uip_pis_array = np.array([trace_uip_lst[i]['pis'].mean(axis=0) for i in range(reps)])
        uip_M_array = np.array([trace_uip_lst[i]['M'].mean(axis=0) for i in range(reps)])

        del trace_uip_lst
        gc.collect()
        print('UIP Complete!')

        pis_lst.append((uip_pis_array))
        M_lst.append((uip_M_array))

        # saving
        # evaluation metrics
        uip_df = pd.DataFrame(np.array([np.nan*np.ones(K),
                           uip_pis_array.mean(axis=0)]),columns=['$w_{}$'.format(i+1) for i in range(K)])
        uip_df['$M$'] = [np.nan,
                         uip_M_array.mean(axis=0)]


        metrics_array = np.array([method_eval(theta_pred_blr_mat,np.exp(theta)),
                                  method_eval(theta_pred_uip_mat,np.exp(theta))])
        metrics_df = pd.DataFrame(metrics_array,columns=['Bias','RMSE','CI Width','CI Coverage'])
        metrics_df['Method'] = ['NIP','UIP']
        metrics_df['Case'] = 'Scenario {}'.format(scenario)
        metrics_df['Bal_Method'] = bal_method
        metrics_df = pd.concat([metrics_df,uip_df],axis=1)

        print(metrics_df.set_index(['Case','Bal_Method','Method']))

        metrics_df.set_index(['Case','Bal_Method','Method']).to_csv(save_folder+bal_method+'_metric_df.csv')

        metrics_dfs.append(metrics_df)

Data Generation

--------------- Scenario 1 ---------------
Load Saved Historical Data!
Load Saved RCT Data!
Load Saved Summary Data!
Balance Method: NearMatch
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 1 NearMatch  NIP     0.849254  1.580266  4.794541        0.914   
                      UIP     0.601336  1.089288  3.853264        0.956   

                                 $w_1$    $w_2$     $w_3$         $M$  
Case       Bal_Method Method                                           
Scenario 1 NearMatch  NIP          NaN      NaN       NaN         NaN  
                      UIP     0.332144  0.33118  0.336677  111.960214  
Balance Method: IPW
NIP Complete!
UIP Complete!
                                  Bias      RMSE  CI Width  CI Coverage  \
Case       Bal_Method Method                                              
Scenario 1 IPW        NIP

In [14]:
metrics_all = pd.concat(metrics_dfs)
metrics_all.set_index(['Case','Method','Bal_Method'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Bias,RMSE,CI Width,CI Coverage,$w_1$,$w_2$,$w_3$,$M$
Case,Method,Bal_Method,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Scenario 1,NIP,NearMatch,0.849254,1.580266,4.794541,0.914,,,,
Scenario 1,UIP,NearMatch,0.601336,1.089288,3.853264,0.956,0.332144,0.33118,0.336677,111.960214
Scenario 1,NIP,IPW,0.849254,1.580266,4.794541,0.914,,,,
Scenario 1,UIP,IPW,0.621051,1.100358,3.872055,0.958,0.337491,0.332004,0.330505,111.737504
Scenario 2,NIP,NearMatch,0.849254,1.580266,4.794541,0.914,,,,
Scenario 2,UIP,NearMatch,0.659402,1.150162,3.945326,0.942,0.331925,0.33145,0.336625,112.378552
Scenario 2,NIP,IPW,0.849254,1.580266,4.794541,0.914,,,,
Scenario 2,UIP,IPW,0.673353,1.156132,3.960712,0.944,0.336984,0.333794,0.329222,112.002559
Scenario 3,NIP,NearMatch,0.849254,1.580266,4.794541,0.914,,,,
Scenario 3,UIP,NearMatch,0.654114,1.147045,3.949243,0.942,0.332811,0.332089,0.3351,112.371703


In [15]:
metrics_all.set_index(['Case','Method','Bal_Method']).to_csv('./results/{}B_outcome/metric_df.csv'.format(date))

# Survival Outcome

In [16]:
def parallel_unit(i,
                  D_lst,
                  summaryDs_lst,
                  n_intervals=5,
                  method='BPH',
                  bal_method='NearMatch',
                  cov_adj=True,
                  random_state=2021):
    
    D = D_lst[i]
    summaryDs = summaryDs_lst[i]
    result_dict = edict()
    
    if method == 'BPH':
        result = BPH(D,
                     n_intervals=n_intervals,
                     cov_adj=cov_adj,
                     random_state=random_state)
        
        result_dict.theta1 = result['theta1']
        result_dict.M = None
        result_dict.pis = None
        
    elif method == 'UIP':
        result = BPH_UIP_Dirichlet(D,summaryDs,
                               n_intervals=n_intervals,
                               cov_adj=cov_adj,
                               gammas_ps=False,
                               bal_method=bal_method,
                               random_state=random_state)
        
        result_dict.theta1 = result['theta1']
        result_dict.M = result['M']
        result_dict.pis = result['pis']

    return result_dict


def method_eval(theta_pred_mat,theta):
    
    bias = theta_pred_mat[:,0].mean()-theta
    rmse = np.sqrt(np.mean((theta_pred_mat[:,0]-theta)**2))
    ci_width = np.mean(theta_pred_mat[:,2]-theta_pred_mat[:,1])
    ci_coverage = np.mean((theta_pred_mat[:,2]>=theta)*(theta_pred_mat[:,1]<=theta))
    
    return np.array([bias, rmse, ci_width, ci_coverage])

In [17]:
nH = 500 # sample size for the historical data 
n = 200 # current data
K = 3 # number of historical data
rho = 0.1 # correlation coefficient
d = 10 # dimension
m_threshold = 0.1 # balance treshold

n_find_cr = int(1e5)
n_intervals = 5

# coefficient of the response surface
theta = np.array([1]+[0.1]*d).astype('float')
sigmat = rho*np.ones((d,d))+(1-rho)*np.eye(d)

cen_ratio = 0.1
surv_v = 4
surv_alphas = [0.5,1,2]

theta1 = {
    '1': [1,1,1],
    '2': [0.8,1.1,1.3],
    '3': [0.8,1.1,1.3]
}

# coefficient of the covariates
X_means = {'1':[0,0,0],
           '2':[0.5,1,1.5],
           '3':[0,0,0]}

X_stds =  {'1':[1,1,1],
           '2':[1,1,1],
           '3':[0.5,1.5,2]}

# coefficient of the propensity scores
betas = [np.array([0.1]*4+[0.3]*3+[-0.3]*3),
         np.array([0.1]*4+[0.6]*3+[-0.6]*3),
         np.array([0.1]*4+[1]*3+[-1]*3)]

In [18]:
metrics_dfs = []
pis_lst = []
M_lst = []

for scenario in scenarios:
    for surv_alpha in surv_alphas:

        print('Data Generation\n')
        print('--------------- Scenario',scenario,'Surv_alpha',surv_alpha,'---------------')

        save_folder = './results/{}S_outcome/scenario{}/{}/'.format(date,scenario,surv_alpha)

        if not os.path.exists(save_folder):
            print('Creat the folder!')
            os.makedirs(save_folder)

        # Generate the historical data
        if not (os.path.exists(save_folder + 'scenario{}_histDs.npy'.format(scenario))):
            print('Generate Historical Data!')

            np.random.seed(2021)
            histDs_lst = []

            for i in tqdm(range(reps)):
                histDs = edict()
                histDs.RWDs = []
                histDs.betas = []
                histDs.thetas = []
                histDs.ecs = []
                histDs.theta_cs = []
                histDs.ps_true = []

                for k in range(K):
                    betak = betas[k]

                    thetak = theta.copy().astype('float')
                    thetak[0] = theta1[str(scenario)][k]
                    thetak[1:] = theta[1:].copy()+np.random.randn(d) if scenario>=2 else theta[1:].copy()
                    
                    sigmatk = rho*np.ones((d,d))+(X_stds[str(scenario)][k]-rho)*np.eye(d)

                    # numerically determine the population censor ratio
                    X = np.random.multivariate_normal(X_means[str(scenario)][k]*np.ones(d),sigmatk,size=n_find_cr)
                    Xb = np.dot(X,betak)
                    ps = sigmoid(Xb)
                    T = np.random.binomial(1,ps,ps.shape[0])
                    z = thetak[0]*T+X.dot(thetak[1:])
                    S = np.random.rand(n_find_cr)
                    e_t = ((-np.log(S)/np.exp(z))*(surv_v**surv_alpha))**(1/surv_alpha)
                    theta_c = brute(cen_fun, [(0,np.ceil(e_t.max()))], 
                              Ns=100,
                              args=(e_t,cen_ratio), 
                              finish=fmin)
                    
                    # generate the survival data
                    Xk = np.random.multivariate_normal(X_means[str(scenario)][k]*np.ones(d),sigmat,size=nH)
                    Xk[:,:4] = np.array(Xk[:,:4]>X_means[str(scenario)][k],dtype=float)
                    Tk, deltak, yk, e_t_k, c_t_k, psk_true = surv_y_gen(Xk,thetak,betak,theta_c,
                                                                        alpha=surv_alpha,v=surv_v)

                    histDs.RWDs.append((Xk,Tk,deltak,yk))
                    histDs.thetas.append(thetak)
                    histDs.betas.append(betak)
                    histDs.ecs.append((e_t_k,c_t_k))
                    histDs.ps_true.append(psk_true)

                histDs_lst.append(histDs)

            np.save(save_folder+'scenario{}_histDs.npy'.format(scenario), np.array(histDs_lst))

        else:

            print('Load Saved Historical Data!')
            histDs_lst = np.load(save_folder+'scenario{}_histDs.npy'.format(scenario),allow_pickle=True)


        # Generate current study
        if not (os.path.exists(save_folder + 'scenario{}_Ds.npy'.format(scenario))):
            print('Generate Current RCT Data!')

            np.random.seed(2021)
            Ds_lst = []
            
            X = np.random.multivariate_normal(np.zeros(d),sigmat,size=n_find_cr)
            Xb = np.dot(X,np.zeros(d))
            ps = sigmoid(Xb)
            T = np.random.binomial(1,ps,ps.shape[0])
            z = theta[0]*T+X.dot(theta[1:])
            S = np.random.rand(n_find_cr)
            e_t = ((-np.log(S)/np.exp(z))*(surv_v**surv_alpha))**(1/surv_alpha)
            theta_c = brute(cen_fun, [(0,np.ceil(e_t.max()))], 
                              Ns=100,
                              args=(e_t,cen_ratio), 
                              finish=fmin)

            for i in tqdm(range(reps)):
                D = edict()
                D.X = np.random.multivariate_normal(np.zeros(d),sigmat,size=n)
                D.X[:,:4] = np.array(D.X[:,:4]>0,dtype=float)
                D.T, D.delta, D.y, D.e_t, D.c_t, D.ps_true = surv_y_gen(D.X,theta,np.zeros(d),theta_c,
                                                                        alpha=surv_alpha,v=surv_v)
                Ds_lst.append(D)

            np.save(save_folder+'scenario{}_Ds.npy'.format(scenario), np.array(Ds_lst))

        else:

            print('Load Saved RCT Data!')
            Ds_lst = np.load(save_folder+'scenario{}_Ds.npy'.format(scenario),allow_pickle=True)


        # Generate the summary info based on historical data
        if not os.path.exists(save_folder + 'scenario{}_summaryDs.bin'.format(scenario)):
            print('Generate Current Summary Data!')
            from balance_methods import *

            numpy2ri.activate()
            pandas2ri.activate()

            summaryDs_lst = []

            for i in tqdm(range(reps)):
                summaryDs = edict()
                summaryDs.ps_pred = []
                summaryDs.beta_hat = []
                summaryDs.NearMatch = []
                summaryDs.IPW = []

                histDs = histDs_lst[i]

                for k in range(K):
                    Xk,Tk,deltak,yk = histDs.RWDs[k]
                    clf = LogisticRegression()
                    clf.fit(Xk,Tk)
                    psk_pred = clf.predict_proba(Xk)[:,1]
                    betak_hat = np.array(clf.coef_.flatten().tolist())

                    summaryDs.ps_pred.append(psk_pred)
                    summaryDs.beta_hat.append(betak_hat)

                    # nearest matching
                    mdata, summary_fit, bal_out = surv_matchit_wrapper(yk,Tk,deltak,Xk,
                                                                  cov_adj=cov_adj_datgen,
                                                                  method='nearest',
                                                                  estimand='ATT',
                                                                  replace=False)

                    summaryDs.NearMatch.append(((mdata['weights']>0).sum(), 
                                                (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                                                summary_fit, mdata))
                    
                    # inverse probability weighting
                    wdata, summary_fit, bal_out = surv_weightit_wrapper(yk,Tk,deltak,Xk,
                                                                        cov_adj=cov_adj_datgen,
                                                                        method='ps',
                                                                        estimand='ATE',
                                                                        link='logit')
                    summaryDs.IPW.append(((wdata['weights']>0).sum(), 
                                (np.sum(np.abs(bal_out[0]['Diff.Adj'][1:]) < m_threshold))/(d), 
                                summary_fit, wdata))

                summaryDs_lst.append(summaryDs)

            file=open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"wb")
            pickle.dump(summaryDs_lst, file) 
            file.close()

            numpy2ri.deactivate()
            pandas2ri.deactivate()

        else:

            print('Load Saved Summary Data!')
            summaryDs_lst = pickle.load(open(save_folder+'scenario{}_summaryDs.bin'.format(scenario),"rb"))
            
        for bal_method in ['NearMatch','IPW']:

            print('Balance Methods:',bal_method)

            # NIP 
            if not os.path.exists(save_folder + 'scenario{}_blr_results.bin'.format(scenario)):
                trace_blr_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                   summaryDs_lst, 
                                                                                   n_intervals=n_intervals,
                                                                                   bal_method=bal_method,
                                                                                   method='BPH',
                                                                                   cov_adj=cov_adj_pred,
                                                                                   random_state=random_state) 
                                                                                   for i in tqdm(range(reps)))
                
                file=open(save_folder+'scenario{}_blr_results.bin'.format(scenario),"wb")
                pickle.dump(trace_blr_lst, file) 
                file.close()

            else:
                trace_blr_lst = pickle.load(open(save_folder+'scenario{}_blr_results.bin'.format(scenario),"rb"))

            theta_pred_blr_mat = np.array([np.array([np.exp(trace_blr_lst[i]['theta1']).mean(),
                                                     np.quantile(np.exp(trace_blr_lst[i]['theta1']),0.025),
                                                     np.quantile(np.exp(trace_blr_lst[i]['theta1']),0.975)]) for i in range(reps)])

            del trace_blr_lst
            gc.collect()
            print('NIP Complete!')


            # UIP
            if not os.path.exists(save_folder + bal_method + '_scenario{}_uip_results.bin'.format(scenario)):
                trace_uip_lst = Parallel(n_jobs=num_cores)(delayed(parallel_unit)(i, Ds_lst, 
                                                                                   summaryDs_lst, 
                                                                                   n_intervals=n_intervals,
                                                                                   bal_method=bal_method,
                                                                                   method='UIP',
                                                                                   cov_adj=cov_adj_pred,
                                                                                   random_state=random_state) 
                                                                                   for i in tqdm(range(reps)))

                file=open(save_folder + bal_method + '_scenario{}_uip_results.bin'.format(scenario),"wb")
                pickle.dump(trace_uip_lst, file) 
                file.close()

            else:
                trace_uip_lst = pickle.load(open(save_folder + bal_method + '_scenario{}_uip_results.bin'.format(scenario),"rb"))


            theta_pred_uip_mat = np.array([np.array([np.exp(trace_uip_lst[i]['theta1']).mean(),
                                                     np.quantile(np.exp(trace_uip_lst[i]['theta1']),0.025),
                                                     np.quantile(np.exp(trace_uip_lst[i]['theta1']),0.975)]) for i in range(reps)])
            uip_pis_array = np.array([trace_uip_lst[i]['pis'].mean(axis=0) for i in range(reps)])
            uip_M_array = np.array([trace_uip_lst[i]['M'].mean(axis=0) for i in range(reps)])

            del trace_uip_lst
            gc.collect()
            print('UIP Complete!')

            pis_lst.append((uip_pis_array))
            M_lst.append((uip_M_array))

            # saving
            # evaluation metrics
            uip_df = pd.DataFrame(np.array([np.nan*np.ones(K),
                               uip_pis_array.mean(axis=0)]),columns=['$w_{}$'.format(i+1) for i in range(K)])
            uip_df['$M$'] = [np.nan,
                        uip_M_array.mean(axis=0)]


            metrics_array = np.array([method_eval(theta_pred_blr_mat,np.exp(theta[0])),
                                  method_eval(theta_pred_uip_mat,np.exp(theta[0]))])
            metrics_df = pd.DataFrame(metrics_array,columns=['Bias','RMSE','CI Width','CI Coverage'])
            metrics_df['Method'] = ['NIP','UIP']
            metrics_df['Case'] = 'Scenario {}'.format(scenario)
            metrics_df['Bal_Method'] = bal_method
            metrics_df['\\alpha'] = surv_alpha
            metrics_df = pd.concat([metrics_df,uip_df],axis=1)

            print(metrics_df.set_index(['Case','\\alpha','Method','Bal_Method']))

            metrics_df.set_index(['Case','\\alpha','Method','Bal_Method']).to_csv(save_folder+bal_method+ \
                                                                                  '_metric_df.csv')

            metrics_dfs.append(metrics_df)

Data Generation

--------------- Scenario 1 Surv_alpha 0.5 ---------------
Load Saved Historical Data!
Load Saved RCT Data!
Load Saved Summary Data!
Balance Methods: NearMatch
NIP Complete!
UIP Complete!
                                         Bias      RMSE  CI Width  \
Case       \alpha Method Bal_Method                                 
Scenario 1 0.5    NIP    NearMatch   0.697987  0.992461  2.332536   
                  UIP    NearMatch   0.515802  0.729699  2.018301   

                                     CI Coverage     $w_1$     $w_2$  \
Case       \alpha Method Bal_Method                                    
Scenario 1 0.5    NIP    NearMatch         0.738       NaN       NaN   
                  UIP    NearMatch         0.838  0.321344  0.336368   

                                        $w_3$         $M$  
Case       \alpha Method Bal_Method                        
Scenario 1 0.5    NIP    NearMatch        NaN         NaN  
                  UIP    NearMatch   0.342288  107

In [19]:
metrics_all = pd.concat(metrics_dfs)
metrics_all.set_index(['Case','\\alpha',
                       'Method','Bal_Method'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Bias,RMSE,CI Width,CI Coverage,$w_1$,$w_2$,$w_3$,$M$
Case,\alpha,Method,Bal_Method,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Scenario 1,0.5,NIP,NearMatch,0.697987,0.992461,2.332536,0.738,,,,
Scenario 1,0.5,UIP,NearMatch,0.515802,0.729699,2.018301,0.838,0.321344,0.336368,0.342288,107.450036
Scenario 1,0.5,NIP,IPW,0.697987,0.992461,2.332536,0.738,,,,
Scenario 1,0.5,UIP,IPW,0.528438,0.743102,2.031844,0.832,0.323696,0.338573,0.337731,107.901002
Scenario 1,1.0,NIP,NearMatch,0.321025,0.636225,2.035587,0.912,,,,
Scenario 1,1.0,UIP,NearMatch,0.24897,0.475368,1.776573,0.954,0.324241,0.336134,0.339625,111.480167
Scenario 1,1.0,NIP,IPW,0.321025,0.636225,2.035587,0.912,,,,
Scenario 1,1.0,UIP,IPW,0.262737,0.491579,1.800424,0.958,0.327543,0.3386,0.333857,111.839986
Scenario 1,2.0,NIP,NearMatch,0.0207,0.449316,1.824796,0.958,,,,
Scenario 1,2.0,UIP,NearMatch,0.026638,0.338954,1.624806,0.994,0.32334,0.336632,0.340029,112.481135


In [20]:
metrics_all.set_index(['Case',
                       '\\alpha',
                       'Method',
                       'Bal_Method']).to_csv('./results/{}S_outcome/metric_df.csv'.format(date))