This notebook performs comparisons among different benchmark methods.

## Perform Simulation for Benchmark Methods

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from benchmarks import *
from joblib import Parallel,delayed
from sklearn.preprocessing import StandardScaler

In [2]:
folder_name = '0415ben_simucpr/'

In [3]:
def parallel_unit(i,r,scenario):

    # data path
    data_path = './save/simu_data/'+scenario+'/'
    
    # load the data
    data_full_path = data_path + 'd' + str(i) + '.npy'
    dat = np.load(data_full_path,allow_pickle=True)

    # unzip the data
    x = dat.item()['x']
    z = dat.item()['z']
    y1 = dat.item()['y1']
    y2 = dat.item()['y2']
    y3 = dat.item()['y3']
    
    nt = int(z.sum())
    x = x[:nt*(1+r)]
    z = z[:nt*(1+r)]
    y1 = y1[:nt*(1+r)]
    y2 = y2[:nt*(1+r)]
    y3 = y3[:nt*(1+r)]

    save_folder = './save/'+folder_name+scenario+'/'+'r='+str(r)+'/'+str(i)+'/'

    if not os.path.exists(save_folder):
        print('Creat the folder.')
        os.makedirs(save_folder)
    
    # PSM
    if not os.path.exists(save_folder+'match_psm.csv'):
        print('Perform Matching (PSM).')
        # fit logistic model for propensity scores
        _, ps_score = glm_wrapper(x,z)

        # tau
        tau_match_psm = [] 
        for y in [y1,y2,y3]:
            tau_match_psm.append(match_wrapper(y,z,ps_score,None,False))
        tau_match_psm = np.array(tau_match_psm)

        df_match_psm = pd.DataFrame(tau_match_psm,
                                    columns=['tauhat'])
        df_match_psm['method'] = 'PSM'

        df_match_psm.to_csv(save_folder+'match_psm.csv',index=False)

    else:
        print('Skip! Matching (PSM) has been performed.')
    
    # IPW
    if not os.path.exists(save_folder+'ipw.csv'):
        print('Perform IPW.')
        
        # tau
        tau_ipw = [] 
        for y in [y1,y2,y3]:
            tau_ipw.append(ipw_wrapper(y,z,x))

        df_ipw = pd.DataFrame(tau_ipw,
                               columns=['tauhat'])
        df_ipw['method'] = 'IPW'
        
        df_ipw.to_csv(save_folder+'ipw.csv',index=False)
        
    else:
        print('Skip! IPW has been performed.')

    # EBCW
    if not os.path.exists(save_folder+'ebcw.csv'):
        print('Perform EBCW.')
        
        tau_ate = [] 
        for y in [y1,y2,y3]:
            tau_ate.append(ate_wrapper(y,z,x))

        df_ate = pd.DataFrame(tau_ate,columns=['tauhat'])
        df_ate['method'] = 'EBCW'
        
        df_ate.to_csv(save_folder+'ebcw.csv',index=False)
        
    else:
        print('Skip! EBCW has been performed.')
        
    # EBAL
    if not os.path.exists(save_folder+'ebal.csv'):
        print('Perform EBAL.')
        
        tau_ebal = [] 
        for y in [y1,y2,y3]:
            tau_ebal.append(ebal_wrapper(y,z,x))

        df_ebal = pd.DataFrame(tau_ebal,columns=['tauhat'])
        df_ebal['method'] = 'EBAL'
        
        df_ebal.to_csv(save_folder+'ebal.csv',index=False)
        
    else:
        print('Skip! EBAL has been performed.')
    
    # SBW
    if not os.path.exists(save_folder+'sbw.csv'):
        print('Perform SBW.')
        
        tau_sbw = [] 
        for y in [y1,y2,y3]:
            tau_sbw.append(optweight_wrapper(y,z,x))

        df_sbw = pd.DataFrame(tau_sbw,columns=['tauhat'])
        df_sbw['method'] = 'SBW'
        
        df_sbw.to_csv(save_folder+'sbw.csv',index=False)
    else:
        print('Skip! CBPS has been performed.')
    
    # FM
    if not os.path.exists(save_folder+'fm.csv'):
        tau_fm = [] 
        for y in [y1,y2,y3]:
            tau_fm.append(matchit_wrapper(y,z,x))
            
        df_fm = pd.DataFrame(tau_fm,columns=['tauhat'])
        df_fm['method'] = 'FM'
        
        df_fm.to_csv(save_folder+'fm.csv',index=False)
    else:
        print('Skip! FM has been performed.')

In [4]:
n_kernel = 40
n_data = 200
rs = [1,2]
scenarios = ['scenario1','scenario2','scenario3']

In [5]:
for r in rs:
    for scenario in scenarios:
        print('------------- Simulation Comparisons:',scenario,'r =',r,'-------------')
        Parallel(n_jobs=n_kernel)(delayed(parallel_unit)(i=i,r=r,scenario=scenario) for i in tqdm(range(n_data)))

------------- Simulation Comparisons: scenario1 r = 1 -------------


100%|█████████████████████████████████████████| 200/200 [00:07<00:00, 25.34it/s]


Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been perfo

100%|███████████████████████████████████████| 200/200 [00:00<00:00, 1279.64it/s]


------------- Simulation Comparisons: scenario3 r = 1 -------------


100%|███████████████████████████████████████| 200/200 [00:00<00:00, 1294.14it/s]


------------- Simulation Comparisons: scenario1 r = 2 -------------


100%|███████████████████████████████████████| 200/200 [00:00<00:00, 1311.45it/s]


------------- Simulation Comparisons: scenario2 r = 2 -------------


100%|███████████████████████████████████████| 200/200 [00:00<00:00, 1329.16it/s]

Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been perfo




------------- Simulation Comparisons: scenario3 r = 2 -------------


  0%|                                                   | 0/200 [00:00<?, ?it/s]

Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been performed.
Skip! IPW has been performed.
Skip! EBCW has been performed.
Skip! EBAL has been performed.
Skip! CBPS has been performed.
Skip! FM has been performed.
Skip! Matching (PSM) has been perfo

100%|███████████████████████████████████████| 200/200 [00:00<00:00, 1273.87it/s]


## Generate Tables

In [6]:
def mcse_cal_fun(array_val,tau):
    # array value
    n_sim = array_val.shape[0]

    # bias MCSE
    bias_mcse = np.sqrt(np.var(array_val)/n_sim)

    # RMSE MCSE using asymptotic property
    # using Delta method and the formula in 
    # https://cran.r-project.org/web/packages/simhelpers/vignettes/MCSE.html
    array_sq_val = (array_val-tau)**2
    mse = array_sq_val.mean()
    rmse_mcse = np.sqrt(np.sum((array_sq_val-mse)**2)/((n_sim-1)*4*mse*n_sim))

    # # RMSE MCSE using bootstrapping
    # from sklearn.utils import resample
    # B = 1000 # bootstrapping iteration
    # rmse_boot_array = []
    # for b in range(B):
    #     array_val_boot = resample(array_val,
    #                               n_samples=n_sim,
    #                               replace=1)
    #     rmse_boot_array.append(np.sqrt(((array_val_boot-tau)**2).mean()))
    # rmse_boot_array = np.array(rmse_boot_array)
    # rmse_mcse = np.sqrt(np.var(rmse_boot_array))

    return bias_mcse, rmse_mcse

def summary_table_gen(dat_array,tau,method):
    
    
    bias = dat_array[:,:].mean(axis=0)-tau
    rmse = np.sqrt(((dat_array[:,:]-tau)**2).mean(axis=0))
    mcse = np.apply_along_axis(func1d=mcse_cal_fun,
                        arr=dat_array,
                        axis=0,tau=tau)

    df = pd.DataFrame({'Outcome':['Linear','Nonlinear1','Nonlinear2'],
                   'Bias':bias,
                   'Bias_MCSE':mcse[0,:],
                   'RMSE':rmse,
                   'RMSE_MCSE':mcse[1,:],
                   'Method':method})

    return df

In [7]:
save_file = './save/'
tau = 1

In [8]:
sum_df_lst = []

for r in [1,2]:
    for scenario in ['scenario1','scenario2','scenario3']:

        save_benchmark_files = save_file + folder_name + scenario+'/r='+str(r)+'/'

        ipw_array = []
        fm_array = []
        match_psm_array = []
        ebal_array = []
        sbw_array = []
        ebcb_array = []

        for i in tqdm(range(200)):
            ipw_df = pd.read_csv(save_benchmark_files + str(i) + '/ipw.csv')
            match_psm_df = pd.read_csv(save_benchmark_files + str(i) + '/match_psm.csv')
            fm_df = pd.read_csv(save_benchmark_files + str(i) + '/fm.csv')
            ebcb_df = pd.read_csv(save_benchmark_files + str(i) + '/ebcw.csv')
            ebal_df = pd.read_csv(save_benchmark_files + str(i) + '/ebal.csv')
            sbw_df = pd.read_csv(save_benchmark_files + str(i) + '/sbw.csv')

            ipw = ipw_df.iloc[:3,0].values
            match_psm = match_psm_df.iloc[:3,0].values
            fm = fm_df.iloc[:3,0].values
            ebcb = ebcb_df.iloc[:3,0].values
            ebal = ebal_df.iloc[:3,0].values
            sbw = sbw_df.iloc[:3,0].values

            ipw_array.append(ipw)
            match_psm_array.append(match_psm)
            fm_array.append(fm)
            ebal_array.append(ebal)
            sbw_array.append(sbw)
            ebcb_array.append(ebcb)


        ipw_array = np.array(ipw_array)
        match_psm_array = np.array(match_psm_array)
        fm_array = np.array(fm_array)
        ebal_array = np.array(ebal_array)
        sbw_array = np.array(sbw_array)
        ebcb_array = np.array(ebcb_array)
        
        ipw_sum_df = summary_table_gen(ipw_array,tau,'IPW')
        match_psm_sum_df = summary_table_gen(match_psm_array,tau,'PSM')
        fm_sum_df = summary_table_gen(fm_array,tau,'FM')
        ebal_sum_df = summary_table_gen(ebal_array,tau,'EBAL')
        ebcb_sum_df = summary_table_gen(ebcb_array,tau,'EBCW')
        sbw_sum_df = summary_table_gen(sbw_array,tau,'SBW')
        
        sum_df = pd.concat([ipw_sum_df,
                  match_psm_sum_df,fm_sum_df,ebal_sum_df,sbw_sum_df,ebcb_sum_df])

        sum_df['Scenario'] = scenario
        sum_df['r'] = r

        sum_df = sum_df.reset_index(drop=True)
        
        sum_df_lst.append(sum_df)

100%|████████████████████████████████████████| 200/200 [00:01<00:00, 187.63it/s]
100%|████████████████████████████████████████| 200/200 [00:01<00:00, 194.83it/s]
100%|████████████████████████████████████████| 200/200 [00:01<00:00, 199.16it/s]
100%|████████████████████████████████████████| 200/200 [00:01<00:00, 197.92it/s]
100%|████████████████████████████████████████| 200/200 [00:01<00:00, 198.14it/s]
100%|████████████████████████████████████████| 200/200 [00:01<00:00, 197.66it/s]


In [9]:
sum_df_pivot = pd.concat(sum_df_lst)
sum_df_pivot

Unnamed: 0,Outcome,Bias,Bias_MCSE,RMSE,RMSE_MCSE,Method,Scenario,r
0,Linear,0.039949,0.029915,0.424949,0.032395,IPW,scenario1,1
1,Nonlinear1,0.077329,0.040808,0.582269,0.039945,IPW,scenario1,1
2,Nonlinear2,0.246775,0.250849,3.556107,0.491634,IPW,scenario1,1
3,Linear,0.162419,0.027155,0.416969,0.022757,PSM,scenario1,1
4,Nonlinear1,0.244808,0.039765,0.613333,0.028083,PSM,scenario1,1
...,...,...,...,...,...,...,...,...
13,Nonlinear1,-0.121241,0.013865,0.230532,0.010703,SBW,scenario3,2
14,Nonlinear2,-6.955099,0.377884,8.771124,0.439215,SBW,scenario3,2
15,Linear,0.001152,0.006284,0.088879,0.004873,EBCW,scenario3,2
16,Nonlinear1,-0.031720,0.013753,0.197071,0.008882,EBCW,scenario3,2


## Combine Tables with QReR

In [10]:
sum_df_qrer = pd.read_csv('./save/0415qrer_pate/qrer_pate.csv')
sum_df_qrer = sum_df_qrer.drop(columns=['pa']).sort_values(by=['r','Scenario','Outcome']).reset_index(drop=True)
sum_df_qrer

Unnamed: 0,r,Scenario,Outcome,Bias,Bias_MCSE,RMSE,RMSE_MCSE,Method
0,1,scenario1,Linear,-0.02233,0.008244,0.118707,0.005318,QReR-M
1,1,scenario1,Nonlinear1,-0.033328,0.01348,0.193523,0.008733,QReR-M
2,1,scenario1,Nonlinear2,0.02493,0.144175,2.039099,0.125231,QReR-M
3,1,scenario2,Linear,-0.031358,0.007944,0.116637,0.005488,QReR-M
4,1,scenario2,Nonlinear1,-0.122774,0.014756,0.242114,0.010876,QReR-M
5,1,scenario2,Nonlinear2,4.875077,0.356514,7.013328,0.444806,QReR-M
6,1,scenario3,Linear,-0.028594,0.007887,0.115144,0.005666,QReR-M
7,1,scenario3,Nonlinear1,-0.133249,0.015407,0.2554,0.011644,QReR-M
8,1,scenario3,Nonlinear2,-6.112064,0.315123,7.564246,0.322803,QReR-M
9,2,scenario1,Linear,-0.016963,0.007217,0.103461,0.005018,QReR-M


In [11]:
sum_df_all = pd.concat([sum_df_pivot,sum_df_qrer]).sort_values(by=['r','Scenario','Outcome']).reset_index(drop=True)
sum_df_all = sum_df_all.set_index(['r','Scenario','Outcome','Method'])
sum_df_all.to_csv('./save/pate_cpr.csv')

In [12]:
sum_df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Bias,Bias_MCSE,RMSE,RMSE_MCSE
r,Scenario,Outcome,Method,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,scenario1,Linear,IPW,0.039949,0.029915,0.424949,0.032395
1,scenario1,Linear,PSM,0.162419,0.027155,0.416969,0.022757
1,scenario1,Linear,FM,0.155569,0.026217,0.402081,0.022412
1,scenario1,Linear,EBAL,-0.002284,0.007524,0.106430,0.004827
1,scenario1,Linear,SBW,-0.003231,0.007391,0.104578,0.004755
...,...,...,...,...,...,...,...
2,scenario3,Nonlinear2,FM,-8.278990,0.420735,10.195357,0.498106
2,scenario3,Nonlinear2,EBAL,-8.151318,0.419759,10.083827,0.633938
2,scenario3,Nonlinear2,SBW,-6.955099,0.377884,8.771124,0.439215
2,scenario3,Nonlinear2,EBCW,-8.151585,0.419758,10.084033,0.633966
