- 0329: rerun the simulation and add the summary table generation
- 0330: update the inference method for the matchit wrapper
- 0401: update the results and combine the outcome from QReR
- 0413: update the se calculation of IPW1
- 0430: update the network
- 0509: add stable weighting

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from benchmarks import *
from sklearn.preprocessing import StandardScaler

In [2]:
from joblib import Parallel,delayed

In [3]:
folder_name = '0525simucpr/'

In [4]:
def parallel_unit(i,r,scenario):

    # data path
    data_path = './save/simu_data/'+scenario+'/'
    
    # load the data
    data_full_path = data_path + 'd' + str(i) + '.npy'
    dat = np.load(data_full_path,allow_pickle=True)

    # unzip the data
    x = dat.item()['x']
    z = dat.item()['z']
    y1 = dat.item()['y1']
    y2 = dat.item()['y2']
    y3 = dat.item()['y3']
    
    nt = int(z.sum())
    x = x[:nt*(1+r)]
    z = z[:nt*(1+r)]
    y1 = y1[:nt*(1+r)]
    y2 = y2[:nt*(1+r)]
    y3 = y3[:nt*(1+r)]

    save_folder = './save/'+folder_name+scenario+'/'+'r='+str(r)+'/'+str(i)+'/'

    if not os.path.exists(save_folder):
        print('Creat the folder.')
        os.makedirs(save_folder)
    
    # Matching (M-N-m)
    if not os.path.exists(save_folder+'match_mnm.csv'):
        print('Perform Matching(M-N-m).')
        # fit logistic model for propensity scores
        _, ps_score = glm_wrapper(x,z)

        # tau
        tau_match_mnm = [] 
        for y in [y1,y2,y3]:
            tau_match_mnm.append(match_wrapper(y,z,ps_score,None,False))
        tau_match_mnm = np.array(tau_match_mnm)

        df_match_mnm = pd.DataFrame(tau_match_mnm,columns=['tauhat','95CI_lb','95CI_ub'])
        df_match_mnm['method'] = 'M-N-m'

        df_match_mnm.to_csv(save_folder+'match_mnm.csv',index=False)

    else:
        print('Skip! Matching(M-N-m) has been performed.')
        
    
#     # Matching (M-C-m)
#     if not os.path.exists(save_folder+'match_mcm.csv'):
#         print('Perform Matching(M-C-m).')
#         # fit logistic model for propensity scores
#         _, ps_score = glm_wrapper(x,z)

#         # tau
#         tau_match_mcm = [] 
#         for y in [y1,y2,y3]:
#             tau_match_mcm.append(match_wrapper(y,z,ps_score,x,True))
#         tau_match_mcm = np.array(tau_match_mcm)

#         df_match_mcm = pd.DataFrame(tau_match_mcm,columns=['tauhat','95CI_lb','95CI_ub'])
#         df_match_mcm['method'] = 'M-C-m'

#         df_match_mcm.to_csv(save_folder+'match_mcm.csv',index=False)

#     else:
#         print('Skip! Matching(M-C-m) has been performed.')
    
    # IPW
    if not os.path.exists(save_folder+'ipw.csv'):
        print('Perform IPW.')
        
        # tau
        tau_ipw1 = [] 
        for y in [y1,y2,y3]:
            tau_ipw1.append(ipw1_wrapper(y,z,x))

        df_ipw1 = pd.DataFrame(tau_ipw1,columns=['tauhat','95CI_lb','95CI_ub'])
        df_ipw1['method'] = 'IPW1'
        
        df_ipw1.to_csv(save_folder+'ipw.csv',index=False)
        
    else:
        print('Skip! IPW has been performed.')
        
        
#     # DR
#     if not os.path.exists(save_folder+'dr.csv'):
#         print('Perform DR.')
        
#         tau_dr = [] 
#         for y in [y1,y2,y3]:
#             tau_dr.append(dr_wrapper(y,z,x))

#         df_dr = pd.DataFrame(tau_dr,columns=['tauhat','95CI_lb','95CI_ub'])
#         df_dr['method'] = 'DR'
        
#         df_dr.to_csv(save_folder+'dr.csv',index=False)
        
#     else:
#         print('Skip! DR has been performed.')
        
    
    # ATE
    if not os.path.exists(save_folder+'ebcw.csv'):
        print('Perform EBCW.')
        
        tau_ate = [] 
        for y in [y1,y2,y3]:
            tau_ate.append(ate_wrapper(y,z,x))

        df_ate = pd.DataFrame(tau_ate,columns=['tauhat','95CI_lb','95CI_ub'])
        df_ate['method'] = 'EBCW'
        
        df_ate.to_csv(save_folder+'ebcw.csv',index=False)
        
    else:
        print('Skip! EBCW has been performed.')
        
    # EBAL
    if not os.path.exists(save_folder+'ebal.csv'):
        print('Perform EBAL.')
        
        tau_ebal = [] 
        for y in [y1,y2,y3]:
            tau_ebal.append(ebal_wrapper(y,z,x))

        df_ebal = pd.DataFrame(tau_ebal,columns=['tauhat','95CI_lb','95CI_ub'])
        df_ebal['method'] = 'EBAL'
        
        df_ebal.to_csv(save_folder+'ebal.csv',index=False)
        
    else:
        print('Skip! EBAL has been performed.')
        
#     # CBPS
#     if not os.path.exists(save_folder+'cbps.csv'):
#         print('Perform CBPS.')
        
#         tau_cbps = [] 
#         for y in [y1,y2,y3]:
#             tau_cbps.append(cbps_wrapper(y,z,x))

#         df_cbps = pd.DataFrame(tau_cbps,columns=['tauhat','95CI_lb','95CI_ub'])
#         df_cbps['method'] = 'CBPS'
        
#         df_cbps.to_csv(save_folder+'cbps.csv',index=False)
#     else:
#         print('Skip! CBPS has been performed.')
    
    # SBW
    if not os.path.exists(save_folder+'sbw.csv'):
        print('Perform SBW.')
        
        tau_sbw = [] 
        for y in [y1,y2,y3]:
            tau_sbw.append(optweight_wrapper(y,z,x))

        df_sbw = pd.DataFrame(tau_sbw,columns=['tauhat','95CI_lb','95CI_ub'])
        df_sbw['method'] = 'SBW'
        
        df_sbw.to_csv(save_folder+'sbw.csv',index=False)
    else:
        print('Skip! CBPS has been performed.')
    
    # FM
    if not os.path.exists(save_folder+'fm.csv'):
        tau_fm = [] 
        for y in [y1,y2,y3]:
            tau_fm.append(matchit_wrapper(y,z,x))
            #tau_fm.append(matchit_wrapper(y,z,x,'full','mahalanobis'))

        df_fm = pd.DataFrame(tau_fm,columns=['tauhat','95CI_lb','95CI_ub'])
        df_fm['method'] = 'FM'
        
        df_fm.to_csv(save_folder+'fm.csv',index=False)
    else:
        print('Skip! FM has been performed.')
        
#     # Nonparametric CBPS
#     if not os.path.exists(save_folder+'npcbps.csv'):
#         print('Perform NPCBPS.')
        
#         tau_npcbps = [] 
#         for y in [y1,y2,y3]:
#             tau_npcbps.append(cbps_wrapper(y,z,x,cbps_type='npcbps'))

#         df_npcbps = pd.DataFrame(tau_npcbps,columns=['tauhat','95CI_lb','95CI_ub'])
#         df_npcbps['method'] = 'NPCBPS'
        
#         df_npcbps.to_csv(save_folder+'npcbps.csv',index=False)
#     else:
#         print('Skip! NPCBPS has been performed.')

In [5]:
n_kernel = 40
n_data = 200

In [6]:
rs = [1,2]
scenarios = ['scenario1','scenario2','scenario3']

In [7]:
for r in rs:
    for scenario in scenarios:
        print('------------- Simulation Comparisons:',scenario,'r =',r,'-------------')
        Parallel(n_jobs=n_kernel)(delayed(parallel_unit)(i=i,r=r,scenario=scenario) for i in tqdm(range(n_data)))

  0%|          | 0/200 [00:00<?, ?it/s]

------------- Simulation Comparisons: scenario1 r = 1 -------------


100%|██████████| 200/200 [00:19<00:00, 10.17it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

------------- Simulation Comparisons: scenario2 r = 1 -------------


100%|██████████| 200/200 [00:09<00:00, 21.17it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

------------- Simulation Comparisons: scenario3 r = 1 -------------


100%|██████████| 200/200 [00:09<00:00, 20.87it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

------------- Simulation Comparisons: scenario1 r = 2 -------------


100%|██████████| 200/200 [00:15<00:00, 12.85it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

------------- Simulation Comparisons: scenario2 r = 2 -------------


100%|██████████| 200/200 [00:16<00:00, 12.39it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

------------- Simulation Comparisons: scenario3 r = 2 -------------


100%|██████████| 200/200 [00:15<00:00, 13.02it/s]


In [8]:
def summary_table_gen(dat_array,tau,method):
    bias = dat_array[:,:,0].mean(axis=0)-tau
    std = dat_array[:,:,0].std(axis=0)
    rmse = np.sqrt(((dat_array[:,:,0]-tau)**2).mean(axis=0))
    covarage = ((dat_array[:,:,1]<=tau)*(dat_array[:,:,2]>=tau)).mean(axis=0)
    width = (dat_array[:,:,2] - dat_array[:,:,1]).mean(axis=0)
    
    df = pd.DataFrame({'Outcome':['Linear','Nonlinear1','Nonlinear2'],
                   'Bias':bias,
                   'RMSE':rmse,
                   'CI Covarage':covarage,
                   'CI Width':width,
                   'Method':method})
    return df

In [9]:
sum_df_lst = []
save_file = './save/'
tau = 1

In [18]:
for r in [1]:
    for scenario in ['scenario3']:
        ipw_array = []
        fm_array = []
        match_mnm_array = []
        ebal_array = []
        sbw_array = []
        ebcb_array = []
        #qrwg_array_list = [[] for i in range(len(save_qrwg_files_list))]

        for i in tqdm(range(200)):
            ipw_df = pd.read_csv(save_benchmark_files + str(i) + '/ipw.csv')
            match_mnm_df = pd.read_csv(save_benchmark_files + str(i) + '/match_mnm.csv')
            fm_df = pd.read_csv(save_benchmark_files + str(i) + '/fm.csv')
            ebcb_df = pd.read_csv(save_benchmark_files + str(i) + '/ebcw.csv')
            ebal_df = pd.read_csv(save_benchmark_files + str(i) + '/ebal.csv')
            sbw_df = pd.read_csv(save_benchmark_files + str(i) + '/sbw.csv')

        #     for i_qrwg in range(len(save_qrwg_files_list)):
        #         qrwg_df = pd.read_csv(save_qrwg_files_list[i_qrwg] + str(i) + '/tau_est.csv')
        #         qrwg_array_list[i_qrwg].append(qrwg_df.values)

            ipw = ipw_df.iloc[:3,:3].values
            match_mnm = match_mnm_df.iloc[:3,:3].values
            fm = fm_df.iloc[:3,:3].values
            ebcb = ebcb_df.iloc[:3,:3].values
            ebal = ebal_df.iloc[:3,:3].values
            sbw = sbw_df.iloc[:3,:3].values

            ipw_array.append(ipw)
            match_mnm_array.append(match_mnm)
            fm_array.append(fm)
            ebal_array.append(ebal)
            sbw_array.append(sbw)
            ebcb_array.append(ebcb)


        ipw_array = np.array(ipw_array)
        match_mnm_array = np.array(match_mnm_array)
        fm_array = np.array(fm_array)
        ebal_array = np.array(ebal_array)
        sbw_array = np.array(sbw_array)
        ebcb_array = np.array(ebcb_array)

100%|██████████| 200/200 [00:02<00:00, 97.62it/s] 


In [10]:
for r in [1,2]:
    for scenario in ['scenario1','scenario2','scenario3']:

        save_benchmark_files = save_file + folder_name + scenario+'/r='+str(r)+'/'

#         qrwg_files_lists = ['qrwg_mdiff_lam=2_pa=1/',#,#,
#                            'qrwg_mdiff_lam=2_pa=0.5/',
#                            'qrwg_mdiff_lam=2_pa=0.1/']

#         qrwg_files_names = ['QRWG-pa=1','QRWG-pa=0.5','QRWG-pa=0.1']

#         save_qrwg_files_list = [save_file + item for item in qrwg_files_lists]
        # save_qrwg_files_list

        ipw_array = []
        fm_array = []
        match_mnm_array = []
        ebal_array = []
        sbw_array = []
        ebcb_array = []
        #qrwg_array_list = [[] for i in range(len(save_qrwg_files_list))]

        for i in tqdm(range(200)):
            ipw_df = pd.read_csv(save_benchmark_files + str(i) + '/ipw.csv')
            match_mnm_df = pd.read_csv(save_benchmark_files + str(i) + '/match_mnm.csv')
            fm_df = pd.read_csv(save_benchmark_files + str(i) + '/fm.csv')
            ebcb_df = pd.read_csv(save_benchmark_files + str(i) + '/ebcw.csv')
            ebal_df = pd.read_csv(save_benchmark_files + str(i) + '/ebal.csv')
            sbw_df = pd.read_csv(save_benchmark_files + str(i) + '/sbw.csv')

        #     for i_qrwg in range(len(save_qrwg_files_list)):
        #         qrwg_df = pd.read_csv(save_qrwg_files_list[i_qrwg] + str(i) + '/tau_est.csv')
        #         qrwg_array_list[i_qrwg].append(qrwg_df.values)

            ipw = ipw_df.iloc[:3,:3].values
            match_mnm = match_mnm_df.iloc[:3,:3].values
            fm = fm_df.iloc[:3,:3].values
            ebcb = ebcb_df.iloc[:3,:3].values
            ebal = ebal_df.iloc[:3,:3].values
            sbw = sbw_df.iloc[:3,:3].values

            ipw_array.append(ipw)
            match_mnm_array.append(match_mnm)
            fm_array.append(fm)
            ebal_array.append(ebal)
            sbw_array.append(sbw)
            ebcb_array.append(ebcb)


        ipw_array = np.array(ipw_array)
        match_mnm_array = np.array(match_mnm_array)
        fm_array = np.array(fm_array)
        ebal_array = np.array(ebal_array)
        sbw_array = np.array(sbw_array)
        ebcb_array = np.array(ebcb_array)
        #qrwg_array_list = [np.array(qrwg_array) for qrwg_array in qrwg_array_list]

        ipw_sum_df = summary_table_gen(ipw_array,tau,'IPW')
        match_mnm_sum_df = summary_table_gen(match_mnm_array,tau,'PSM')
        fm_sum_df = summary_table_gen(fm_array,tau,'FM')
        ebal_sum_df = summary_table_gen(ebal_array,tau,'EBAL')
        ebcb_sum_df = summary_table_gen(ebcb_array,tau,'EBCW')
        sbw_sum_df = summary_table_gen(sbw_array,tau,'SBW')
        #qrwg_sum_df_list = [summary_table_gen(qrwg_array,tau, qrwg_files_names[i]) for i, qrwg_array in enumerate(qrwg_array_list)]

        sum_df = pd.concat([ipw_sum_df,
                  match_mnm_sum_df,fm_sum_df,ebal_sum_df,sbw_sum_df,ebcb_sum_df])

        sum_df['Scenario'] = scenario
        sum_df['r'] = r

        sum_df = sum_df.reset_index(drop=True)

        sum_df_lst.append(sum_df)

100%|██████████| 200/200 [00:01<00:00, 108.11it/s]
100%|██████████| 200/200 [00:01<00:00, 107.38it/s]
100%|██████████| 200/200 [00:01<00:00, 114.10it/s]
100%|██████████| 200/200 [00:01<00:00, 102.96it/s]
100%|██████████| 200/200 [00:01<00:00, 109.62it/s]
100%|██████████| 200/200 [00:01<00:00, 110.50it/s]


In [11]:
sum_df_pivot = pd.concat(sum_df_lst)
sum_df_pivot

Unnamed: 0,Outcome,Bias,RMSE,CI Covarage,CI Width,Method,Scenario,r
0,Linear,0.039949,0.424949,1.00,4.152484,IPW,scenario1,1
1,Nonlinear1,0.077329,0.582269,1.00,5.618629,IPW,scenario1,1
2,Nonlinear2,0.246775,3.556107,0.97,13.108974,IPW,scenario1,1
3,Linear,0.162419,0.416969,1.00,2.512527,PSM,scenario1,1
4,Nonlinear1,0.244808,0.613333,1.00,3.659448,PSM,scenario1,1
...,...,...,...,...,...,...,...,...
13,Nonlinear1,-0.113795,0.223939,1.00,3.871902,SBW,scenario3,2
14,Nonlinear2,-7.064268,8.883050,0.65,18.267944,SBW,scenario3,2
15,Linear,0.002554,0.087754,0.94,0.346819,EBCW,scenario3,2
16,Nonlinear1,-0.022089,0.190161,0.94,0.715940,EBCW,scenario3,2


In [15]:
sum_df_qrer = pd.read_csv('./save/0525qrer_sp/qrwg_sp.csv')
#sum_df_qwrg = sum_df_qwrg.set_index(['r','Scenario','Outcome'])
sum_df_qrer = sum_df_qrer.drop(columns=['pa']).sort_values(by=['r','Scenario','Outcome']).reset_index(drop=True)
sum_df_qrer

Unnamed: 0,r,Scenario,Outcome,Bias,RMSE,CI Covarage,CI Width,Method
0,1,scenario1,Linear,0.000886,0.362929,1.0,2.433858,QReR-S1
1,1,scenario1,Linear,-0.024032,0.117627,1.0,2.205995,QReR-S2
2,1,scenario1,Linear,0.024394,0.278415,1.0,2.409702,QReR-S3
3,1,scenario1,Nonlinear1,-0.014162,0.515248,1.0,3.501385,QReR-S1
4,1,scenario1,Nonlinear1,-0.035941,0.192624,1.0,3.171703,QReR-S2
5,1,scenario1,Nonlinear1,0.02707,0.407236,1.0,3.468821,QReR-S3
6,1,scenario1,Nonlinear2,0.052929,2.336547,0.95,8.186365,QReR-S1
7,1,scenario1,Nonlinear2,0.016747,2.028463,0.96,7.357344,QReR-S2
8,1,scenario1,Nonlinear2,0.062248,2.251676,0.955,8.047511,QReR-S3
9,1,scenario2,Linear,0.006204,0.431706,0.995,2.97281,QReR-S1


In [16]:
sum_df_all = pd.concat([sum_df_pivot,sum_df_qrer]).sort_values(by=['r','Scenario','Outcome']).reset_index(drop=True)
sum_df_all = sum_df_all.set_index(['r','Scenario','Outcome','Method'])
sum_df_all.to_csv('./save/sp_cpr.csv')

In [17]:
sum_df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Bias,RMSE,CI Covarage,CI Width
r,Scenario,Outcome,Method,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,scenario1,Linear,IPW,0.039949,0.424949,1.000,4.152484
1,scenario1,Linear,PSM,0.162419,0.416969,1.000,2.512527
1,scenario1,Linear,FM,0.145307,0.397008,0.995,2.397569
1,scenario1,Linear,EBAL,-0.002284,0.106430,1.000,2.498832
1,scenario1,Linear,SBW,-0.003231,0.104578,1.000,2.321471
...,...,...,...,...,...,...,...
2,scenario3,Nonlinear2,SBW,-7.064268,8.883050,0.650,18.267944
2,scenario3,Nonlinear2,EBCW,-8.466055,10.930968,0.560,19.089203
2,scenario3,Nonlinear2,QReR-S1,-4.507679,6.150922,0.750,14.744702
2,scenario3,Nonlinear2,QReR-S2,-4.706129,5.893102,0.690,13.366878
