In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
import rpy2
from rpy2 import robjects as ro
from rpy2.robjects import Formula
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri, pandas2ri

from joblib import Parallel,delayed

In [3]:
def parallel_unit(i):
    numpy2ri.activate()
    pandas2ri.activate()
    
    try:
        stats = importr("stats")
    except:
        utils = importr('utils')
        utils.install_packages('stats', repos='http://cran.us.r-project.org')
        stats = importr("stats")
        
    try:
        matching = importr("Matching")
    except:
        utils = importr('utils')
        utils.install_packages('Matching', repos='http://cran.us.r-project.org')
        matching = importr("Matching")
    
    def match_wrapper(y,z,X):
        kwargs = {'Y':y,
              'Tr':z,
              'X':X,
              'estimand':'ATE',
              'M':1}
        rr = matching.Match(**kwargs)
        tauhat = rr[0]
        se = rr[1]
        lb = tauhat - stats.qnorm(0.975)*se
        ub = tauhat + stats.qnorm(0.975)*se
        return tauhat.item(), lb.item(), ub.item()

    # data path
    data_path = './save/simu_data/'
    
    # load the data
    data_full_path = data_path + 'd' + str(i) + '.npy'
    dat = np.load(data_full_path,allow_pickle=True)

    # unzip the data
    x = dat.item()['x']
    z = dat.item()['z']
    y1 = dat.item()['y1']
    y2 = dat.item()['y2']
    y3 = dat.item()['y3']


    save_folder = './save/benchmarks/'+str(i)+'/'

    if not os.path.exists(save_folder):
        print('Creat the folder.')
        os.makedirs(save_folder)

    if not os.path.exists(save_folder+'match.csv'):
        print('Perform Matching.')
        # fit logistic model for propensity scores
        columns = ['x'+str(i+1) for i in range(x.shape[1])] + ['z']
        formula = columns[-1]+'~'+'+'.join(columns[:-1])

        kwargs = {"formula": Formula(formula),
                  "family": stats.binomial,
                  "data": pd.DataFrame(np.hstack([x, z.reshape(-1,1)]),
                                        columns=columns)}

        glm_ps = stats.glm(**kwargs)
        ps_score = glm_ps[2]

        # tau
        tau_match_ps = [] 
        tau_match_x = [] 

        for y in [y1,y2,y3]:
            tau_match_ps.append(match_wrapper(y,z,ps_score))
            tau_match_x.append(match_wrapper(y,z,x))

        tau_match_ps = np.array(tau_match_ps)
        tau_match_x = np.array(tau_match_x)

        df_match_ps = pd.DataFrame(tau_match_ps,columns=['tauhat','95CI_lb','95CI_ub'])
        df_match_ps['method'] = 'Match-PS'

        df_match_x = pd.DataFrame(tau_match_x,columns=['tauhat','95CI_lb','95CI_ub'])
        df_match_x['method'] = 'Match-X'

        df_match = pd.concat([df_match_ps,df_match_x],axis=0)
        df_match.to_csv(save_folder+'match.csv',index=False)

    else:
        print('Skip! Matching has been performed.')
        

In [4]:
n_kernel = 20
n_data = 1000

In [5]:
Parallel(n_jobs=n_kernel)(delayed(parallel_unit)(i=i) for i in tqdm(range(n_data)))

  4%|▍         | 45/1000 [00:13<05:18,  3.00it/s]

NameError: name 'se' is not defined