In [1]:
import numpy as np
from datagen import *
from benchmarks import *

In [2]:
i = 0

# data path
data_path = './save/simu_data/'

# load the data
data_full_path = data_path + 'd' + str(i) + '.npy'
dat = np.load(data_full_path,allow_pickle=True)

# unzip the data
x = dat.item()['x']
z = dat.item()['z']
y1 = dat.item()['y1']
y2 = dat.item()['y2']
y3 = dat.item()['y3']

In [3]:
y = y1.copy()

In [4]:
def ipw1_wrapper(y,z,x):
    # point estimation
    glm_ps, ps_score = glm_wrapper(x,z)
    tauhat = np.mean(z*y/ps_score)-np.mean((1-z)*y/(1-ps_score))

    # the se calculation follows Lunceford and Davidian (2004)
    H = np.mean((z*y*(1-ps_score)/ps_score-(1-z)*y*ps_score/(1-ps_score)).reshape(-1,1)*x,axis=0)
    ww = ps_score*(1-ps_score)
    E = np.matmul(x.T,ww.reshape(-1,1)*x)/x.shape[0]
    HEinv = np.matmul(H.T,np.linalg.pinv(E))
    xHEinv = np.matmul(x,HEinv)
    se = np.sum((z*y/ps_score-(1-z)*y/(1-ps_score)-tauhat-(z-ps_score)*xHEinv)**2)/(x.shape[0]**2)

    # confidence interval
    qnorm = stats.qnorm(0.975).item()
    lb = tauhat - qnorm*se
    ub = tauhat + qnorm*se
    
    return tauhat, lb, ub

In [5]:
# tau
tau_ipw1 = [] 
for y in [y1,y2,y3]:
    tau_ipw1.append(ipw1_wrapper(y,z,x))
    
df_ipw1 = pd.DataFrame(tau_ipw1,columns=['tauhat','95CI_lb','95CI_ub'])
df_ipw1['method'] = 'IPW1'

#df_ipw1.to_csv(save_folder+'ipw1.csv',index=False)

In [6]:
df_ipw1

Unnamed: 0,tauhat,95CI_lb,95CI_ub,method
0,0.699449,-0.146443,1.54534,IPW1
1,1.051727,-3.715475,5.818929,IPW1
2,3.150288,1.677431,4.623146,IPW1


In [7]:
def dr_wrapper(y,z,x):
    from sklearn.linear_model import LinearRegression
    
    # estimate propensity score
    glm_ps, ps_score = glm_wrapper(x,z)

    # estimate linear model for each treatment group
    lr1 = LinearRegression()
    lr0 = LinearRegression()
    lr1.fit(x[z==1,:],y[z==1])
    lr0.fit(x[z==0,:],y[z==0])

    yhat1 = lr1.predict(x)
    yhat0 = lr0.predict(x)

    # point estimator
    tauhat = np.mean((z*y-(z-ps_score)*yhat1)/ps_score)-np.mean(((1-z)*y+(z-ps_score)*yhat0)/(1-ps_score))

    # compute the se (following Gutman & Rubin SMMR 2015)
    I = (z*y-(z-ps_score)*yhat1)/ps_score-((1-z)*y+(z-ps_score)*yhat0)/(1-ps_score)-tauhat
    se = np.sum(I**2)/(I.shape[0]**2)

    # confidence interval
    qnorm = stats.qnorm(0.975).item()
    lb = tauhat - qnorm*se
    ub = tauhat + qnorm*se
    
    return tauhat, lb, ub

In [8]:
# tau
tau_dr = [] 
for y in [y1,y2,y3]:
    tau_dr.append(dr_wrapper(y,z,x))
    
df_dr = pd.DataFrame(tau_dr,columns=['tauhat','95CI_lb','95CI_ub'])
df_dr['method'] = 'DR'


In [9]:
df_dr

Unnamed: 0,tauhat,95CI_lb,95CI_ub,method
0,0.848572,0.824934,0.872211,DR
1,1.307945,-0.773381,3.38927,DR
2,3.351744,1.250765,5.452724,DR


In [10]:
# M-N-m
# fit logistic model for propensity scores
_, ps_score = glm_wrapper(x,z)

# tau
tau_match_mnm = [] 
for y in [y1,y2,y3]:
    tau_match_mnm.append(match_wrapper(y,z,ps_score,None,False))
tau_match_mnm = np.array(tau_match_mnm)

df_match_mnm = pd.DataFrame(tau_match_mnm,columns=['tauhat','95CI_lb','95CI_ub'])
df_match_mnm['method'] = 'M-N-m'

In [11]:
df_match_mnm

Unnamed: 0,tauhat,95CI_lb,95CI_ub,method
0,0.810902,-0.237457,1.859261,M-N-m
1,0.847664,-1.28467,2.979998,M-N-m
2,2.630363,0.370855,4.889871,M-N-m


In [12]:
# M-C-m
# fit logistic model for propensity scores
_, ps_score = glm_wrapper(x,z)

# tau
tau_match_mcm = [] 
for y in [y1,y2,y3]:
    tau_match_mcm.append(match_wrapper(y,z,ps_score,x,True))
tau_match_mcm = np.array(tau_match_mcm)

df_match_mcm = pd.DataFrame(tau_match_mcm,columns=['tauhat','95CI_lb','95CI_ub'])
df_match_mcm['method'] = 'M-C-m'

In [13]:
df_match_mcm

Unnamed: 0,tauhat,95CI_lb,95CI_ub,method
0,0.839503,0.61628,1.062727,M-C-m
1,0.598294,-0.877749,2.074336,M-C-m
2,2.589516,0.790336,4.388696,M-C-m
