# Use the IRM with weights

In [1]:
import numpy as np
import pandas as pd
import doubleml as dml

In [2]:
def group_effect(x):
    if x[0] <= -0.3:
        te = 2.5
    elif (x[0] >= 0.2) & (x[1] >= 0.4):
        te = 1.5
    else:
        te = -2
    return te

In [3]:
def create_synthetic_group_data(n_samples=200, n_w=10, support_size=5):
    """
    Creates a simple synthetic example for group effects.

    Parameters
    ----------
    n_samples : int
        Number of samples.
        Default is ``200``.

    n_w : int
        Dimension of covariates.
        Default is ``10``.

    support_size : int
        Number of relevant covariates.
        Default is ``5``.

    Returns
    -------
     data : pd.DataFrame
            A data frame.

    """
    # Outcome support
    support_w = np.random.choice(np.arange(n_w), size=support_size, replace=False)
    coefs_w = np.random.uniform(0, 1, size=support_size)
    # Define the function to generate the noise
    epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n_samples)
    # Treatment support
    # Assuming the matrices gamma and beta have the same number of non-zero components
    support_t = np.random.choice(np.arange(n_w), size=support_size, replace=False)
    coefs_t = np.random.uniform(0, 1, size=support_size)
    # Define the function to generate the noise
    eta_sample = lambda n: np.random.uniform(-1, 1, size=n_samples)

    # Generate controls, covariates, treatments and outcomes
    w = np.random.normal(0, 1, size=(n_samples, n_w))
    # Group treatment effect
    te = np.apply_along_axis(group_effect, axis=1, arr=w)
    # Define treatment
    log_odds = np.dot(w[:, support_t], coefs_t) + eta_sample(n_samples)
    t_sigmoid = 1 / (1 + np.exp(-log_odds))
    t = np.array([np.random.binomial(1, p) for p in t_sigmoid])
    # Define the outcome
    y = te * t + np.dot(w[:, support_w], coefs_w) + epsilon_sample(n_samples)

    # Now we build the dataset
    y_df = pd.DataFrame({'y': y})
    t_df = pd.DataFrame({'t': t})
    w_df = pd.DataFrame(data=w, index=np.arange(w.shape[0]),
                        columns=[f'w_{i}' for i in range(1, w.shape[1] + 1)])

    data = pd.concat([y_df, t_df, w_df], axis=1)
    covariates = list(w_df.columns.values)

    return data, covariates

### Generate Data

In [4]:
# DGP constants
np.random.seed(42)
n_samples = 500
n_w = 10
support_size = 5

# Create data
data, covariates = create_synthetic_group_data(n_samples=n_samples, n_w=n_w, support_size=support_size)
data_dml_base = dml.DoubleMLData(data,
                                 y_col='y',
                                 d_cols='t',
                                 x_cols=covariates)

#### Baseline IRM Model without Weights

In [5]:
# First stage estimation
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
randomForest_reg = RandomForestRegressor(n_estimators=200, random_state=42)
randomForest_class = RandomForestClassifier(n_estimators=200, random_state=42)

np.random.seed(42)

dml_irm_0 = dml.DoubleMLIRM(data_dml_base,
                          ml_g=randomForest_reg,
                          ml_m=randomForest_class,
                          trimming_threshold=0.01,
                          n_folds=5)
print("Training IRM Model")
print(dml_irm_0.fit(store_predictions=True))

Training IRM Model

------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['t']
Covariates: ['w_1', 'w_2', 'w_3', 'w_4', 'w_5', 'w_6', 'w_7', 'w_8', 'w_9', 'w_10']
Instrument variable(s): None
No. Observations: 500

------------------ Score & algorithm ------------------
Score function: ATE
DML algorithm: dml2

------------------ Machine learner   ------------------
Learner ml_g: RandomForestRegressor(n_estimators=200, random_state=42)
Learner ml_m: RandomForestClassifier(n_estimators=200, random_state=42)
Out-of-sample Performance:
Learner ml_g0 RMSE: [[0.71640709]]
Learner ml_g1 RMSE: [[0.89641921]]
Learner ml_m RMSE: [[0.48132406]]

------------------ Resampling        ------------------
No. folds: 5
No. repeated sample splits: 1
Apply cross-fitting: True

------------------ Fit summary       ------------------
       coef   std err         t     P>|t|     2.5 %    97.5 %
t  0.208387  0.122443  1.701908  0.088773 -0.031597  0.448371


We create weights according to the DGP

In [6]:
groups_1, groups_2 = np.zeros_like(data.y.values),np.zeros_like(data.y.values)
groups_1[np.where(data["w_1"].values<=-0.3)] = 1
groups_2[np.where((data["w_1"].values>=0.2) & (data["w_2"].values>=0.4))] = 1
groups_3 = groups_1 + groups_2
# weights_1 = groups_1 / groups_1.sum()
# weights_2 = groups_2 / groups_2.sum()
# weights_3 = groups_3 / groups_3.sum()

Compare weighted ATE to GATE estimation

In [7]:
dml_irm = dml.DoubleMLIRM(data_dml_base,
                          ml_g=randomForest_reg,
                          ml_m=randomForest_class,
                          trimming_threshold=0.01,
                          n_folds=5,
                          weights=groups_1)
print("Training IRM Model")
dml_irm.fit(store_predictions=True)
print(dml_irm.summary)
print(dml_irm_0.gate(groups=pd.DataFrame(groups_1)).confint())

Training IRM Model
       coef   std err          t         P>|t|    2.5 %    97.5 %
t  2.539948  0.127012  19.997749  5.761453e-89  2.29101  2.788886
     2.5 %    effect    97.5 %
0  2.20386  2.478536  2.753212


In [8]:
dml_irm = dml.DoubleMLIRM(data_dml_base,
                          ml_g=randomForest_reg,
                          ml_m=randomForest_class,
                          trimming_threshold=0.01,
                          n_folds=5,
                          weights=groups_2)
print("Training IRM Model")
dml_irm.fit(store_predictions=True)
print(dml_irm.summary)
print(dml_irm_0.gate(groups=pd.DataFrame(groups_2)).confint())

Training IRM Model
       coef  std err         t         P>|t|     2.5 %    97.5 %
t  1.559241   0.2014  7.741996  9.786780e-15  1.164503  1.953978
      2.5 %    effect   97.5 %
0  1.069601  1.472045  1.87449


In [9]:
dml_irm = dml.DoubleMLIRM(data_dml_base,
                          ml_g=randomForest_reg,
                          ml_m=randomForest_class,
                          trimming_threshold=0.01,
                          n_folds=5,
                          weights=groups_3)
print("Training IRM Model")
dml_irm.fit(store_predictions=True)
print(dml_irm.summary)
print(dml_irm_0.gate(groups=pd.DataFrame(groups_3)).confint())

Training IRM Model
       coef   std err          t          P>|t|     2.5 %    97.5 %
t  2.298257  0.107819  21.315881  8.087073e-101  2.086935  2.509578
      2.5 %   effect    97.5 %
0  1.965647  2.20006  2.434474


Compare weighted ATE to ATTE estimation

In [10]:
dml_irm_atte = dml.DoubleMLIRM(data_dml_base,
                          ml_g=randomForest_reg,
                          ml_m=randomForest_class,
                          trimming_threshold=0.01,
                          n_folds=5,
                          score="ATTE")
print("Training IRM Model")
dml_irm_atte.fit(store_predictions=True)
print(dml_irm_atte.summary)
dml_irm = dml.DoubleMLIRM(data_dml_base,
                          ml_g=randomForest_reg,
                          ml_m=randomForest_class,
                          trimming_threshold=0.01,
                          n_folds=5,
                          weights=data.t.values)
print("Training IRM Model")
dml_irm.fit(store_predictions=True)
print(dml_irm.summary)

Training IRM Model
       coef  std err         t     P>|t|     2.5 %    97.5 %
t  0.230142   0.1477  1.558172  0.119192 -0.059345  0.519629
Training IRM Model
       coef   std err         t     P>|t|     2.5 %    97.5 %
t  0.069694  0.187706  0.371295  0.710418 -0.298202  0.437591
