In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
import doubleml as dml

#-------------------------------------------
# Load and preprocess data
#-------------------------------------------
census = pd.read_csv("census2000.csv")
census.rename(columns=lambda c: c[1:] if c.startswith(' ') else c, inplace=True)

# Treatment indicator: T = 1 if male, 0 if female
census['T'] = (census['sex'] == 'M').astype(int)

# Outcome: log wage = log(income/hours)
census['wage'] = census['income'] / census['hours']
census['Y'] = np.log(census['wage'])

# Features: age, marital, race, education (converted to dummies)
X = pd.get_dummies(census[['age','marital','race','education']], drop_first=True)
Y = census['Y'].values
T = census['T'].values

#-------------------------------------------
# Create DoubleMLData Object
#-------------------------------------------
X_np = X.values
obj_dml_data = dml.DoubleMLData.from_arrays(X_np, Y, T)

#-------------------------------------------
# Specify Learners
#-------------------------------------------
# For the propensity score model (ml_m): a classifier
propensity_model = RandomForestClassifier(n_estimators=500, random_state=42)

# For the outcome model (ml_g): a regressor
outcome_model = RandomForestRegressor(n_estimators=500, random_state=42)

#-------------------------------------------
# Initialize DoubleMLIRM for Binary Treatment DR Estimation
#-------------------------------------------
dml_irm = dml.DoubleMLIRM(
    obj_dml_data,
    ml_g=outcome_model,
    ml_m=propensity_model,
    n_folds=2,   # corresponds to cross-fitting folds
    score='ATE'  # default IRM score estimates ATE; this is a doubly robust score
)

# Fit the DoubleML model
dml_irm.fit()

# Extract the estimated treatment effect (ATE)
tau_DR = dml_irm.coef[0]
tau_se = dml_irm.se[0]

print("DoubleML-based Doubly Robust ATE estimate:", tau_DR)
print("Standard Error:", tau_se)

# If you also want to try different ML methods:
# e.g. logistic regression for p(x) and linear regression for mu0(x):
# ml_g_lin = LinearRegression()
# ml_m_logistic = LogisticRegression(max_iter=1000)
# dml_irm_lin = dml.DoubleMLIRM(obj_dml_data, ml_g_lin, ml_m_logistic, n_folds=2)
# dml_irm_lin.fit()
# print("DoubleML with linear/logistic:", dml_irm_lin.coef[0])

DoubleML-based Doubly Robust ATE estimate: 0.12342314601251633
Standard Error: 0.17182105633995212


