# Double/Debiased Machine Learning for the Partially Linear Regression Model

This is a simple implementation of Debiased Machine Learning for the Partially Linear Regression Model, which provides an application of DML inference to determine the causal effect of countries' intitial wealth on the rate of economic growth.


Reference:

- https://arxiv.org/abs/1608.00060
- https://www.amazon.com/Business-Data-Science-Combining-Accelerate/dp/1260452778

The code is based on the book.

In [None]:
# Import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV, LinearRegression, Ridge, Lasso, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import patsy
import warnings
from sklearn.base import BaseEstimator, clone
import statsmodels.api as sm
import statsmodels.formula.api as smf
warnings.simplefilter('ignore')

np.random.seed(123)

In [None]:
file = "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/GrowthData.csv"
data = pd.read_csv(file)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')] # get rid of index column
data.shape

In [None]:
data.head()

Construct the outcome variable, the treatment variable and the matrix $x$ that includes the control variables.

In [None]:
y = data["Outcome"]
d = data["gdpsh465"]
x = data[data.columns[~data.columns.isin(['gdpsh465','intercept','Outcome'])]]

In [None]:
# some summary statistics
print("The length of y is: ", y.shape[0])
print("The number of features in x is: ", x.shape[1])
# naive OLS
all_columns = "+".join(data.iloc[:,2:].columns)
my_formula = "Outcome ~ " + all_columns
ols_naive = smf.ols(formula = my_formula, data = data).fit()
print("Naive OLS that uses all features w/o cross-fitting Y ~ D+X yields: ", ols_naive.params[1], "(",ols_naive.bse[1],")")


## DML algorithm

Here we perform estimation and inference of predictive coefficient $\alpha$ in the partially linear statistical model,
$$
Y = D\alpha + g(X) + U, \quad E (U | D, X) = 0.
$$
For $\tilde Y = Y- E(Y|X)$ and $\tilde D= D- E(D|X)$, we can write
$$
\tilde Y = \alpha \tilde D + U, \quad E (U |\tilde D) =0.
$$
Parameter $\alpha$ is then estimated using cross-fitting approach to obtain the residuals $\tilde D$ and $\tilde Y$.
The algorithm comsumes $Y, D, X$, and machine learning methods for learning the residuals $\tilde Y$ and $\tilde D$, where
the residuals are obtained by cross-validation (cross-fitting).

The statistical parameter $\alpha$ has a causal interpretation of being the effect of $D$ on $Y$ in the causal DAG $$ D\to Y, \quad X\to (D,Y)$$ or the counterfactual outcome model with conditionally exogenous (conditionally random) assignment of treatment $D$ given $X$:
$$
Y(d) = d\alpha + g(X) + U(d),\quad  U(d) \text{ indep } D |X, \quad Y = Y(D), \quad U = U(D).
$$


In [None]:
def dml(X, D, y, modely, modeld, *, nfolds, classifier=False, time = None, clu = None, cluster = True):
    '''
    DML for the Partially Linear Model setting with cross-fitting

    Input
    -----
    X: the controls
    D: the treatment
    y: the outcome
    modely: the ML model for predicting the outcome y
    modeld: the ML model for predicting the treatment D
    nfolds: the number of folds in cross-fitting
    classifier: bool, whether the modeld is a classifier or a regressor

    time: array of time indices, eg [0,1,...,T-1,0,1,...,T-1,...,0,1,...,T-1]
    clu: array of cluster indices, eg [1073, 1073, 1073, ..., 5055, 5055, 5055, 5055]
    cluster: bool, whether to use clustered standard errors

    Output
    ------
    point: the point estimate of the treatment effect of D on y
    stderr: the standard error of the treatment effect
    yhat: the cross-fitted predictions for the outcome y
    Dhat: the cross-fitted predictions for the treatment D
    resy: the outcome residuals
    resD: the treatment residuals
    epsilon: the final residual-on-residual OLS regression residual
    '''
    cv = KFold(n_splits=nfolds, shuffle=True, random_state=123) # shuffled k-folds
    yhat = cross_val_predict(modely, X, y, cv=cv, n_jobs=-1) # out-of-fold predictions for y
    # out-of-fold predictions for D
    # use predict or predict_proba dependent on classifier or regressor for D
    if classifier:
        Dhat = cross_val_predict(modeld, X, D, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]
    else:
        Dhat = cross_val_predict(modeld, X, D, cv=cv, n_jobs=-1)
    # calculate outcome and treatment residuals
    resy = y - yhat
    resD = D - Dhat

    if cluster:
      # final stage ols clustered
      dml_data = pd.concat([clu, pd.Series(time), pd.Series(resy, name = 'resy'), pd.Series(resD, name = 'resD')], axis=1)

    else:
      # final stage ols nonclustered
      dml_data = pd.concat([pd.Series(resy, name = 'resy'), pd.Series(resD, name = 'resD')], axis=1)

    if cluster:
      # clustered standard errors
      ols_mod = smf.ols(formula = 'resy ~ 1 + resD', data = dml_data).fit(cov_type='cluster', cov_kwds={"groups": dml_data['CountyCode']})

    else:
      # regular ols
      ols_mod = smf.ols(formula = 'resy ~ 1 + resD', data = dml_data).fit()

    point = ols_mod.params[1]
    stderr = ols_mod.bse[1]
    epsilon = ols_mod.resid

    return point, stderr, yhat, Dhat, resy, resD, epsilon

In [None]:
def summary(point, stderr, yhat, Dhat, resy, resD, epsilon, X, D, y, *, name):
    '''
    Convenience summary function that takes the results of the DML function
    and summarizes several estimation quantities and performance metrics.
    '''
    return pd.DataFrame({'estimate': point, # point estimate
                         'stderr': stderr, # standard error
                         #'lower': point - 1.96*stderr, # lower end of 95% confidence interval
                         #'upper': point + 1.96*stderr, # upper end of 95% confidence interval
                         'rmse y': np.sqrt(np.mean(resy**2)), # RMSE of model that predicts outcome y
                         'rmse D': np.sqrt(np.mean(resD**2)) # RMSE of model that predicts treatment D
                         }, index=[name])

We now run through DML using as first stage models:
 1. OLS
 2. (Rigorous) Lasso
 3. Random Forests
 4. Mix of Random Forest and Lasso

Run the following command to install hdmpy for rigorous lasso:

In [None]:
!pip install multiprocess


!git clone https://github.com/maxhuppertz/hdmpy.git

In [None]:
import hdmpy
from sklearn.base import BaseEstimator, clone

class RLasso(BaseEstimator):

    def __init__(self, *, post=True):
        self.post = post

    def fit(self, X, y):
        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)
        return self

    def predict(self, X):
        return np.array(X) @ np.array(self.rlasso_.est['beta']).flatten() + np.array(self.rlasso_.est['intercept'])

lasso_model = lambda: RLasso(post=False)

In [None]:
# DML with OLS:
modely = make_pipeline(StandardScaler(), LinearRegression())
modeld = make_pipeline(StandardScaler(), LinearRegression())
result_OLS = dml(x,d,y, modely, modeld, nfolds=10, classifier=False, cluster = False)
table_OLS = summary(*result_OLS, x,d,y, name = 'OLS')

# DML with RLasso:
modely = make_pipeline(StandardScaler(), RLasso(post=False))
modeld = make_pipeline(StandardScaler(), RLasso(post=False))
result_RLasso = dml(x,d,y, modely, modeld, nfolds=10, classifier=False, cluster = False)
table_RLasso = summary(*result_RLasso, x,d,y, name = 'Lasso')


# DML with Random Forests
modely = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=123))
modeld = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=123))
result_RF = dml(x,d,y, modely, modeld, nfolds=10, classifier=False, cluster = False)
table_RF = summary(*result_RF, x,d,y, name = 'RF')

# DML with Mix:
modely = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=123))
modeld = make_pipeline(StandardScaler(), RLasso(post=False))
result_mix = dml(x,d,y, modely, modeld, nfolds=10, classifier=False, cluster = False)
table_mix = summary(*result_mix, x,d,y, name = 'RF/Lasso Mix')

In [None]:
table = pd.concat([table_OLS, table_RLasso, table_RF, table_mix], axis=0)

In [None]:
print(table)