# Double/Debiased Machine Learning: 401k Data

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.linalg import toeplitz

from sklearn.model_selection import KFold
from sklearn.base import clone

from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import doubleml as dml
from doubleml.datasets import fetch_bonus

In [None]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

In [None]:
raw_data = dml.datasets.fetch_401K()
raw_data.head()

In [None]:
y_col = 'net_tfa'
d_cols = ['e401']
x_cols = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']
dml_data = dml.DoubleMLData(raw_data, y_col, d_cols, x_cols)
dml_data

In [None]:
# Set machine learning methods for m & g
learner = RandomForestRegressor(max_depth=2, n_estimators=100)
ml_learners = {'ml_m': clone(learner),
               'ml_g': clone(learner)}
n_folds = 2
n_rep_cross_fit=100

dml_plr_obj_rf = dml.DoubleMLPLR(dml_data,
                                 ml_learners,
                                 n_folds,
                                 n_rep_cross_fit,
                                 'IV-type',
                                 'dml1')

In [None]:
dml_plr_obj_rf.fit()
dml_plr_obj_rf.summary

In [None]:
poly = PolynomialFeatures(2, include_bias=False)
data_transf = poly.fit_transform(raw_data[x_cols])
x_cols_lasso = poly.get_feature_names(x_cols)

data_transf = pd.DataFrame(data_transf, columns=x_cols_lasso)
data_transf = pd.concat((raw_data[[y_col] + d_cols], data_transf),
                        axis=1, sort=False)

dml_data_lasso = dml.DoubleMLData(data_transf, y_col, d_cols, x_cols_lasso)
dml_data_lasso

In [None]:
# Set machine learning methods for m & g
learner = Lasso(alpha=0.1)
ml_learners = {'ml_m': clone(learner),
               'ml_g': clone(learner)}
n_folds = 2
n_rep_cross_fit=100

dml_plr_obj_lasso = dml.DoubleMLPLR(dml_data_lasso,
                                    ml_learners,
                                    n_folds,
                                    n_rep_cross_fit,
                                    'DML2018',
                                    'dml2')

In [None]:
dml_plr_obj_lasso.fit()
dml_plr_obj_lasso.summary

In [None]:
# Set machine learning methods for m & g
ml_learners = {'ml_m': RandomForestClassifier(max_depth=2, n_estimators=100),
               'ml_g': RandomForestRegressor(max_depth=2, n_estimators=100)}
n_folds = 2
n_rep_cross_fit=100

dml_irm_obj = dml.DoubleMLIRM(dml_data,
                              ml_learners,
                              n_folds,
                              n_rep_cross_fit,
                              'ATE',
                              'dml2')

In [None]:
dml_irm_obj.fit()
dml_irm_obj.summary