# Econometrics of Big Data Final Problem Part B
*by Christian Stolborg*

*15-07-2022*

## Data preparation

In [48]:
import warnings

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import doubleml as dml

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns


warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./data/401ksubs.csv")

# From DoubleML docs - need to convert to float due to sklearn issue https://github.com/scikit-learn/scikit-learn/issues/21997
dtypes = df.dtypes
dtypes['nifa'] = 'float64'
dtypes['net_tfa'] = 'float64'
dtypes['tw'] = 'float64'
dtypes['inc'] = 'float64'
df = df.astype(dtypes)

X_cols = "age,inc,fsize,educ,db,marr,twoearn,pira,hown".split(",")
df.head(2)

Unnamed: 0,net_tfa,age,inc,fsize,educ,db,marr,twoearn,e401,p401,pira,hown
0,0,47,6765,2,8,0,0,0,0,0,0,1
1,1015,36,28452,1,16,0,0,0,0,0,0,1


### (i)

The problem of estimating the policy effect ($D$) of 401(k) eligibility on net financial assets ($Y$) with only 9 control variables ($X$) becomes a high-dimensional problem when either $Y$ or $D$ is functionally related to $X$ in a way that is not linear in $X$. For example, through higher-order terms, interactions or additional unobservable variables. In this model, enrollment into a 401(k) plan is not random and it is highly likely that net financial assets is affected by households' heterogeneity in saving preferences. Hence, we should believe that both $Y$ and $D$ are partly determined by other factors such as those present in $X$. However, the relationship between $Y$, $D$ and $X$ could very well be a non-linear relationship. Exploring whether this is the case can be done in a data-driven way with double machine learning methods.

### (ii)



In [33]:
X = df[["e401"]+X_cols]
X = sm.add_constant(X)
y = df["net_tfa"]

model = sm.OLS(y, X).fit()
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.291e+04,4276.223,-7.695,0.000,-4.13e+04,-2.45e+04
e401,5896.1984,1250.014,4.717,0.000,3445.917,8346.480
age,624.1455,59.521,10.486,0.000,507.472,740.819
inc,0.9357,0.030,30.982,0.000,0.876,0.995
fsize,-1018.7979,449.859,-2.265,0.024,-1900.614,-136.982
educ,-639.7538,228.499,-2.800,0.005,-1087.659,-191.848
db,-4904.5684,1359.098,-3.609,0.000,-7568.677,-2240.460
marr,743.3445,1795.556,0.414,0.679,-2776.310,4262.999
twoearn,-1.923e+04,1576.431,-12.196,0.000,-2.23e+04,-1.61e+04


Estimating $net_tfa = \alpha_0 e401 + \beta'X + \epsilon$ I find that $\alpha_0= 5896$ with a standard error of $1250$. Thus, in the linear model, 401(k) eligibility corresponds to an increase in net financial assets of almost \$6,000

### (iii)

I now repeat the analysis using double machine learning, allowing for non-linear nuisance functions.

In [37]:
dml_data_base = DoubleMLData(df,
                        y_col='net_tfa',
                        d_cols='e401',
                        x_cols=X_cols)

In [90]:
def nl_transform(data, features: str) -> pd.DataFrame:
    """ Perform non-linear transformation on features in data """
    features = data.copy()[features]

    # Add polynomials
    poly_dict = {'age': 2,
                'inc': 2,
                'educ': 2,
                'fsize': 2}

    for key, degree in poly_dict.items():
        poly = PolynomialFeatures(degree, include_bias=False)
        data_transf = poly.fit_transform(data[[key]])
        x_cols = poly.get_feature_names_out([key])
        data_transf = pd.DataFrame(data_transf, columns=x_cols)

        features = pd.concat((features, data_transf),
                            axis=1, sort=False)

    # Add interaction terms
    cols = features.columns
    for col1 in cols:
        for col2 in cols:
            c1 = col1.strip("^23")
            c2 = col2.strip("^23")
            if c1 != c2:
                features[col1+"_"+col2] = features[col1] * features[col2]

    model_data = pd.concat((data.copy()[['net_tfa', 'e401']], features.copy()),
                            axis=1, sort=False)

    return model_data

model_data = nl_transform(df, features=['marr', 'twoearn', 'db', 'pira', 'hown'])
print(f"Shape of flexible data model: {model_data.shape}")


# Initialize DoubleMLData (data-backend of DoubleML)
dml_data_flex = dml.DoubleMLData(model_data, y_col='net_tfa', d_cols='e401')

Shape of flexible data model: (9915, 163)


In [91]:
# Initialize learners
Cs = 0.0001*np.logspace(0, 4, 10)
lasso = make_pipeline(StandardScaler(), LassoCV(cv=5, max_iter=10000))
lasso_class = make_pipeline(StandardScaler(),
                            LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear',
                                                 Cs = Cs, max_iter=1000))

np.random.seed(123)
# Initialize DoubleMLPLR model
dml_plr_lasso = dml.DoubleMLPLR(dml_data_base,
                                ml_l = lasso,
                                ml_m = lasso_class,
                                n_folds = 3)

dml_plr_lasso.fit(store_predictions=True)
dml_plr_lasso.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
e401,5722.476521,1380.619334,4.144862,3.4e-05,3016.512351,8428.440692


In [92]:
# Estimate the ATE in the flexible model with lasso
np.random.seed(123)
dml_plr_lasso = dml.DoubleMLPLR(dml_data_flex,
                                ml_l = lasso,
                                ml_m = lasso_class,
                                n_folds = 3)

dml_plr_lasso.fit(store_predictions=True)
lasso_summary = dml_plr_lasso.summary

lasso_summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
e401,9295.923313,1367.652263,6.796993,1.06825e-11,6615.374134,11976.472492
