In [1]:
import Pkg
using Pkg

# Install required packages
Pkg.add("CSV")
Pkg.add("DataFrames")
Pkg.add("StatsModels")
Pkg.add("GLM")
Pkg.add("Random")
Pkg.add("MLDataUtils")
Pkg.add("MLBase")
Pkg.add("FixedEffectModels")
Pkg.add("Lasso")
Pkg.add("MLJ")
Pkg.add("DecisionTree")
Pkg.add("RData")
Pkg.add("GLMNet")
Pkg.add("PrettyTables")
Pkg.add("MLJScikitLearnInterface")
Pkg.add("MLJFlux")
Pkg.add("Flux")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m DataAPI ───────────────────── v1.16.0
[32m[1m   Installed[22m[39m TableTraits ───────────────── v1.0.1
[32m[1m   Installed[22m[39m SentinelArrays ────────────── v1.4.8
[32m[1m   Installed[22m[39m IteratorInterfaceExtensions ─ v1.0.0
[32m[1m   Installed[22m[39m PooledArrays ──────────────── v1.4.3
[32m[1m   Installed[22m[39m InlineStrings ─────────────── v1.4.5
[32m[1m   Installed[22m[39m Tables ────────────────────── v1.12.1
[32m[1m   Installed[22m[39m CSV ───────────────────────── v0.10.15
[32m[1m   Installed[22m[39m DataValueInterfaces ───────── v1.0.0
[32m[1m   Installed[22m[39m WorkerUtilities ───────────── v1.6.1
[32m[1m   Installed[22m[39m OrderedCollections ────────── v1.8.1
[32m[1m   Installed[22m[39m FilePathsBase ─────────────── v0.9.24
[32m[1m   Installed[22m[39m Weak

In [12]:
using CSV, DataFrames, StatsModels, GLM, Random, RData, MLBase, MLJ, PrettyTables, FixedEffectModels
using MLDataUtils, FixedEffectModels, DecisionTree, Lasso, GLMNet, MLJScikitLearnInterface

In [16]:
using Downloads, CSV, DataFrames

# URL de tu dataset
url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat"
Downloads.download(url, "penn_jae.dat")
# Leer el archivo (delimitado por espacios)
data = CSV.read("penn_jae.dat", DataFrame; delim=' ', ignorerepeated=true)

# Mostrar información básica
println("Número de filas: ", size(data, 1))
println("Número de columnas: ", size(data, 2))
println("Nombres de columnas:")
println(names(data))

first(data, 5)
  

Número de filas: 13913
Número de columnas: 23
Nombres de columnas:
["abdt", "tg", "inuidur1", "inuidur2", "female", "black", "hispanic", "othrace", "dep", "q1", "q2", "q3", "q4", "q5", "q6", "recall", "agelt35", "agegt54", "durable", "nondurable", "lusd", "husd", "muld"]


Row,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,q1,q2,q3,q4,q5,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,10824,0,18,18,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,10635,2,7,3,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0
3,10551,5,18,6,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0
4,10824,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
5,10747,0,27,27,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [None]:
# Cleaning and set-up
filter!(row -> row.tg == 0 || row.tg == 4, data)
data.T4 = [row.tg == 4 ? 1 : 0 for row in eachrow(data)]
filter!(row -> row.inuidur1 > 0, data)
data.y = log.(data.inuidur1)
data.dep_0 = [row.dep == 0 ? 1 : 0 for row in eachrow(data)]
data.dep_1 = [row.dep == 1 ? 1 : 0 for row in eachrow(data)]
data.dep_2 = [row.dep == 2 ? 1 : 0 for row in eachrow(data)]

xvars = [
    :female, :black, :othrace,
    :dep_1, :dep_2,
    :q2, :q3, :q4, :q5, :q6,
    :recall, :agelt35, :agegt54,
    :durable, :nondurable, :lusd, :husd
]

# Definir Y, D y X para la estimación
y = data.y
d = data.T4
x = data[:, xvars]

println("\nVariables listas para DML:")
println("Outcome (Y): log(inuidur1)")
println("Treatment (D): T4 (tg==4)")
println("Número de controles (X): ", length(xvars))
println("Dimensiones de X: ", size(x))

# Vista rápida de los datos
first(data, 5)


Variables listas para DML:
Outcome (Y): log(inuidur1)
Treatment (D): T4 (tg==4)
Número de controles (X): 17
Dimensiones de X: (5099, 17)


Row,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,q1,q2,q3,q4,q5,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld,T4,y,dep_0,dep_1,dep_2
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Float64,Int64,Int64,Int64
1,10824,0,18,18,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2.89037,0,0,1
2,10824,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0.0,1,0,0
3,10747,0,27,27,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,3.29584,1,0,0
4,10607,4,9,9,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,2.19722,1,0,0
5,10831,0,27,27,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,3.29584,0,1,0


In [9]:
function training_sample_append(cv_split, test_sample_index)
        training_indices = []
        for vector in cv_split[Not(test_sample_index)]
                training_indices = [training_indices; vector]
        end
        return training_indices, cv_split[test_sample_index]
end

function dml(x, d, y, dreg, yreg, nfold)
        n = length(y)
        cv = [partition(eachindex(y), fill(1/nfold, nfold-1)..., shuffle = true, rng = 1234)...]
        machine_y = machine(yreg, x, y, scitype_check_level=0)
        machine_d = machine(dreg, x, d, scitype_check_level=0)
        y_hat = zeros(n)
        d_hat = zeros(n)

        for fold in 1:nfold
                training_fold, test_fold = training_sample_append(cv, fold)
                y_hat[test_fold] = MLJ.predict(MLJ.fit!(machine_y, rows = training_fold), x[test_fold, :])
                d_hat[test_fold] = MLJ.predict(MLJ.fit!(machine_d, rows = training_fold), x[test_fold, :])
        end

        resy = y .- y_hat
        resd = reshape(d .- d_hat, (n, 1))
        estimate = lm(resd, resy)
        coef_est = GLM.coef(estimate)[1]
        se = GLM.coeftable(estimate).cols[2][1]
        println(" coef (se) = ", coef_est ,"(",se,")")
        return coef_est, se, resy, resd;
end

function summarize(point, stderr, resy, resd, name)
        return DataFrame(
                model = [name],
                estimate = [point], stderr = [stderr], 
                rmse_y = [sqrt(mean(resy .^ 2))], 
                rmse_d = [sqrt(mean(resd .^ 2))]
        )
end

summarize (generic function with 1 method)

In [13]:
LinearRegressor = @load LinearRegressor pkg=MLJScikitLearnInterface verbosity=0
dreg = Standardizer() |> LinearRegressor()
yreg = Standardizer() |> LinearRegressor()
result_ols = dml(x, d, y, dreg, yreg, 10)
table_ols = summarize(result_ols..., "OLS")

LassoCVRegressor = @load LassoCVRegressor pkg=MLJScikitLearnInterface verbosity=0
dreg = Standardizer() |> LassoCVRegressor(max_iter=200000)
yreg = Standardizer() |> LassoCVRegressor(max_iter=200000)
results_lasso = dml(x, d, y, dreg, yreg, 10)
table_lasso = summarize(results_lasso..., "LassoCV")

RandomForestRegressor = @load RandomForestRegressor pkg=MLJScikitLearnInterface verbosity=0
dreg = RandomForestRegressor()
yreg = RandomForestRegressor()
results_rf = dml(x, d, y, dreg, yreg, 10)
table_rf = summarize(results_rf..., "RF")

dreg = Standardizer() |> LassoCVRegressor(max_iter=200000)
results_mix = dml(x, d, y, dreg , yreg, 10)
table_mix = summarize(results_mix..., "RF/LassoCV")

NeuralNetworkRegressor = @load NeuralNetworkRegressor pkg=MLJFlux verbosity=0 
nn_model = NeuralNetworkRegressor(
    builder = MLJFlux.MLP(; hidden=(20,20), σ=relu), 
    epochs = 100,
    batch_size = 32,
    optimiser = Flux.ADAM(0.001),
    rng = 1234
)
dreg = Standardizer() |> nn_model
yreg = Standardizer() |> nn_model
results_nn = dml(x, d, y, dreg, yreg, 10)
table_nn = summarize(results_nn..., "NeuralNet")

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m

 coef (se) = -0.06980700224406336

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:linear_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziRe

(0.03522987047951291)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[

 coef (se) = -0.07297073580257903(0.035281673581847965)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m

 coef (se) = -0.1022458661062878(0.034858329213297366)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(RandomForestRegressor(n_estimators = 100, …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[

 coef (se) = -0.09068804606958437(0.03737404519272649)


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:lasso_cv_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[33m[1m└ [22m[39m[90m@ MLJModels ~/.julia/packages/MLJModels/ziReN/src/builtins/Transformers.jl:637[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:neural_network_regressor, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mMLJFlux: converting input data to Float32
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining

 coef (se) = -0.07574458421704361(0.03458495423037937)


Row,model,estimate,stderr,rmse_y,rmse_d
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64
1,NeuralNet,-0.0757446,0.034585,1.21663,0.492455


In [14]:
pretty_table([table_ols; table_lasso; table_rf; table_mix; table_nn])

┌────────────┬────────────┬───────────┬─────────┬──────────┐
│[1m      model [0m│[1m   estimate [0m│[1m    stderr [0m│[1m  rmse_y [0m│[1m   rmse_d [0m│
│[90m     String [0m│[90m    Float64 [0m│[90m   Float64 [0m│[90m Float64 [0m│[90m  Float64 [0m│
├────────────┼────────────┼───────────┼─────────┼──────────┤
│        OLS │  -0.069807 │ 0.0352299 │ 1.19593 │ 0.475255 │
│    LassoCV │ -0.0729707 │ 0.0352817 │ 1.19601 │ 0.474574 │
│         RF │  -0.102246 │ 0.0348583 │ 1.26704 │ 0.508647 │
│ RF/LassoCV │  -0.090688 │  0.037374 │ 1.26714 │ 0.474574 │
│  NeuralNet │ -0.0757446 │  0.034585 │ 1.21663 │ 0.492455 │
└────────────┴────────────┴───────────┴─────────┴──────────┘


In [None]:
vcat(table_ols, table_lasso, table_rf, table_mix, table_nn)

Row,model,estimate,stderr,rmse_y,rmse_d
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64
1,OLS,-0.069807,0.0352299,1.19593,0.475255
2,LassoCV,-0.0729707,0.0352817,1.19601,0.474574
3,RF,-0.102246,0.0348583,1.26704,0.508647
4,RF/LassoCV,-0.090688,0.037374,1.26714,0.474574
5,NeuralNet,-0.0757446,0.034585,1.21663,0.492455
