# 0) Setup e imports

In [1]:
pkgs <- c(
  "data.table","glmnet","ranger","nnet","Matrix","stats","utils"
)
to_install <- pkgs[!pkgs %in% installed.packages()[,1]]
if(length(to_install)) install.packages(to_install, repos = "https://cloud.r-project.org")

library(data.table)
library(glmnet)
library(ranger)
library(nnet)
library(Matrix)

set.seed(12345)
options(stringsAsFactors = FALSE)


Loading required package: Matrix

Loaded glmnet 4.1-7



In [9]:
getwd()  # verifica que sea C:/Users/ARIANA
list.files()  # debe mostrar penn_jae.dat

In [11]:
data_path <- "C:/Users/ARIANA/penn_jae.dat.txt"
DT <- data.table::fread(data_path)

# 1) Cleaning & set-up (robusto sin 'age')

In [17]:
# normaliza nombres
setnames(DT, old = names(DT), new = tolower(names(DT)))

# columnas requeridas
req_base <- c("tg","inuidur1","dep")
req_x_core <- c("female","black","othrace","q2","q3","q4","q5","q6",
                "recall","durable","nondurable","lusd","husd")

miss <- setdiff(c(req_base, req_x_core), names(DT))
if(length(miss)) stop(paste("Faltan columnas:", paste(miss, collapse=", ")))

# manejo de edad: usa agelt35/agegt54 si existen; si no, deriva desde 'age'
has_agelt <- "agelt35" %in% names(DT)
has_agegt <- "agegt54" %in% names(DT)
has_age   <- "age"     %in% names(DT)

if(!(has_agelt && has_agegt)) {
  if(has_age) {
    DT[, agelt35 := as.integer(age < 35)]
    DT[, agegt54 := as.integer(age > 54)]
  } else {
    stop("No existen 'agelt35'/'agegt54' ni 'age' para derivarlas.")
  }
}

# filtra tg==0 o tg==4
DT <- DT[tg %in% c(0,4)]

# tratamiento y outcome
DT[, T4 := as.integer(tg == 4)]
DT[, y  := log(inuidur1)]

# dummies de 'dep' (baseline dep_0)
DT[, dep := as.integer(dep)]
DT[, dep_0 := as.integer(dep == 0)]
DT[, dep_1 := as.integer(dep == 1)]
DT[, dep_2 := as.integer(dep == 2)]

# define X como pide el enunciado
x_vars <- c("female","black","othrace",
            "dep_1","dep_2",
            "q2","q3","q4","q5","q6",
            "recall","agelt35","agegt54",
            "durable","nondurable","lusd","husd")

# verifica disponibilidad de X
miss_x <- setdiff(x_vars, names(DT))
if(length(miss_x)) stop(paste("Faltan columnas de X:", paste(miss_x, collapse=", ")))

# dataset final
use_cols <- c("y","T4", x_vars)
DT <- na.omit(DT[, ..use_cols])

y <- DT$y
d <- DT$T4
X <- as.matrix(DT[, ..x_vars])
n <- nrow(DT)

cat(sprintf("Muestra final: %d filas, %d predictores.\n", n, ncol(X)))


Muestra final: 5099 filas, 17 predictores.


# 2) Utilidades

In [19]:
rmse <- function(a, b) sqrt(mean((a - b)^2))

plm_theta_se <- function(y_tilde, d_tilde) {
  theta <- sum(d_tilde * y_tilde) / sum(d_tilde * d_tilde)
  psi   <- (y_tilde - d_tilde * theta) * d_tilde
  se <- sqrt(mean(psi^2) / (length(y_tilde) * mean(d_tilde^2)^2))
  list(theta = theta, se = se)
}


# 3) Learners (OLS, Lasso, RF, NN)

In [21]:
library(glmnet)
library(ranger)
library(nnet)

# OLS y LOGIT
fit_y_ols <- function(X, y) lm(y ~ ., data = data.frame(y=y, X))
pred_y_ols <- function(fit, X) predict(fit, newdata = data.frame(X))

fit_d_logit <- function(X, d) glm(d ~ ., data=data.frame(d=d, X), family=binomial())
pred_d_logit <- function(fit, X) as.numeric(predict(fit, newdata=data.frame(X), type="response"))

# LASSO
fit_y_lasso <- function(X, y) cv.glmnet(x = X, y = y, family = "gaussian", alpha = 1)
pred_y_lasso <- function(fit, X) as.numeric(predict(fit, newx = X, s = "lambda.min"))

fit_d_lasso <- function(X, d) cv.glmnet(x = X, y = d, family = "binomial", alpha = 1)
pred_d_lasso <- function(fit, X) as.numeric(predict(fit, newx = X, s = "lambda.min", type = "response"))

# Random Forest
fit_y_rf <- function(X, y) ranger(y ~ ., data=data.frame(y=y, X),
                                  num.trees=1000, mtry=floor(sqrt(ncol(X))), min.node.size=5, seed=1)
pred_y_rf <- function(fit, X) as.numeric(predict(fit, data=data.frame(X))$predictions)

fit_d_rf <- function(X, d) ranger(d ~ ., data=data.frame(d=factor(d), X), probability=TRUE,
                                  num.trees=1000, mtry=floor(sqrt(ncol(X))), min.node.size=5, seed=1)
pred_d_rf <- function(fit, X) {
  pr <- predict(fit, data=data.frame(X))$predictions
  if(is.vector(pr)) return(as.numeric(pr))
  as.numeric(pr[,"1"])
}

# Neural net
fit_y_nn <- function(X, y, size=4, decay=1e-4, maxit=500) {
  nnet(y ~ ., data=data.frame(y=y, X), size=size, decay=decay, maxit=maxit, linout=TRUE, trace=FALSE)
}
pred_y_nn <- function(fit, X) as.numeric(predict(fit, newdata=data.frame(X)))

fit_d_nn <- function(X, d, size=3, decay=1e-4, maxit=500) {
  nnet(d ~ ., data=data.frame(d=factor(d), X), size=size, decay=decay, maxit=maxit, trace=FALSE)
}
pred_d_nn <- function(fit, X) {
  pr <- predict(fit, newdata=data.frame(X), type="raw")
  if(is.matrix(pr) && ncol(pr)==2) return(as.numeric(pr[,2]))
  as.numeric(pr)
}


# 4) DML con cross-fitting

In [23]:
dml_plm <- function(y, d, X, K=2,
                    ml_y = list(fit=fit_y_lasso, pred=pred_y_lasso),
                    ml_d = list(fit=fit_d_lasso, pred=pred_d_lasso),
                    return_nuisance_rmse = TRUE) {
  n <- length(y)
  folds <- sample(rep(1:K, length.out = n))
  m_hat <- g_hat <- rep(NA_real_, n)
  rmse_y_folds <- rmse_d_folds <- c()

  for(k in 1:K){
    I_tr <- which(folds != k); I_te <- which(folds == k)
    fit_m <- ml_y$fit(X[I_tr,,drop=FALSE], y[I_tr])
    fit_g <- ml_d$fit(X[I_tr,,drop=FALSE], d[I_tr])
    m_hat[I_te] <- ml_y$pred(fit_m, X[I_te,,drop=FALSE])
    g_hat[I_te] <- ml_d$pred(fit_g, X[I_te,,drop=FALSE])
    if(return_nuisance_rmse){
      rmse_y_folds <- c(rmse_y_folds, rmse(y[I_te], m_hat[I_te]))
      rmse_d_folds <- c(rmse_d_folds, rmse(d[I_te], g_hat[I_te]))
    }
  }

  y_tilde <- y - m_hat
  d_tilde <- d - g_hat
  est <- plm_theta_se(y_tilde, d_tilde)

  out <- list(theta = est$theta, se = est$se)
  if(return_nuisance_rmse){
    out$rmse_y <- mean(rmse_y_folds); out$rmse_d <- mean(rmse_d_folds)
  }
  out
}


# 5) DML SIN cross-fitting

In [25]:
dml_plm_no_cf <- function(y, d, X, K=2,
                          ml_y = list(fit=fit_y_lasso, pred=pred_y_lasso),
                          ml_d = list(fit=fit_d_lasso, pred=pred_d_lasso),
                          return_nuisance_rmse = TRUE) {
  n <- length(y)
  folds <- sample(rep(1:K, length.out = n))
  m_hat <- g_hat <- rep(NA_real_, n)
  rmse_y_folds <- rmse_d_folds <- c()

  for(k in 1:K){
    I_k <- which(folds == k)
    fit_m <- ml_y$fit(X[I_k,,drop=FALSE], y[I_k])
    fit_g <- ml_d$fit(X[I_k,,drop=FALSE], d[I_k])
    m_hat[I_k] <- ml_y$pred(fit_m, X[I_k,,drop=FALSE])
    g_hat[I_k] <- ml_d$pred(fit_g, X[I_k,,drop=FALSE])
    if(return_nuisance_rmse){
      rmse_y_folds <- c(rmse_y_folds, rmse(y[I_k], m_hat[I_k]))
      rmse_d_folds <- c(rmse_d_folds, rmse(d[I_k], g_hat[I_k]))
    }
  }

  y_tilde <- y - m_hat
  d_tilde <- d - g_hat
  est <- plm_theta_se(y_tilde, d_tilde)

  out <- list(theta = est$theta, se = est$se)
  if(return_nuisance_rmse){
    out$rmse_y <- mean(rmse_y_folds); out$rmse_d <- mean(rmse_d_folds)
  }
  out
}


# 6) Ejecuta: CF y No-CF con 4 modelos

In [27]:
learners <- list(
  "OLS+LOGIT" = list(ml_y=list(fit=fit_y_ols,   pred=pred_y_ols),
                     ml_d=list(fit=fit_d_logit, pred=pred_d_logit)),
  "LASSO"     = list(ml_y=list(fit=fit_y_lasso, pred=pred_y_lasso),
                     ml_d=list(fit=fit_d_lasso, pred=pred_d_lasso)),
  "RF"        = list(ml_y=list(fit=fit_y_rf,    pred=pred_y_rf),
                     ml_d=list(fit=fit_d_rf,    pred=pred_d_rf)),
  "NN"        = list(ml_y=list(fit=fit_y_nn,    pred=pred_y_nn),
                     ml_d=list(fit=fit_d_nn,    pred=pred_d_nn))
)

run_block <- function(fun, y, d, X, K, learners) {
  res <- lapply(names(learners), function(name){
    set.seed(42)
    ml <- learners[[name]]
    est <- fun(y=y, d=d, X=X, K=K, ml_y=ml$ml_y, ml_d=ml$ml_d)
    data.table(
      Method = name,
      theta  = est$theta,
      se     = est$se,
      pval   = 2*pnorm(-abs(est$theta/est$se)),
      rmse_y = est$rmse_y,
      rmse_d = est$rmse_d
    )
  })
  rbindlist(res)
}

K <- 2
tab_cf   <- run_block(dml_plm,       y, d, X, K, learners)[, CrossFitting:="Yes"]
tab_nocf <- run_block(dml_plm_no_cf, y, d, X, K, learners)[, CrossFitting:="No"]
results_all <- rbind(tab_cf, tab_nocf)[order(CrossFitting, Method)]
results_all


Method,theta,se,pval,rmse_y,rmse_d,CrossFitting
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
LASSO,-0.0730846,0.03506389,0.03713045,1.188764,0.4743361,No
NN,-0.06428574,0.03488759,0.06537964,1.150431,0.4621387,No
OLS+LOGIT,-0.07199566,0.03509784,0.04023922,1.188087,0.4727075,No
RF,-0.06690366,0.03557079,0.05999101,1.098698,0.4384868,No
LASSO,-0.07535457,0.03529125,0.03274307,1.198176,0.4745012,Yes
NN,-0.06656165,0.03537802,0.05991199,1.22649,0.4818913,Yes
OLS+LOGIT,-0.0744631,0.03517661,0.0342738,1.197637,0.4753144,Yes
RF,-0.08434902,0.03485202,0.01551179,1.205164,0.4807615,Yes


# 7) OLS con controles como benchmark

In [29]:
df_full <- data.frame(y=y, d=d, X)
ols_full <- lm(y ~ d + ., data=df_full)
ols_sum  <- summary(ols_full)

theta_ols_controls <- coef(ols_full)["d"]
se_ols_controls    <- coef(ols_sum)[ "d", "Std. Error"]
pval_ols_controls  <- coef(ols_sum)[ "d", "Pr(>|t|)"]

ols_row <- data.table(
  CrossFitting="N/A", Method="OLS with controls",
  theta=theta_ols_controls, se=se_ols_controls, pval=pval_ols_controls,
  rmse_y = rmse(y, predict(ols_full, newdata=df_full)),
  rmse_d = rmse(d, fitted(glm(d ~ ., data=df_full, family=binomial())))
)

results_all <- rbind(results_all, ols_row, fill=TRUE)
results_all[order(CrossFitting, Method)]


Method,theta,se,pval,rmse_y,rmse_d,CrossFitting
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
OLS with controls,-0.07257562,0.03527046,0.03967099,1.189979,0.4731577,
LASSO,-0.0730846,0.03506389,0.03713045,1.188764,0.4743361,No
NN,-0.06428574,0.03488759,0.06537964,1.150431,0.4621387,No
OLS+LOGIT,-0.07199566,0.03509784,0.04023922,1.188087,0.4727075,No
RF,-0.06690366,0.03557079,0.05999101,1.098698,0.4384868,No
LASSO,-0.07535457,0.03529125,0.03274307,1.198176,0.4745012,Yes
NN,-0.06656165,0.03537802,0.05991199,1.22649,0.4818913,Yes
OLS+LOGIT,-0.0744631,0.03517661,0.0342738,1.197637,0.4753144,Yes
RF,-0.08434902,0.03485202,0.01551179,1.205164,0.4807615,Yes


# 8) Selección de modelo (CF) y estimación final

In [31]:
tab_cf_sorted <- tab_cf[order(se)]
best_cf <- tab_cf_sorted[1]
best_cf

run_final <- function(method = best_cf$Method){
  ml <- learners[[method]]
  out <- dml_plm(y=y, d=d, X=X, K=K, ml_y=ml$ml_y, ml_d=ml$ml_d)
  cat(sprintf("\nFinal DML (CF) con %s\n", method))
  cat(sprintf("theta=%.4f, se=%.4f, pval=%.4g\n", out$theta, out$se, 2*pnorm(-abs(out$theta/out$se))))
  invisible(out)
}
# Ejemplo:
# final_fit <- run_final()


Method,theta,se,pval,rmse_y,rmse_d,CrossFitting
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
RF,-0.08434902,0.03485202,0.01551179,1.205164,0.4807615,Yes


# 9) Tablas legibles

In [33]:
print_table <- function(TAB, title="Resultados") {
  cat("\n", title, "\n")
  cat("-------------------------------------------------------------\n")
  print(TAB[, .(CrossFitting, Method,
                theta = round(theta,4),
                se    = round(se,4),
                pval  = signif(pval,3),
                rmse_y = round(rmse_y,4),
                rmse_d = round(rmse_d,4))])
  cat("-------------------------------------------------------------\n")
}

print_table(tab_cf,   "Table A. DML con cross-fitting")
print_table(tab_nocf, "Table B. DML sin cross-fitting")
print_table(results_all, "Appendix. Todos los modelos (incluye OLS con controles)")



 Table A. DML con cross-fitting 
-------------------------------------------------------------
   CrossFitting    Method   theta     se   pval rmse_y rmse_d
1:          Yes OLS+LOGIT -0.0745 0.0352 0.0343 1.1976 0.4753
2:          Yes     LASSO -0.0754 0.0353 0.0327 1.1982 0.4745
3:          Yes        RF -0.0843 0.0349 0.0155 1.2052 0.4808
4:          Yes        NN -0.0666 0.0354 0.0599 1.2265 0.4819
-------------------------------------------------------------

 Table B. DML sin cross-fitting 
-------------------------------------------------------------
   CrossFitting    Method   theta     se   pval rmse_y rmse_d
1:           No OLS+LOGIT -0.0720 0.0351 0.0402 1.1881 0.4727
2:           No     LASSO -0.0731 0.0351 0.0371 1.1888 0.4743
3:           No        RF -0.0669 0.0356 0.0600 1.0987 0.4385
4:           No        NN -0.0643 0.0349 0.0654 1.1504 0.4621
-------------------------------------------------------------

 Appendix. Todos los modelos (incluye OLS con controles) 
-----

# 10) Respuestas en Markdown
cat("
# Answers

## PLM and DML
We estimate the partially linear model
$ y = \\theta d + g_0(X) + \\varepsilon, \\quad d = m_0(X) + \\nu.$
DML uses cross-fitting to build out-of-sample residuals
$\\tilde y = y - \\hat g(X),\\; \\tilde d = d - \\hat m(X)$
and
$\\hat\\theta = \\frac{\\sum_i \\tilde d_i\\tilde y_i}{\\sum_i \\tilde d_i^2}$,
with IF-based standard errors.

## Cross-fitting vs no cross-fitting
- RMSE for predicting $y$ and $d$ is usually **smaller** without cross-fitting due to in-sample optimism.
- Lower RMSE there does **not** mean better causal inference; it reflects **overfitting** of nuisances.
- Sin cross-fitting, el sesgo de regularización se filtra al estimando y genera **sesgo** y **inferencias no conservadoras**.

## Selected model
Choose the CF method with the smallest SE in Table A and report its $\\hat\\theta$ as the final effect.
")
