In [20]:
using Random, DataFrames, Distributions, StatsModels, GLM

In [21]:
using Random, DataFrames, Distributions

function generar_datos(n::Int=1000; seed::Union{Int,Nothing}=nothing)
    if seed !== nothing
        Random.seed!(seed)   # fija semilla si quieres reproducibilidad, en este caso no
    end

    X1 = rand(Normal(), n)
    X2 = rand(Normal(), n)
    X3 = rand(Bernoulli(0.5), n)
    X4 = rand(Normal(), n)
    D  = rand(Bernoulli(0.5), n)
    ϵ  = rand(Normal(), n)

    Y = 2 .* D .+ 0.5 .* X1 .- 0.3 .* X2 .+ 0.2 .* X3 .+ ϵ

    return DataFrame(
        Y = Y,
        D = D,
        X1 = X1,
        X2 = X2,
        X3 = X3,
        X4 = X4
    )
end

# Ejemplo de uso
df = generar_datos(1000, seed=nothing)  # sin semilla fija para que se aprecie el cambio de los predictores lasso elegidos 
first(df, 5)


Row,Y,D,X1,X2,X3,X4
Unnamed: 0_level_1,Float64,Bool,Float64,Float64,Bool,Float64
1,-1.10063,False,-1.37881,-0.557993,False,-0.466615
2,2.57531,True,1.23118,-0.385044,False,0.01503
3,3.53803,True,0.888268,-1.27134,False,-0.91043
4,0.75127,True,-0.40482,-0.675685,False,-1.34471
5,0.411425,False,0.125845,1.30583,True,0.490851


In [22]:

# 1) Regresión simple: Y ~ D
model_simple = lm(@formula(Y ~ D), df)
ct_simple = coeftable(model_simple)   # tabla con coef, stderr, t, p

# 2) Regresión con controles: Y ~ D + X1 + X2 + X3 + X4
model_controls = lm(@formula(Y ~ D + X1 + X2 + X3 + X4), df)
ct_controls = coeftable(model_controls)

println("---- Regresión simple: Y ~ D ----")
println(ct_simple)
println()
println("---- Regresión con controles: Y ~ D + X1 + X2 + X3 + X4 ----")
println(ct_controls)



---- Regresión simple: Y ~ D ----
──────────────────────────────────────────────────────────────────────────
                 Coef.  Std. Error      t  Pr(>|t|)   Lower 95%  Upper 95%
──────────────────────────────────────────────────────────────────────────
(Intercept)  0.0656232   0.051531    1.27    0.2031  -0.0354983   0.166745
D            2.01306     0.0731691  27.51    <1e-99   1.86948     2.15664
──────────────────────────────────────────────────────────────────────────

---- Regresión con controles: Y ~ D + X1 + X2 + X3 + X4 ----
────────────────────────────────────────────────────────────────────────────
                  Coef.  Std. Error      t  Pr(>|t|)   Lower 95%   Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept)   0.0109008   0.0544711   0.20    0.8414  -0.0959908   0.117792
D             2.06043     0.0632887  32.56    <1e-99   1.93623     2.18462
X1            0.51269     0.0316582  16.19    <1e-51   0.450565    0.5748

In [23]:
using DataFrames, HypothesisTests, Statistics

function balance_check(df::DataFrame, covariables::Vector{Symbol})
    results = DataFrame(
        variable = String[], 
        mean_treated = Float64[], 
        mean_control = Float64[], 
        p_value = Float64[]
    )
    
    for var in covariables
        treated = df[df.D .== 1, var]
        control = df[df.D .== 0, var]

        # T-test de dos muestras (varianzas iguales)
        test = HypothesisTests.EqualVarianceTTest(treated, control)
        pval = pvalue(test)

        push!(results, (
            variable = String(var),
            mean_treated = mean(treated),
            mean_control = mean(control),
            p_value = pval
        ))
    end
    
    return results
end

# Ejemplo de uso
balance_df = balance_check(df, [:X1, :X2, :X3, :X4])
println("Balance check (t-test por covariable):")
println(balance_df)


Balance check (t-test por covariable):
[1m4×4 DataFrame[0m
[1m Row [0m│[1m variable [0m[1m mean_treated [0m[1m mean_control [0m[1m p_value  [0m
     │[90m String   [0m[90m Float64      [0m[90m Float64      [0m[90m Float64  [0m
─────┼────────────────────────────────────────────────
   1 │ X1          -0.0601658    0.038604    0.118605
   2 │ X2          -0.0195868    0.00175376  0.734122
   3 │ X3           0.479839     0.498016    0.565791
   4 │ X4          -0.0753231    0.0146817   0.151505


In [24]:
using DataFrames, GLMNet, Random, Statistics

# Generar datos
#df = generar_datos(1000, seed=123)

# Matriz X y vector y
X = Matrix(df[:, [:X1, :X2, :X3, :X4]])
y = df.Y

#Random.seed!(123)

# LASSO con validación cruzada
cvfit = glmnetcv(X, y; alpha=1, nfolds=5)

# Encontrar índice del lambda que minimiza el error promedio
best_idx = argmin(cvfit.meanloss)
best_lambda = cvfit.lambda[best_idx]

println("Mejor λ (lambda_min): ", best_lambda)



# Coeficientes en λmin
coefs = GLMNet.coef(cvfit)
vars = ["X1", "X2", "X3", "X4"]

for (v, c) in zip(vars, coefs)
    println(rpad(v, 12), " -> ", round(c, digits=4))
end

# Variables seleccionadas (coef ≠ 0, ignorando intercepto)
selected = [v for (v,c) in zip(vars[1:end], coefs[1:end]) if c ≠ 0.0]
println("Variables seleccionadas: ", selected)

# Construir la fórmula dinámicamente
# Variable dependiente: Y
# Variables independientes: D + variables en selected
terms = [Term(:Y), Term(:D), [Term(Symbol(v)) for v in selected]...]
formula = FormulaTerm(terms[1], tuple(terms[2:end]...))

# Ajustar el modelo GLM
model_controls = GLM.lm(formula, df)

# Obtener la tabla de coeficientes
ct_controls = GLM.coeftable(model_controls)
println(ct_controls)

# Evaluar el modelo (opcional)
y_pred = GLM.predict(model_controls, df)
y_true = df[!, :Y]
mse = mean((y_true .- y_pred).^2)
println("MSE en el conjunto completo: ", mse)
r2 = cor(y_true, y_pred)^2
println("R² en el conjunto completo: ", r2)

Mejor λ (lambda_min): 0.025731704335141693
X1           -> 0.4377
X2           -> -0.2744
X3           -> 0.0
X4           -> -0.0035
Variables seleccionadas: ["X1", "X2", "X4"]
────────────────────────────────────────────────────────────────────────────
                  Coef.  Std. Error      t  Pr(>|t|)   Lower 95%   Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept)   0.0459905   0.0444897   1.03    0.3015  -0.0413139   0.133295
D             2.05937     0.0632895  32.54    <1e-99   1.93518     2.18357
X1            0.514653    0.0316132  16.28    <1e-52   0.452617    0.576689
X2           -0.290878    0.0318425  -9.13    <1e-18  -0.353364   -0.228392
X4            0.0187461   0.0318962   0.59    0.5569  -0.0438454   0.0813376
────────────────────────────────────────────────────────────────────────────
MSE en el conjunto completo: 0.9917345986949503
R² en el conjunto completo: 0.5777528368017952


# Conclusion
---
LASSO vs. OLS: LASSO is preferable due to its feature selection, excluding the non-significant X3 while retaining similar predictive power (R² ≈ 0.5778). The OLS model includes X3 unnecessarily, but coefficients for D, X1, and X2 are consistent across both models.
Performance: LASSO’s R² (0.5778) and MSE (0.9917) suggest a good fit, likely better than your earlier models due to feature selection or outcome scaling.

