In [4]:
# Verificar e instalar librerías si es necesario
try:
    import numpy as np
    print("✓ numpy instalado")
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import pandas as pd
    print("✓ pandas instalado")
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    from scipy import stats
    print("✓ scipy instalado")
except ImportError:
    !pip install scipy
    from scipy import stats

try:
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    print("✓ statsmodels instalado")
except ImportError:
    !pip install statsmodels
    import statsmodels.api as sm
    import statsmodels.formula.api as smf

try:
    from sklearn.linear_model import LassoCV
    from sklearn.preprocessing import StandardScaler
    print("✓ scikit-learn instalado")
except ImportError:
    !pip install scikit-learn
    from sklearn.linear_model import LassoCV
    from sklearn.preprocessing import StandardScaler

print("¡Todas las librerías están listas!")

✓ numpy instalado
✓ pandas instalado
✓ scipy instalado


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab 4.4.5 requires setuptools>=41.1.0, which is not installed.
jupyterlab-server 2.27.3 requires requests>=2.31, which is not installed.
jupyter-console 6.6.3 requires pygments, which is not installed.
nbconvert 7.16.6 requires pygments>=2.4.1, which is not installed.


Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting packaging>=21.3 (from statsmodels)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   --------- ------------------------------ 2.4/9.6 MB 10.9 MB/s eta 0:00:01
   --------------- ------------------------ 3.7/9.6 MB 8.6 MB/s eta 0:00:01
   -------------------- ------------------- 5.0/9.6 MB 7.9 MB/s eta 0:00:01
   ----------------------------- ---------- 7.1/9.6 MB 8.4 MB/s eta 0:00:01
   ------------------------------------ --- 8.7/9.6 MB 8.2 MB/s eta 0:00:01
   ---------------------------------------  9.4/9.6 MB 7.8 MB/s eta 0:00:01
   ---------

# 3.1 DATA SIMULATION

In [27]:
# Configurar semilla para reproducibilidad
np.random.seed(42)
# 1. Generar covariables (n = 1000 individuos)
n = 1000
# X1 y X3 continuas ~ N(0,1)
X1 = np.random.normal(0,1,n)
X3 = np.random.normal(0,1,n)
# X2 y X4 binarias ~ Bernoulli(0.5)
X2 = np.random.binomial(1,0.5,n)
X4 = np.random.binomial(1,0.5,n)
# 2. Tratamiento D ~ Bernoulli(0.5)
D = np.random.binomial(1,0.5,n)
# Error aleatorio epsilon ~ N(0,1)
epsilon = np.random.normal(0,1,n)
# 3. VARIABLE OUTCOME Y según la fórmula del lab 3:
Y = 2*D + 0.5*X1 - 0.3*X2 + 0.2*X3 + epsilon
# Note que es el proceso generador de datos. Y no depende de X4 lo que nos
#lleva a intuir que X4 no es una variable relevante.

In [28]:
# 4. Crear DataFrame
df = pd.DataFrame({
    'Y': Y,
    'D': D,
    'X1': X1,
    'X2': X2,
    'X3': X3,
    'X4': X4
})

print("=" * 60)
print("3.1 DATA SIMULATION - COMPLETADO")
print("=" * 60)
print("Primeras 5 filas del DataFrame:")
print(df.head())
print(f"\nDimensión del DataFrame: {df.shape}")

3.1 DATA SIMULATION - COMPLETADO
Primeras 5 filas del DataFrame:
          Y  D        X1  X2        X3  X4
0  1.920756  1  0.496714   0  1.399355   0
1  1.711433  1 -0.138264   0  0.924634   1
2  1.649491  1  0.647689   0  0.059630   1
3 -0.800544  0  1.523030   0 -0.646937   0
4 -0.131596  0 -0.234153   1  0.698223   1

Dimensión del DataFrame: (1000, 6)


In [29]:
# Balance check (comparación de medias)
tratamiento = df[df['D'] == 1]
control = df[df['D'] == 0]

In [30]:
def balance_check(variable):
    t_stat, p_value = stats.ttest_ind(tratamiento[variable], control[variable])
    print(f"{variable}:")
    print(f"  Media tratamiento: {tratamiento[variable].mean():.3f}")
    print(f"  Media control: {control[variable].mean():.3f}")
    print(f"  Diferencia: {tratamiento[variable].mean() - control[variable].mean():.3f}")
    print(f"  p-valor: {p_value:.3f}")
    print("-" * 40)

    print("\nBALANCE CHECK - Comparación de medias")
for cov in ['X1', 'X2', 'X3', 'X4']:
    balance_check(cov)

X1:
  Media tratamiento: 0.049
  Media control: -0.009
  Diferencia: 0.058
  p-valor: 0.350
----------------------------------------

BALANCE CHECK - Comparación de medias
X2:
  Media tratamiento: 0.510
  Media control: 0.496
  Diferencia: 0.014
  p-valor: 0.655
----------------------------------------

BALANCE CHECK - Comparación de medias
X3:
  Media tratamiento: 0.077
  Media control: 0.065
  Diferencia: 0.013
  p-valor: 0.843
----------------------------------------

BALANCE CHECK - Comparación de medias
X4:
  Media tratamiento: 0.465
  Media control: 0.500
  Diferencia: -0.035
  p-valor: 0.271
----------------------------------------

BALANCE CHECK - Comparación de medias


# 3.2 ESTIMATING THE AVERAGE TREATMENT EFFECT

In [31]:
# 1. Regresión simple: Y ~ D
modelo_simple = smf.ols('Y ~ D', data=df).fit()
print("=== REGRESIÓN SIMPLE (Y ~ D) ===")
print("Coeficiente de D (ATE):", round(modelo_simple.params['D'], 3))
print("Error estándar:", round(modelo_simple.bse['D'], 3))
print()

=== REGRESIÓN SIMPLE (Y ~ D) ===
Coeficiente de D (ATE): 2.087
Error estándar: 0.072



In [32]:
# 2. Regresión con controles: Y ~ D + X1 + X2 + X3 + X4
modelo_completo = smf.ols('Y ~ D + X1 + X2 + X3 + X4', data=df).fit()
print("=== REGRESIÓN CON CONTROLES (Y ~ D + X1 + X2 + X3 + X4) ===")
print("Coeficiente de D (ATE):", round(modelo_completo.params['D'], 3))
print("Error estándar:", round(modelo_completo.bse['D'], 3))
print()

=== REGRESIÓN CON CONTROLES (Y ~ D + X1 + X2 + X3 + X4) ===
Coeficiente de D (ATE): 2.059
Error estándar: 0.062



In [33]:
# 3. Comparación de resultados
print("=== COMPARACIÓN ===")
print(f"Diferencia en ATE: {modelo_completo.params['D'] - modelo_simple.params['D']:.3f}")
print(f"Diferencia en SE: {modelo_completo.bse['D'] - modelo_simple.bse['D']:.3f}")

=== COMPARACIÓN ===
Diferencia en ATE: -0.028
Diferencia en SE: -0.010


# 3.3 LASSO AND VARIABLE SELECTION

In [34]:
# Preparar datos para LASSO
X_cov = df[['X1', 'X2', 'X3', 'X4']]  # Solo covariables
y = df['Y']  # Variable resultado

# Estandarizar variables para LASSO
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cov)

In [35]:
# 1. Aplicar LASSO con validación cruzada
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)

# Variables seleccionadas (coeficientes diferentes de cero)
coeficientes = pd.DataFrame({
    'Variable': X_cov.columns,
    'Coeficiente': lasso.coef_
})

variables_seleccionadas = coeficientes[coeficientes['Coeficiente'] != 0]['Variable'].tolist()

print("=== RESULTADOS LASSO ===")
print(f"Lambda óptimo: {lasso.alpha_:.4f}")
print("Coeficientes LASSO:")
print(coeficientes)
print(f"Variables seleccionadas: {variables_seleccionadas}")
print()

=== RESULTADOS LASSO ===
Lambda óptimo: 0.0005
Coeficientes LASSO:
  Variable  Coeficiente
0       X1     0.541932
1       X2    -0.183172
2       X3     0.258659
3       X4    -0.034346
Variables seleccionadas: ['X1', 'X2', 'X3', 'X4']



In [36]:
# 2. Re-estimar ATE con variables seleccionadas por LASSO
if variables_seleccionadas:
    formula_lasso = 'Y ~ D + ' + ' + '.join(variables_seleccionadas)
else:
    formula_lasso = 'Y ~ D'

modelo_lasso = smf.ols(formula_lasso, data=df).fit()
print("=== REGRESIÓN CON VARIABLES SELECCIONADAS POR LASSO ===")
print("Coeficiente de D (ATE):", round(modelo_lasso.params['D'], 3))
print("Error estándar:", round(modelo_lasso.bse['D'], 3))
print()


=== REGRESIÓN CON VARIABLES SELECCIONADAS POR LASSO ===
Coeficiente de D (ATE): 2.059
Error estándar: 0.062



In [37]:
# 3. Comparación final
print("=== COMPARACIÓN FINAL ===")
print(f"Modelo simple - ATE: {modelo_simple.params['D']:.3f}, SE: {modelo_simple.bse['D']:.3f}")
print(f"Modelo completo - ATE: {modelo_completo.params['D']:.3f}, SE: {modelo_completo.bse['D']:.3f}")
print(f"Modelo LASSO - ATE: {modelo_lasso.params['D']:.3f}, SE: {modelo_lasso.bse['D']:.3f}")
print(f"ATE verdadero: 2.000")

=== COMPARACIÓN FINAL ===
Modelo simple - ATE: 2.087, SE: 0.072
Modelo completo - ATE: 2.059, SE: 0.062
Modelo LASSO - ATE: 2.059, SE: 0.062
ATE verdadero: 2.000
