In [1]:
import numpy as np
import pandas as pd
from scipy import stats

# Para reproducibilidad
np.random.seed(5)

n = 100

# Resultado potencial si NO recibe tratamiento: y0 ~ N(100, 30^2)
y0 = np.random.normal(loc=100, scale=30, size=n)

# Efecto de tratamiento constante
te = 20

# Resultado potencial si recibe tratamiento: y1 = y0 + te + N(0, 10^2)
y1 = y0 + te + np.random.normal(loc=0, scale=10, size=n)

# Tratamiento otorgado aleatoriamente
# drawnorm random -> normal estándar
random_normal = np.random.normal(loc=0, scale=1, size=n)
D = (random_normal > 0).astype(int)  # 1 si tratado, 0 si control

# Variable observada
y = D * y1 + (1 - D) * y0

# Armamos un DataFrame como si fuera la base en Stata
df = pd.DataFrame({
    'y0': y0,
    'y1': y1,
    'D': D,
    'y': y
})

In [3]:
import pandas as pd
import pyreadstat as st
pd.set_option('display.max_columns', None)
path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases prácticas\PS\PS5-20251103\ps5_ex2.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,y0,te,y1,random,D,y,U
0,89.203667,20.0,102.014397,0.775717,1.0,102.014397,0.0


In [4]:
y_treated = df.loc[df['D'] == 1, 'y']
y_control = df.loc[df['D'] == 0, 'y']

ate_sample = y_treated.mean() - y_control.mean()
ate_sample

np.float64(19.67476337727264)

In [5]:
t_stat, p_value = stats.ttest_ind(y_treated, y_control, equal_var=True)
t_stat, p_value

(np.float64(3.226834906864113), np.float64(0.001702316698383185))

In [6]:
print("Two-sample t test with equal variances")
print(f"Mean treated (D=1): {y_treated.mean():.3f}")
print(f"Mean control (D=0): {y_control.mean():.3f}")
print(f"ATE sample (treated - control): {ate_sample:.3f}")
print(f"t statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}")


Two-sample t test with equal variances
Mean treated (D=1): 116.619
Mean control (D=0): 96.944
ATE sample (treated - control): 19.675
t statistic: 3.227
p-value: 0.002


In [7]:
ATE_true = df['te'].mean()
ATE_true

np.float64(20.0)

In [8]:
ATT_true = df.loc[df['D'] == 1, 'te'].mean()
ATT_true


np.float64(20.0)

In [9]:
ATU_true = df.loc[df['D'] == 0, 'te'].mean()
ATU_true


np.float64(20.0)

In [10]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# =========================================
# 1) Setup: N, semilla
# =========================================
np.random.seed(2703)
n = 100

# =========================================
# 2) Resultados potenciales
# =========================================
# y0 ~ N(100, 30^2)
y0 = np.random.normal(loc=100, scale=30, size=n)

# te ~ N(20, 10^2)
te = np.random.normal(loc=20, scale=10, size=n)

# y1 = y0 + te + N(0, 10^2)
y1 = y0 + te + np.random.normal(loc=0, scale=10, size=n)

# =========================================
# 3) Asignación aleatoria A (como "random>0")
# =========================================
random_normal = np.random.normal(loc=0, scale=1, size=n)
A = (random_normal > 0).astype(int)

# =========================================
# 4) Tipos de individuos (alwaystaker, nevertaker, defier, complier)
# =========================================
random2 = np.random.uniform(0, 1, size=n)

alwaystaker = (random2 < 0.3)
nevertaker  = (random2 >= 0.3) & (random2 < 0.5)
defier      = (random2 >= 0.5) & (random2 < 0.7)
complier    = (random2 > 0.7)

# =========================================
# 5) Tratamiento efectivamente recibido D
# =========================================
D = np.zeros(n, dtype=int)

# always-takers siempre 1
D[alwaystaker] = 1

# never-takers siempre 0 (ya está en 0 por defecto, pero lo dejamos claro)
D[nevertaker] = 0

# compliers siguen la asignación A
D[complier] = A[complier]

# defiers hacen lo contrario a la asignación A
D[defier] = 1 - A[defier]

# =========================================
# 6) Variable observada y
# =========================================
y = D * y1 + (1 - D) * y0

# Armamos DataFrame tipo Stata
df = pd.DataFrame({
    "y0": y0,
    "y1": y1,
    "te": te,
    "A": A,
    "D": D,
    "y": y,
    "alwaystaker": alwaystaker.astype(int),
    "nevertaker": nevertaker.astype(int),
    "defier": defier.astype(int),
    "complier": complier.astype(int),
})

# =========================================
# 7) Regress y D  (equivalente a: regress y D en Stata)
# =========================================
X = sm.add_constant(df["D"])   # agrega intercepto como Stata
model = sm.OLS(df["y"], X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1.268
Date:                Sun, 16 Nov 2025   Prob (F-statistic):              0.263
Time:                        00:32:08   Log-Likelihood:                -482.33
No. Observations:                 100   AIC:                             968.7
Df Residuals:                      98   BIC:                             973.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        106.4359      5.213     20.416      0.0

In [11]:
import pandas as pd
import pyreadstat as st
pd.set_option('display.max_columns', None)
path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases prácticas\PS\PS5-20251103\ps5_ex2_2.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,y0,te,y1,random,A,random2,alwaystaker,nevertaker,defier,complier,D,y
0,89.203667,12.81073,109.771561,-0.209102,0.0,0.492404,0.0,1.0,0.0,0.0,0.0,89.203667


In [12]:
X = sm.add_constant(df["D"])   # agrega intercepto como Stata
model = sm.OLS(df["y"], X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     4.645
Date:                Sun, 16 Nov 2025   Prob (F-statistic):             0.0336
Time:                        00:32:31   Log-Likelihood:                -488.65
No. Observations:                 100   AIC:                             981.3
Df Residuals:                      98   BIC:                             986.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         98.4163      4.775     20.612      0.0

In [13]:
from linearmodels.iv import IV2SLS

iv_model = IV2SLS.from_formula(
    "y ~ 1 + [D ~ A]",  # y ~ const + (D instrumentado por A)
    data=df
).fit(cov_type="unadjusted")  # para que sea comparable al default de Stata

print("\n=== IV 2SLS: ivregress 2sls y (D=A) ===")
print(iv_model.summary)


=== IV 2SLS: ivregress 2sls y (D=A) ===
                          IV-2SLS Estimation Summary                          
Dep. Variable:                      y   R-squared:                     -0.1423
Estimator:                    IV-2SLS   Adj. R-squared:                -0.1539
No. Observations:                 100   F-statistic:                    0.0946
Date:                Sun, Nov 16 2025   P-value (F-stat)                0.7584
Time:                        00:37:01   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      113.81     25.700     4.4284     0.0000      63.439      164.18
D          

In [1]:
import numpy as np
from scipy.spatial.distance import cdist
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.discrete.discrete_model import Logit


def generate_dgp_exercise3(n, seed=None, weak_instrument=False):
    """
    Genera datos para el Ejercicio 3: Variables Instrumentales (IV).

    DGP con endogeneidad:
    - Z ~ Bernoulli(0.5) (instrumento)
    - X ~ N(0, 1) (covariable)
    - v ~ N(0, 1) (error estructural)
    - D = 0.2 + γ*Z + 0.5*X + v, donde γ = 0.3 (fuerte) o 0.05 (débil)
    - ε ~ N(0, 1) (error idiosincrático)
    - u = 0.8*v + ε (endogeneidad)
    - Y = 5 + 2*D + X + u
    - ATE verdadero = 2

    Parámetros
    ----------
    n : int
        Tamaño de muestra
    seed : int, optional
        Semilla para reproducibilidad
    weak_instrument : bool, default=False
        Si True, usa γ=0.05 (instrumento débil)
        Si False, usa γ=0.3 (instrumento fuerte)

    Retorna
    -------
    dict
        Diccionario con claves:
        - 'Y': Variable de resultado (array)
        - 'D': Variable endógena de tratamiento (array)
        - 'Z': Variable instrumental (array)
        - 'X': Covariable exógena (array)
        - 'true_ate': ATE verdadero (float)
        - 'instrument_strength': Fuerza del instrumento ('strong' o 'weak')
    """
    if seed is not None:
        np.random.seed(seed)

    # Coeficiente del instrumento
    gamma = 0.05 if weak_instrument else 0.3

    # Generar instrumento y covariable
    Z = np.random.binomial(1, 0.5, n)
    X = np.random.normal(0, 1, n)

    # Generar tratamiento endógeno (primera etapa)
    v = np.random.normal(0, 1, n)
    D = 0.2 + gamma * Z + 0.5 * X + v

    # Generar variable de resultado con endogeneidad
    epsilon = np.random.normal(0, 1, n)
    u = 0.8 * v + epsilon  # Correlación entre u y v
    Y = 5 + 2 * D + X + u

    return {
        'Y': Y,
        'D': D,
        'Z': Z,
        'X': X,
        'true_ate': 2.0,
        'instrument_strength': 'weak' if weak_instrument else 'strong'
    }


def estimate_2sls(y, d, z, x):
    """
    Estima el ATE mediante Two-Stage Least Squares (2SLS).

    Primera etapa:  D = π0 + π1*Z + π2*X + v
    Segunda etapa: Y = β0 + β1*D̂ + β2*X + u

    Estimador: ATE = β1

    Parámetros
    ----------
    y : array-like
        Variable de resultado
    d : array-like
        Variable endógena (tratamiento)
    z : array-like
        Instrumento(s)
    x : array-like
        Covariable(s) exógena(s)

    Retorna
    -------
    dict
        Diccionario con:
        - 'ate_ols': ATE usando OLS naive (sesgado)
        - 'ate_2sls': ATE usando 2SLS
        - 'se_2sls': Error estándar 2SLS
        - 'ci_lower': Límite inferior IC 95%
        - 'ci_upper': Límite superior IC 95%
        - 'first_stage_f': Estadístico F de primera etapa
    """
    y = np.asarray(y)
    d = np.asarray(d)
    z = np.asarray(z)
    x = np.asarray(x)
    n = len(y)

    # Asegurar formato matricial
    if z.ndim == 1:
        z = z.reshape(-1, 1)
    if x.ndim == 1:
        x = x.reshape(-1, 1)

    # OLS naive (para comparación)
    X_ols = np.column_stack([np.ones(n), d, x])
    ols_model = OLS(y, X_ols).fit()
    ate_ols = ols_model.params[1]

    # Primera etapa: D ~ Z + X
    X_first = np.column_stack([np.ones(n), z, x])
    first_stage = OLS(d, X_first).fit()

    # Calcular estadístico F de primera etapa
    # F-test de significancia conjunta de instrumentos
    k_instruments = z.shape[1]
    r_matrix = np.zeros((k_instruments, first_stage.params.shape[0]))
    for i in range(k_instruments):
        r_matrix[i, i + 1] = 1  # Posiciones de los instrumentos (después de constante)

    f_test = first_stage.f_test(r_matrix)
    first_stage_f = f_test.fvalue

    # Valores predichos de D
    d_hat = first_stage.predict(X_first)

    # Segunda etapa: Y ~ D̂ + X
    X_second = np.column_stack([np.ones(n), d_hat, x])
    second_stage = OLS(y, X_second).fit()

    # Nota: Los errores estándar de la segunda etapa no son correctos
    # porque no ajustan por la incertidumbre de la primera etapa.
    # Para errores correctos, usaríamos el estimador manual de 2SLS.

    # Cálculo manual de 2SLS para errores estándar correctos
    # Matriz de instrumentos: [1, Z, X]
    W = X_first
    # Matriz de regresores: [1, D, X]
    X_design = np.column_stack([np.ones(n), d, x])

    # Proyección: X̂ = W(W'W)^-1 W'X
    P_W = W @ np.linalg.inv(W.T @ W) @ W.T
    X_hat = P_W @ X_design

    # Estimador 2SLS: β = (X̂'X)^-1 X̂'y
    beta_2sls = np.linalg.inv(X_hat.T @ X_design) @ X_hat.T @ y

    # Residuos
    residuals = y - X_design @ beta_2sls

    # Varianza de errores
    sigma2 = (residuals ** 2).sum() / (n - X_design.shape[1])

    # Matriz de varianza-covarianza
    vcov = sigma2 * np.linalg.inv(X_hat.T @ X_design)
    se_2sls = np.sqrt(np.diag(vcov))

    ate_2sls = beta_2sls[1]
    se_ate = se_2sls[1]

    # Intervalo de confianza
    ci_lower = ate_2sls - 1.96 * se_ate
    ci_upper = ate_2sls + 1.96 * se_ate

    return {
        'ate_ols': ate_ols,
        'ate_2sls': ate_2sls,
        'se_2sls': se_ate,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'first_stage_f': first_stage_f
    }

In [21]:
import numpy as np
import pandas as pd
from statsmodels.regression.linear_model import OLS
from linearmodels.iv import IV2SLS

# Generar datos con tu DGP
data = generate_dgp_exercise3(n=1000, seed=123, weak_instrument=False)

y = data['Y']
d = data['D']
z = data['Z']
x = data['X']

# --- Tu 2SLS casero ---
res = estimate_2sls(y, d, z, x)
print("=== Mi 2SLS casero ===")
print("beta_D      =", res['ate_2sls'])
print("se(beta_D)  =", res['se_2sls'])
print("F 1ra etapa =", res['first_stage_f'])

# --- IV2SLS linearmodels ---
df = pd.DataFrame({
    'y': y,
    'D': d,
    'Z': z,
    'X': x
})

iv_model = IV2SLS.from_formula(
    "y ~ 1 + X + [D ~ Z]",   # ✅ fórmula correcta
    data=df
).fit(cov_type="unadjusted")

print("\n=== IV 2SLS (IV2SLS) ===")
print("beta_D      =", iv_model.params['D'])
print("se(beta_D)  =", iv_model.std_errors['D'])
print("F 1ra etapa =", iv_model.first_stage.diagnostics['f.stat'].iloc[0])

=== Mi 2SLS casero ===
beta_D      = 1.8138076803606629
se(beta_D)  = 0.28912732911094885
F 1ra etapa = 23.16967953996404

=== IV 2SLS (IV2SLS) ===
beta_D      = 1.8138076803607066
se(beta_D)  = 0.28869331236021034
F 1ra etapa = 23.239397733163443
