In [1]:
import pandas as pd
import pyreadstat as st
path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\jobtraining.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,train,age,educ,black,hisp,married,earn96,unem96,earn98,unem98
0,0,37,11,1,0,1,0.0,1,1.617924,0


In [2]:
df.shape

(1130, 10)

In [4]:
import numpy as np
from dataclasses import dataclass
from typing import List, Optional, Literal, Dict, Any
from sklearn.neighbors import NearestNeighbors
import statsmodels.api as sm
from scipy.stats import norm

@dataclass
class NNMatchResult:
    estimand: str
    coef: float
    se: float
    z: float
    p: float
    ci_low: float
    ci_high: float
    n: int
    matches_summary: Dict[str, Any]

def _whiten_mahalanobis(X: np.ndarray) -> np.ndarray:
    """Devuelve X transformada tal que la distancia euclídea en Z es Mahalanobis en X."""
    V = np.cov(X, rowvar=False)
    VI = np.linalg.inv(V)
    # Cholesky de VI: VI = U @ U.T  ->  ||U.T (x - y)||^2 = (x - y)' VI (x - y)
    U = np.linalg.cholesky(VI)
    Z = X @ U  # Euclídea en Z equivale a Mahalanobis en X
    return Z

def nnmatch(
    df: pd.DataFrame,
    y: str,
    t: str,
    x: List[str],
    *,
    estimand: Literal["ATE", "ATT"] = "ATE",
    n_neighbors: int = 1,
    metric: Literal["mahalanobis"] = "mahalanobis",
    bias_adj: bool = True,
    se_type: Literal["HC1", "bootstrap"] = "HC1",
    bootstrap: Optional[int] = None,
    random_state: Optional[int] = 2025,
    return_matches: bool = False,
) -> NNMatchResult | tuple[NNMatchResult, pd.DataFrame]:
    """
    Nearest-Neighbor Matching al estilo Stata teffects nnmatch, por defecto:
      - 1 vecino, Mahalanobis, con reemplazo
      - estimand = 'ATE' (o 'ATT')
      - ajuste de sesgo post-matching con OLS en la muestra emparejada
      - SE robustos (HC1). Opcional: bootstrap=B
    """
    # --- 0) limpieza básica
    d = df[[y, t] + x].dropna().copy()
    d[t] = d[t].astype(int)
    n = len(d)

    # --- 1) preparar matrices
    X = d[x].to_numpy(dtype=float)
    T = d[t].to_numpy(dtype=int)
    Y = d[y].to_numpy(dtype=float)

    # --- 2) transformar a espacio euclídeo equivalente a Mahalanobis
    if metric != "mahalanobis":
        raise ValueError("Por ahora solo se implementa 'mahalanobis'.")
    Z = _whiten_mahalanobis(X)

    # --- 3) separar tratados/controles y ajustar NNs
    treat_idx = np.where(T == 1)[0]
    ctrl_idx  = np.where(T == 0)[0]

    Z_t = Z[treat_idx]
    Z_c = Z[ctrl_idx]
    Y_t = Y[treat_idx]
    Y_c = Y[ctrl_idx]

    # Para cada tratado: vecino(s) control más cercano
    nn_c = NearestNeighbors(n_neighbors=n_neighbors, algorithm="auto").fit(Z_c)
    dist_tc, nbrs_tc = nn_c.kneighbors(Z_t, return_distance=True)

    # Para ATE, también matcheamos controles -> tratados
    if estimand == "ATE":
        nn_t = NearestNeighbors(n_neighbors=n_neighbors, algorithm="auto").fit(Z_t)
        dist_ct, nbrs_ct = nn_t.kneighbors(Z_c, return_distance=True)

    # --- 4) construir muestra emparejada
    # Usaremos un "panel" emparejado donde cada fila es una observación real con su "match outcome" del grupo opuesto.
    # Para ATT: solo promediamos sobre tratados; para ATE: sobre todos.

    # índices de matches (usamos 1-NN por default; si >1, promediamos)
    def _avg_match_values(base_idx, other_idx_map, Y_other):
        # base_idx: índices de la submuestra (tratados o controles) en el conjunto original
        # other_idx_map: matriz idx de vecinos (en subíndice de la submuestra opuesta)
        # Y_other: outcomes del grupo opuesto
        # Retorna vector del tamaño de base_idx con el promedio de Y de los vecinos
        if other_idx_map.ndim == 1:
            return Y_other[other_idx_map]
        else:
            return Y_other[other_idx_map].mean(axis=1)

    # Tratados: matched control outcome
    match_c_idx_sub = nbrs_tc[:, 0] if n_neighbors == 1 else nbrs_tc
    Yc_matched_for_t = _avg_match_values(treat_idx, match_c_idx_sub, Y_c)

    if estimand == "ATE":
        # Controles: matched treated outcome
        match_t_idx_sub = nbrs_ct[:, 0] if n_neighbors == 1 else nbrs_ct
        Yt_matched_for_c = _avg_match_values(ctrl_idx, match_t_idx_sub, Y_t)

    # --- 5) preparar DataFrame para regresión post-matching (bias adjustment)
    # Para ATT: usamos solo tratados + sus controles matcheados (dos "bandas")
    # Para ATE: usamos tratados con sus controles emparejados y controles con sus tratados emparejados
    rows = []

    # banda de tratados (observación real)
    for pos, i in enumerate(treat_idx):
        rows.append({
            "Y": Y[i],
            "T": 1,
            **{k: d.iloc[i][k] for k in x},
            "_unit": i,
            "_group": "treated_real"
        })
        # banda de control "match" sintético para el mismo tratado (mismo X del real? No: registramos el real del control emparejado)
        j_ctrl = ctrl_idx[match_c_idx_sub[pos]] if n_neighbors == 1 else ctrl_idx[match_c_idx_sub[pos][0]]
        rows.append({
            "Y": Yc_matched_for_t[pos],
            "T": 0,
            **{k: d.iloc[j_ctrl][k] for k in x},
            "_unit": j_ctrl,
            "_group": "control_match_for_treated"
        })

    if estimand == "ATE":
        # banda de controles y su match tratado
        for pos, j in enumerate(ctrl_idx):
            rows.append({
                "Y": Y[j],
                "T": 0,
                **{k: d.iloc[j][k] for k in x},
                "_unit": j,
                "_group": "control_real"
            })
            i_treat = treat_idx[match_t_idx_sub[pos]] if n_neighbors == 1 else treat_idx[match_t_idx_sub[pos][0]]
            rows.append({
                "Y": Yt_matched_for_c[pos],
                "T": 1,
                **{k: d.iloc[i_treat][k] for k in x},
                "_unit": i_treat,
                "_group": "treated_match_for_control"
            })

    matched_df = pd.DataFrame(rows)

    # --- 6) estimación: post-matching OLS (bias_adj=True)
    # Coeficiente de T ~ ATE/ATT ajustado por X en la muestra emparejada
    if bias_adj:
        Xreg = sm.add_constant(matched_df[["T"] + x])
        res = sm.OLS(matched_df["Y"].to_numpy(), Xreg).fit(cov_type="HC1")
        coef = res.params["T"]
        se = res.bse["T"]
    else:
        # Sin ajuste: diferencia emparejada simple
        if estimand == "ATT":
            att_simple = (Y_t - Yc_matched_for_t).mean()
            coef = float(att_simple)
        else:
            # ATE simple: promedio simétrico
            diff_t = (Y_t - Yc_matched_for_t).mean()
            diff_c = (Yt_matched_for_c - Y_c).mean()
            coef = float(0.5*diff_t + 0.5*diff_c)
        # SE por delta simple no es trivial; recomiendo bootstrap si sin ajuste
        se = np.nan

    # --- 7) Bootstrap opcional para SE
    if se_type == "bootstrap" or (bootstrap is not None and bootstrap > 0):
        B = int(bootstrap or 500)
        rng = np.random.default_rng(random_state)
        boot = []
        for _ in range(B):
            # re-muestreo por unidad original (no por filas del matched_df para evitar doblar probas)
            samp_idx = np.arange(n)
            samp = rng.choice(samp_idx, size=n, replace=True)
            d_b = d.iloc[samp].reset_index(drop=True)
            try:
                res_b = nnmatch(
                    d_b, y, t, x,
                    estimand=estimand, n_neighbors=n_neighbors,
                    metric=metric, bias_adj=bias_adj,
                    se_type="HC1", bootstrap=None, random_state=None, return_matches=False
                )
                boot.append(res_b.coef)
            except Exception:
                # puede fallar si la covarianza no es invertible; saltamos esa réplica
                continue
        if len(boot) >= 10:
            se = float(np.std(boot, ddof=1))
        else:
            # si no pudimos bootstrapiar suficiente, dejamos el SE HC1
            pass

    # --- 8) estadísticos
    z = coef / se if (se is not None and np.isfinite(se) and se > 0) else np.nan
    p = 2*(1 - norm.cdf(abs(z))) if np.isfinite(z) else np.nan
    ci_low = coef - 1.96*se if np.isfinite(se) else np.nan
    ci_high = coef + 1.96*se if np.isfinite(se) else np.nan

    # --- 9) resumen de matching
    match_summary = {
        "n_obs": n,
        "n_treated": int((T == 1).sum()),
        "n_control": int((T == 0).sum()),
        "neighbors": n_neighbors,
        "metric": metric,
        "mean_dist_t_to_c": float(dist_tc.mean()) if 'dist_tc' in locals() else None,
    }
    if estimand == "ATE" and 'dist_ct' in locals():
        match_summary["mean_dist_c_to_t"] = float(dist_ct.mean())

    result = NNMatchResult(
        estimand=estimand,
        coef=float(coef),
        se=float(se),
        z=float(z) if np.isfinite(z) else np.nan,
        p=float(p) if np.isfinite(p) else np.nan,
        ci_low=float(ci_low) if np.isfinite(ci_low) else np.nan,
        ci_high=float(ci_high) if np.isfinite(ci_high) else np.nan,
        n=n,
        matches_summary=match_summary
    )

    if return_matches:
        return result, matched_df
    return result

In [5]:
out = nnmatch(df, y="earn98", t="train", x=["earn96", "age", "educ"],
              estimand="ATE", n_neighbors=1, metric="mahalanobis",
              bias_adj=True, se_type="HC1", bootstrap=None)
out

NNMatchResult(estimand='ATE', coef=1.5369651104230515, se=0.2518237220648781, z=6.1033372782373485, p=1.0387630755559485e-09, ci_low=1.0433906151758905, ci_high=2.0305396056702127, n=1130, matches_summary={'n_obs': 1130, 'n_treated': 376, 'n_control': 754, 'neighbors': 1, 'metric': 'mahalanobis', 'mean_dist_t_to_c': 0.16856989690402338, 'mean_dist_c_to_t': 0.3753011393873344})

In [20]:
import pandas as pd
from causalinference import CausalModel

df = pd.read_stata(r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\jobtraining.dta")

Y = df["earn98"].to_numpy()
D = df["train"].astype(int).to_numpy()
X = df[["earn96", "age", "educ"]].to_numpy()

# --- Sin ajuste de sesgo (equivale a omitir biasadj() en Stata)
cm_noadj = CausalModel(Y=Y, D=D, X=X)
cm_noadj.est_via_matching(weights="maha", matches=10, bias_adj=False)
print("\nSIN ajuste de sesgo (bias_adj=False):")
print(cm_noadj.estimates["matching"])

# --- Con ajuste de sesgo (equivale a biasadj(earn96 age educ))
cm_adj = CausalModel(Y=Y, D=D, X=X)
cm_adj.est_via_matching(weights="maha", matches=10, bias_adj=True)
print("\nCON ajuste de sesgo (bias_adj=True):")
print(cm_adj.estimates["matching"])


SIN ajuste de sesgo (bias_adj=False):

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      1.494      0.571      2.614      0.009      0.374      2.614
           ATC      1.289      0.716      1.801      0.072     -0.114      2.693
           ATT      1.903      0.497      3.832      0.000      0.930      2.877


CON ajuste de sesgo (bias_adj=True):

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      2.394      0.566      4.226      0.000      1.284      3.504
           ATC      2.609      0.709      3.681      0.000      1.220      3.998
           ATT      1.962      0.495      3.963      0.000      0.992      2.932



In [14]:
d = df[["earn98","train","earn96","age","educ"]].dropna()
len(d)  # debería ser 1130

1130

In [15]:
import numpy as np
from scipy.spatial import distance

X = d[["earn96","age","educ"]].to_numpy()
V = np.cov(X, rowvar=False)
VI = np.linalg.inv(V)

tmask = d["train"].astype(int).to_numpy()==1
Xt, Xc = X[tmask], X[~tmask]

# Para cada tratado, contar cuántos controles están a la distancia mínima (tolerancia numérica eps)
eps = 1e-12
tie_counts = []
for x in Xt:
    dists = np.array([distance.mahalanobis(x, xc, VI) for xc in Xc])
    mind = dists.min()
    ties = np.sum(np.isclose(dists, mind, atol=eps, rtol=0))
    tie_counts.append(ties)

print("Empates (tratados): min", np.min(tie_counts), "max", np.max(tie_counts))


Empates (tratados): min 1 max 5


In [21]:
# -*- coding: utf-8 -*-
# Emparejamiento 1-NN con manejo de EMPATES al estilo Stata (promedia todos los empatados)
# Distancia: Mahalanobis
# Estimand: ATE o ATT
# Bias adj: OLS post-matching en la muestra emparejada (EE robustos HC1)

import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Literal, Tuple
import statsmodels.api as sm
from scipy.stats import norm

@dataclass
class MatchResult:
    estimand: str
    coef: float
    se: float
    z: float
    p: float
    ci_low: float
    ci_high: float
    n: int
    details: dict

def _invert_psd(M: np.ndarray) -> np.ndarray:
    # inversa estable (por si V es semidefinida)
    try:
        return np.linalg.inv(M)
    except np.linalg.LinAlgError:
        return np.linalg.pinv(M)

def _mahal_min_ties(x: np.ndarray, Xref: np.ndarray, VI: np.ndarray, atol: float = 1e-12) -> Tuple[np.ndarray, float]:
    """
    Devuelve los índices de TODOS los empates a la distancia mínima entre x y Xref
    usando distancia de Mahalanobis con matriz inversa VI.
    """
    dif = Xref - x
    # dist^2 = (x - xi)' VI (x - xi) = rowwise( dif @ VI @ dif.T )
    # calculamos de forma vectorizada
    d2 = np.einsum('ij,jk,ik->i', dif, VI, dif)
    dmin = d2.min()
    tie_idx = np.where(np.isclose(d2, dmin, atol=atol, rtol=0))[0]
    return tie_idx, float(np.sqrt(dmin))

def _directional_tie_matching(
    X_src: np.ndarray, Y_src: np.ndarray,     # grupo "base" (p.ej. tratados si ATT)
    X_ref: np.ndarray, Y_ref: np.ndarray,     # grupo de referencia (p.ej. controles)
    VI: np.ndarray, atol: float = 1e-12
):
    """
    Para cada fila en X_src encuentra TODOS los empates a la mínima distancia en X_ref,
    y devuelve:
      - Y_ref_prom: promedio de Y_ref de los empatados
      - X_ref_prom: promedio de X_ref de los empatados (para bias adj)
      - dist_min: distancia mínima (Mahalanobis)
      - tie_count: cantidad de empatados
    """
    n = X_src.shape[0]
    Y_ref_prom = np.empty(n)
    X_ref_prom = np.empty_like(X_src)
    dist_min = np.empty(n)
    tie_count = np.empty(n, dtype=int)

    for i in range(n):
        ties, dmin = _mahal_min_ties(X_src[i], X_ref, VI, atol=atol)
        Y_ref_prom[i] = Y_ref[ties].mean()
        X_ref_prom[i] = X_ref[ties].mean(axis=0)
        dist_min[i] = dmin
        tie_count[i] = len(ties)

    return Y_ref_prom, X_ref_prom, dist_min, tie_count

def nnmatch_with_ties(
    df: pd.DataFrame,
    y: str,
    t: str,
    x: List[str],
    *,
    estimand: Literal["ATE", "ATT"] = "ATE",
    bias_adj: bool = True,
    atol_tie: float = 1e-12
) -> MatchResult:
    """
    Matching nearest-neighbor con EMPATES (promedia TODOS los empatados a la mínima distancia).
    Distancia: Mahalanobis en X.
    - ATT: trata cada tratado y le asigna el promedio de sus controles empatados.
    - ATE: simétrico: tratados->controles y controles->tratados, y promedia.
    Ajuste de sesgo: OLS(Y ~ T + X) en muestra emparejada (dos "bandas") con EE HC1.
    """
    d = df[[y, t] + x].dropna().copy()
    d[t] = d[t].astype(int)
    Y = d[y].to_numpy(dtype=float)
    T = d[t].to_numpy(dtype=int)
    X = d[x].to_numpy(dtype=float)
    n = len(d)

    # Matriz de covarianza e inversa (Mahalanobis)
    V = np.cov(X, rowvar=False)
    VI = _invert_psd(V)

    # Separar
    mask_t = T == 1
    Xt, Xc = X[mask_t], X[~mask_t]
    Yt, Yc = Y[mask_t], Y[~mask_t]

    # Tratados -> Controles (promedio de ties)
    Yc_for_t, Xc_for_t, dist_tc, ties_tc = _directional_tie_matching(Xt, Yt, Xc, Yc, VI, atol=atol_tie)
    diff_t = (Yt - Yc_for_t).mean()

    if estimand.upper() == "ATT":
        # construir muestra emparejada para bias adj (bandas: tratados reales + controles "promedio")
        rows = []
        # Banda tratados (reales)
        for i in range(len(Yt)):
            rows.append({"Y": Yt[i], "T": 1, **{k: Xt[i, j] for j, k in enumerate(x)}})
        # Banda controles (promedio de empates de cada tratado)
        for i in range(len(Yt)):
            rows.append({"Y": Yc_for_t[i], "T": 0, **{k: Xc_for_t[i, j] for j, k in enumerate(x)}})

        matched = pd.DataFrame(rows)

        if bias_adj:
            Xreg = sm.add_constant(matched[["T"] + x])
            res = sm.OLS(matched["Y"].to_numpy(), Xreg).fit(cov_type="HC1")
            coef = float(res.params["T"])
            se = float(res.bse["T"])
        else:
            coef = float(diff_t)
            # SE cerrado sin ajuste no trivial; opción: bootstrap si lo quisieras
            # aquí devolvemos NA para el se (puedes reemplazar por bootstrap)
            se = np.nan

    else:  # ATE
        # Controles -> Tratados (simétrico, promedio de ties)
        Yt_for_c, Xt_for_c, dist_ct, ties_ct = _directional_tie_matching(Xc, Yc, Xt, Yt, VI, atol=atol_tie)
        diff_c = (Yt_for_c - Yc).mean()
        ate_simple = 0.5 * (diff_t + diff_c)

        # construir muestra emparejada simétrica para bias adj
        rows = []
        # Banda tratados (reales)
        for i in range(len(Yt)):
            rows.append({"Y": Yt[i], "T": 1, **{k: Xt[i, j] for j, k in enumerate(x)}})
        # Banda controles (promedio de empates tratados->controles)
        for i in range(len(Yt)):
            rows.append({"Y": Yc_for_t[i], "T": 0, **{k: Xc_for_t[i, j] for j, k in enumerate(x)}})
        # Banda controles (reales)
        for j in range(len(Yc)):
            rows.append({"Y": Yc[j], "T": 0, **{k: Xc[j, k0] for k0, k in enumerate(x)}})
        # Banda tratados (promedio de empates controles->tratados)
        for j in range(len(Yc)):
            rows.append({"Y": Yt_for_c[j], "T": 1, **{k: Xt_for_c[j, k0] for k0, k in enumerate(x)}})

        matched = pd.DataFrame(rows)

        if bias_adj:
            Xreg = sm.add_constant(matched[["T"] + x])
            res = sm.OLS(matched["Y"].to_numpy(), Xreg).fit(cov_type="HC1")
            coef = float(res.params["T"])
            se = float(res.bse["T"])
        else:
            coef = float(ate_simple)
            se = np.nan

    # Estadísticos
    z = coef / se if (se is not None and np.isfinite(se) and se > 0) else np.nan
    p = 2 * (1 - norm.cdf(abs(z))) if np.isfinite(z) else np.nan
    ci_low = coef - 1.96 * se if np.isfinite(se) else np.nan
    ci_high = coef + 1.96 * se if np.isfinite(se) else np.nan

    details = {
        "n": n,
        "n_treated": int(mask_t.sum()),
        "n_control": int((~mask_t).sum()),
        "mean_ties_treated_to_control": float(ties_tc.mean()),
        "max_ties_treated_to_control": int(ties_tc.max()),
    }
    if estimand.upper() == "ATE":
        details.update({
            "mean_ties_control_to_treated": float(ties_ct.mean()),
            "max_ties_control_to_treated": int(ties_ct.max()),
        })

    return MatchResult(
        estimand=estimand.upper(),
        coef=float(coef),
        se=float(se) if np.isfinite(se) else np.nan,
        z=float(z) if np.isfinite(z) else np.nan,
        p=float(p) if np.isfinite(p) else np.nan,
        ci_low=float(ci_low) if np.isfinite(ci_low) else np.nan,
        ci_high=float(ci_high) if np.isfinite(ci_high) else np.nan,
        n=n,
        details=details
    )

In [22]:
# =============================
# EJEMPLO DE USO (tu dataset)
# =============================
import pandas as pd
df = pd.read_stata(r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\jobtraining.dta")

# ATE con ajuste (similar a: teffects nnmatch ... , nneighbor(1)  con biasadj)
out_ate = nnmatch_with_ties(df, y="earn98", t="train", x=["earn96","age","educ"],
                            estimand="ATE", bias_adj=True)
print(out_ate)

# ATE sin ajuste (similar a: omitir biasadj())
out_ate_noadj = nnmatch_with_ties(df, y="earn98", t="train", x=["earn96","age","educ"],
                                  estimand="ATE", bias_adj=False)
print(out_ate_noadj)

# ATT con ajuste (si querés comparar ATT)
out_att = nnmatch_with_ties(df, y="earn98", t="train", x=["earn96","age","educ"],
                            estimand="ATT", bias_adj=True)
print(out_att)

MatchResult(estimand='ATE', coef=1.6558467881706713, se=0.24965481878294243, z=6.6325448723275615, p=3.299471806883503e-11, ci_low=1.1665233433561042, ci_high=2.1451702329852385, n=1130, details={'n': 1130, 'n_treated': 376, 'n_control': 754, 'mean_ties_treated_to_control': 1.2872340425531914, 'max_ties_treated_to_control': 5, 'mean_ties_control_to_treated': 1.1618037135278514, 'max_ties_control_to_treated': 10})
MatchResult(estimand='ATE', coef=1.3794591120672046, se=nan, z=nan, p=nan, ci_low=nan, ci_high=nan, n=1130, details={'n': 1130, 'n_treated': 376, 'n_control': 754, 'mean_ties_treated_to_control': 1.2872340425531914, 'max_ties_treated_to_control': 5, 'mean_ties_control_to_treated': 1.1618037135278514, 'max_ties_control_to_treated': 10})
MatchResult(estimand='ATT', coef=1.9287294491754579, se=0.4053652913975394, z=4.758003435681335, p=1.9551717189703055e-06, ci_low=1.1342134780362807, ci_high=2.723245420314635, n=1130, details={'n': 1130, 'n_treated': 376, 'n_control': 754, 'mea

In [24]:
import numpy as np
import pandas as pd

def bootstrap_nnmatch_with_ties(
    df, y, t, x,
    *,
    estimand="ATE",
    bias_adj=True,
    atol_tie=1e-12,
    B=1000,
    random_state=2025
):
    rng = np.random.default_rng(random_state)
    d = df[[y, t] + x].dropna().copy()
    n = len(d)

    # corrida base
    base = nnmatch_with_ties(d, y, t, x, estimand=estimand, bias_adj=bias_adj, atol_tie=atol_tie)
    coefs = []

    for _ in range(B):
        # remuestreo por filas (unidades) con reemplazo
        samp_idx = rng.integers(0, n, size=n)
        d_b = d.iloc[samp_idx].reset_index(drop=True)
        try:
            r_b = nnmatch_with_ties(d_b, y, t, x, estimand=estimand, bias_adj=bias_adj, atol_tie=atol_tie)
            coefs.append(r_b.coef)
        except Exception:
            # por si alguna réplica tiene V no invertible; la salteamos
            continue

    se_boot = float(np.std(coefs, ddof=1)) if len(coefs) >= 30 else np.nan
    z = base.coef / se_boot if np.isfinite(se_boot) and se_boot > 0 else np.nan
    p = 2 * (1 - norm.cdf(abs(z))) if np.isfinite(z) else np.nan
    ci_low = base.coef - 1.96 * se_boot if np.isfinite(se_boot) else np.nan
    ci_high = base.coef + 1.96 * se_boot if np.isfinite(se_boot) else np.nan

    return {
        "estimand": estimand,
        "coef": float(base.coef),
        "se_boot": se_boot,
        "z_boot": z,
        "p_boot": p,
        "ci_low_boot": ci_low,
        "ci_high_boot": ci_high,
        "n_reps": len(coefs),
        "details": base.details
    }

# ======= Ejemplos rápidos =======
# ATE con ajuste (comparable a tu primer MatchResult, pero con EE bootstrap)
b_ate = bootstrap_nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                    estimand="ATE", bias_adj=True, B=1000)
print(b_ate)

# ATE sin ajuste (rellena el EE faltante con bootstrap)
b_ate_noadj = bootstrap_nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                          estimand="ATE", bias_adj=False, B=1000)
print(b_ate_noadj)

# ATT con ajuste (por si querés comparar contra causalinference)
b_att = bootstrap_nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                    estimand="ATT", bias_adj=True, B=1000)
print(b_att)


{'estimand': 'ATE', 'coef': 1.6558467881706713, 'se_boot': 0.5899970787180918, 'z_boot': 2.8065338760123866, 'p_boot': np.float64(0.005007764668350179), 'ci_low_boot': 0.4994525138832113, 'ci_high_boot': 2.8122410624581313, 'n_reps': 1000, 'details': {'n': 1130, 'n_treated': 376, 'n_control': 754, 'mean_ties_treated_to_control': 1.2872340425531914, 'max_ties_treated_to_control': 5, 'mean_ties_control_to_treated': 1.1618037135278514, 'max_ties_control_to_treated': 10}}
{'estimand': 'ATE', 'coef': 1.3794591120672046, 'se_boot': 0.5098066711630074, 'z_boot': 2.705847510626497, 'p_boot': np.float64(0.006813032277899422), 'ci_low_boot': 0.38023803658771016, 'ci_high_boot': 2.378680187546699, 'n_reps': 1000, 'details': {'n': 1130, 'n_treated': 376, 'n_control': 754, 'mean_ties_treated_to_control': 1.2872340425531914, 'max_ties_treated_to_control': 5, 'mean_ties_control_to_treated': 1.1618037135278514, 'max_ties_control_to_treated': 10}}
{'estimand': 'ATT', 'coef': 1.9287294491754579, 'se_boo

In [26]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Literal, Tuple, Union, Optional, Dict, Any
import statsmodels.api as sm
from scipy.stats import norm

# =============================
# Utilidades y estructuras
# =============================

@dataclass
class MatchResult:
    estimand: str
    coef: float
    se: float
    z: float
    p: float
    ci_low: float
    ci_high: float
    n: int
    details: Dict[str, Any]

def _invert_psd(M: np.ndarray) -> np.ndarray:
    try:
        return np.linalg.inv(M)
    except np.linalg.LinAlgError:
        return np.linalg.pinv(M)

def _mahal_min_ties(x: np.ndarray, Xref: np.ndarray, VI: np.ndarray, atol: float = 1e-12) -> Tuple[np.ndarray, float]:
    """Índices de TODOS los empates a distancia mínima (Mahalanobis) y la distancia mínima."""
    dif = Xref - x
    d2 = np.einsum('ij,jk,ik->i', dif, VI, dif)
    dmin = d2.min()
    tie_idx = np.where(np.isclose(d2, dmin, atol=atol, rtol=0))[0]
    return tie_idx, float(np.sqrt(dmin))

def _directional_tie_matching(
    X_src: np.ndarray, Y_src: np.ndarray,
    X_ref: np.ndarray, Y_ref: np.ndarray,
    VI: np.ndarray, atol: float = 1e-12
):
    """
    Para cada fila en X_src encuentra TODOS los empates a la mínima distancia en X_ref,
    y devuelve promedios de Y y X de los vecinos empatados, distancia mínima y conteo de empates.
    """
    n = X_src.shape[0]
    Y_ref_prom = np.empty(n)
    X_ref_prom = np.empty_like(X_src)
    dist_min = np.empty(n)
    tie_count = np.empty(n, dtype=int)

    for i in range(n):
        ties, dmin = _mahal_min_ties(X_src[i], X_ref, VI, atol=atol)
        Y_ref_prom[i] = Y_ref[ties].mean()
        X_ref_prom[i] = X_ref[ties].mean(axis=0)
        dist_min[i] = dmin
        tie_count[i] = len(ties)

    return Y_ref_prom, X_ref_prom, dist_min, tie_count

# =============================
# Estimador principal
# =============================

def nnmatch_with_ties(
    df: pd.DataFrame,
    y: str,
    t: str,
    x: List[str],
    *,
    estimand: Literal["ATE", "ATT"] = "ATE",
    bias_adj: Union[Literal["ols", "unitwise"], bool] = "ols",
    atol_tie: float = 1e-12
) -> MatchResult:
    """
    Nearest-neighbor matching con EMPATES (promedia TODOS los empatados a distancia mínima).
    - Distancia: Mahalanobis en X.
    - estimand: "ATE" (promedio 1/n) o "ATT".
    - bias_adj:
        * False: sin ajuste
        * "ols": OLS post-matching (EE HC1)
        * "unitwise": ajuste por unidad con g(X) y promedio 1/n (ATE) o 1/nt (ATT)
    """
    d = df[[y, t] + x].dropna().copy()
    d[t] = d[t].astype(int)
    Y = d[y].to_numpy(dtype=float)
    T = d[t].to_numpy(dtype=int)
    X = d[x].to_numpy(dtype=float)
    n = len(d)

    # Mahalanobis
    V = np.cov(X, rowvar=False)
    VI = _invert_psd(V)

    # Split
    mask_t = (T == 1)
    Xt, Xc = X[mask_t], X[~mask_t]
    Yt, Yc = Y[mask_t], Y[~mask_t]
    nt, nc = len(Yt), len(Yc)

    # Tratados -> Controles
    Yc_for_t, Xc_for_t, dist_tc, ties_tc = _directional_tie_matching(Xt, Yt, Xc, Yc, VI, atol=atol_tie)
    diff_t = (Yt - Yc_for_t)  # vector diferencias lado tratados

    # Si ATE, también Controles -> Tratados
    if estimand.upper() == "ATE":
        Yt_for_c, Xt_for_c, dist_ct, ties_ct = _directional_tie_matching(Xc, Yc, Xt, Yt, VI, atol=atol_tie)
        diff_c = (Yt_for_c - Yc)  # vector diferencias lado controles

    # =============================
    # Estimación del coeficiente
    # =============================

    if bias_adj is False:
        if estimand.upper() == "ATT":
            coef = float(diff_t.mean())  # promedio sobre tratados
            se = np.nan  # sugerido: bootstrap
        else:  # ATE con promedio 1/n
            coef = float((diff_t.sum() + diff_c.sum()) / n)
            se = np.nan  # sugerido: bootstrap

    elif bias_adj == "unitwise":
        # g(X): ajuste por unidad (no incluye T).
        Xall = sm.add_constant(d[x])
        g = sm.OLS(d[y], Xall).fit()
        # Predicciones por grupo:
        g_Xt       = g.predict(sm.add_constant(pd.DataFrame(Xt, columns=x)))
        g_Xc_for_t = g.predict(sm.add_constant(pd.DataFrame(Xc_for_t, columns=x)))

        adj_t = (Yt - Yc_for_t) - (g_Xt - g_Xc_for_t)

        if estimand.upper() == "ATT":
            coef = float(adj_t.mean())  # promedio sobre tratados
            se = np.nan  # sugerido: bootstrap
        else:
            g_Xt_for_c = g.predict(sm.add_constant(pd.DataFrame(Xt_for_c, columns=x)))
            g_Xc       = g.predict(sm.add_constant(pd.DataFrame(Xc, columns=x)))
            adj_c = (Yt_for_c - Yc) - (g_Xt_for_c - g_Xc)
            coef = float((adj_t.sum() + adj_c.sum()) / n)  # promedio 1/n
            se = np.nan  # sugerido: bootstrap

    elif bias_adj == "ols":
        # Construimos muestra emparejada (bandas) y hacemos OLS(Y ~ T + X) HC1
        rows = []
        # Banda tratados (reales)
        for i in range(nt):
            rows.append({"Y": Yt[i], "T": 1, **{k: Xt[i, j] for j, k in enumerate(x)}})
        # Banda controles promediados de tratados->controles
        for i in range(nt):
            rows.append({"Y": Yc_for_t[i], "T": 0, **{k: Xc_for_t[i, j] for j, k in enumerate(x)}})

        if estimand.upper() == "ATE":
            # Banda controles (reales)
            for j in range(nc):
                rows.append({"Y": Yc[j], "T": 0, **{k: Xc[j, j0] for j0, k in enumerate(x)}})
            # Banda tratados promediados de controles->tratados
            for j in range(nc):
                rows.append({"Y": Yt_for_c[j], "T": 1, **{k: Xt_for_c[j, j0] for j0, k in enumerate(x)}})

        matched = pd.DataFrame(rows)
        Xreg = sm.add_constant(matched[["T"] + x])
        res = sm.OLS(matched["Y"].to_numpy(), Xreg).fit(cov_type="HC1")
        coef = float(res.params["T"])
        se = float(res.bse["T"])

    else:
        raise ValueError("bias_adj debe ser False, 'ols' o 'unitwise'.")

    # =============================
    # Estadísticos
    # =============================
    z = coef / se if (se is not None and np.isfinite(se) and se > 0) else np.nan
    p = 2 * (1 - norm.cdf(abs(z))) if np.isfinite(z) else np.nan
    ci_low = coef - 1.96 * se if np.isfinite(se) else np.nan
    ci_high = coef + 1.96 * se if np.isfinite(se) else np.nan

    details = {
        "n": n,
        "n_treated": int(mask_t.sum()),
        "n_control": int((~mask_t).sum()),
        "mean_ties_treated_to_control": float(ties_tc.mean()),
        "max_ties_treated_to_control": int(ties_tc.max()),
    }
    if estimand.upper() == "ATE":
        details.update({
            "mean_ties_control_to_treated": float(ties_ct.mean()),
            "max_ties_control_to_treated": int(ties_ct.max()),
        })

    return MatchResult(
        estimand=estimand.upper(),
        coef=float(coef),
        se=float(se) if np.isfinite(se) else np.nan,
        z=float(z) if np.isfinite(z) else np.nan,
        p=float(p) if np.isfinite(p) else np.nan,
        ci_low=float(ci_low) if np.isfinite(ci_low) else np.nan,
        ci_high=float(ci_high) if np.isfinite(ci_high) else np.nan,
        n=n,
        details=details
    )

# =============================
# Bootstrap (para EE)
# =============================

def bootstrap_nnmatch_with_ties(
    df: pd.DataFrame,
    y: str, t: str, x: List[str],
    *,
    estimand: Literal["ATE", "ATT"] = "ATE",
    bias_adj: Union[Literal["ols", "unitwise"], bool] = "ols",
    atol_tie: float = 1e-12,
    B: int = 1000,
    random_state: Optional[int] = 2025
) -> Dict[str, Any]:
    rng = np.random.default_rng(random_state)
    d = df[[y, t] + x].dropna().copy()
    n = len(d)

    base = nnmatch_with_ties(d, y, t, x, estimand=estimand, bias_adj=bias_adj, atol_tie=atol_tie)
    coefs = []
    for _ in range(B):
        samp = d.sample(n, replace=True, random_state=int(rng.integers(1e9)))
        try:
            r_b = nnmatch_with_ties(samp, y, t, x, estimand=estimand, bias_adj=bias_adj, atol_tie=atol_tie)
            coefs.append(r_b.coef)
        except Exception:
            continue

    se_boot = float(np.std(coefs, ddof=1)) if len(coefs) >= 30 else np.nan
    z = base.coef / se_boot if np.isfinite(se_boot) and se_boot > 0 else np.nan
    p = 2 * (1 - norm.cdf(abs(z))) if np.isfinite(z) else np.nan
    ci_low = base.coef - 1.96 * se_boot if np.isfinite(se_boot) else np.nan
    ci_high = base.coef + 1.96 * se_boot if np.isfinite(se_boot) else np.nan

    return {
        "estimand": base.estimand,
        "coef": float(base.coef),
        "se_boot": se_boot,
        "z_boot": z,
        "p_boot": p,
        "ci_low_boot": ci_low,
        "ci_high_boot": ci_high,
        "n_reps": len(coefs),
        "details": base.details
    }

# =============================
# Printer estilo Stata
# =============================

def print_stata_like(result: Union[MatchResult, Dict[str, Any]], *, title="Treatment-effects estimation",
                     with_bootstrap=False):
    if isinstance(result, MatchResult):
        coef = result.coef; se = result.se; z = result.z; p = result.p
        cil = result.ci_low; cih = result.ci_high
        n = result.n; estimand = result.estimand
    else:
        coef = result["coef"]; se = result.get("se_boot", np.nan); z = result.get("z_boot", np.nan)
        p = result.get("p_boot", np.nan); cil = result.get("ci_low_boot", np.nan); cih = result.get("ci_high_boot", np.nan)
        n = result["details"]["n"]; estimand = result["estimand"]

    print(f"{title:>s}                   Number of obs = {n:>7d}")
    print("Estimator      : nearest-neighbor matching (ties averaged)")
    print("Outcome model  : matching")
    print("Distance metric: Mahalanobis")
    print("Neighbors      : 1 (all ties at min distance included)")
    print("-" * 78)
    hdr = "             |      Coef.    Std. Err.      z      P>|z|     [95% Conf. Interval]"
    print(hdr)
    print("-" * 78)
    lab = f"{estimand:<12s}|"
    def _fmt(v): 
        return "     " + ("    " if np.isnan(v) else f"{v:8.3f}")
    print(f"{lab}{_fmt(coef)}{_fmt(se)}{_fmt(z)}{_fmt(p)}{_fmt(cil)}{_fmt(cih)}")
    print("-" * 78)

# =============================
# USO (ejemplo con jobtraining.dta)
# =============================
df = pd.read_stata(r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\jobtraining.dta")
# ATE con promedio 1/n y OLS post-matching (EE HC1)
res_ate_ols = nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                estimand="ATE", bias_adj="ols")
print_stata_like(res_ate_ols)

# ATE con ajuste "unitwise" (promedio 1/n) y EE por bootstrap
res_ate_unit = bootstrap_nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                           estimand="ATE", bias_adj="unitwise", B=1000)
print_stata_like(res_ate_unit, with_bootstrap=True)

# ATE sin ajuste y EE por bootstrap
res_ate_noadj = bootstrap_nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                            estimand="ATE", bias_adj=False, B=1000)
print_stata_like(res_ate_noadj, with_bootstrap=True)

# ATT con ajuste OLS (HC1)
res_att_ols = nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                estimand="ATT", bias_adj="ols")
print_stata_like(res_att_ols)

# ATT con bootstrap
res_att_boot = bootstrap_nnmatch_with_ties(df, "earn98", "train", ["earn96","age","educ"],
                                           estimand="ATT", bias_adj="ols", B=1000)
print_stata_like(res_att_boot, with_bootstrap=True)


Treatment-effects estimation                   Number of obs =    1130
Estimator      : nearest-neighbor matching (ties averaged)
Outcome model  : matching
Distance metric: Mahalanobis
Neighbors      : 1 (all ties at min distance included)
------------------------------------------------------------------------------
             |      Coef.    Std. Err.      z      P>|z|     [95% Conf. Interval]
------------------------------------------------------------------------------
ATE         |        1.656        0.250        6.633        0.000        1.167        2.145
------------------------------------------------------------------------------
Treatment-effects estimation                   Number of obs =    1130
Estimator      : nearest-neighbor matching (ties averaged)
Outcome model  : matching
Distance metric: Mahalanobis
Neighbors      : 1 (all ties at min distance included)
------------------------------------------------------------------------------
             |      Coef.    S