In [1]:
import pandas as pd
import pyreadstat as st
path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\jobtraining.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,train,age,educ,black,hisp,married,earn96,unem96,earn98,unem98
0,0,37,11,1,0,1,0.0,1,1.617924,0


In [2]:
df.shape

(1130, 10)

In [4]:
import numpy as np
from dataclasses import dataclass
from typing import List, Optional, Literal, Dict, Any
from sklearn.neighbors import NearestNeighbors
import statsmodels.api as sm
from scipy.stats import norm

@dataclass
class NNMatchResult:
    estimand: str
    coef: float
    se: float
    z: float
    p: float
    ci_low: float
    ci_high: float
    n: int
    matches_summary: Dict[str, Any]

def _whiten_mahalanobis(X: np.ndarray) -> np.ndarray:
    """Devuelve X transformada tal que la distancia euclídea en Z es Mahalanobis en X."""
    V = np.cov(X, rowvar=False)
    VI = np.linalg.inv(V)
    # Cholesky de VI: VI = U @ U.T  ->  ||U.T (x - y)||^2 = (x - y)' VI (x - y)
    U = np.linalg.cholesky(VI)
    Z = X @ U  # Euclídea en Z equivale a Mahalanobis en X
    return Z

def nnmatch(
    df: pd.DataFrame,
    y: str,
    t: str,
    x: List[str],
    *,
    estimand: Literal["ATE", "ATT"] = "ATE",
    n_neighbors: int = 1,
    metric: Literal["mahalanobis"] = "mahalanobis",
    bias_adj: bool = True,
    se_type: Literal["HC1", "bootstrap"] = "HC1",
    bootstrap: Optional[int] = None,
    random_state: Optional[int] = 2025,
    return_matches: bool = False,
) -> NNMatchResult | tuple[NNMatchResult, pd.DataFrame]:
    """
    Nearest-Neighbor Matching al estilo Stata teffects nnmatch, por defecto:
      - 1 vecino, Mahalanobis, con reemplazo
      - estimand = 'ATE' (o 'ATT')
      - ajuste de sesgo post-matching con OLS en la muestra emparejada
      - SE robustos (HC1). Opcional: bootstrap=B
    """
    # --- 0) limpieza básica
    d = df[[y, t] + x].dropna().copy()
    d[t] = d[t].astype(int)
    n = len(d)

    # --- 1) preparar matrices
    X = d[x].to_numpy(dtype=float)
    T = d[t].to_numpy(dtype=int)
    Y = d[y].to_numpy(dtype=float)

    # --- 2) transformar a espacio euclídeo equivalente a Mahalanobis
    if metric != "mahalanobis":
        raise ValueError("Por ahora solo se implementa 'mahalanobis'.")
    Z = _whiten_mahalanobis(X)

    # --- 3) separar tratados/controles y ajustar NNs
    treat_idx = np.where(T == 1)[0]
    ctrl_idx  = np.where(T == 0)[0]

    Z_t = Z[treat_idx]
    Z_c = Z[ctrl_idx]
    Y_t = Y[treat_idx]
    Y_c = Y[ctrl_idx]

    # Para cada tratado: vecino(s) control más cercano
    nn_c = NearestNeighbors(n_neighbors=n_neighbors, algorithm="auto").fit(Z_c)
    dist_tc, nbrs_tc = nn_c.kneighbors(Z_t, return_distance=True)

    # Para ATE, también matcheamos controles -> tratados
    if estimand == "ATE":
        nn_t = NearestNeighbors(n_neighbors=n_neighbors, algorithm="auto").fit(Z_t)
        dist_ct, nbrs_ct = nn_t.kneighbors(Z_c, return_distance=True)

    # --- 4) construir muestra emparejada
    # Usaremos un "panel" emparejado donde cada fila es una observación real con su "match outcome" del grupo opuesto.
    # Para ATT: solo promediamos sobre tratados; para ATE: sobre todos.

    # índices de matches (usamos 1-NN por default; si >1, promediamos)
    def _avg_match_values(base_idx, other_idx_map, Y_other):
        # base_idx: índices de la submuestra (tratados o controles) en el conjunto original
        # other_idx_map: matriz idx de vecinos (en subíndice de la submuestra opuesta)
        # Y_other: outcomes del grupo opuesto
        # Retorna vector del tamaño de base_idx con el promedio de Y de los vecinos
        if other_idx_map.ndim == 1:
            return Y_other[other_idx_map]
        else:
            return Y_other[other_idx_map].mean(axis=1)

    # Tratados: matched control outcome
    match_c_idx_sub = nbrs_tc[:, 0] if n_neighbors == 1 else nbrs_tc
    Yc_matched_for_t = _avg_match_values(treat_idx, match_c_idx_sub, Y_c)

    if estimand == "ATE":
        # Controles: matched treated outcome
        match_t_idx_sub = nbrs_ct[:, 0] if n_neighbors == 1 else nbrs_ct
        Yt_matched_for_c = _avg_match_values(ctrl_idx, match_t_idx_sub, Y_t)

    # --- 5) preparar DataFrame para regresión post-matching (bias adjustment)
    # Para ATT: usamos solo tratados + sus controles matcheados (dos "bandas")
    # Para ATE: usamos tratados con sus controles emparejados y controles con sus tratados emparejados
    rows = []

    # banda de tratados (observación real)
    for pos, i in enumerate(treat_idx):
        rows.append({
            "Y": Y[i],
            "T": 1,
            **{k: d.iloc[i][k] for k in x},
            "_unit": i,
            "_group": "treated_real"
        })
        # banda de control "match" sintético para el mismo tratado (mismo X del real? No: registramos el real del control emparejado)
        j_ctrl = ctrl_idx[match_c_idx_sub[pos]] if n_neighbors == 1 else ctrl_idx[match_c_idx_sub[pos][0]]
        rows.append({
            "Y": Yc_matched_for_t[pos],
            "T": 0,
            **{k: d.iloc[j_ctrl][k] for k in x},
            "_unit": j_ctrl,
            "_group": "control_match_for_treated"
        })

    if estimand == "ATE":
        # banda de controles y su match tratado
        for pos, j in enumerate(ctrl_idx):
            rows.append({
                "Y": Y[j],
                "T": 0,
                **{k: d.iloc[j][k] for k in x},
                "_unit": j,
                "_group": "control_real"
            })
            i_treat = treat_idx[match_t_idx_sub[pos]] if n_neighbors == 1 else treat_idx[match_t_idx_sub[pos][0]]
            rows.append({
                "Y": Yt_matched_for_c[pos],
                "T": 1,
                **{k: d.iloc[i_treat][k] for k in x},
                "_unit": i_treat,
                "_group": "treated_match_for_control"
            })

    matched_df = pd.DataFrame(rows)

    # --- 6) estimación: post-matching OLS (bias_adj=True)
    # Coeficiente de T ~ ATE/ATT ajustado por X en la muestra emparejada
    if bias_adj:
        Xreg = sm.add_constant(matched_df[["T"] + x])
        res = sm.OLS(matched_df["Y"].to_numpy(), Xreg).fit(cov_type="HC1")
        coef = res.params["T"]
        se = res.bse["T"]
    else:
        # Sin ajuste: diferencia emparejada simple
        if estimand == "ATT":
            att_simple = (Y_t - Yc_matched_for_t).mean()
            coef = float(att_simple)
        else:
            # ATE simple: promedio simétrico
            diff_t = (Y_t - Yc_matched_for_t).mean()
            diff_c = (Yt_matched_for_c - Y_c).mean()
            coef = float(0.5*diff_t + 0.5*diff_c)
        # SE por delta simple no es trivial; recomiendo bootstrap si sin ajuste
        se = np.nan

    # --- 7) Bootstrap opcional para SE
    if se_type == "bootstrap" or (bootstrap is not None and bootstrap > 0):
        B = int(bootstrap or 500)
        rng = np.random.default_rng(random_state)
        boot = []
        for _ in range(B):
            # re-muestreo por unidad original (no por filas del matched_df para evitar doblar probas)
            samp_idx = np.arange(n)
            samp = rng.choice(samp_idx, size=n, replace=True)
            d_b = d.iloc[samp].reset_index(drop=True)
            try:
                res_b = nnmatch(
                    d_b, y, t, x,
                    estimand=estimand, n_neighbors=n_neighbors,
                    metric=metric, bias_adj=bias_adj,
                    se_type="HC1", bootstrap=None, random_state=None, return_matches=False
                )
                boot.append(res_b.coef)
            except Exception:
                # puede fallar si la covarianza no es invertible; saltamos esa réplica
                continue
        if len(boot) >= 10:
            se = float(np.std(boot, ddof=1))
        else:
            # si no pudimos bootstrapiar suficiente, dejamos el SE HC1
            pass

    # --- 8) estadísticos
    z = coef / se if (se is not None and np.isfinite(se) and se > 0) else np.nan
    p = 2*(1 - norm.cdf(abs(z))) if np.isfinite(z) else np.nan
    ci_low = coef - 1.96*se if np.isfinite(se) else np.nan
    ci_high = coef + 1.96*se if np.isfinite(se) else np.nan

    # --- 9) resumen de matching
    match_summary = {
        "n_obs": n,
        "n_treated": int((T == 1).sum()),
        "n_control": int((T == 0).sum()),
        "neighbors": n_neighbors,
        "metric": metric,
        "mean_dist_t_to_c": float(dist_tc.mean()) if 'dist_tc' in locals() else None,
    }
    if estimand == "ATE" and 'dist_ct' in locals():
        match_summary["mean_dist_c_to_t"] = float(dist_ct.mean())

    result = NNMatchResult(
        estimand=estimand,
        coef=float(coef),
        se=float(se),
        z=float(z) if np.isfinite(z) else np.nan,
        p=float(p) if np.isfinite(p) else np.nan,
        ci_low=float(ci_low) if np.isfinite(ci_low) else np.nan,
        ci_high=float(ci_high) if np.isfinite(ci_high) else np.nan,
        n=n,
        matches_summary=match_summary
    )

    if return_matches:
        return result, matched_df
    return result

In [5]:
out = nnmatch(df, y="earn98", t="train", x=["earn96", "age", "educ"],
              estimand="ATE", n_neighbors=1, metric="mahalanobis",
              bias_adj=True, se_type="HC1", bootstrap=None)
out

NNMatchResult(estimand='ATE', coef=1.5369651104230515, se=0.2518237220648781, z=6.1033372782373485, p=1.0387630755559485e-09, ci_low=1.0433906151758905, ci_high=2.0305396056702127, n=1130, matches_summary={'n_obs': 1130, 'n_treated': 376, 'n_control': 754, 'neighbors': 1, 'metric': 'mahalanobis', 'mean_dist_t_to_c': 0.16856989690402338, 'mean_dist_c_to_t': 0.3753011393873344})

In [20]:
import pandas as pd
from causalinference import CausalModel

df = pd.read_stata(r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\jobtraining.dta")

Y = df["earn98"].to_numpy()
D = df["train"].astype(int).to_numpy()
X = df[["earn96", "age", "educ"]].to_numpy()

# --- Sin ajuste de sesgo (equivale a omitir biasadj() en Stata)
cm_noadj = CausalModel(Y=Y, D=D, X=X)
cm_noadj.est_via_matching(weights="maha", matches=10, bias_adj=False)
print("\nSIN ajuste de sesgo (bias_adj=False):")
print(cm_noadj.estimates["matching"])

# --- Con ajuste de sesgo (equivale a biasadj(earn96 age educ))
cm_adj = CausalModel(Y=Y, D=D, X=X)
cm_adj.est_via_matching(weights="maha", matches=10, bias_adj=True)
print("\nCON ajuste de sesgo (bias_adj=True):")
print(cm_adj.estimates["matching"])


SIN ajuste de sesgo (bias_adj=False):

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      1.494      0.571      2.614      0.009      0.374      2.614
           ATC      1.289      0.716      1.801      0.072     -0.114      2.693
           ATT      1.903      0.497      3.832      0.000      0.930      2.877


CON ajuste de sesgo (bias_adj=True):

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      2.394      0.566      4.226      0.000      1.284      3.504
           ATC      2.609      0.709      3.681      0.000      1.220      3.998
           ATT      1.962      0.495      3.963      0.000      0.992      2.932



In [14]:
d = df[["earn98","train","earn96","age","educ"]].dropna()
len(d)  # debería ser 1130

1130

In [15]:
import numpy as np
from scipy.spatial import distance

X = d[["earn96","age","educ"]].to_numpy()
V = np.cov(X, rowvar=False)
VI = np.linalg.inv(V)

tmask = d["train"].astype(int).to_numpy()==1
Xt, Xc = X[tmask], X[~tmask]

# Para cada tratado, contar cuántos controles están a la distancia mínima (tolerancia numérica eps)
eps = 1e-12
tie_counts = []
for x in Xt:
    dists = np.array([distance.mahalanobis(x, xc, VI) for xc in Xc])
    mind = dists.min()
    ties = np.sum(np.isclose(dists, mind, atol=eps, rtol=0))
    tie_counts.append(ties)

print("Empates (tratados): min", np.min(tie_counts), "max", np.max(tie_counts))


Empates (tratados): min 1 max 5
