In [13]:
import pandas as pd
import pyreadstat as st
path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 2T\[MT08-MT13] Microeconometría II\Clases\Stata\DiD_ejemplo.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,i,t,y,st,post,dit
0,1,1,5,1,0,0


In [12]:
# Solo statsmodels (+ patsy para fórmula, scipy para t-crit)
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
from scipy.stats import t as tdist

# ============================================================
# Asumo que ya existe un DataFrame `df` con columnas:
# i, t, y, post, dit  (tu 'st' no se usa si ya tienes 'dit')
# ============================================================

# ---------- 1) Chequeos y preparación ----------
req = {"i","t","y","post","dit"}
missing = req - set(df.columns)
if missing:
    raise ValueError(f"Faltan columnas: {missing}")

d = df.copy()
d[["i","t"]] = d[["i","t"]].astype(int)
for c in ["y","post","dit"]:
    d[c] = pd.to_numeric(d[c], errors="coerce")

# ---------- 2) LSDV: y ~ dit + post + C(i) (constante incluida) ----------
# Algebraicamente equivalente a FE con intercepto
y_lsdv, X_lsdv = patsy.dmatrices("y ~ dit + post + C(i)", d, return_type="dataframe")
ols_lsdv = sm.OLS(y_lsdv, X_lsdv).fit()   # var clásica (como xtreg, fe sin vce(robust))

# ---------- 3) Constante "estilo Stata": ybar - xbar' * beta ----------
xbars = d[["dit", "post"]].mean()
beta = ols_lsdv.params[["dit", "post"]]
cons_stata = float(d["y"].mean() - np.dot(xbars.values, beta.values))

# Desvío estándar de _cons: sigma_e / sqrt(G)
G = d["i"].nunique()
sigma_e = float(np.sqrt(np.sum(ols_lsdv.resid**2) / ols_lsdv.df_resid))
se_cons = sigma_e / np.sqrt(G)

# Estadísticos de _cons con df de residuo del LSDV
df_resid = int(ols_lsdv.df_resid)
t_cons = cons_stata / se_cons
p_cons = 2 * (1 - tdist.cdf(abs(t_cons), df_resid))
crit = tdist.ppf(0.975, df_resid)  # 95%
ci_cons_low  = cons_stata - crit * se_cons
ci_cons_high = cons_stata + crit * se_cons

# ---------- 4) Coefs de 'dit' y 'post' directo de LSDV ----------
row_dp = ols_lsdv.summary2().tables[1].loc[["dit","post"]].copy()
row_dp.rename(columns={
    "Coef.":"Coefficient", "Std.Err.":"Std. Err.", "t":"T-stat",
    "P>|t|":"P-value", "[0.025":"Lower CI", "0.975]":"Upper CI"
}, inplace=True)

# Armo tabla final con _cons (centrado) + dit + post
tab_final = pd.DataFrame({
    "Coefficient": [cons_stata, row_dp.loc["dit","Coefficient"], row_dp.loc["post","Coefficient"]],
    "Std. Err.":   [se_cons,    row_dp.loc["dit","Std. Err."],   row_dp.loc["post","Std. Err."]],
    "T-stat":      [t_cons,     row_dp.loc["dit","T-stat"],      row_dp.loc["post","T-stat"]],
    "P-value":     [p_cons,     row_dp.loc["dit","P-value"],     row_dp.loc["post","P-value"]],
    "Lower CI":    [ci_cons_low,row_dp.loc["dit","Lower CI"],    row_dp.loc["post","Lower CI"]],
    "Upper CI":    [ci_cons_high,row_dp.loc["dit","Upper CI"],   row_dp.loc["post","Upper CI"]],
}, index=["_cons","dit","post"])

print("=== Fixed Effects (within) — LSDV (statsmodels, constante centrada como Stata) ===")
print(tab_final)

# ---------- 5) R² (within) al estilo Stata ----------
# R2_within = 1 - SSE_within / SST_within, con variables centradas por entidad
g = d.groupby("i", as_index=False)
y_within = d["y"] - g["y"].transform("mean")
X_within = d[["dit","post"]] - g[["dit","post"]].transform("mean")

ols_within = sm.OLS(y_within, X_within).fit()
SSE_within = float(np.sum(ols_within.resid**2))
SST_within = float(np.sum((y_within - y_within.mean())**2))
R2_within = 1.0 - SSE_within / SST_within if SST_within > 0 else np.nan
print(f"\nR² (within): {R2_within:.10f}")

# ---------- 6) F test that all u_i = 0 (poolability) ----------
# Test conjunto: todos los coeficientes de C(i)[T.*] = 0
fe_cols = [c for c in X_lsdv.columns if c.startswith("C(i)[T.")]
if fe_cols:
    R = np.zeros((len(fe_cols), X_lsdv.shape[1]))
    col_index = {c: j for j, c in enumerate(X_lsdv.columns)}
    for r, col in enumerate(fe_cols):
        R[r, col_index[col]] = 1.0
    ftest = ols_lsdv.f_test(R)
    stat = float(getattr(ftest, "fvalue", getattr(ftest, "statistic", np.nan)))
    pval = float(ftest.pvalue)
    df_num = len(fe_cols)
    df_denom = int(ols_lsdv.df_resid)
    print("\nF-test that all u_i = 0 (poolability):")
    print(f"F({df_num}, {df_denom}) = {stat:.4f}   P-value = {pval:.4f}")
else:
    print("\n(No se detectaron columnas C(i) para el F-test).")

# ---------- 7) Resumen rápido estilo Stata ----------
n = len(d)
Tbar = n / G
print("\nResumen rápido:")
print(f"Obs = {n} | Grupos (i) = {G} | Obs/grupo (avg) = {Tbar:.1f}")
print(f"F(modelo within; 2, {int(ols_within.df_resid)}) = {float(ols_within.fvalue):.4f}   Prob > F = {float(ols_within.f_pvalue):.4f}")


=== Fixed Effects (within) — LSDV (statsmodels, constante centrada como Stata) ===
       Coefficient  Std. Err.     T-stat   P-value  Lower CI  Upper CI
_cons          5.0   0.395285  12.649111  0.006192  3.299227  6.700773
dit           -3.5   1.118034  -3.130495  0.088678 -8.310512  1.310512
post           0.5   0.790569   0.632456  0.591752 -2.901546  3.901546

R² (within): 0.8809523810

F-test that all u_i = 0 (poolability):
F(3, 2) = 8.1333   P-value = 0.1115

Resumen rápido:
Obs = 8 | Grupos (i) = 4 | Obs/grupo (avg) = 2.0
F(modelo within; 2, 6) = 22.2000   Prob > F = 0.0017


  return hypotest_fun_in(*args, **kwds)
