In [40]:
# 只做一次
# pip install geopandas libpysal linearmodels scipy pandas numpy
from pathlib import Path   # ← 关键：先导入 Path
import numpy as np
import pandas as pd
import geopandas as gpd

from libpysal import weights
from libpysal.weights import Queen, KNN
from scipy.sparse import identity
from scipy.sparse.linalg import inv as spinv

from linearmodels.iv import IV2SLS   # 用 2SLS/IV 估计 δ、ρ、β、θ（带双向固定效应）

In [41]:
PATH_PANEL = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\panel_long_merged.csv")
PATH_GEO   = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\cleaned geo data\NUTS2_2021.geojson")
OUT_DIR    = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit")
OUT_DIR.mkdir(parents=True, exist_ok=True)


In [42]:
# ── 读数据 ─────────────────────────────────────────────────────────────
panel = pd.read_csv(PATH_PANEL)                       # 长表：NUTS_ID, year, ...
nuts  = gpd.read_file(PATH_GEO)[["NUTS_ID","geometry"]]

In [43]:
panel = panel.rename(columns={"region": "NUTS_ID"})   # 关键一行
panel["NUTS_ID"] = panel["NUTS_ID"].astype(str)

In [44]:
# 用 vet_per_million 生成 ln_x（自动把 ≤0 的设为 NaN，后面会 dropna）
panel["vet_per_million"] = pd.to_numeric(panel["vet_per_million"], errors="coerce")
bad = (panel["vet_per_million"] <= 0).sum()
print(f"≤0 的条数：{bad}")  # 仅提示你是否存在 0/负数
panel["ln_x"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))


≤0 的条数：0


In [45]:
# 用已对数的 y（log_gdp_pc）作为 ln_y
panel["ln_y"] = pd.to_numeric(panel["log_gdp_pc"], errors="coerce")

In [46]:
s = panel["vet_per_million"].astype(str).str.strip()

# 1) 处理空白与常见“缺失占位”
s = (s.replace({"": None, "NA": None, "N/A": None, ".": None, "-": None, "—": None})
       .str.replace("\u00A0", "", regex=False))  # 去掉不换行空格

# 2) 先去掉千分位逗号（123,456 → 123456），再把小数逗号换成点（1,23 → 1.23）
s = s.str.replace(r",(?!\d{3}\b)", ".", regex=True)         # 小数逗号 → 点
s = s.str.replace(r"(?<=\d),(?=\d{3}\b)", "", regex=True)   # 千分位逗号 → 空

# 3) 去掉除数字/点/负号外的杂字符（比如 <、~、%）
s = s.str.replace(r"[^0-9\.\-]", "", regex=True)

# 4) 转成数值
panel["vet_per_million"] = pd.to_numeric(s, errors="coerce")

# 5) 统计问题行
n_all  = len(s)
n_na   = panel["vet_per_million"].isna().sum()
n_le0  = (panel["vet_per_million"] <= 0).sum()
print(f"总行数: {n_all} | 解析失败(→NaN): {n_na} | ≤0 行: {n_le0}")

# 6) 生成 ln_x（把 ≤0 当缺失丢掉；若不想丢见下方“平移法”）
panel["ln_x"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))

总行数: 2244 | 解析失败(→NaN): 89 | ≤0 行: 0


In [47]:
key_id = "NUTS_ID" if "NUTS_ID" in panel.columns else "region"
panel[key_id] = panel[key_id].astype(str)
panel["year"] = pd.to_numeric(panel["year"], errors="coerce").astype("Int64")

# === 2) 排序并生成滞后 ===============================================
panel = panel.sort_values([key_id, "year"]).reset_index(drop=True)

def add_lags(df, id_col, cols, lags=(1,2,3)):
    for c in cols:
        for L in lags:
            df[f"{c}_lag{L}"] = df.groupby(id_col, observed=True)[c].shift(L)
    return df

panel = add_lags(panel, key_id, cols=["ln_y", "ln_x"], lags=(1,2,3))

# === 3) 快速查看（前几行） ===========================================
cols_show = [
    key_id, "year",
    "ln_y","ln_y_lag1","ln_y_lag2","ln_y_lag3",
    "ln_x","ln_x_lag1","ln_x_lag2","ln_x_lag3"
]
print(panel[cols_show].head(12))

   NUTS_ID  year       ln_y  ln_y_lag1  ln_y_lag2  ln_y_lag3       ln_x  \
0     AT11  2013  10.158130        NaN        NaN        NaN  10.216217   
1     AT11  2014  10.188666  10.158130        NaN        NaN  10.199947   
2     AT11  2015  10.236382  10.188666  10.158130        NaN  10.182091   
3     AT11  2016  10.257659  10.236382  10.188666  10.158130  10.142548   
4     AT11  2017  10.292146  10.257659  10.236382  10.188666  10.122987   
5     AT11  2018  10.312280  10.292146  10.257659  10.236382  10.112194   
6     AT11  2019  10.335270  10.312280  10.292146  10.257659  10.135747   
7     AT11  2020  10.298902  10.335270  10.312280  10.292146  10.147350   
8     AT11  2021  10.367222  10.298902  10.335270  10.312280  10.153266   
9     AT11  2022  10.445812  10.367222  10.298902  10.335270  10.109881   
10    AT11  2023  10.505068  10.445812  10.367222  10.298902  10.106750   
11    AT12  2013  10.344963        NaN        NaN        NaN  10.231037   

    ln_x_lag1  ln_x_lag2

In [48]:
# Queen 邻接
wq = Queen.from_dataframe(nuts, silence_warnings=True)

# KNN6（补孤岛/断裂）
wk = KNN.from_dataframe(nuts, k=6)

# 合并 Queen 与 KNN 的邻接
neighbors = {}
for i in range(nuts.shape[0]):
    qn = wq.neighbors.get(i, [])
    kn = wk.neighbors.get(i, [])
    neighbors[i] = sorted(set(qn + kn))

W = weights.W(neighbors)     # 初始1/0权重
W.transform = "R"            # 行标准化（常用）
n = W.n

# 建立从地区ID到W行号的映射
nuts = nuts.reset_index(drop=True)
id2row = dict(zip(nuts["NUTS_ID"], nuts.index))
panel["rid"] = panel["NUTS_ID"].map(id2row)




In [49]:
# 时间滞后 y_{i,t-1}、y_{i,t-2}、y_{i,t-3}
panel = panel.sort_values(["NUTS_ID","year"])
panel["ln_y_lag1"] = panel.groupby("NUTS_ID")["ln_y"].shift(1)
panel["ln_y_lag2"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel["ln_y_lag3"] = panel.groupby("NUTS_ID")["ln_y"].shift(3)

# 分年份做空间乘：Wy、Wx
def add_spatial_lag(df, colname, newname):
    out = []
    for t, g in df.groupby("year"):
        v = g.set_index("rid")[colname].reindex(range(n)).values
        # 缺失填充为 0（更稳妥做法是先 dropna 再对齐；这里简单演示）
        v = np.nan_to_num(v, nan=0.0)
        lagv = W.sparse @ v
        tmp = pd.DataFrame({"rid": range(n), "year": t, newname: lagv})
        out.append(tmp)
    out = pd.concat(out, ignore_index=True)
    return df.merge(out, on=["rid","year"], how="left")

panel = add_spatial_lag(panel, "ln_y", "wy")     # W ln Y_it  —— ρ 的右手项
panel = add_spatial_lag(panel, "ln_x", "wx")     # W ln X_it  —— θ 的右手项

# 作为工具的高阶 W * X、W^2 * X，以及 W * y_{t-2}
# W^2 * x
def add_higher_order_Wx(df, base_name="ln_x"):
    # W^2：直接连乘稀疏矩阵
    W2 = W.sparse @ W.sparse
    out = []
    for t, g in df.groupby("year"):
        x = g.set_index("rid")[base_name].reindex(range(n)).values
        x = np.nan_to_num(x, nan=0.0)
        w2x = W2 @ x
        tmp = pd.DataFrame({"rid": range(n), "year": t, "w2_ln_x": w2x})
        out.append(tmp)
    return df.merge(pd.concat(out, ignore_index=True), on=["rid","year"], how="left")

panel = add_higher_order_Wx(panel, "ln_x")

# W * y_{t-2}
panel["ln_y_lag2_tmp"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel = add_spatial_lag(panel, "ln_y_lag2_tmp", "w_ln_y_lag2")
panel.drop(columns=["ln_y_lag2_tmp"], inplace=True)

# 回归可用的数据（去掉首两期缺滞后值的行）
reg = panel.dropna(subset=["ln_y","ln_x","ln_y_lag1","wy","wx","ln_y_lag2","ln_y_lag3","w2_ln_x","w_ln_y_lag2"]).copy()


In [50]:
# 把 year / NUTS_ID 设为分类（固定效应用虚拟变量吸收）


reg["year"]    = reg["year"].astype("category")
reg["NUTS_ID"] = reg["NUTS_ID"].astype("category")

# 公式： y ~ exog + [endog ~ instruments]
# 这里 exog 有 ln_x, wx, 以及双向 FE：C(year) + C(NUTS_ID)
# endog 是 ln_y_lag1（δ）和 wy（ρ）
# instruments（排除工具）包括：w2_ln_x, ln_y_lag2, ln_y_lag3, w_ln_y_lag2
formula = """
ln_y ~ 1 + C(year) + C(NUTS_ID) + ln_x + wx
      + [ ln_y_lag1 + wy ~ w2_ln_x + ln_y_lag2 + ln_y_lag3 + w_ln_y_lag2 ]
"""

iv = IV2SLS.from_formula(formula, data=reg)
res = iv.fit(cov_type="robust")   # 或 "clustered", clusters=reg["NUTS_ID"]
print(res.summary)

# 抽出四个系数
delta  = res.params["ln_y_lag1"]
rho    = res.params["wy"]
beta   = res.params["ln_x"]
theta  = res.params["wx"]
delta, rho, beta, theta

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   ln_y   R-squared:                      0.9973
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9969
No. Observations:                1504   F-statistic:                 9.717e+05
Date:                Wed, Aug 27 2025   P-value (F-stat)                0.0000
Time:                        00:14:20   Distribution:                chi2(206)
Cov. Estimator:                robust                                         
                                                                              
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
Intercept              1.4293     0.1957     7.3047     0.0000      1.0458      1.8129
C(year)[T.2017]     

(np.float64(0.8091106392457732),
 np.float64(0.015792257356451955),
 np.float64(0.03361316078917298),
 np.float64(0.0026462150786663585))

In [51]:
from scipy.sparse import csr_matrix

I = identity(n, format="csr")
Ws = W.sparse  # 行标准化后的稀疏矩阵

# 稀疏逆（n≈~200 能承受；更大可以用求解-迹近似）
M = spinv(I - rho * Ws)              # (I - ρW)^{-1}
S0 = M @ (beta * I + theta * Ws)     # 当期乘数

# 直接/总/间接（平均意义下）
direct_short  = S0.diagonal().mean()
total_short   = np.asarray(S0.sum(axis=1)).ravel().mean()
indirect_short = total_short - direct_short

# 长期（稳态）
mult = 1.0 / (1.0 - delta)
direct_long   = mult * direct_short
indirect_long = mult * indirect_short
total_long    = mult * total_short

print("Short-run effects  (elasticities):")
print(f"  Direct  = {direct_short:.4f}")
print(f"  Indirect= {indirect_short:.4f}")
print(f"  Total   = {total_short:.4f}")

print("\nLong-run effects (× 1/(1-δ)):")
print(f"  Direct  = {direct_long:.4f}")
print(f"  Indirect= {indirect_long:.4f}")
print(f"  Total   = {total_long:.4f}")

Short-run effects  (elasticities):
  Direct  = 0.0336
  Indirect= 0.0032
  Total   = 0.0368

Long-run effects (× 1/(1-δ)):
  Direct  = 0.1761
  Indirect= 0.0169
  Total   = 0.1930


  return splu(A).solve
  Ainv = spsolve(A, I)


In [52]:
from scipy.stats import norm

def stars(p):
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

def sdm_impacts_with_se(res, W, x_name='ln_x', wy_name='wy', wx_name='wx', lagy_name='ln_y_lag1',
                        B=3000, seed=2025):
    """返回 impacts（SR/LR）的点估计、SE、p值、95%CI 以及占比AIE/ATE（含区间）"""
    n = W.n
    I = identity(n, format='csr')
    S = W.sparse

    p = res.params
    has_delta = lagy_name in p.index
    # 读参数（容错：x_name 换成 'x' 也能取到）
    rho   = float(p[wy_name])
    beta  = float(p[x_name] if x_name in p.index else p['x'])
    theta = float(p[wx_name])
    delta = float(p[lagy_name]) if has_delta else 0.0

    # —— 点估计（SR/LR）
    M  = spinv(I - rho * S)
    S0 = M @ (beta * I + theta * S)
    direct_SR   = S0.diagonal().mean()
    total_SR    = np.asarray(S0.sum(axis=1)).ravel().mean()
    indirect_SR = total_SR - direct_SR
    mult        = 1.0 / (1.0 - delta) if has_delta else 1.0
    direct_LR, indirect_LR, total_LR = mult*direct_SR, mult*indirect_SR, mult*total_SR

    # —— 参数模拟（Krinsky–Robb）
    names = [wy_name, x_name if x_name in p.index else 'x', wx_name] + ([lagy_name] if has_delta else [])
    cov   = res.cov.loc[names, names].values
    mean  = p[names].values

    rng   = np.random.default_rng(seed)
    draws = rng.multivariate_normal(mean, cov, size=B)

    SR_dir=[]; SR_ind=[]; SR_tot=[]
    LR_dir=[]; LR_ind=[]; LR_tot=[]
    for d in draws:
        if has_delta:
            rho_b, beta_b, theta_b, delta_b = d
            mult_b = 1.0 / (1.0 - delta_b) if abs(delta_b) < 0.999 else np.nan
        else:
            rho_b, beta_b, theta_b = d
            mult_b = 1.0
        try:
            Mb = spinv(I - rho_b * S)
        except Exception:
            continue
        S0b = Mb @ (beta_b * I + theta_b * S)
        d_sr = S0b.diagonal().mean()
        t_sr = np.asarray(S0b.sum(axis=1)).ravel().mean()
        i_sr = t_sr - d_sr
        SR_dir.append(d_sr); SR_ind.append(i_sr); SR_tot.append(t_sr)
        LR_dir.append(mult_b*d_sr); LR_ind.append(mult_b*i_sr); LR_tot.append(mult_b*t_sr)

    def _summ(samples, est):
        arr = np.asarray(samples)
        se  = np.nanstd(arr, ddof=1)
        z   = est / se
        p   = 2 * (1 - norm.cdf(abs(z)))
        lo, hi = np.nanpercentile(arr, [2.5, 97.5])
        return dict(est=est, se=se, p=p, lo=lo, hi=hi)

    out = {
        ('Direct','SR'): _summ(SR_dir, direct_SR),
        ('Indirect','SR'): _summ(SR_ind, indirect_SR),
        ('Total','SR'): _summ(SR_tot, total_SR),
        ('Direct','LR'): _summ(LR_dir, direct_LR),
        ('Indirect','LR'): _summ(LR_ind, indirect_LR),
        ('Total','LR'): _summ(LR_tot, total_LR),
        ('Share','SR'):  _summ(np.asarray(SR_ind)/np.asarray(SR_tot), indirect_SR/total_SR),
        ('Share','LR'):  _summ(np.asarray(LR_ind)/np.asarray(LR_tot), indirect_LR/total_LR)
    }
    return out, has_delta
# --- helper for stars (if not already defined) ---
def stars(p):
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

# === 1) Impacts ===
imp, has_delta = sdm_impacts_with_se(
    res, W, x_name='ln_x', wy_name='wy', wx_name='wx', lagy_name='ln_y_lag1'
)

# === 2) Panel A: coefficients ===
coef_rows = []
labels = {
    'wy'        : 'ρ · W ln Y',
    'ln_x'      : 'β · ln X',
    'x'         : 'β · ln X',
    'wx'        : 'θ · W ln X',
    'ln_y_lag1' : 'δ · ln Y_{t−1}'
}
for name in ['wy', 'ln_x' if 'ln_x' in res.params.index else 'x', 'wx'] + (['ln_y_lag1'] if has_delta else []):
    coef_rows.append({
        'Variable'   : labels[name],
        'Coef.'      : res.params[name],
        'Std. Error' : res.std_errors[name],
        'p-value'    : res.pvalues[name]
    })
panelA = pd.DataFrame(coef_rows)

# sample/setup info
key_id = 'NUTS_ID' if 'NUTS_ID' in panel.columns else 'region'
try:
    N = int(getattr(res, 'nobs', None) or reg.shape[0])
except NameError:
    N = int(getattr(res, 'nobs', np.nan))
G = int(reg[key_id].nunique())
T = int(reg['year'].nunique())

info_rows = pd.DataFrame([
    {'Variable':'Region FE / Year FE', 'Coef.':'Yes / Yes', 'Std. Error':'', 'p-value':''},
    {'Variable':'Obs. N; Regions G; Years T', 'Coef.':f'{N}; {G}; {T}', 'Std. Error':'', 'p-value':''}
])
panelA_full = pd.concat([panelA, info_rows], ignore_index=True)

# === 3) Panel B: impacts (short/long with SE & p) ===
def fmt(cell):
    return f"{cell['est']:.4f} ({cell['se']:.4f}){stars(cell['p'])}"

panelB = pd.DataFrame({
    'Effect'          : ['Direct (ADE)','Indirect (AIE)','Total (ATE)','Spillover share (AIE/ATE)'],
    'Short-run (SR)'  : [fmt(imp[('Direct','SR')]),
                         fmt(imp[('Indirect','SR')]),
                         fmt(imp[('Total','SR')]),
                         f"{imp[('Share','SR')]['est']:.3f} [{imp[('Share','SR')]['lo']:.3f},{imp[('Share','SR')]['hi']:.3f}]"],
    'Long-run (LR)'   : [fmt(imp[('Direct','LR')]),
                         fmt(imp[('Indirect','LR')]),
                         fmt(imp[('Total','LR')]),
                         f"{imp[('Share','LR')]['est']:.3f} [{imp[('Share','LR')]['lo']:.3f},{imp[('Share','LR')]['hi']:.3f}]"]
})

# === 4) Combine and export ===
empty = pd.DataFrame([{'Variable':'', 'Coef.':'', 'Std. Error':'', 'p-value':''}])

combined = pd.concat([
    pd.DataFrame([{'Variable':'Panel A: Coefficients (dependent variable ln Y)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelA_full,
    empty,
    pd.DataFrame([{'Variable':'Panel B: LeSage–Pace impacts (ln X → ln Y, elasticities)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelB.rename(columns={'Effect':'Variable', 'Short-run (SR)':'Coef.', 'Long-run (LR)':'Std. Error'}).assign(**{'p-value':''})
], ignore_index=True)

# CSV
combined.to_csv(OUT_DIR / "SDM_results_gdp.csv", index=False, encoding="utf-8-sig")

# Excel (engine fallback: xlsxwriter -> openpyxl; if neither, only CSV)
excel_path = OUT_DIR / "SDM_results_gdp.xlsx"
engine = None
try:
    import xlsxwriter  # noqa
    engine = "xlsxwriter"
except ModuleNotFoundError:
    try:
        import openpyxl  # noqa
        engine = "openpyxl"
    except ModuleNotFoundError:
        engine = None

if engine:
    with pd.ExcelWriter(excel_path, engine=engine) as w:
        combined.to_excel(w, index=False, sheet_name="SDM")
    print(f"Exported Excel (engine={engine}): {excel_path}")
else:
    print("xlsxwriter/openpyxl not installed; exported CSV only. Install one if you need .xlsx.")

  return splu(A).solve
  Ainv = spsolve(A, I)


Exported Excel (engine=xlsxwriter): D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit\SDM_results_gdp.xlsx


In [19]:
# 只做一次
# pip install geopandas libpysal linearmodels scipy pandas numpy
from pathlib import Path   # ← 关键：先导入 Path
import numpy as np
import pandas as pd
import geopandas as gpd

from libpysal import weights
from libpysal.weights import Queen, KNN
from scipy.sparse import identity
from scipy.sparse.linalg import inv as spinv

from linearmodels.iv import IV2SLS   # 用 2SLS/IV 估计 δ、ρ、β、θ（带双向固定效应）

PATH_PANEL = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\panel_long_merged.csv")
PATH_GEO   = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\cleaned geo data\NUTS2_2021.geojson")
OUT_DIR    = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ── 读数据 ─────────────────────────────────────────────────────────────
panel = pd.read_csv(PATH_PANEL)                       # 长表：NUTS_ID, year, ...
nuts  = gpd.read_file(PATH_GEO)[["NUTS_ID","geometry"]]


panel = panel.rename(columns={"region": "NUTS_ID"})   # 关键一行
panel["NUTS_ID"] = panel["NUTS_ID"].astype(str)

# 用 vet_per_million 生成 ln_x（自动把 ≤0 的设为 NaN，后面会 dropna）
panel["vet_per_million"] = pd.to_numeric(panel["vet_per_million"], errors="coerce")
bad = (panel["vet_per_million"] <= 0).sum()
print(f"≤0 的条数：{bad}")  # 仅提示你是否存在 0/负数
panel["ln_x"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))




≤0 的条数：0


In [20]:

panel["employment_rate"] = pd.to_numeric(panel["employment_rate"], errors="coerce")

er = panel["employment_rate"]
# 自动识别：若最大值>1，说明是百分数，先 /100
er = er/100.0 if er.dropna().max() > 1.0000001 else er

print("employment_rate 中 ≤0 的条数：", (er <= 0).sum())
panel["ln_y"] = np.log(er.where(er > 0, np.nan))   # ln(employment_rate)


s = panel["vet_per_million"].astype(str).str.strip()

# 1) 处理空白与常见“缺失占位”
s = (s.replace({"": None, "NA": None, "N/A": None, ".": None, "-": None, "—": None})
       .str.replace("\u00A0", "", regex=False))  # 去掉不换行空格

# 2) 先去掉千分位逗号（123,456 → 123456），再把小数逗号换成点（1,23 → 1.23）
s = s.str.replace(r",(?!\d{3}\b)", ".", regex=True)         # 小数逗号 → 点
s = s.str.replace(r"(?<=\d),(?=\d{3}\b)", "", regex=True)   # 千分位逗号 → 空

# 3) 去掉除数字/点/负号外的杂字符（比如 <、~、%）
s = s.str.replace(r"[^0-9\.\-]", "", regex=True)

# 4) 转成数值
panel["vet_per_million"] = pd.to_numeric(s, errors="coerce")

# 5) 统计问题行
n_all  = len(s)
n_na   = panel["vet_per_million"].isna().sum()
n_le0  = (panel["vet_per_million"] <= 0).sum()
print(f"总行数: {n_all} | 解析失败(→NaN): {n_na} | ≤0 行: {n_le0}")

# 6) 生成 ln_x（把 ≤0 当缺失丢掉；若不想丢见下方“平移法”）
panel["ln_x"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))






employment_rate 中 ≤0 的条数： 0
总行数: 2244 | 解析失败(→NaN): 89 | ≤0 行: 0


In [21]:
key_id = "NUTS_ID" if "NUTS_ID" in panel.columns else "region"
panel[key_id] = panel[key_id].astype(str)
panel["year"] = pd.to_numeric(panel["year"], errors="coerce").astype("Int64")

# === 2) 排序并生成滞后 ===============================================
panel = panel.sort_values([key_id, "year"]).reset_index(drop=True)

def add_lags(df, id_col, cols, lags=(1,2,3)):
    for c in cols:
        for L in lags:
            df[f"{c}_lag{L}"] = df.groupby(id_col, observed=True)[c].shift(L)
    return df

panel = add_lags(panel, key_id, cols=["ln_y", "ln_x"], lags=(1,2,3))

# === 3) 快速查看（前几行） ===========================================
cols_show = [
    key_id, "year",
    "ln_y","ln_y_lag1","ln_y_lag2","ln_y_lag3",
    "ln_x","ln_x_lag1","ln_x_lag2","ln_x_lag3"
]
print(panel[cols_show].head(12))

   NUTS_ID  year      ln_y  ln_y_lag1  ln_y_lag2  ln_y_lag3       ln_x  \
0     AT11  2013 -0.358105        NaN        NaN        NaN  10.216217   
1     AT11  2014 -0.359536  -0.358105        NaN        NaN  10.199947   
2     AT11  2015 -0.362406  -0.359536  -0.358105        NaN  10.182091   
3     AT11  2016 -0.359536  -0.362406  -0.359536  -0.358105  10.142548   
4     AT11  2017 -0.341083  -0.359536  -0.362406  -0.359536  10.122987   
5     AT11  2018 -0.331286  -0.341083  -0.359536  -0.362406  10.112194   
6     AT11  2019 -0.329894  -0.331286  -0.341083  -0.359536  10.135747   
7     AT11  2020 -0.328504  -0.329894  -0.331286  -0.341083  10.147350   
8     AT11  2021 -0.331286  -0.328504  -0.329894  -0.331286  10.153266   
9     AT11  2022 -0.310610  -0.331286  -0.328504  -0.329894  10.109881   
10    AT11  2023 -0.311975  -0.310610  -0.331286  -0.328504  10.106750   
11    AT12  2013 -0.322964        NaN        NaN        NaN  10.231037   

    ln_x_lag1  ln_x_lag2  ln_x_lag3  

In [22]:
# Queen 邻接
wq = Queen.from_dataframe(nuts, silence_warnings=True)

# KNN6（补孤岛/断裂）
wk = KNN.from_dataframe(nuts, k=6)

# 合并 Queen 与 KNN 的邻接
neighbors = {}
for i in range(nuts.shape[0]):
    qn = wq.neighbors.get(i, [])
    kn = wk.neighbors.get(i, [])
    neighbors[i] = sorted(set(qn + kn))

W = weights.W(neighbors)     # 初始1/0权重
W.transform = "R"            # 行标准化（常用）
n = W.n

# 建立从地区ID到W行号的映射
nuts = nuts.reset_index(drop=True)
id2row = dict(zip(nuts["NUTS_ID"], nuts.index))
panel["rid"] = panel["NUTS_ID"].map(id2row)




In [23]:
# 时间滞后 y_{i,t-1}、y_{i,t-2}、y_{i,t-3}
panel = panel.sort_values(["NUTS_ID","year"])
panel["ln_y_lag1"] = panel.groupby("NUTS_ID")["ln_y"].shift(1)
panel["ln_y_lag2"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel["ln_y_lag3"] = panel.groupby("NUTS_ID")["ln_y"].shift(3)

# 分年份做空间乘：Wy、Wx
def add_spatial_lag(df, colname, newname):
    out = []
    for t, g in df.groupby("year"):
        v = g.set_index("rid")[colname].reindex(range(n)).values
        # 缺失填充为 0（更稳妥做法是先 dropna 再对齐；这里简单演示）
        v = np.nan_to_num(v, nan=0.0)
        lagv = W.sparse @ v
        tmp = pd.DataFrame({"rid": range(n), "year": t, newname: lagv})
        out.append(tmp)
    out = pd.concat(out, ignore_index=True)
    return df.merge(out, on=["rid","year"], how="left")

panel = add_spatial_lag(panel, "ln_y", "wy")     # W ln Y_it  —— ρ 的右手项
panel = add_spatial_lag(panel, "ln_x", "wx")     # W ln X_it  —— θ 的右手项

# 作为工具的高阶 W * X、W^2 * X，以及 W * y_{t-2}
# W^2 * x
def add_higher_order_Wx(df, base_name="ln_x"):
    # W^2：直接连乘稀疏矩阵
    W2 = W.sparse @ W.sparse
    out = []
    for t, g in df.groupby("year"):
        x = g.set_index("rid")[base_name].reindex(range(n)).values
        x = np.nan_to_num(x, nan=0.0)
        w2x = W2 @ x
        tmp = pd.DataFrame({"rid": range(n), "year": t, "w2_ln_x": w2x})
        out.append(tmp)
    return df.merge(pd.concat(out, ignore_index=True), on=["rid","year"], how="left")

panel = add_higher_order_Wx(panel, "ln_x")

# W * y_{t-2}
panel["ln_y_lag2_tmp"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel = add_spatial_lag(panel, "ln_y_lag2_tmp", "w_ln_y_lag2")
panel.drop(columns=["ln_y_lag2_tmp"], inplace=True)

# 回归可用的数据（去掉首两期缺滞后值的行）
reg = panel.dropna(subset=["ln_y","ln_x","ln_y_lag1","wy","wx","ln_y_lag2","ln_y_lag3","w2_ln_x","w_ln_y_lag2"]).copy()

In [24]:
# 把 year / NUTS_ID 设为分类（固定效应用虚拟变量吸收）


reg["year"]    = reg["year"].astype("category")
reg["NUTS_ID"] = reg["NUTS_ID"].astype("category")

# 公式： y ~ exog + [endog ~ instruments]
# 这里 exog 有 ln_x, wx, 以及双向 FE：C(year) + C(NUTS_ID)
# endog 是 ln_y_lag1（δ）和 wy（ρ）
# instruments（排除工具）包括：w2_ln_x, ln_y_lag2, ln_y_lag3, w_ln_y_lag2
formula = """
ln_y ~ 1 + C(year) + C(NUTS_ID) + ln_x + wx
      + [ ln_y_lag1 + wy ~ w2_ln_x + ln_y_lag2 + ln_y_lag3 + w_ln_y_lag2 ]
"""

iv = IV2SLS.from_formula(formula, data=reg)
res = iv.fit(cov_type="robust")   # 或 "clustered", clusters=reg["NUTS_ID"]
print(res.summary)

# 抽出四个系数
rho    = res.params["wy"]
beta   = res.params["ln_x"]
theta  = res.params["wx"]
delta, rho, beta, theta

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   ln_y   R-squared:                      0.9848
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9824
No. Observations:                1584   F-statistic:                 2.185e+05
Date:                Sat, Aug 30 2025   P-value (F-stat)                0.0000
Time:                        17:26:31   Distribution:                chi2(214)
Cov. Estimator:                robust                                         
                                                                              
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
Intercept             -0.1663     0.0841    -1.9784     0.0479     -0.3311     -0.0015
C(year)[T.2017]     

(np.float64(0.6779504451405955),
 np.float64(0.4759871567002847),
 np.float64(0.009629414789344537),
 np.float64(0.0002977384774922598))

In [32]:
from scipy.sparse import csr_matrix

I = identity(n, format="csr")
Ws = W.sparse  # 行标准化后的稀疏矩阵

# 稀疏逆（n≈~200 能承受；更大可以用求解-迹近似）
M = spinv(I - rho * Ws)              # (I - ρW)^{-1}
S0 = M @ (beta * I + theta * Ws)     # 当期乘数

# 直接/总/间接（平均意义下）
direct_short  = S0.diagonal().mean()
total_short   = np.asarray(S0.sum(axis=1)).ravel().mean()
indirect_short = total_short - direct_short

# 长期（稳态）
mult = 1.0 / (1.0 - delta)
direct_long   = mult * direct_short
indirect_long = mult * indirect_short
total_long    = mult * total_short

print("Short-run effects  (elasticities):")
print(f"  Direct  = {direct_short:.4f}")
print(f"  Indirect= {indirect_short:.4f}")
print(f"  Total   = {total_short:.4f}")

print("\nLong-run effects (× 1/(1-δ)):")
print(f"  Direct  = {direct_long:.4f}")
print(f"  Indirect= {indirect_long:.4f}")
print(f"  Total   = {total_long:.4f}")

Short-run effects  (elasticities):
  Direct  = 0.0100
  Indirect= 0.0089
  Total   = 0.0189

Long-run effects (× 1/(1-δ)):
  Direct  = 0.0312
  Indirect= 0.0277
  Total   = 0.0588


In [33]:
# --- helper for stars (if not already defined) ---
def stars(p):
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

# === 1) Impacts ===
imp, has_delta = sdm_impacts_with_se(
    res, W, x_name='ln_x', wy_name='wy', wx_name='wx', lagy_name='ln_y_lag1'
)

# === 2) Panel A: coefficients ===
coef_rows = []
labels = {
    'wy'        : 'ρ · W ln Y',
    'ln_x'      : 'β · ln X',
    'x'         : 'β · ln X',
    'wx'        : 'θ · W ln X',
    'ln_y_lag1' : 'δ · ln Y_{t−1}'
}
for name in ['wy', 'ln_x' if 'ln_x' in res.params.index else 'x', 'wx'] + (['ln_y_lag1'] if has_delta else []):
    coef_rows.append({
        'Variable'   : labels[name],
        'Coef.'      : res.params[name],
        'Std. Error' : res.std_errors[name],
        'p-value'    : res.pvalues[name]
    })
panelA = pd.DataFrame(coef_rows)

# sample/setup info
key_id = 'NUTS_ID' if 'NUTS_ID' in panel.columns else 'region'
try:
    N = int(getattr(res, 'nobs', None) or reg.shape[0])
except NameError:
    N = int(getattr(res, 'nobs', np.nan))
G = int(reg[key_id].nunique())
T = int(reg['year'].nunique())

info_rows = pd.DataFrame([
    {'Variable':'Region FE / Year FE', 'Coef.':'Yes / Yes', 'Std. Error':'', 'p-value':''},
    {'Variable':'Obs. N; Regions G; Years T', 'Coef.':f'{N}; {G}; {T}', 'Std. Error':'', 'p-value':''}
])
panelA_full = pd.concat([panelA, info_rows], ignore_index=True)

# === 3) Panel B: impacts (short/long with SE & p) ===
def fmt(cell):
    return f"{cell['est']:.4f} ({cell['se']:.4f}){stars(cell['p'])}"

panelB = pd.DataFrame({
    'Effect'          : ['Direct (ADE)','Indirect (AIE)','Total (ATE)','Spillover share (AIE/ATE)'],
    'Short-run (SR)'  : [fmt(imp[('Direct','SR')]),
                         fmt(imp[('Indirect','SR')]),
                         fmt(imp[('Total','SR')]),
                         f"{imp[('Share','SR')]['est']:.3f} [{imp[('Share','SR')]['lo']:.3f},{imp[('Share','SR')]['hi']:.3f}]"],
    'Long-run (LR)'   : [fmt(imp[('Direct','LR')]),
                         fmt(imp[('Indirect','LR')]),
                         fmt(imp[('Total','LR')]),
                         f"{imp[('Share','LR')]['est']:.3f} [{imp[('Share','LR')]['lo']:.3f},{imp[('Share','LR')]['hi']:.3f}]"]
})

# === 4) Combine and export ===
empty = pd.DataFrame([{'Variable':'', 'Coef.':'', 'Std. Error':'', 'p-value':''}])

combined = pd.concat([
    pd.DataFrame([{'Variable':'Panel A: Coefficients (dependent variable ln Y)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelA_full,
    empty,
    pd.DataFrame([{'Variable':'Panel B: LeSage–Pace impacts (ln X → ln Y, elasticities)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelB.rename(columns={'Effect':'Variable', 'Short-run (SR)':'Coef.', 'Long-run (LR)':'Std. Error'}).assign(**{'p-value':''})
], ignore_index=True)

# CSV
combined.to_csv(OUT_DIR / "SDM_results_em01.csv", index=False, encoding="utf-8-sig")

# Excel (engine fallback: xlsxwriter -> openpyxl; if neither, only CSV)
excel_path = OUT_DIR / "SDM_results_em01.xlsx"
engine = None
try:
    import xlsxwriter  # noqa
    engine = "xlsxwriter"
except ModuleNotFoundError:
    try:
        import openpyxl  # noqa
        engine = "openpyxl"
    except ModuleNotFoundError:
        engine = None

if engine:
    with pd.ExcelWriter(excel_path, engine=engine) as w:
        combined.to_excel(w, index=False, sheet_name="SDM")
    print(f"Exported Excel (engine={engine}): {excel_path}")
else:
    print("xlsxwriter/openpyxl not installed; exported CSV only. Install one if you need .xlsx.")


NameError: name 'sdm_impacts_with_se' is not defined

In [17]:
# 只做一次
# pip install geopandas libpysal linearmodels scipy pandas numpy
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd

from libpysal import weights
from libpysal.weights import Queen, KNN
from scipy.sparse import identity
from scipy.sparse.linalg import inv as spinv

from linearmodels.iv import IV2SLS   # 2SLS/IV 估计（带双向固定效应）

# ========== 路径 ==========
PATH_PANEL = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\panel_long_merged.csv")
PATH_GEO   = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\cleaned geo data\NUTS2_2021.geojson")
OUT_DIR    = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ========== 读数据 ==========
panel = pd.read_csv(PATH_PANEL)                       # 长表：应包含 NUTS_ID/region, year, vet_per_million, employment_rate, log_gdp_c
nuts  = gpd.read_file(PATH_GEO)[["NUTS_ID","geometry"]]

# 统一键名
panel = panel.rename(columns={"region": "NUTS_ID"}) if "region" in panel.columns else panel
panel["NUTS_ID"] = panel["NUTS_ID"].astype(str)

# ---------- 清洗 & 构造变量 ----------
# 1) vet_per_million：先清洗字符串，再转数值，再取对数 → ln_y
v = panel["vet_per_million"].astype(str).str.strip()
v = (v.replace({"": None, "NA": None, "N/A": None, ".": None, "-": None, "—": None})
       .str.replace("\u00A0", "", regex=False))
# 小数逗号 -> 点；千分位逗号去掉
v = v.str.replace(r",(?!\d{3}\b)", ".", regex=True)
v = v.str.replace(r"(?<=\d),(?=\d{3}\b)", "", regex=True)
v = v.str.replace(r"[^0-9\.\-]", "", regex=True)
panel["vet_per_million"] = pd.to_numeric(v, errors="coerce")
panel["ln_y"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))  # y = ln(vet_per_million)

# 2) x1 = log_gdp_c（已对数，直接用；但仍做数值化）
panel["x1"] = pd.to_numeric(panel["log_gdp_pc"], errors="coerce")

# 3) x2 = ln(employment_rate)（需取对数）
panel["employment_rate"] = pd.to_numeric(panel["employment_rate"], errors="coerce")
panel["x2"] = np.log(panel["employment_rate"].where(panel["employment_rate"] > 0, np.nan))

# 时间与ID
panel["year"] = pd.to_numeric(panel["year"], errors="coerce").astype("Int64")
panel = panel.sort_values(["NUTS_ID", "year"]).reset_index(drop=True)

# ---------- 生成滞后 ----------
def add_lags(df, id_col, cols, lags=(1,2,3)):
    for c in cols:
        for L in lags:
            df[f"{c}_lag{L}"] = df.groupby(id_col, observed=True)[c].shift(L)
    return df

panel = add_lags(panel, "NUTS_ID", cols=["ln_y"], lags=(1,2,3))  # y 的1~3阶滞后

# ---------- 构造空间权重 W（Queen + KNN6补孤岛，行标准化） ----------
wq = Queen.from_dataframe(nuts, silence_warnings=True)
wk = KNN.from_dataframe(nuts, k=6)

neighbors = {}
for i in range(nuts.shape[0]):
    qn = wq.neighbors.get(i, [])
    kn = wk.neighbors.get(i, [])
    neighbors[i] = sorted(set(qn + kn))

W = weights.W(neighbors)
W.transform = "R"   # 行标准化
n = W.n
Ws = W.sparse

# 行号映射
nuts = nuts.reset_index(drop=True)
id2row = dict(zip(nuts["NUTS_ID"], nuts.index))
panel["rid"] = panel["NUTS_ID"].map(id2row)

# ---------- 分年做空间乘：Wy, Wx1, Wx2 ----------
def add_spatial_lag(df, colname, newname, n=n, W=W):
    out = []
    for t, g in df.groupby("year"):
        v = g.set_index("rid")[colname].reindex(range(n)).values
        v = np.nan_to_num(v, nan=0.0)        # 缺失先置0；（更严格做法：先对齐非缺失再回填）
        lagv = W.sparse @ v
        tmp = pd.DataFrame({"rid": range(n), "year": t, newname: lagv})
        out.append(tmp)
    return df.merge(pd.concat(out, ignore_index=True), on=["rid","year"], how="left")

# Wy
panel = add_spatial_lag(panel, "ln_y", "wy")
# Wx1, Wx2
panel = add_spatial_lag(panel, "x1", "wx1")
panel = add_spatial_lag(panel, "x2", "wx2")

# ---------- 高阶工具：W²x1, W²x2 以及 W*y_{t-2} ----------
W2 = Ws @ Ws

def add_W2x(df, base_col, newname):
    out = []
    for t, g in df.groupby("year"):
        x = g.set_index("rid")[base_col].reindex(range(n)).values
        x = np.nan_to_num(x, nan=0.0)
        w2x = W2 @ x
        tmp = pd.DataFrame({"rid": range(n), "year": t, newname: w2x})
        out.append(tmp)
    return df.merge(pd.concat(out, ignore_index=True), on=["rid","year"], how="left")

panel = add_W2x(panel, "x1", "w2_x1")
panel = add_W2x(panel, "x2", "w2_x2")

# W*y_{t-2}
panel = add_spatial_lag(panel, "ln_y_lag2", "w_ln_y_lag2")

# ---------- 回归样本 ----------
reg = panel.dropna(subset=[
    "ln_y", "x1", "x2",
    "ln_y_lag1", "ln_y_lag2", "ln_y_lag3",
    "wy", "wx1", "wx2",
    "w2_x1", "w2_x2", "w_ln_y_lag2",
    "year", "NUTS_ID"
]).copy()

reg["year"]    = reg["year"].astype("category")
reg["NUTS_ID"] = reg["NUTS_ID"].astype("category")

# ---------- 2SLS 公式 ----------
# 因变量：ln_y
# exog：x1 + x2 + wx1 + wx2 + 双向FE
# endog：ln_y_lag1（δ）、wy（ρ）
# instruments：w2_x1, w2_x2, ln_y_lag2, ln_y_lag3, w_ln_y_lag2
formula = """
ln_y ~ 1 + C(year) + C(NUTS_ID) + x1 + x2 + wx1 + wx2
      + [ ln_y_lag1 + wy ~ w2_x1 + w2_x2 + ln_y_lag2 + ln_y_lag3 + w_ln_y_lag2 ]
"""

iv = IV2SLS.from_formula(formula, data=reg)
res = iv.fit(cov_type="robust")  # 可换 "clustered", clusters=reg["NUTS_ID"]
print(res.summary)

# ---------- 提取系数 ----------
params = res.params
se     = res.std_errors
pvals  = res.pvalues

# 命名方便
delta = params.get("ln_y_lag1", np.nan)  # δ
rho   = params.get("wy", np.nan)         # ρ
beta1 = params.get("x1", np.nan)         # β1 (log_gdp_c)
theta1= params.get("wx1", np.nan)        # θ1
beta2 = params.get("x2", np.nan)         # β2 (ln employment_rate)
theta2= params.get("wx2", np.nan)        # θ2

# ---------- 影响效应（LeSage-Pace，对每个x分别算） ----------
I = identity(n, format="csr")
M = spinv(I - rho * Ws)  # (I - ρW)^{-1}

def impacts_for(beta, theta, delta, Ws, M):
    S0 = M @ (beta * I + theta * Ws)            # 当期乘数
    direct_short   = S0.diagonal().mean()
    total_short    = np.asarray(S0.sum(axis=1)).ravel().mean()
    indirect_short = total_short - direct_short
    mult = 1.0 / (1.0 - delta) if np.isfinite(delta) else np.nan   # 长期乘数
    direct_long    = mult * direct_short
    indirect_long  = mult * indirect_short
    total_long     = mult * total_short
    share_short    = (indirect_short / total_short) if total_short not in (0, np.nan) else np.nan
    share_long     = (indirect_long / total_long) if total_long not in (0, np.nan) else np.nan
    return {
        "SR": {"Direct": direct_short, "Indirect": indirect_short, "Total": total_short, "Share": share_short},
        "LR": {"Direct": direct_long , "Indirect": indirect_long , "Total": total_long , "Share": share_long }
    }

imp_x1 = impacts_for(beta1, theta1, delta, Ws, M)  # 对 log_gdp_c 的效应
imp_x2 = impacts_for(beta2, theta2, delta, Ws, M)  # 对 ln(employment_rate) 的效应

# ---------- 整理导出 ----------
def stars(p):
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

# Panel A：系数表
panelA = pd.DataFrame([
    {"Variable":"ρ · W y",            "Coef.": params.get("wy", np.nan),        "Std. Error": se.get("wy", np.nan),        "p-value": pvals.get("wy", np.nan)},
    {"Variable":"β1 · log_gdp_c",     "Coef.": params.get("x1", np.nan),        "Std. Error": se.get("x1", np.nan),        "p-value": pvals.get("x1", np.nan)},
    {"Variable":"θ1 · W log_gdp_c",   "Coef.": params.get("wx1", np.nan),       "Std. Error": se.get("wx1", np.nan),       "p-value": pvals.get("wx1", np.nan)},
    {"Variable":"β2 · ln(employ)",    "Coef.": params.get("x2", np.nan),        "Std. Error": se.get("x2", np.nan),        "p-value": pvals.get("x2", np.nan)},
    {"Variable":"θ2 · W ln(employ)",  "Coef.": params.get("wx2", np.nan),       "Std. Error": se.get("wx2", np.nan),       "p-value": pvals.get("wx2", np.nan)},
    {"Variable":"δ · ln y_{t−1}",     "Coef.": params.get("ln_y_lag1", np.nan), "Std. Error": se.get("ln_y_lag1", np.nan), "p-value": pvals.get("ln_y_lag1", np.nan)},
])

# 样本/设置信息
N = int(getattr(res, 'nobs', reg.shape[0]))
G = int(reg["NUTS_ID"].nunique())
T = int(reg["year"].nunique())
info_rows = pd.DataFrame([
    {'Variable':'Region FE / Year FE', 'Coef.':'Yes / Yes', 'Std. Error':'', 'p-value':''},
    {'Variable':'Obs. N; Regions G; Years T', 'Coef.':f'{N}; {G}; {T}', 'Std. Error':'', 'p-value':''}
])
panelA_full = pd.concat([panelA, info_rows], ignore_index=True)

# Panel B：影响效应（仅点估计；如需SE需做 Delta/模拟）
def panelB_from_imp(imp, title):
    return pd.DataFrame([
        {"Variable": f"{title} — Direct (ADE)",   "Coef.": f"{imp['SR']['Direct']:.4f}", "Std. Error": f"{imp['LR']['Direct']:.4f}", "p-value":""},
        {"Variable": f"{title} — Indirect (AIE)", "Coef.": f"{imp['SR']['Indirect']:.4f}","Std. Error": f"{imp['LR']['Indirect']:.4f}","p-value":""},
        {"Variable": f"{title} — Total (ATE)",    "Coef.": f"{imp['SR']['Total']:.4f}",   "Std. Error": f"{imp['LR']['Total']:.4f}",   "p-value":""},
        {"Variable": f"{title} — Spillover Share","Coef.": f"{imp['SR']['Share']:.3f}",   "Std. Error": f"{imp['LR']['Share']:.3f}",   "p-value":""},
    ])

panelB_x1 = panelB_from_imp(imp_x1, "log_gdp_c")
panelB_x2 = panelB_from_imp(imp_x2, "ln(employment_rate)")

empty = pd.DataFrame([{'Variable':'', 'Coef.':'', 'Std. Error':'', 'p-value':''}])

combined = pd.concat([
    pd.DataFrame([{'Variable':'Panel A: Coefficients (dependent variable ln vet_per_million)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelA_full,
    empty,
    pd.DataFrame([{'Variable':'Panel B1: LeSage–Pace impacts for log_gdp_c (elasticities: SR|LR)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelB_x1,
    empty,
    pd.DataFrame([{'Variable':'Panel B2: LeSage–Pace impacts for ln(employment_rate) (elasticities: SR|LR)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelB_x2,
], ignore_index=True)

# 导出
csv_path   = OUT_DIR / "SDM_results_vet.csv"
excel_path = OUT_DIR / "SDM_results_vet.xlsx"

combined.to_csv(csv_path, index=False, encoding="utf-8-sig")

engine = None
try:
    import xlsxwriter  # noqa
    engine = "xlsxwriter"
except ModuleNotFoundError:
    try:
        import openpyxl  # noqa
        engine = "openpyxl"
    except ModuleNotFoundError:
        engine = None

if engine:
    with pd.ExcelWriter(excel_path, engine=engine) as w:
        combined.to_excel(w, index=False, sheet_name="SDM")
    print(f"Exported CSV & Excel (engine={engine}):\n  {csv_path}\n  {excel_path}")
else:
    print(f"Exported CSV only:\n  {csv_path}")




                          IV-2SLS Estimation Summary                          
Dep. Variable:                   ln_y   R-squared:                      0.9821
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9792
No. Observations:                1479   F-statistic:                 2.768e+05
Date:                Sat, Aug 30 2025   P-value (F-stat)                0.0000
Time:                        17:26:08   Distribution:                chi2(205)
Cov. Estimator:                robust                                         
                                                                              
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
Intercept             -0.8747     0.5974    -1.4643     0.1431     -2.0455      0.2961
C(year)[T.2017]     

  return splu(A).solve
  Ainv = spsolve(A, I)


PermissionError: [Errno 13] Permission denied: 'D:\\Dissertation\\dissertation\\data 2\\data\\Without UK and Germany\\Final\\moran\\SDM_Edit\\SDM_results_vet.csv'

In [9]:


# 只做一次
# pip install geopandas libpysal linearmodels scipy pandas numpy
from pathlib import Path   # ← 关键：先导入 Path
import numpy as np
import pandas as pd
import geopandas as gpd

from libpysal import weights
from libpysal.weights import Queen, KNN
from scipy.sparse import identity
from scipy.sparse.linalg import inv as spinv

from linearmodels.iv import IV2SLS   # 用 2SLS/IV 估计 δ、ρ、β、θ（带双向固定效应）

PATH_PANEL = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\panel_long_merged.csv")
PATH_GEO   = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\cleaned geo data\NUTS2_2021.geojson")
OUT_DIR    = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# ── 读数据 ─────────────────────────────────────────────────────────────
panel = pd.read_csv(PATH_PANEL)                       # 长表：NUTS_ID, year, ...
nuts  = gpd.read_file(PATH_GEO)[["NUTS_ID","geometry"]]

panel = panel.rename(columns={"region": "NUTS_ID"})   # 关键一行
panel["NUTS_ID"] = panel["NUTS_ID"].astype(str)

# 用 vlog_gdp_pc 生成 ln_x（自动把 ≤0 的设为 NaN，后面会 dropna）
panel["log_gdp_pc"] = pd.to_numeric(panel["log_gdp_pc"], errors="coerce")
bad = (panel["log_gdp_pc"] <= 0).sum()
print(f"≤0 的条数：{bad}")  # 仅提示你是否存在 0/负数
panel["ln_x"] = pd.to_numeric(panel["log_gdp_pc"], errors="coerce")

≤0 的条数：0


In [10]:
panel["ln_y"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))

In [11]:
s = panel["log_gdp_pc"].astype(str).str.strip()

# 1) 处理空白与常见“缺失占位”
s = (s.replace({"": None, "NA": None, "N/A": None, ".": None, "-": None, "—": None})
       .str.replace("\u00A0", "", regex=False))  # 去掉不换行空格

# 2) 先去掉千分位逗号（123,456 → 123456），再把小数逗号换成点（1,23 → 1.23）
s = s.str.replace(r",(?!\d{3}\b)", ".", regex=True)         # 小数逗号 → 点
s = s.str.replace(r"(?<=\d),(?=\d{3}\b)", "", regex=True)   # 千分位逗号 → 空

# 3) 去掉除数字/点/负号外的杂字符（比如 <、~、%）
s = s.str.replace(r"[^0-9\.\-]", "", regex=True)

# 4) 转成数值
panel["log_gdp_pc"] = pd.to_numeric(s, errors="coerce")

# 5) 统计问题行
n_all  = len(s)
n_na   = panel["log_gdp_pc"].isna().sum()
n_le0  = (panel["log_gdp_pc"] <= 0).sum()
print(f"总行数: {n_all} | 解析失败(→NaN): {n_na} | ≤0 行: {n_le0}")

# 6) 生成 ln_x（把 ≤0 当缺失丢掉；若不想丢见下方“平移法”）
panel["ln_x"] = np.log(panel["log_gdp_pc"].where(panel["log_gdp_pc"] > 0, np.nan))

总行数: 2244 | 解析失败(→NaN): 100 | ≤0 行: 0


In [12]:
key_id = "NUTS_ID" if "NUTS_ID" in panel.columns else "region"
panel[key_id] = panel[key_id].astype(str)
panel["year"] = pd.to_numeric(panel["year"], errors="coerce").astype("Int64")

# === 2) 排序并生成滞后 ===============================================
panel = panel.sort_values([key_id, "year"]).reset_index(drop=True)

def add_lags(df, id_col, cols, lags=(1,2,3)):
    for c in cols:
        for L in lags:
            df[f"{c}_lag{L}"] = df.groupby(id_col, observed=True)[c].shift(L)
    return df

panel = add_lags(panel, key_id, cols=["ln_y", "ln_x"], lags=(1,2,3))

# === 3) 快速查看（前几行） ===========================================
cols_show = [
    key_id, "year",
    "ln_y","ln_y_lag1","ln_y_lag2","ln_y_lag3",
    "ln_x","ln_x_lag1","ln_x_lag2","ln_x_lag3"
]
print(panel[cols_show].head(12))

   NUTS_ID  year       ln_y  ln_y_lag1  ln_y_lag2  ln_y_lag3      ln_x  \
0     AT11  2013  10.216217        NaN        NaN        NaN  2.318274   
1     AT11  2014  10.199947  10.216217        NaN        NaN  2.321276   
2     AT11  2015  10.182091  10.199947  10.216217        NaN  2.325948   
3     AT11  2016  10.142548  10.182091  10.199947  10.216217  2.328025   
4     AT11  2017  10.122987  10.142548  10.182091  10.199947  2.331381   
5     AT11  2018  10.112194  10.122987  10.142548  10.182091  2.333335   
6     AT11  2019  10.135747  10.112194  10.122987  10.142548  2.335562   
7     AT11  2020  10.147350  10.135747  10.112194  10.122987  2.332037   
8     AT11  2021  10.153266  10.147350  10.135747  10.112194  2.338649   
9     AT11  2022  10.109881  10.153266  10.147350  10.135747  2.346201   
10    AT11  2023  10.106750  10.109881  10.153266  10.147350  2.351858   
11    AT12  2013  10.231037        NaN        NaN        NaN  2.336500   

    ln_x_lag1  ln_x_lag2  ln_x_lag3  

In [14]:
# Queen 邻接
wq = Queen.from_dataframe(nuts, silence_warnings=True)

# KNN6（补孤岛/断裂）
wk = KNN.from_dataframe(nuts, k=6)

# 合并 Queen 与 KNN 的邻接
neighbors = {}
for i in range(nuts.shape[0]):
    qn = wq.neighbors.get(i, [])
    kn = wk.neighbors.get(i, [])
    neighbors[i] = sorted(set(qn + kn))

W = weights.W(neighbors)     # 初始1/0权重
W.transform = "R"            # 行标准化（常用）
n = W.n

# 建立从地区ID到W行号的映射
nuts = nuts.reset_index(drop=True)
id2row = dict(zip(nuts["NUTS_ID"], nuts.index))
panel["rid"] = panel["NUTS_ID"].map(id2row)



In [15]:
# 时间滞后 y_{i,t-1}、y_{i,t-2}、y_{i,t-3}
panel = panel.sort_values(["NUTS_ID","year"])
panel["ln_y_lag1"] = panel.groupby("NUTS_ID")["ln_y"].shift(1)
panel["ln_y_lag2"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel["ln_y_lag3"] = panel.groupby("NUTS_ID")["ln_y"].shift(3)

# 分年份做空间乘：Wy、Wx
def add_spatial_lag(df, colname, newname):
    out = []
    for t, g in df.groupby("year"):
        v = g.set_index("rid")[colname].reindex(range(n)).values
        # 缺失填充为 0（更稳妥做法是先 dropna 再对齐；这里简单演示）
        v = np.nan_to_num(v, nan=0.0)
        lagv = W.sparse @ v
        tmp = pd.DataFrame({"rid": range(n), "year": t, newname: lagv})
        out.append(tmp)
    out = pd.concat(out, ignore_index=True)
    return df.merge(out, on=["rid","year"], how="left")

panel = add_spatial_lag(panel, "ln_y", "wy")     # W ln Y_it  —— ρ 的右手项
panel = add_spatial_lag(panel, "ln_x", "wx")     # W ln X_it  —— θ 的右手项

# 作为工具的高阶 W * X、W^2 * X，以及 W * y_{t-2}
# W^2 * x
def add_higher_order_Wx(df, base_name="ln_x"):
    # W^2：直接连乘稀疏矩阵
    W2 = W.sparse @ W.sparse
    out = []
    for t, g in df.groupby("year"):
        x = g.set_index("rid")[base_name].reindex(range(n)).values
        x = np.nan_to_num(x, nan=0.0)
        w2x = W2 @ x
        tmp = pd.DataFrame({"rid": range(n), "year": t, "w2_ln_x": w2x})
        out.append(tmp)
    return df.merge(pd.concat(out, ignore_index=True), on=["rid","year"], how="left")

panel = add_higher_order_Wx(panel, "ln_x")

# W * y_{t-2}
panel["ln_y_lag2_tmp"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel = add_spatial_lag(panel, "ln_y_lag2_tmp", "w_ln_y_lag2")
panel.drop(columns=["ln_y_lag2_tmp"], inplace=True)

# 回归可用的数据（去掉首两期缺滞后值的行）
reg = panel.dropna(subset=["ln_y","ln_x","ln_y_lag1","wy","wx","ln_y_lag2","ln_y_lag3","w2_ln_x","w_ln_y_lag2"]).copy()

In [16]:
# 把 year / NUTS_ID 设为分类（固定效应用虚拟变量吸收）


reg["year"]    = reg["year"].astype("category")
reg["NUTS_ID"] = reg["NUTS_ID"].astype("category")

# 公式： y ~ exog + [endog ~ instruments]
# 这里 exog 有 ln_x, wx, 以及双向 FE：C(year) + C(NUTS_ID)
# endog 是 ln_y_lag1（δ）和 wy（ρ）
# instruments（排除工具）包括：w2_ln_x, ln_y_lag2, ln_y_lag3, w_ln_y_lag2
formula = """
ln_y ~ 1 + C(year) + C(NUTS_ID) + ln_x + wx
      + [ ln_y_lag1 + wy ~ w2_ln_x + ln_y_lag2 + ln_y_lag3 + w_ln_y_lag2 ]
"""

iv = IV2SLS.from_formula(formula, data=reg)
res = iv.fit(cov_type="robust")   # 或 "clustered", clusters=reg["NUTS_ID"]
print(res.summary)

# 抽出四个系数
rho    = res.params["wy"]
beta   = res.params["ln_x"]
theta  = res.params["wx"]
delta, rho, beta, theta

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   ln_y   R-squared:                      0.9819
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9790
No. Observations:                1479   F-statistic:                 2.485e+05
Date:                Sat, Aug 30 2025   P-value (F-stat)                0.0000
Time:                        17:25:44   Distribution:                chi2(203)
Cov. Estimator:                robust                                         
                                                                              
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
Intercept             -0.5722     0.6590    -0.8683     0.3852     -1.8639      0.7194
C(year)[T.2017]     

(np.float64(0.6779504451405955),
 np.float64(0.012856746981015021),
 np.float64(1.4753775412827963),
 np.float64(0.016807214434265916))

In [74]:
from scipy.sparse import csr_matrix

I = identity(n, format="csr")
Ws = W.sparse  # 行标准化后的稀疏矩阵

# 稀疏逆（n≈~200 能承受；更大可以用求解-迹近似）
M = spinv(I - rho * Ws)              # (I - ρW)^{-1}
S0 = M @ (beta * I + theta * Ws)     # 当期乘数

# 直接/总/间接（平均意义下）
direct_short  = S0.diagonal().mean()
total_short   = np.asarray(S0.sum(axis=1)).ravel().mean()
indirect_short = total_short - direct_short

# 长期（稳态）
mult = 1.0 / (1.0 - delta)
direct_long   = mult * direct_short
indirect_long = mult * indirect_short
total_long    = mult * total_short

print("Short-run effects  (elasticities):")
print(f"  Direct  = {direct_short:.4f}")
print(f"  Indirect= {indirect_short:.4f}")
print(f"  Total   = {total_short:.4f}")

print("\nLong-run effects (× 1/(1-δ)):")
print(f"  Direct  = {direct_long:.4f}")
print(f"  Indirect= {indirect_long:.4f}")
print(f"  Total   = {total_long:.4f}")

Short-run effects  (elasticities):
  Direct  = 1.4754
  Indirect= 0.0362
  Total   = 1.5116

Long-run effects (× 1/(1-δ)):
  Direct  = 4.5814
  Indirect= 0.1124
  Total   = 4.6937


  return splu(A).solve
  Ainv = spsolve(A, I)


In [75]:
# --- helper for stars (if not already defined) ---
def stars(p):
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

# === 1) Impacts ===
imp, has_delta = sdm_impacts_with_se(
    res, W, x_name='ln_x', wy_name='wy', wx_name='wx', lagy_name='ln_y_lag1'
)

# === 2) Panel A: coefficients ===
coef_rows = []
labels = {
    'wy'        : 'ρ · W ln Y',
    'ln_x'      : 'β · ln X',
    'x'         : 'β · ln X',
    'wx'        : 'θ · W ln X',
    'ln_y_lag1' : 'δ · ln Y_{t−1}'
}
for name in ['wy', 'ln_x' if 'ln_x' in res.params.index else 'x', 'wx'] + (['ln_y_lag1'] if has_delta else []):
    coef_rows.append({
        'Variable'   : labels[name],
        'Coef.'      : res.params[name],
        'Std. Error' : res.std_errors[name],
        'p-value'    : res.pvalues[name]
    })
panelA = pd.DataFrame(coef_rows)

# sample/setup info
key_id = 'NUTS_ID' if 'NUTS_ID' in panel.columns else 'region'
try:
    N = int(getattr(res, 'nobs', None) or reg.shape[0])
except NameError:
    N = int(getattr(res, 'nobs', np.nan))
G = int(reg[key_id].nunique())
T = int(reg['year'].nunique())

info_rows = pd.DataFrame([
    {'Variable':'Region FE / Year FE', 'Coef.':'Yes / Yes', 'Std. Error':'', 'p-value':''},
    {'Variable':'Obs. N; Regions G; Years T', 'Coef.':f'{N}; {G}; {T}', 'Std. Error':'', 'p-value':''}
])
panelA_full = pd.concat([panelA, info_rows], ignore_index=True)

# === 3) Panel B: impacts (short/long with SE & p) ===
def fmt(cell):
    return f"{cell['est']:.4f} ({cell['se']:.4f}){stars(cell['p'])}"

panelB = pd.DataFrame({
    'Effect'          : ['Direct (ADE)','Indirect (AIE)','Total (ATE)','Spillover share (AIE/ATE)'],
    'Short-run (SR)'  : [fmt(imp[('Direct','SR')]),
                         fmt(imp[('Indirect','SR')]),
                         fmt(imp[('Total','SR')]),
                         f"{imp[('Share','SR')]['est']:.3f} [{imp[('Share','SR')]['lo']:.3f},{imp[('Share','SR')]['hi']:.3f}]"],
    'Long-run (LR)'   : [fmt(imp[('Direct','LR')]),
                         fmt(imp[('Indirect','LR')]),
                         fmt(imp[('Total','LR')]),
                         f"{imp[('Share','LR')]['est']:.3f} [{imp[('Share','LR')]['lo']:.3f},{imp[('Share','LR')]['hi']:.3f}]"]
})

# === 4) Combine and export ===
empty = pd.DataFrame([{'Variable':'', 'Coef.':'', 'Std. Error':'', 'p-value':''}])

combined = pd.concat([
    pd.DataFrame([{'Variable':'Panel A: Coefficients (dependent variable ln Y)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelA_full,
    empty,
    pd.DataFrame([{'Variable':'Panel B: LeSage–Pace impacts (ln X → ln Y, elasticities)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelB.rename(columns={'Effect':'Variable', 'Short-run (SR)':'Coef.', 'Long-run (LR)':'Std. Error'}).assign(**{'p-value':''})
], ignore_index=True)

# CSV
combined.to_csv(OUT_DIR / "SDM_results__vet_gdp.csv", index=False, encoding="utf-8-sig")

# Excel (engine fallback: xlsxwriter -> openpyxl; if neither, only CSV)
excel_path = OUT_DIR / "SDM_results_vet_gdp.xlsx"
engine = None
try:
    import xlsxwriter  # noqa
    engine = "xlsxwriter"
except ModuleNotFoundError:
    try:
        import openpyxl  # noqa
        engine = "openpyxl"
    except ModuleNotFoundError:
        engine = None

if engine:
    with pd.ExcelWriter(excel_path, engine=engine) as w:
        combined.to_excel(w, index=False, sheet_name="SDM")
    print(f"Exported Excel (engine={engine}): {excel_path}")
else:
    print("xlsxwriter/openpyxl not installed; exported CSV only. Install one if you need .xlsx.")

  return splu(A).solve
  Ainv = spsolve(A, I)


Exported Excel (engine=xlsxwriter): D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit\SDM_results_vet_gdp.xlsx


In [78]:
PATH_PANEL = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\panel_long_merged.csv")
PATH_GEO   = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\cleaned geo data\NUTS2_2021.geojson")
OUT_DIR    = Path(r"D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# ── 读数据 ─────────────────────────────────────────────────────────────
panel = pd.read_csv(PATH_PANEL)                       # 长表：NUTS_ID, year, ...
nuts  = gpd.read_file(PATH_GEO)[["NUTS_ID","geometry"]]

panel = panel.rename(columns={"region": "NUTS_ID"})   # 关键一行
panel["NUTS_ID"] = panel["NUTS_ID"].astype(str)

# 用 employment_rate 生成 ln_x（自动把 ≤0 的设为 NaN，后面会 dropna）
panel["employment_rate"] = pd.to_numeric(panel["employment_rate"], errors="coerce")
bad = (panel["employment_rate"] <= 0).sum()
print(f"≤0 的条数：{bad}")  # 仅提示你是否存在 0/负数
panel["ln_x"] = np.log(panel["employment_rate"].where(panel["employment_rate"] > 0, np.nan))

≤0 的条数：0


In [79]:
# 用已对数的 y（log_gdp_pc）作为 ln_y
panel["ln_y"] = np.log(panel["vet_per_million"].where(panel["vet_per_million"] > 0, np.nan))

s = panel["employment_rate"].astype(str).str.strip()

# 1) 处理空白与常见“缺失占位”
s = (s.replace({"": None, "NA": None, "N/A": None, ".": None, "-": None, "—": None})
       .str.replace("\u00A0", "", regex=False))  # 去掉不换行空格

# 2) 先去掉千分位逗号（123,456 → 123456），再把小数逗号换成点（1,23 → 1.23）
s = s.str.replace(r",(?!\d{3}\b)", ".", regex=True)         # 小数逗号 → 点
s = s.str.replace(r"(?<=\d),(?=\d{3}\b)", "", regex=True)   # 千分位逗号 → 空

# 3) 去掉除数字/点/负号外的杂字符（比如 <、~、%）
s = s.str.replace(r"[^0-9\.\-]", "", regex=True)

# 4) 转成数值
panel["employment_rate"] = pd.to_numeric(s, errors="coerce")

# 5) 统计问题行
n_all  = len(s)
n_na   = panel["employment_rate"].isna().sum()
n_le0  = (panel["employment_rate"] <= 0).sum()
print(f"总行数: {n_all} | 解析失败(→NaN): {n_na} | ≤0 行: {n_le0}")

# 6) 生成 ln_x（把 ≤0 当缺失丢掉；若不想丢见下方“平移法”）
panel["ln_x"] = np.log(panel["employment_rate"].where(panel["employment_rate"] > 0, np.nan))


总行数: 2244 | 解析失败(→NaN): 18 | ≤0 行: 0


In [80]:
key_id = "NUTS_ID" if "NUTS_ID" in panel.columns else "region"
panel[key_id] = panel[key_id].astype(str)
panel["year"] = pd.to_numeric(panel["year"], errors="coerce").astype("Int64")

# === 2) 排序并生成滞后 ===============================================
panel = panel.sort_values([key_id, "year"]).reset_index(drop=True)

def add_lags(df, id_col, cols, lags=(1,2,3)):
    for c in cols:
        for L in lags:
            df[f"{c}_lag{L}"] = df.groupby(id_col, observed=True)[c].shift(L)
    return df

panel = add_lags(panel, key_id, cols=["ln_y", "ln_x"], lags=(1,2,3))

# === 3) 快速查看（前几行） ===========================================
cols_show = [
    key_id, "year",
    "ln_y","ln_y_lag1","ln_y_lag2","ln_y_lag3",
    "ln_x","ln_x_lag1","ln_x_lag2","ln_x_lag3"
]
print(panel[cols_show].head(12))

   NUTS_ID  year       ln_y  ln_y_lag1  ln_y_lag2  ln_y_lag3      ln_x  \
0     AT11  2013  10.216217        NaN        NaN        NaN  4.247066   
1     AT11  2014  10.199947  10.216217        NaN        NaN  4.245634   
2     AT11  2015  10.182091  10.199947  10.216217        NaN  4.242765   
3     AT11  2016  10.142548  10.182091  10.199947  10.216217  4.245634   
4     AT11  2017  10.122987  10.142548  10.182091  10.199947  4.264087   
5     AT11  2018  10.112194  10.122987  10.142548  10.182091  4.273884   
6     AT11  2019  10.135747  10.112194  10.122987  10.142548  4.275276   
7     AT11  2020  10.147350  10.135747  10.112194  10.122987  4.276666   
8     AT11  2021  10.153266  10.147350  10.135747  10.112194  4.273884   
9     AT11  2022  10.109881  10.153266  10.147350  10.135747  4.294561   
10    AT11  2023  10.106750  10.109881  10.153266  10.147350  4.293195   
11    AT12  2013  10.231037        NaN        NaN        NaN  4.282206   

    ln_x_lag1  ln_x_lag2  ln_x_lag3  

In [81]:
# Queen 邻接
wq = Queen.from_dataframe(nuts, silence_warnings=True)

# KNN6（补孤岛/断裂）
wk = KNN.from_dataframe(nuts, k=6)

# 合并 Queen 与 KNN 的邻接
neighbors = {}
for i in range(nuts.shape[0]):
    qn = wq.neighbors.get(i, [])
    kn = wk.neighbors.get(i, [])
    neighbors[i] = sorted(set(qn + kn))

W = weights.W(neighbors)     # 初始1/0权重
W.transform = "R"            # 行标准化（常用）
n = W.n

# 建立从地区ID到W行号的映射
nuts = nuts.reset_index(drop=True)
id2row = dict(zip(nuts["NUTS_ID"], nuts.index))
panel["rid"] = panel["NUTS_ID"].map(id2row)



In [82]:
# 时间滞后 y_{i,t-1}、y_{i,t-2}、y_{i,t-3}
panel = panel.sort_values(["NUTS_ID","year"])
panel["ln_y_lag1"] = panel.groupby("NUTS_ID")["ln_y"].shift(1)
panel["ln_y_lag2"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel["ln_y_lag3"] = panel.groupby("NUTS_ID")["ln_y"].shift(3)

# 分年份做空间乘：Wy、Wx
def add_spatial_lag(df, colname, newname):
    out = []
    for t, g in df.groupby("year"):
        v = g.set_index("rid")[colname].reindex(range(n)).values
        # 缺失填充为 0（更稳妥做法是先 dropna 再对齐；这里简单演示）
        v = np.nan_to_num(v, nan=0.0)
        lagv = W.sparse @ v
        tmp = pd.DataFrame({"rid": range(n), "year": t, newname: lagv})
        out.append(tmp)
    out = pd.concat(out, ignore_index=True)
    return df.merge(out, on=["rid","year"], how="left")

panel = add_spatial_lag(panel, "ln_y", "wy")     # W ln Y_it  —— ρ 的右手项
panel = add_spatial_lag(panel, "ln_x", "wx")     # W ln X_it  —— θ 的右手项

# 作为工具的高阶 W * X、W^2 * X，以及 W * y_{t-2}
# W^2 * x
def add_higher_order_Wx(df, base_name="ln_x"):
    # W^2：直接连乘稀疏矩阵
    W2 = W.sparse @ W.sparse
    out = []
    for t, g in df.groupby("year"):
        x = g.set_index("rid")[base_name].reindex(range(n)).values
        x = np.nan_to_num(x, nan=0.0)
        w2x = W2 @ x
        tmp = pd.DataFrame({"rid": range(n), "year": t, "w2_ln_x": w2x})
        out.append(tmp)
    return df.merge(pd.concat(out, ignore_index=True), on=["rid","year"], how="left")

panel = add_higher_order_Wx(panel, "ln_x")

# W * y_{t-2}
panel["ln_y_lag2_tmp"] = panel.groupby("NUTS_ID")["ln_y"].shift(2)
panel = add_spatial_lag(panel, "ln_y_lag2_tmp", "w_ln_y_lag2")
panel.drop(columns=["ln_y_lag2_tmp"], inplace=True)

# 回归可用的数据（去掉首两期缺滞后值的行）
reg = panel.dropna(subset=["ln_y","ln_x","ln_y_lag1","wy","wx","ln_y_lag2","ln_y_lag3","w2_ln_x","w_ln_y_lag2"]).copy()


In [83]:
# 把 year / NUTS_ID 设为分类（固定效应用虚拟变量吸收）


reg["year"]    = reg["year"].astype("category")
reg["NUTS_ID"] = reg["NUTS_ID"].astype("category")

# 公式： y ~ exog + [endog ~ instruments]
# 这里 exog 有 ln_x, wx, 以及双向 FE：C(year) + C(NUTS_ID)
# endog 是 ln_y_lag1（δ）和 wy（ρ）
# instruments（排除工具）包括：w2_ln_x, ln_y_lag2, ln_y_lag3, w_ln_y_lag2
formula = """
ln_y ~ 1 + C(year) + C(NUTS_ID) + ln_x + wx
      + [ ln_y_lag1 + wy ~ w2_ln_x + ln_y_lag2 + ln_y_lag3 + w_ln_y_lag2 ]
"""

iv = IV2SLS.from_formula(formula, data=reg)
res = iv.fit(cov_type="robust")   # 或 "clustered", clusters=reg["NUTS_ID"]
print(res.summary)

# 抽出四个系数
rho    = res.params["wy"]
beta   = res.params["ln_x"]
theta  = res.params["wx"]
delta, rho, beta, theta

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   ln_y   R-squared:                      0.9820
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9792
No. Observations:                1546   F-statistic:                 2.568e+05
Date:                Wed, Aug 27 2025   P-value (F-stat)                0.0000
Time:                        01:10:30   Distribution:                chi2(208)
Cov. Estimator:                robust                                         
                                                                              
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
Intercept             -0.1901     0.4979    -0.3817     0.7027     -1.1660      0.7859
C(year)[T.2017]     

(np.float64(0.6779504451405955),
 np.float64(0.01842752044321777),
 np.float64(0.12938958642415987),
 np.float64(0.5960437650925954))

In [84]:
from scipy.sparse import csr_matrix

I = identity(n, format="csr")
Ws = W.sparse  # 行标准化后的稀疏矩阵

# 稀疏逆（n≈~200 能承受；更大可以用求解-迹近似）
M = spinv(I - rho * Ws)              # (I - ρW)^{-1}
S0 = M @ (beta * I + theta * Ws)     # 当期乘数

# 直接/总/间接（平均意义下）
direct_short  = S0.diagonal().mean()
total_short   = np.asarray(S0.sum(axis=1)).ravel().mean()
indirect_short = total_short - direct_short

# 长期（稳态）
mult = 1.0 / (1.0 - delta)
direct_long   = mult * direct_short
indirect_long = mult * indirect_short
total_long    = mult * total_short

print("Short-run effects  (elasticities):")
print(f"  Direct  = {direct_short:.4f}")
print(f"  Indirect= {indirect_short:.4f}")
print(f"  Total   = {total_short:.4f}")

print("\nLong-run effects (× 1/(1-δ)):")
print(f"  Direct  = {direct_long:.4f}")
print(f"  Indirect= {indirect_long:.4f}")
print(f"  Total   = {total_long:.4f}")


Short-run effects  (elasticities):
  Direct  = 0.1308
  Indirect= 0.6083
  Total   = 0.7391

Long-run effects (× 1/(1-δ)):
  Direct  = 0.4061
  Indirect= 1.8888
  Total   = 2.2948


  return splu(A).solve
  Ainv = spsolve(A, I)


In [85]:
# --- helper for stars (if not already defined) ---
def stars(p):
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

# === 1) Impacts ===
imp, has_delta = sdm_impacts_with_se(
    res, W, x_name='ln_x', wy_name='wy', wx_name='wx', lagy_name='ln_y_lag1'
)

# === 2) Panel A: coefficients ===
coef_rows = []
labels = {
    'wy'        : 'ρ · W ln Y',
    'ln_x'      : 'β · ln X',
    'x'         : 'β · ln X',
    'wx'        : 'θ · W ln X',
    'ln_y_lag1' : 'δ · ln Y_{t−1}'
}
for name in ['wy', 'ln_x' if 'ln_x' in res.params.index else 'x', 'wx'] + (['ln_y_lag1'] if has_delta else []):
    coef_rows.append({
        'Variable'   : labels[name],
        'Coef.'      : res.params[name],
        'Std. Error' : res.std_errors[name],
        'p-value'    : res.pvalues[name]
    })
panelA = pd.DataFrame(coef_rows)

# sample/setup info
key_id = 'NUTS_ID' if 'NUTS_ID' in panel.columns else 'region'
try:
    N = int(getattr(res, 'nobs', None) or reg.shape[0])
except NameError:
    N = int(getattr(res, 'nobs', np.nan))
G = int(reg[key_id].nunique())
T = int(reg['year'].nunique())

info_rows = pd.DataFrame([
    {'Variable':'Region FE / Year FE', 'Coef.':'Yes / Yes', 'Std. Error':'', 'p-value':''},
    {'Variable':'Obs. N; Regions G; Years T', 'Coef.':f'{N}; {G}; {T}', 'Std. Error':'', 'p-value':''}
])
panelA_full = pd.concat([panelA, info_rows], ignore_index=True)

# === 3) Panel B: impacts (short/long with SE & p) ===
def fmt(cell):
    return f"{cell['est']:.4f} ({cell['se']:.4f}){stars(cell['p'])}"

panelB = pd.DataFrame({
    'Effect'          : ['Direct (ADE)','Indirect (AIE)','Total (ATE)','Spillover share (AIE/ATE)'],
    'Short-run (SR)'  : [fmt(imp[('Direct','SR')]),
                         fmt(imp[('Indirect','SR')]),
                         fmt(imp[('Total','SR')]),
                         f"{imp[('Share','SR')]['est']:.3f} [{imp[('Share','SR')]['lo']:.3f},{imp[('Share','SR')]['hi']:.3f}]"],
    'Long-run (LR)'   : [fmt(imp[('Direct','LR')]),
                         fmt(imp[('Indirect','LR')]),
                         fmt(imp[('Total','LR')]),
                         f"{imp[('Share','LR')]['est']:.3f} [{imp[('Share','LR')]['lo']:.3f},{imp[('Share','LR')]['hi']:.3f}]"]
})

# === 4) Combine and export ===
empty = pd.DataFrame([{'Variable':'', 'Coef.':'', 'Std. Error':'', 'p-value':''}])

combined = pd.concat([
    pd.DataFrame([{'Variable':'Panel A: Coefficients (dependent variable ln Y)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelA_full,
    empty,
    pd.DataFrame([{'Variable':'Panel B: LeSage–Pace impacts (ln X → ln Y, elasticities)', 'Coef.':'', 'Std. Error':'', 'p-value':''}]),
    panelB.rename(columns={'Effect':'Variable', 'Short-run (SR)':'Coef.', 'Long-run (LR)':'Std. Error'}).assign(**{'p-value':''})
], ignore_index=True)

# CSV
combined.to_csv(OUT_DIR / "SDM_results_vet_em.csv", index=False, encoding="utf-8-sig")

# Excel (engine fallback: xlsxwriter -> openpyxl; if neither, only CSV)
excel_path = OUT_DIR / "SDM_results_vet_em.xlsx"
engine = None
try:
    import xlsxwriter  # noqa
    engine = "xlsxwriter"
except ModuleNotFoundError:
    try:
        import openpyxl  # noqa
        engine = "openpyxl"
    except ModuleNotFoundError:
        engine = None

if engine:
    with pd.ExcelWriter(excel_path, engine=engine) as w:
        combined.to_excel(w, index=False, sheet_name="SDM")
    print(f"Exported Excel (engine={engine}): {excel_path}")
else:
    print("xlsxwriter/openpyxl not installed; exported CSV only. Install one if you need .xlsx.")


  return splu(A).solve
  Ainv = spsolve(A, I)


Exported Excel (engine=xlsxwriter): D:\Dissertation\dissertation\data 2\data\Without UK and Germany\Final\moran\SDM_Edit\SDM_results_vet_em.xlsx
