# 2.3 倾向评分匹配（PSM）实战：模拟“二甲双胍治疗”场景

本 Notebook 演示：
1) 构造一个含混杂的模拟数据集；
2) 估计倾向评分并做**最近邻匹配**；
3) 可视化匹配前后 PS 分布；
4) 计算匹配后的 **ATE** 与**平衡性（SMD）**。

### 环境准备（如已安装可跳过）
```bash
pip install -U numpy pandas scikit-learn matplotlib
```

In [None]:
# ① 构造模拟数据
import numpy as np
import pandas as pd
np.random.seed(42)
n = 500
age = np.random.randint(40, 70, n)
bmi = np.random.normal(28, 5, n)
# 让治疗依赖 age/bmi（产生混杂）
p_treat = 1/(1+np.exp(-(0.1*age - 0.2*bmi)))
treatment = np.random.binomial(1, p=p_treat, size=n)

# 结局 y（较低更好，假设治疗可降低 0.5 单位）
y = 10 - 0.5*treatment + 0.05*age + 0.1*bmi + np.random.normal(0,1,n)

df = pd.DataFrame({'age':age,'bmi':bmi,'treatment':treatment,'y':y})
df.head()

In [None]:
# ② 倾向评分估计 + 最近邻匹配
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

logit = LogisticRegression(max_iter=1000)
logit.fit(df[['age','bmi']], df['treatment'])
ps = logit.predict_proba(df[['age','bmi']])[:,1]
df['ps'] = ps

# 拆分处理/对照
treated = df[df.treatment==1].copy()
control = df[df.treatment==0].copy()

# 最近邻匹配（1:1）
nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[['ps']])
dist, idx = nn.kneighbors(treated[['ps']])
matched_control = control.iloc[idx.flatten()].copy().reset_index(drop=True)
matched_treated = treated.reset_index(drop=True)

# 计算配对差异（配对 ATE）
pair_diff = matched_treated['y'].values - matched_control['y'].values
ate_matched = pair_diff.mean()
print('Matched ATE (Treated - Control):', round(ate_matched, 4))


In [None]:
# ③ 匹配前后：PS 分布可视化
import matplotlib.pyplot as plt

# 匹配前
plt.figure()
plt.hist(treated['ps'], bins=30, alpha=0.6, label='Treated', density=True)
plt.hist(control['ps'], bins=30, alpha=0.6, label='Control', density=True)
plt.xlabel('Propensity Score')
plt.ylabel('Density')
plt.title('PS Distributions (Before Matching)')
plt.legend()
plt.show()

# 匹配后
plt.figure()
plt.hist(matched_treated['ps'], bins=30, alpha=0.6, label='Treated', density=True)
plt.hist(matched_control['ps'], bins=30, alpha=0.6, label='Control', density=True)
plt.xlabel('Propensity Score')
plt.ylabel('Density')
plt.title('PS Distributions (After Matching)')
plt.legend()
plt.show()


In [None]:
# ④ 平衡性诊断：SMD（标准化均差）
def smd(x_t, x_c):
    m_t, m_c = np.mean(x_t), np.mean(x_c)
    s_t, s_c = np.std(x_t, ddof=1), np.std(x_c, ddof=1)
    s_p = np.sqrt((s_t**2 + s_c**2)/2)
    return (m_t - m_c)/s_p if s_p>0 else 0.0

covs = ['age','bmi']
print('SMD (Before Matching):')
for c in covs:
    print(f'  {c}:', round(smd(treated[c], control[c]), 3))

print('\nSMD (After Matching):')
for c in covs:
    print(f'  {c}:', round(smd(matched_treated[c], matched_control[c]), 3))
