# 3.1 MIMIC-IV ICU：血管活性药物 → 28 天死亡率（PSM & IPTW 模板）

本 Notebook 提供在 **真实世界 ICU 数据（如 MIMIC-IV）** 上做因果推断的模板：
1) 数据清洗与缺失处理；2) 倾向评分（PS）估计；3) **PSM** 与 **IPTW**；
4) 平衡性诊断（SMD）；5) 估计风险差/风险比；6) 结果解释。

> ⚠️ 说明：默认会先生成一个**可运行的模拟数据**以演示流程。将其替换为你的 MIMIC-IV 提取表即可。

## 0. 环境依赖（如已安装可跳过）

```bash
pip install -U pandas numpy scikit-learn matplotlib
```

In [None]:
# 1) 载入数据：优先尝试读取本地 CSV；若未提供，则生成模拟数据
import os
import numpy as np
import pandas as pd

CSV_PATH = ''  # <- 可改为你的 MIMIC 提取结果，如 'mimic_icustay_vaso_28d.csv'

def make_synthetic_icu(n=3000, seed=7):
    rng = np.random.default_rng(seed)
    age = rng.integers(18, 90, n)
    sex = rng.integers(0, 2, n)  # 0/1
    sofa = np.clip(rng.normal(6, 3, n), 0, None)
    comorb = np.clip(rng.normal(1.5, 0.8, n), 0, None)
    # treatment assignment depends on severity
    p_t = 1/(1+np.exp(-( -4 + 0.06*age + 0.25*sofa + 0.4*comorb )))
    t = rng.binomial(1, p_t)
    # baseline risk + treatment effect
    base = -3 + 0.03*age + 0.35*sofa + 0.3*comorb
    logit_death = base - 0.25*t  # treatment beneficial
    p_death = 1/(1+np.exp(-logit_death))
    y = rng.binomial(1, p_death)  # 1=death within 28d
    df = pd.DataFrame({
        'age': age,
        'sex': sex,
        'sofa': sofa,
        'comorbidity_index': comorb,
        'vasopressor': t,
        'mortality_28d': y
    })
    # inject some missingness
    miss_idx = rng.choice(n, size=int(0.05*n), replace=False)
    df.loc[miss_idx, 'sofa'] = np.nan
    return df

if CSV_PATH and os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
else:
    df = make_synthetic_icu()

df.head()


## 2) 清洗：缺失值与极端值处理（Winsorize）

In [None]:
def median_impute(df, cols):
    for c in cols:
        df[c] = df[c].fillna(df[c].median())
    return df

def mode_impute(df, cols):
    for c in cols:
        mode = df[c].mode(dropna=True)
        if len(mode):
            df[c] = df[c].fillna(mode[0])
    return df

def winsorize(df, cols, lower=0.01, upper=0.99):
    for c in cols:
        lo, hi = df[c].quantile([lower, upper])
        df[c] = df[c].clip(lo, hi)
    return df

cont = ['age','sofa','comorbidity_index']
cat  = ['sex']
df = median_impute(df, cont)
df = mode_impute(df, cat)
df = winsorize(df, cont)
df[cont+cat].describe(include='all')


## 3) 倾向评分（PS）估计 + 诊断

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

X = df[['age','sex','sofa','comorbidity_index']]
T = df['vasopressor'].values
Y = df['mortality_28d'].values

logit = LogisticRegression(max_iter=2000)
logit.fit(X, T)
ps = logit.predict_proba(X)[:,1]
df['ps'] = ps

print('PS model AUC:', round(roc_auc_score(T, ps), 3))

plt.figure()
plt.hist(ps[T==1], bins=40, alpha=0.6, label='Treated', density=True)
plt.hist(ps[T==0], bins=40, alpha=0.6, label='Control', density=True)
plt.xlabel('Propensity Score')
plt.ylabel('Density')
plt.title('PS Distributions (Before Matching)')
plt.legend()
plt.show()


## 4) PSM：1:1 最近邻匹配（卡尺可选）+ 组间效果估计

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

treated = df[df.vasopressor==1].copy()
control = df[df.vasopressor==0].copy()

nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[['ps']])
dist, idx = nn.kneighbors(treated[['ps']])
matched_control = control.iloc[idx.flatten()].copy().reset_index(drop=True)
matched_treated = treated.reset_index(drop=True)

caliper = 0.05
mask = (np.abs(matched_treated['ps'] - matched_control['ps']) <= caliper)
matched_treated = matched_treated[mask].reset_index(drop=True)
matched_control = matched_control[mask].reset_index(drop=True)

print('Matched pairs:', len(matched_treated))

plt.figure()
plt.hist(matched_treated['ps'], bins=30, alpha=0.6, label='Treated', density=True)
plt.hist(matched_control['ps'], bins=30, alpha=0.6, label='Control', density=True)
plt.xlabel('Propensity Score')
plt.ylabel('Density')
plt.title('PS Distributions (After Matching)')
plt.legend()
plt.show()

risk_t = matched_treated['mortality_28d'].mean()
risk_c = matched_control['mortality_28d'].mean()
rd = risk_t - risk_c
rr = risk_t / risk_c if risk_c>0 else np.nan
print('PSM Risk (T,C):', round(risk_t,3), round(risk_c,3))
print('PSM Risk Difference:', round(rd,3))
print('PSM Risk Ratio:', round(rr,3))


## 5) IPTW：稳定化权重 + 加权风险估计

In [None]:
p_treat = T.mean()
sw = np.where(T==1, p_treat/ps, (1-p_treat)/(1-ps))
df['weight_sw'] = sw

wt_mean_T = np.average(df.loc[T==1, 'mortality_28d'], weights=df.loc[T==1,'weight_sw'])
wt_mean_C = np.average(df.loc[T==0, 'mortality_28d'], weights=df.loc[T==0,'weight_sw'])
print('IPTW weighted risk (T,C):', round(wt_mean_T,3), round(wt_mean_C,3))
print('IPTW Risk Difference:', round(wt_mean_T - wt_mean_C, 3))


## 6) 平衡性诊断（SMD）：匹配前/后、IPTW 后

In [None]:
def smd(x_t, x_c):
    m_t, m_c = np.mean(x_t), np.mean(x_c)
    s_t, s_c = np.std(x_t, ddof=1), np.std(x_c, ddof=1)
    s_p = np.sqrt((s_t**2 + s_c**2)/2)
    return (m_t - m_c)/s_p if s_p>0 else 0.0

def smd_weighted(x, t, w):
    xt, xc = x[t==1], x[t==0]
    wt, wc = w[t==1], w[t==0]
    m_t = np.average(xt, weights=wt)
    m_c = np.average(xc, weights=wc)
    s_t = np.sqrt(np.average((xt-m_t)**2, weights=wt))
    s_c = np.sqrt(np.average((xc-m_c)**2, weights=wc))
    s_p = np.sqrt((s_t**2 + s_c**2)/2)
    return (m_t - m_c)/s_p if s_p>0 else 0.0

covs = ['age','sex','sofa','comorbidity_index']
print('SMD (Before Matching):')
for c in covs:
    print(f'  {c}:', round(smd(df.loc[T==1, c], df.loc[T==0, c]), 3))

print('\nSMD (After PSM Matching):')
for c in covs:
    print(f'  {c}:', round(smd(matched_treated[c], matched_control[c]), 3))

print('\nSMD (After IPTW):')
for c in covs:
    print(f'  {c}:', round(smd_weighted(df[c].values, T, df['weight_sw'].values), 3))
