In [1]:
import duckdb
con = duckdb.connect('mimiciv.duckdb')


In [2]:
lab_itemids = {
    'BUN': 51006,
    'Scr': 50912,
    'Na': 50983,
    'K': 50971,
    'Glucose': 50931,
    'WBC': 51301,
    'PLT': 51265,
    'Hb': 51222,
    'Ca': 50893,
    'Cl': 50902,
    'AG': 50868,
    'Phos': 50970,
    'BG': 50809
}

# 构造 SQL 查询（入ICU后24小时内，取第一次记录）
lab_queries = []
for var, itemid in lab_itemids.items():
    lab_queries.append(f"""
    SELECT stay_id, '{var}' AS variable, MIN(valuenum) AS value
    FROM labevents l
    INNER JOIN dka_with_aki d ON l.hadm_id = d.hadm_id
    WHERE l.itemid = {itemid}
      AND l.charttime BETWEEN d.intime AND d.intime + INTERVAL 24 HOUR
    GROUP BY stay_id
    """)

# 合并所有查询（UNION ALL）
full_lab_query = " UNION ALL ".join(lab_queries)

lab_features_df = con.execute(full_lab_query).df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
lab_pivot = lab_features_df.pivot(index='stay_id', columns='variable', values='value').reset_index()


In [5]:
vital_itemids = {
    'HR': 211,
    'RR': 618,
    'SBP': 51,
    'DBP': 8368,
    'Weight': 226512
}

vital_queries = []
for var, itemid in vital_itemids.items():
    vital_queries.append(f"""
    SELECT c.stay_id, '{var}' AS variable, MIN(c.valuenum) AS value
    FROM chartevents c
    INNER JOIN dka_with_aki d ON c.stay_id = d.stay_id
    WHERE c.itemid = {itemid}
      AND c.charttime BETWEEN d.intime AND d.intime + INTERVAL 24 HOUR
    GROUP BY c.stay_id
    """)

# 合并所有变量查询
full_vital_query = " UNION ALL ".join(vital_queries)

# 执行查询并转为透视表
vital_features_df = con.execute(full_vital_query).df()
vital_pivot = vital_features_df.pivot(index='stay_id', columns='variable', values='value').reset_index()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
# 获取 cohort 基础信息（含 AKI 标签）
cohort_df = con.execute("SELECT stay_id, subject_id, hadm_id, age, gender, aki_label FROM dka_with_aki").df()

# 合并所有变量
full_features = cohort_df.merge(lab_pivot, on='stay_id', how='left')
full_features = full_features.merge(vital_pivot, on='stay_id', how='left')


In [7]:
# 预览样本和特征
print(full_features.shape)
full_features.head()

# 检查缺失比例
missing_pct = full_features.isnull().mean().sort_values(ascending=False)
print(missing_pct)


(1400, 20)
Weight        0.987143
BG            0.665714
Hb            0.058571
WBC           0.057143
PLT           0.053571
Ca            0.022143
Phos          0.020714
Glucose       0.014286
AG            0.014286
BUN           0.014286
Scr           0.013571
Cl            0.013571
K             0.012857
Na            0.012857
aki_label     0.009286
subject_id    0.000000
gender        0.000000
age           0.000000
hadm_id       0.000000
stay_id       0.000000
dtype: float64


In [None]:
lab_pivot.to_csv("lab_pivot.csv", index=False)
vital_pivot.to_csv("vital_pivot.csv", index=False)
cohort_df.to_csv("cohort_df.csv", index=False)
