
# 0.3.2 UCI Heart + 因果 EDA 小工具

本 Notebook：
- 尝试加载 UCI Heart Disease 数据（在线 URL 或本地备选路径）
- 进行面向因果的 EDA（缺失/类型、分组均值、一个简单分布图）
- 提供可复用的 SMD 与 PS 可视化辅助函数（可用于你的任何项目）


## 可复用函数（建议复制到你的项目 utils 中）

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def smd_cont(x_t, x_c):
    mu_t, mu_c = np.nanmean(x_t), np.nanmean(x_c)
    sd_pool = np.sqrt((np.nanvar(x_t, ddof=1)+np.nanvar(x_c, ddof=1))/2)
    return (mu_t - mu_c) / (sd_pool + 1e-12)

def smd_binary(x_t, x_c):
    p1, p0 = np.nanmean(x_t), np.nanmean(x_c)
    p = np.nanmean(np.r_[x_t, x_c])
    denom = np.sqrt(p*(1-p) + 1e-12)
    return (p1 - p0) / denom

def balance_table(df, treat_col, covs, cat_covs=None):
    cat_covs = set(cat_covs or [])
    tmask = df[treat_col] == 1
    cmask = df[treat_col] == 0
    rows = []
    for v in covs:
        if v in cat_covs or df[v].dropna().isin([0,1]).all():
            smd = smd_binary(df.loc[tmask, v], df.loc[cmask, v])
        else:
            smd = smd_cont(df.loc[treat_col==1, v], df.loc[treat_col==0, v])
        rows.append((v, smd))
    out = pd.DataFrame(rows, columns=["variable","SMD"]).sort_values("variable")
    return out

def plot_ps_overlap(ps_t, ps_c, bins=30):
    plt.figure()
    plt.hist(ps_t, bins=bins, alpha=0.6, label="Treatment")
    plt.hist(ps_c, bins=bins, alpha=0.6, label="Control")
    plt.xlabel("Propensity Score"); plt.ylabel("Count"); plt.legend(); plt.title("PS Overlap")
    plt.show()



## 加载 UCI Heart Disease 数据
- 首选：在线 CSV（若你本地联网）  
- 备选：本地路径（下载后设置 heart_path）


In [None]:

import pandas as pd
import os

# 在线来源（若在离线环境将会失败）
url = "https://raw.githubusercontent.com/selva86/datasets/master/Heart.csv"

dfh = None
try:
    dfh = pd.read_csv(url)
    print("已从在线来源加载：", url)
except Exception as e:
    print("在线加载失败：", e)
    heart_path = r"/path/to/your/local/Heart.csv"  # 改成你的本地路径
    if os.path.exists(heart_path):
        dfh = pd.read_csv(heart_path)
        print("已从本地加载：", heart_path)
    else:
        print("未找到本地 Heart.csv，请修改路径。")

if dfh is not None:
    print("形状:", dfh.shape)
    display(dfh.head())
    print(dfh.info())
    dfh.columns = [c.lower() for c in dfh.columns]


## 因果视角的基础 EDA（仅演示，不代表因果结论）

In [None]:

import matplotlib.pyplot as plt

if isinstance(dfh, pd.DataFrame):
    # 识别结局列
    target_candidates = [c for c in dfh.columns if c in ("ahd", "target", "disease")]
    print("可能的结局列:", target_candidates)
    target = target_candidates[0] if target_candidates else None

    # 若是字符串标签，转为 0/1
    if target and dfh[target].dtype == "O":
        dfh[target] = dfh[target].map({"Yes":1, "No":0}).fillna(dfh[target]).astype(int)

    # 缺失与描述性
    print("缺失值统计：")
    print(dfh.isnull().sum().sort_values(ascending=False).head(10))
    display(dfh.describe(include='all'))

    # 简单分组均值（随数据版本字段而定）
    cols_focus = [c for c in ["age","restbp","chol","maxhr","oldpeak"] if c in dfh.columns]
    if target and cols_focus:
        print("\n按是否患病分组的均值：")
        print(dfh.groupby(target)[cols_focus].mean())

    # 可视化：年龄分布（病例 vs 对照）
    if target and "age" in dfh.columns:
        plt.figure()
        for val, name in [(0,"No Disease"), (1,"Disease")]:
            plt.hist(dfh.loc[dfh[target]==val, "age"].dropna(), bins=20, alpha=0.6, label=name)
        plt.xlabel("Age"); plt.ylabel("Count"); plt.legend(); plt.title("Age by Disease Status")
        plt.show()
