
# 🧪 环境检查与示例数据加载（配套 0.1）
本 Notebook 用于：
1) 检查 Python 包是否安装、版本是否可用；  
2) 快速加载 DoWhy 内置示例数据用于因果推断练习；  
3) （可选）从**本地**加载 UCI Heart Disease 数据做基础 EDA。

> 说明：本执行环境不联网，因此在线下载数据的示例只提供代码模板，实际运行请在本地执行或将数据放到本地路径。



## 1. 环境检查（必须通过）
如果缺包，请回到你的终端/Anaconda Prompt 按照教程中的 conda/pip 命令安装。


In [None]:

import sys, platform, importlib

required = [
    ("pandas", "pd"),
    ("numpy", "np"),
    ("matplotlib", "plt"),
    ("scikit_learn", "sklearn"),
    ("statsmodels", "sm"),
    ("dowhy", "dowhy"),
    ("econml", "econml"),
    ("causalml", "causalml")
]

print("Python:", sys.version.split()[0], "| Platform:", platform.platform())
print("== 包检查与版本 ==")
missing = []
for mod, alias in required:
    try:
        if mod == "matplotlib":
            import matplotlib as m
        elif mod == "scikit_learn":
            import sklearn as m
        else:
            m = importlib.import_module(mod)
        print(f"[OK] {mod:<14} {getattr(m, '__version__', 'unknown')}")
    except Exception as e:
        print(f"[MISSING] {mod:<14} - {e}")
        missing.append(mod)

if missing:
    print("\n缺少的包：", missing)
else:
    print("\n环境检查通过！")



## 2. 加载 DoWhy 示例数据（可直接运行）
- 这是一份合成数据，模拟“处理（treatment）→ 结局（y）”，并包含混杂变量（v0~v4）。  
- 我们做最小化 EDA：样本量、前几行、标准化差异（SMD）与一个示例直方图。


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import dowhy.datasets as dwd
    data = dwd.linear_dataset(
        beta=8, num_common_causes=5, num_instruments=1, num_samples=800,
        treatment_is_binary=True, stddev_treatment_noise=1.0, stddev_outcome_noise=1.0, seed=2025
    )
    df = data["df"]
    T = data["treatment_name"]
    Y = data["outcome_name"]
    covs = data["common_causes_names"]
    print("样本量:", df.shape)
    display(df.head())
except Exception as e:
    print("加载 DoWhy 数据失败：", e)
    df, T, Y, covs = None, None, None, None


In [None]:

def smd(x_t, x_c):
    mu_t, mu_c = np.nanmean(x_t), np.nanmean(x_c)
    sd_pooled = np.sqrt((np.nanvar(x_t, ddof=1) + np.nanvar(x_c, ddof=1)) / 2)
    return (mu_t - mu_c) / (sd_pooled + 1e-12)

if df is not None:
    tmask, cmask = df[T] == 1, df[T] == 0
    print("治疗组 n =", int(tmask.sum()), "| 对照组 n =", int(cmask.sum()))
    rows = []
    for v in covs:
        rows.append((v, smd(df.loc[tmask, v], df.loc[cmask, v])))
    smd_df = pd.DataFrame(rows, columns=["variable","SMD"]).sort_values("variable")
    display(smd_df)

    # 一个示例直方图（不指定颜色，符合绘图规范要求）
    var = covs[0]
    plt.figure()
    plt.hist(df.loc[tmask, var].dropna(), bins=30, alpha=0.6, label="Treatment")
    plt.hist(df.loc[cmask, var].dropna(), bins=30, alpha=0.6, label="Control")
    plt.xlabel(var); plt.ylabel("Count"); plt.legend(); plt.title(f"Distribution of {var}")
    plt.show()



## 3. （可选）从本地加载 UCI Heart Disease 数据
> 由于当前环境不联网，这里提供**本地加载模板**。将 `heart_path` 改为你机器上的 CSV 路径再运行。  
> 如果你在本地 Jupyter 中执行，也可以把在线链接替换为本地路径。


In [None]:

import pandas as pd
import os

# 修改为你本地的 Heart.csv 路径，例如：r"C:\Users\you\Downloads\Heart.csv"
heart_path = r"/path/to/your/local/Heart.csv"

df_heart = None
if os.path.exists(heart_path):
    try:
        df_heart = pd.read_csv(heart_path)
        print("已加载本地数据:", heart_path)
        display(df_heart.head())
        print("形状:", df_heart.shape)
    except Exception as e:
        print("读取本地 Heart.csv 失败：", e)
else:
    print("未找到本地 Heart.csv。请修改 heart_path 为你的实际路径后重试。")


In [None]:

import numpy as np
import matplotlib.pyplot as plt

if isinstance(df_heart, pd.DataFrame):
    # 尝试识别结局列
    candidates = [c for c in df_heart.columns if c.lower() in ("ahd","target","disease","diagnosis")]
    target = candidates[0] if candidates else None
    print("候选结局列：", candidates, "| 使用：", target)

    # 简单数目和一个分布图示例
    if target is not None:
        if df_heart[target].dtype == 'O':
            df_heart[target] = df_heart[target].astype('category').cat.codes
        cases = int((df_heart[target]==1).sum())
        ctrls = int((df_heart[target]==0).sum())
        print(f"病例 n={cases} | 对照 n={ctrls}")

        # 找到一个数值型特征绘图
        num_cols = [c for c in df_heart.columns if c != target and pd.api.types.is_numeric_dtype(df_heart[c])]
        if num_cols:
            feat = num_cols[0]
            plt.figure()
            plt.hist(df_heart.loc[df_heart[target]==1, feat].dropna(), bins=30, alpha=0.6, label="Cases")
            plt.hist(df_heart.loc[df_heart[target]==0, feat].dropna(), bins=30, alpha=0.6, label="Controls")
            plt.xlabel(feat); plt.ylabel("Count"); plt.legend(); plt.title(f"Distribution of {feat}")
            plt.show()
    else:
        print("未能识别结局列，请手动指定 `target` 并重跑。")
