
# 0.3.1 DoWhy 合成数据（Lalonde）因果 EDA 入门

本 Notebook 旨在：
- 加载 DoWhy 合成的 Lalonde 风格数据（含处理、结局、混杂、工具变量）
- 做因果视角的基础 EDA：缺失/类型、基线平衡（SMD）、倾向评分重叠（PS overlap）


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import dowhy.datasets as dwd

# 生成合成数据
data = dwd.linear_dataset(
    beta=10,
    num_common_causes=5,
    num_instruments=1,
    num_samples=1000,
    treatment_is_binary=True,
    stddev_treatment_noise=1.0,
    stddev_outcome_noise=1.0,
    seed=2025
)

df = data["df"].copy()
T = data["treatment_name"]
Y = data["outcome_name"]
C = data["common_causes_names"]
IV = data.get("instrument_names", [])

print("数据形状:", df.shape)
display(df.head())
print("treatment列:", T, " | outcome列:", Y)
print("混杂变量:", C, " | 工具变量:", IV)


## 基础检查：缺失、类型与分布

In [None]:

print(df.info())
display(df.describe())
print("缺失值计数：")
print(df.isnull().sum().sort_values(ascending=False).head(10))



## 基线平衡（SMD）
> 标准化均数差（SMD）用于衡量治疗组与对照组在混杂上的差异；|SMD|≤0.1/0.2 通常认为平衡较好。


In [None]:

def smd_cont(x_t, x_c):
    mu_t, mu_c = np.nanmean(x_t), np.nanmean(x_c)
    sd_pool = np.sqrt((np.nanvar(x_t, ddof=1) + np.nanvar(x_c, ddof=1)) / 2)
    return (mu_t - mu_c) / (sd_pool + 1e-12)

tmask = df[T] == 1
cmask = ~tmask
rows = []
for v in C:
    rows.append((v, smd_cont(df.loc[tmask, v], df.loc[cmask, v])))
smd_table = pd.DataFrame(rows, columns=["variable","SMD"]).sort_values("variable")
display(smd_table)


### 分布可视化（示例：一个混杂变量）

In [None]:

var = C[0]
plt.figure()
plt.hist(df.loc[tmask, var], bins=30, alpha=0.6, label="Treatment")
plt.hist(df.loc[cmask, var], bins=30, alpha=0.6, label="Control")
plt.xlabel(var); plt.ylabel("Count"); plt.legend(); plt.title(f"Distribution of {var}")
plt.show()



## 倾向评分重叠（PS overlap）
> 仅用于粗查可比性，后续将用于配权/匹配。若两组 PS 分布几乎不重叠，需考虑修剪或变更研究设计。


In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X = df[C].values
y = df[T].values

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

ps_model = LogisticRegression(max_iter=200).fit(X_std, y)
ps = ps_model.predict_proba(X_std)[:, 1]
df["propensity_score"] = ps

plt.figure()
plt.hist(df.loc[tmask, "propensity_score"], bins=30, alpha=0.6, label="Treatment")
plt.hist(df.loc[cmask, "propensity_score"], bins=30, alpha=0.6, label="Control")
plt.xlabel("Propensity Score"); plt.ylabel("Count"); plt.legend(); plt.title("PS Overlap Check")
plt.show()
