# Set up

In [1]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Tuple, Dict, List

# Test build

In [2]:
RNG = np.random.default_rng(1030)

LOW, HIGH = 0.1, 5.0
BASELINE = 1
CENTRES = [1.0, 1.1, 1.5, 2.0, 3.0]
REPLICATES_PER_REGION = 20

In [6]:
def sample_trunc_bvn(mean: np.ndarray, cov: np.ndarray, low: float, high: float) -> np.ndarray:
    while True:
        x = RNG.multivariate_normal(mean, cov)
        if low <= x[0] <= high and low <= x[1] <= high:
            return x

# def build_datasets_for_centre(
#     m: float,
#     sd2: float = 0.1,
#     rho: float = 0.3,
#     baseline_jitter: float = 1e-2,
#     jitter_mode: str = "random"
# ) -> pd.DataFrame:
#     sd = np.sqrt(sd2)
#     cov = np.array([[sd**2, rho*sd**2],
#                     [rho*sd**2, sd**2]])
#     rows = []
#     for region in range(1, 6):
#         for _ in range(REPLICATES_PER_REGION):
#             mean = np.array([m, m])
#             a_r, b_r = sample_trunc_bvn(mean, cov, LOW, HIGH)

#             params = []
#             for r in range(1, 6):
#                 if r == region:
#                     # 当前被“激活”的区域：用真正采样到的那一对
#                     params.extend([a_r, b_r])
#                 else:
#                     # 其他区域：本来是 BASELINE，这里加一个很小的扰动
#                     if jitter_mode == "random":
#                         # 随机小抖动，零均值，幅度由 baseline_jitter 控制
#                         eps_a = np.random.uniform(-baseline_jitter, baseline_jitter)
#                         eps_b = np.random.uniform(-baseline_jitter, baseline_jitter)
#                         params.extend([BASELINE + eps_a, BASELINE + eps_b])
#                     else:
#                         # 固定的小偏移，保证不等于 BASELINE
#                         params.extend([BASELINE + baseline_jitter,
#                                        BASELINE + baseline_jitter])

#             rows.append(params)

#     df = pd.DataFrame(
#         rows,
#         columns=[f"{ab}{r}" for r in range(1, 6) for ab in ("a", "b")]
#     )
#     return df


def build_datasets_for_centre(
    m: float,
    sd2: float = 0.1,
    rho: float = 0.3,
    baseline_jitter: float = 1e-2,
    jitter_mode: str = "random",
    n_active_regions: int = 1,   # 新增：每行激活多少个区域
) -> pd.DataFrame:

    if not (1 <= n_active_regions <= 5):
        raise ValueError("n_active_regions must be between 1 and 5.")

    sd = np.sqrt(sd2)
    cov = np.array([[sd**2,        rho * sd**2],
                    [rho * sd**2,  sd**2      ]])

    rows = []
    all_regions = list(range(1, 6))

    for region in all_regions:

        for _ in range(REPLICATES_PER_REGION):
            mean = np.array([m, m])

            # === 选出本行中被激活的区域集合 ===
            # 确保当前 region 一定在激活集合里，然后再随机补足剩下的
            active_regions = [region]
            if n_active_regions > 1:
                others = [r for r in all_regions if r != region]
                extra = np.random.choice(
                    others,
                    size=n_active_regions - 1,
                    replace=False
                )
                active_regions.extend(extra)

            # === 为每一个激活区域单独采样一对 (a_r, b_r) ===
            samples = {}
            for r_act in active_regions:
                samples[r_act] = sample_trunc_bvn(mean, cov, LOW, HIGH)  # 返回 shape (2,)

            # === 组装这一行的 10 个参数 a1,b1,...,a5,b5 ===
            params = []
            for r in all_regions:
                if r in active_regions:
                    a_r, b_r = samples[r]
                    params.extend([a_r, b_r])
                else:
                    # 非激活区域：baseline + 小扰动
                    if jitter_mode == "random":
                        eps_a = np.random.uniform(-baseline_jitter, baseline_jitter)
                        eps_b = np.random.uniform(-baseline_jitter, baseline_jitter)
                        params.extend([BASELINE + eps_a, BASELINE + eps_b])
                    else:
                        params.extend([BASELINE + baseline_jitter,
                                       BASELINE + baseline_jitter])

            rows.append(params)

    df = pd.DataFrame(
        rows,
        columns=[f"{ab}{r}" for r in range(1, 6) for ab in ("a", "b")]
    )
    return df


In [None]:
def build_datasets_for_diag(
    sd2: float = 0.1,
    rho: float = 0.3,
    baseline_jitter: float = 1e-2,
    jitter_mode: str = "random",
    n_active_regions: int = 1,   # 每行激活多少个区域
) -> pd.DataFrame:

    if not (1 <= n_active_regions <= 5):
        raise ValueError("n_active_regions must be between 1 and 5.")

    sd = np.sqrt(sd2)
    cov = np.array([[sd**2,       rho * sd**2],
                    [rho * sd**2, sd**2      ]])

    rows = []
    all_regions = list(range(1, 6))   # 1~5 区域

    for region in all_regions:
        for _ in range(REPLICATES_PER_REGION):

            # === 选出本行中被激活的区域集合 ===
            # 确保当前 region 一定在激活集合里，然后再随机补足剩下的
            active_regions = [region]
            if n_active_regions > 1:
                others = [r for r in all_regions if r != region]
                extra = np.random.choice(
                    others,
                    size=n_active_regions - 1,
                    replace=False
                )
                active_regions.extend(extra)

            # === 为每一个激活区域单独采样一对 (a_r, b_r)，
            #     且均值取决于区域编号：区域 r 用均值 r ===
            samples = {}
            for r_act in active_regions:
                mu_r = float(r_act)                      # 区域 1→1.0，区域 2→2.0 ...
                mean = np.array([mu_r, mu_r])            # 二元均值 [μ_r, μ_r]
                samples[r_act] = sample_trunc_bvn(mean, cov, LOW, HIGH)

            # === 组装这一行的 10 个参数 a1,b1,...,a5,b5 ===
            params = []
            for r in all_regions:
                if r in active_regions:
                    a_r, b_r = samples[r]
                    params.extend([a_r, b_r])
                else:
                    # 非激活区域：baseline + 小扰动
                    if jitter_mode == "random":
                        eps_a = np.random.uniform(-baseline_jitter, baseline_jitter)
                        eps_b = np.random.uniform(-baseline_jitter, baseline_jitter)
                        params.extend([BASELINE + eps_a, BASELINE + eps_b])
                    else:
                        params.extend([BASELINE + baseline_jitter,
                                       BASELINE + baseline_jitter])

            rows.append(params)

    df = pd.DataFrame(
        rows,
        columns=[f"{ab}{r}" for r in range(1, 6) for ab in ("a", "b")]
    )
    return df

In [None]:
df = build_datasets_for_diag(baseline_jitter = 0, n_active_regions=5)

In [None]:
np.savetxt("X_slant.csv", df, delimiter=",", fmt="%.4f")

In [None]:
df = build_datasets_for_centre(2, sd2=0.05, rho=0.3, baseline_jitter = 0, n_active_regions=5)

In [None]:
np.savetxt("X_2_g_5.csv", df, delimiter=",", fmt="%.4f")

In [None]:
df

In [None]:
df = pd.DataFrame({f'col_{i}': [5] for i in range(10)})

# Build Disease Dataset

In [4]:
def sample_trunc_bvn(mean: np.ndarray,
                     cov: np.ndarray,
                     low: float,
                     high: float,
                     rng: np.random.Generator) -> np.ndarray:
    """
    从二维正态 N(mean, cov) 采样，直到点落在 [low, high]^2 内。
    """
    while True:
        x = rng.multivariate_normal(mean, cov)
        if low <= x[0] <= high and low <= x[1] <= high:
            return x


# ---------------------------------------------------------------------
@dataclass
class DiseaseConfig:
    name: str
    # 激活（病变）区域的中心值：Ca, Cb 围绕 m_active 采样
    m_active: float
    # 激活区域的方差：sd_active^2
    sd_active: float
    # 激活区域中 Ca-Cb 之间的人群相关性（不是后验相关）
    rho_active: float

    # 非激活（远端/近似健康）区域的基准
    baseline: float = 1.0
    baseline_jitter: float = 0.1  # 非激活区在 baseline 周围的随机扰动半径

    # 截断区间
    low: float = 0.3
    high: float = 3.0

    # 模式：focal = 局灶病变；diffuse = 弥漫病变（所有区域都 active）
    pattern: str = "focal"
    # 每个“病人”中激活区域数目的范围（仅对 focal 模式有意义）
    n_active_regions_range: Tuple[int, int] = (1, 1)


# 一些典型疾病场景的默认设定，数值是“合理 toy 模型”，不是金标准
DISEASE_CONFIGS: Dict[str, DiseaseConfig] = {
    # 健康：所有区域 Ca, Cb ≈ 1，轻微个体差异
    "healthy": DiseaseConfig(
        name="healthy",
        m_active=1.0,
        sd_active=0.05,
        rho_active=0.0,
        baseline=1.0,
        baseline_jitter=0.05,
        low=0.5,
        high=1.5,
        pattern="diffuse",
        n_active_regions_range=(5, 5),
    ),

    # MI：局灶性硬化，1–2 个区域 Ca/Cb 明显 > 1，其他区域接近 1
    "MI": DiseaseConfig(
        name="MI",
        m_active=2.0,      # 病变区中心 2 倍刚度/非线性
        sd_active=0.2,
        rho_active=0.2,    # 人群层面上 Ca,Cb 同向变化
        baseline=1.0,
        baseline_jitter=0.1,
        low=0.5,
        high=3.0,
        pattern="focal",
        n_active_regions_range=(1, 2),
    ),

    # HFpEF：全局被动硬化，中度升高
    "HFpEF": DiseaseConfig(
        name="HFpEF",
        m_active=1.5,
        sd_active=0.15,
        rho_active=0.3,
        baseline=1.0,
        baseline_jitter=0.05,
        low=0.7,
        high=2.5,
        pattern="diffuse",
        n_active_regions_range=(5, 5),
    ),

    # HCM：整体变硬，且区域间有一些差异
    "HCM": DiseaseConfig(
        name="HCM",
        m_active=1.7,
        sd_active=0.2,
        rho_active=0.3,
        baseline=1.0,
        baseline_jitter=0.1,
        low=0.7,
        high=3.0,
        pattern="diffuse",
        n_active_regions_range=(5, 5),
    ),

    # DCM：整体偏软/接近正常（这里只给一个“软化”原型）
    "DCM": DiseaseConfig(
        name="DCM",
        m_active=0.8,
        sd_active=0.1,
        rho_active=0.2,
        baseline=1.0,
        baseline_jitter=0.05,
        low=0.3,
        high=1.5,
        pattern="diffuse",
        n_active_regions_range=(5, 5),
    ),

    # Amyloidosis / 限制型：整体非常僵硬
    "Amyloidosis": DiseaseConfig(
        name="Amyloidosis",
        m_active=2.5,
        sd_active=0.25,
        rho_active=0.3,
        baseline=1.0,
        baseline_jitter=0.05,
        low=0.7,
        high=4.0,
        pattern="diffuse",
        n_active_regions_range=(5, 5),
    ),
}


# ---------------------------------------------------------------------
# 3. 单个“病人”的 Ca/Cb 生成
# ---------------------------------------------------------------------
def simulate_one_patient(
    cfg: DiseaseConfig,
    rng: np.random.Generator,
    n_regions: int = 5,
) -> List[float]:
    """
    生成一个病人的 (a1,b1,...,a5,b5)，其中 ar ~ Ca^(r), br ~ Cb^(r).

    返回长度为 2 * n_regions 的列表。
    """
    # 构造激活区域数目
    if cfg.pattern == "diffuse":
        n_active = n_regions
    else:  # focal
        n_min, n_max = cfg.n_active_regions_range
        if not (1 <= n_min <= n_max <= n_regions):
            raise ValueError("n_active_regions_range must be within [1, n_regions].")
        n_active = rng.integers(n_min, n_max + 1)

    all_regions = list(range(1, n_regions + 1))
    # 随机选取激活区域
    active_regions = set(rng.choice(all_regions, size=n_active, replace=False))

    # 激活区域的协方差矩阵
    sd = cfg.sd_active
    cov = np.array([
        [sd ** 2,          cfg.rho_active * sd ** 2],
        [cfg.rho_active * sd ** 2, sd ** 2]
    ])
    mean = np.array([cfg.m_active, cfg.m_active], dtype=float)

    params: List[float] = []
    for r in all_regions:
        if r in active_regions:
            # 病变区域：截断二元正态
            Ca_r, Cb_r = sample_trunc_bvn(
                mean=mean,
                cov=cov,
                low=cfg.low,
                high=cfg.high,
                rng=rng,
            )
        else:
            # 非激活区域：围绕 baseline 做小扰动
            if cfg.baseline_jitter > 0.0:
                theta = rng.uniform(0.0, 2.0 * np.pi)
                rad = cfg.baseline_jitter * np.sqrt(rng.uniform(0.0, 1.0))
                Ca_r = cfg.baseline + rad * np.cos(theta)
                Cb_r = cfg.baseline + rad * np.sin(theta)
                # 确保仍然在 [low, high] 内
                Ca_r = float(np.clip(Ca_r, cfg.low, cfg.high))
                Cb_r = float(np.clip(Cb_r, cfg.low, cfg.high))
            else:
                Ca_r = cfg.baseline
                Cb_r = cfg.baseline

        params.extend([Ca_r, Cb_r])

    return params


# ---------------------------------------------------------------------
# 4. 主接口：生成某种疾病的数据集
# ---------------------------------------------------------------------
def build_disease_dataset(
    disease_type: str,
    n_patients: int,
    seed: int = 0,
    n_regions: int = 5,
    disease_configs: Dict[str, DiseaseConfig] = DISEASE_CONFIGS,
) -> pd.DataFrame:
    """
    生成一个 (n_patients x (2*n_regions)) 的 DataFrame。

    每一行对应一个“病人”，包含：
        a1, b1, a2, b2, ..., a5, b5
    其中 a_r = C_a^(r), b_r = C_b^(r).

    Parameters
    ----------
    disease_type : {"healthy", "MI", "HFpEF", "HCM", "DCM", "Amyloidosis", ...}
    n_patients : int
        样本数
    seed : int
        随机种子。
    n_regions : int
        区域数
    disease_configs : dict
        疾病类型到 DiseaseConfig 的映射

    Returns
    -------
    df : pd.DataFrame
        列名为 ["a1","b1",...,"a5","b5"]。
    """
    if disease_type not in disease_configs:
        raise ValueError(f"Unknown disease_type: {disease_type}. "
                         f"Available: {list(disease_configs.keys())}")

    cfg = disease_configs[disease_type]
    rng = np.random.default_rng(seed)

    rows: List[List[float]] = []
    for _ in range(n_patients):
        params = simulate_one_patient(cfg=cfg, rng=rng, n_regions=n_regions)
        rows.append(params)

    columns = [f"{ab}{r}" for r in range(1, n_regions + 1) for ab in ("a", "b")]
    return pd.DataFrame(rows, columns=columns)


In [None]:
# 100 个健康样本
df_healthy = build_disease_dataset("healthy", n_patients=100, seed=42)

# 100 个 MI 病人（局灶硬化）
df_mi = build_disease_dataset("MI", n_patients=100, seed=1)

# 50 个 HFpEF 病人（弥漫硬化）
df_hfpef = build_disease_dataset("HFpEF", n_patients=50, seed=2)

# 50 个 HCM 病人
df_hcm = build_disease_dataset("HCM", n_patients=50, seed=3)

# 50 个 DCM 病人（整体偏软）
df_dcm = build_disease_dataset("DCM", n_patients=50, seed=4)

# 30 个心肌淀粉样变病人（极度僵硬）
df_amylo = build_disease_dataset("Amyloidosis", n_patients=30, seed=5)


# MI

    "MI": DiseaseConfig(
        name="MI",
        m_active=2.0,
        sd_active=0.2,
        rho_active=0.2,
        baseline=1.0,
        baseline_jitter=0.1,
        low=0.5,
        high=3.0,
        pattern="focal",
        n_active_regions_range=(1, 2),
    ),

In [16]:
df_mi = build_datasets_for_centre(m = 3, sd2=0.2, rho=0.2, baseline_jitter = 0.1, n_active_regions=1)

In [14]:
df_mi

Unnamed: 0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5
0,1.515314,2.023599,1.056159,0.997635,1.094127,0.935871,0.980490,0.908968,0.977059,1.089605
1,1.781670,2.312705,1.015158,1.014542,0.949623,1.091184,0.905957,1.088069,1.034865,1.056809
2,2.174636,2.803485,0.953022,0.907976,0.931506,0.961221,0.921718,0.932838,0.902219,0.975924
3,2.579799,1.350832,0.952154,0.999491,0.912295,0.979367,1.098271,1.013769,1.081812,0.995708
4,2.067463,2.395785,1.083647,1.041460,0.914236,1.099916,0.900145,1.056847,1.057496,0.945862
...,...,...,...,...,...,...,...,...,...,...
95,0.977063,1.018924,0.992031,1.014494,0.900340,1.071970,1.052054,1.076149,3.153267,2.763054
96,0.947985,1.021663,0.958647,0.925832,1.001588,0.970658,1.063974,1.049306,1.297256,2.529010
97,1.058345,0.952816,0.918285,1.065553,0.972868,0.945449,0.969984,1.006673,2.697300,2.155262
98,0.904130,0.913652,0.976051,0.956889,0.997545,1.056344,0.938536,1.092387,2.090425,2.526521


In [17]:
np.savetxt("X_MI_3.csv", df_mi, delimiter=",", fmt="%.4f")

In [None]:
df = df.to_numpy()
m, n = df.shape


new_data = np.zeros((m, 34))


mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

for orig_col, new_cols in mapping.items():
    for new_col in new_cols:
        new_data[:, new_col] = df[:, orig_col]

In [None]:
# np.savetxt("LocalDisease/X_3.txt", new_data, fmt='%0.4f')
np.savetxt("X_5_max.txt", new_data, fmt='%0.4f')

In [None]:
X_train = pd.read_csv('X_1_1.txt', header=None, delimiter=' ').values

m = X_train.shape[0]

mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

X_all_recovered = np.zeros((m, len(mapping)))


for orig_col, new_cols in mapping.items():

    X_all_recovered[:, orig_col] = X_train[:, new_cols].mean(axis=1)

X_train = np.around(X_all_recovered, decimals=4)

In [None]:
np.savetxt("X_1_1.csv", X_train, delimiter=",", fmt="%.4f")

# Check

In [None]:
Y_train_pca = pd.read_csv('Y_train_std_pca.csv', header=None, delimiter=',').values
Y_test_pca = pd.read_csv('Y_test_std_pca.csv', header=None, delimiter=',').values

Y_data_1_1_pca = pd.read_csv('Y_data_1_1_pca.csv', header=None, delimiter=',').values
Y_data_1_5_pca = pd.read_csv('Y_data_1_5_pca.csv', header=None, delimiter=',').values
Y_data_2_pca = pd.read_csv('Y_data_2_pca.csv', header=None, delimiter=',').values

Y_edge_std_pca = pd.read_csv('Y_edge_std_pca.csv', header=None, delimiter=',').values

Y_data_1_2_jitter_B_2_5_pca = pd.read_csv('Y_data_1_2_jitter_B_2_5_pca.csv', header=None, delimiter=',').values

In [None]:
col_min = np.nanmin(Y_train_pca, axis=0)
col_max = np.nanmax(Y_train_pca, axis=0)


((Y_data_1_1_pca >= col_min) & (Y_data_1_1_pca <= col_max)).sum(axis=1)

In [None]:
X_train = pd.read_csv('../Data/X_train.csv', header=None, delimiter=',').values
X_test = pd.read_csv('../Data/X_test.csv', header=None, delimiter=',').values
X_edge = pd.read_csv('../Data/X_edge.csv', header=None, delimiter=',').values

In [None]:
X_all = np.vstack([X_train, X_edge])