# Set up

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# RealCase

In [5]:

SLICE_FILES = {
    "slice1": Path("slice1.csv"),
    "slice2": Path("slice2.csv"),
    "slice3": Path("slice3.csv"),
    "slice4": Path("slice4.csv"),
    "slice5": Path("slice5.csv"),
}

# 输出文件
OUT_TABLE = Path("AHA17_peak_strains_subset_2_to_-2_table.csv")  # 逐段表（1行/段）
OUT_ONE_ROW_FULL = Path("AHA17_one_row_full_labels_cir_then_rad_custom_order.csv")  # 单行（全称标签）
OUT_ONE_ROW_ABBR = Path("AHA17_one_row_abbr_labels_cir_then_rad_custom_order.csv")  # 单行（缩写标签）

# 列名使用全称(True)或缩写(False)
USE_FULL_LABELS = True

# 你给定的 16 段顺序（仅 1–16 段）
CUSTOM_ORDER_ABBR = [
    "Basal-InfSept", "Basal-AntSept", "Basal-Ant", "Basal-AntLat", "Basal-InfLat", "Basal-Inf",
    "Mid-InfSept", "Mid-AntSept", "Mid-Ant", "Mid-AntLat", "Mid-InfLat", "Mid-Inf",
    "Apical-Septal", "Apical-Anterior", "Apical-Lateral", "Apical-Inferior"
]

# =========================
# 辅助函数
# =========================
def subset_df(df: pd.DataFrame) -> pd.DataFrame:
    """仅保留第2行到倒数第2行；若行不足3，则返回空"""
    if len(df) < 3:
        return df.iloc[0:0].copy()
    return df.iloc[1:-1].reset_index(drop=True)

def pos_name(col: str) -> str:
    """将列名 'cirAntSeptTotal' / 'radInfLatTotal' 规范为位置名 'AntSept' / 'InfLat'"""
    name = col[3:]  # 去掉前缀 'cir' 或 'rad'
    if name.endswith("Total"):
        name = name[:-5]
    return name

def per_slice_extrema_on_subset(df: pd.DataFrame):
    """
    基于子集（去掉首尾行）计算单个 slice 极值：
    - cir_* 列取最小值
    - rad_* 列取最大值
    返回两个 dict：cir_min_map, rad_max_map（键：位置名，如 'Ant','InfSept' 等）
    """
    sub = subset_df(df)
    cir_cols = [c for c in sub.columns if c.startswith("cir")]
    rad_cols = [c for c in sub.columns if c.startswith("rad")]
    cir_min = {pos_name(c): (sub[c].min(skipna=True) if not sub.empty else float('nan'))
               for c in cir_cols}
    rad_max = {pos_name(c): (sub[c].max(skipna=True) if not sub.empty else float('nan'))
               for c in rad_cols}
    return cir_min, rad_max

def avg_maps(m1: dict, m2: dict) -> dict:
    """按键对两个映射做均值（忽略 NaN）"""
    keys = set(m1.keys()) | set(m2.keys())
    out = {}
    for k in keys:
        out[k] = pd.Series([m1.get(k, float('nan')), m2.get(k, float('nan'))]).mean(skipna=True)
    return out

def mean2(a, b):
    return pd.Series([a, b]).mean(skipna=True)

# 缩写→全称映射（环段位置）
ABBR_TO_FULL = {
    "Ant": "Anterior",
    "AntSept": "Anteroseptal",
    "InfSept": "Inferoseptal",
    "Inf": "Inferior",
    "InfLat": "Inferolateral",
    "AntLat": "Anterolateral",
    # 顶端象限（已是全称）
    "Septal": "Septal",
    "Anterior": "Anterior",
    "Lateral": "Lateral",
    "Inferior": "Inferior",
}

def to_full_label(label_abbr: str) -> str:
    """
    将 'Basal-InfSept' → 'Basal-Inferoseptal'
       'Apical-Septal' 保持不变
       'Apex (missing)' → 'Apex'
    """
    s = str(label_abbr).strip()
    if s.startswith("Apex"):
        return "Apex"
    if "-" in s:
        ring, pos = s.split("-", 1)
        full_pos = ABBR_TO_FULL.get(pos, pos)
        return f"{ring}-{full_pos}"
    return s

# =========================
# 计算流程
# =========================
# 1) 读取数据 + 各 slice 子集极值
s1 = pd.read_csv(SLICE_FILES["slice1"])
s2 = pd.read_csv(SLICE_FILES["slice2"])
s3 = pd.read_csv(SLICE_FILES["slice3"])
s4 = pd.read_csv(SLICE_FILES["slice4"])
s5 = pd.read_csv(SLICE_FILES["slice5"])

cir1, rad1 = per_slice_extrema_on_subset(s1)
cir2, rad2 = per_slice_extrema_on_subset(s2)
cir3, rad3 = per_slice_extrema_on_subset(s3)
cir4, rad4 = per_slice_extrema_on_subset(s4)
cir5_map, rad5_map = per_slice_extrema_on_subset(s5)

# 2) (slice1,slice2) 与 (slice3,slice4) 对应位置平均
cir12 = avg_maps(cir1, cir2)
rad12 = avg_maps(rad1, rad2)
cir34 = avg_maps(cir3, cir4)
rad34 = avg_maps(rad3, rad4)

# AHA 6 扇区顺序（环内位置的标准顺序）
AHA6 = ["Ant", "AntSept", "InfSept", "Inf", "InfLat", "AntLat"]

# 3) 组装 1–12 段（Basal、Mid）
rows = []
for i, pos in enumerate(AHA6, start=1):
    rows.append({
        "segment": i,
        "label_abbr": f"Basal-{pos}",
        "cir_min": cir12.get(pos, float('nan')),
        "rad_max": rad12.get(pos, float('nan'))
    })
for i, pos in enumerate(AHA6, start=7):
    rows.append({
        "segment": i,
        "label_abbr": f"Mid-{pos}",
        "cir_min": cir34.get(pos, float('nan')),
        "rad_max": rad34.get(pos, float('nan'))
    })

# 4) 组装 13–16 段（Apical，来自 slice5；按规则合并位置）
APICAL_RULES = [
    (13, "Apical-Anterior",  ("Ant",)),                      # 13=Ant
    (14, "Apical-Septal",    ("AntSept", "InfSept")),        # 14=mean(AntSept, InfSept)
    (15, "Apical-Inferior",  ("Inf",)),                      # 15=Inf
    (16, "Apical-Lateral",   ("AntLat", "InfLat")),          # 16=mean(AntLat, InfLat)
]
for seg, label_full, pos_tuple in APICAL_RULES:
    if len(pos_tuple) == 1:
        p = pos_tuple[0]
        cir_v = cir5_map.get(p, float('nan'))
        rad_v = rad5_map.get(p, float('nan'))
    else:
        cir_v = mean2(cir5_map.get(pos_tuple[0], float('nan')),
                      cir5_map.get(pos_tuple[1], float('nan')))
        rad_v = mean2(rad5_map.get(pos_tuple[0], float('nan')),
                      rad5_map.get(pos_tuple[1], float('nan')))
    # 这里 label_abbr 使用 'Apical-Anterior' 等（已是全称位置词）
    rows.append({
        "segment": seg,
        "label_abbr": label_full,  # 作为缩写标签使用（Apical-* 已是全称）
        "cir_min": cir_v,
        "rad_max": rad_v
    })

# 5) 17 段占位
rows.append({
    "segment": 17,
    "label_abbr": "Apex (missing)",
    "cir_min": float('nan'),
    "rad_max": float('nan')
})

table = pd.DataFrame(rows).sort_values("segment").reset_index(drop=True)

# 冗余一列：全称标签
table["label_full"] = table["label_abbr"].map(to_full_label)

# 保存逐段表（便于检查）
table[["segment", "label_abbr", "label_full", "cir_min", "rad_max"]].to_csv(OUT_TABLE, index=False)

# 6) 依据你给定的 16 段顺序（仅 1–16 段）构建单行输出
#    - 先 CIR 块（按顺序 16 列），再 RAD 块（按相同顺序 16 列）
#    - 列名可选：全称或缩写
order_abbr_16 = CUSTOM_ORDER_ABBR[:]  # 拷贝
# 构造查找表：abbr → 值（从 table 中取）
# 注意：table 中 apical 已采用全称词（Apical-Anterior 等），与 CUSTOM_ORDER_ABBR 的 Apical-* 保持一致
lookup_abbr_to_vals = {
    lab: table.loc[table["label_abbr"] == lab, ["cir_min", "rad_max"]].iloc[0].to_dict()
    for lab in table["label_abbr"].tolist()
    if lab != "Apex (missing)"
}

# 生成列名序列（根据 USE_FULL_LABELS 选择全称/缩写）
def col_name_base(label_abbr: str) -> str:
    return to_full_label(label_abbr) if USE_FULL_LABELS else label_abbr

cir_cols = [f"{col_name_base(lab)}_cir" for lab in order_abbr_16]
rad_cols = [f"{col_name_base(lab)}_rad" for lab in order_abbr_16]

# 生成对应取值（严格按顺序）
cir_vals = [lookup_abbr_to_vals.get(lab, {"cir_min": float('nan')})["cir_min"]
            for lab in order_abbr_16]
rad_vals = [lookup_abbr_to_vals.get(lab, {"rad_max": float('nan')})["rad_max"]
            for lab in order_abbr_16]

one_row = pd.DataFrame([cir_vals + rad_vals], columns=cir_cols + rad_cols)

# 输出文件（按标签样式选择）
if USE_FULL_LABELS:
    one_row.to_csv(OUT_ONE_ROW_FULL, index=False)
    print(f"Saved single-row (full labels) to: {OUT_ONE_ROW_FULL}")
else:
    one_row.to_csv(OUT_ONE_ROW_ABBR, index=False)
    print(f"Saved single-row (abbr labels) to: {OUT_ONE_ROW_ABBR}")


Saved single-row (full labels) to: AHA17_one_row_full_labels_cir_then_rad_custom_order.csv


In [None]:
# -*- coding: utf-8 -*-
from pathlib import Path
import pandas as pd

# =========================
# 配置
# =========================
SLICE_FILES = {
    "slice1": Path("/mnt/data/slice1.csv"),
    "slice2": Path("/mnt/data/slice2.csv"),
    "slice3": Path("/mnt/data/slice3.csv"),
    "slice4": Path("/mnt/data/slice4.csv"),
    "slice5": Path("/mnt/data/slice5.csv"),
}

# 输出文件
OUT_TABLE = Path("/mnt/data/AHA17_peak_strains_subset_2_to_-2_table.csv")  # 逐段表（1行/段）
OUT_ONE_ROW_FULL = Path("/mnt/data/AHA17_one_row_full_labels_cir_then_rad_custom_order.csv")  # 单行（全称标签）
OUT_ONE_ROW_ABBR = Path("/mnt/data/AHA17_one_row_abbr_labels_cir_then_rad_custom_order.csv")  # 单行（缩写标签）

# 列名使用全称(True)或缩写(False)
USE_FULL_LABELS = True

# 你给定的 16 段顺序（仅 1–16 段）
CUSTOM_ORDER_ABBR = [
    "Basal-InfSept", "Basal-AntSept", "Basal-Ant", "Basal-AntLat", "Basal-InfLat", "Basal-Inf",
    "Mid-InfSept", "Mid-AntSept", "Mid-Ant", "Mid-AntLat", "Mid-InfLat", "Mid-Inf",
    "Apical-Septal", "Apical-Anterior", "Apical-Lateral", "Apical-Inferior"
]

# =========================
# 辅助函数
# =========================
def subset_df(df: pd.DataFrame) -> pd.DataFrame:
    """仅保留第2行到倒数第2行；若行不足3，则返回空"""
    if len(df) < 3:
        return df.iloc[0:0].copy()
    return df.iloc[1:-1].reset_index(drop=True)

def pos_name(col: str) -> str:
    """将列名 'cirAntSeptTotal' / 'radInfLatTotal' 规范为位置名 'AntSept' / 'InfLat'"""
    name = col[3:]  # 去掉前缀 'cir' 或 'rad'
    if name.endswith("Total"):
        name = name[:-5]
    return name

def per_slice_means_on_subset(df: pd.DataFrame):
    """
    基于子集（去掉首尾行）直接按相同行（同一行集合）对每一列取 **均值**：
    - cir_* 列：均值
    - rad_* 列：均值
    返回两个 dict：cir_mean_map, rad_mean_map（键：位置名，如 'Ant','InfSept' 等）
    """
    sub = subset_df(df)
    cir_cols = [c for c in sub.columns if c.startswith("cir")]
    rad_cols = [c for c in sub.columns if c.startswith("rad")]

    if sub.empty:
        cir_mean = {pos_name(c): float('nan') for c in cir_cols}
        rad_mean = {pos_name(c): float('nan') for c in rad_cols}
    else:
        cir_mean = {pos_name(c): sub[c].mean(skipna=True) for c in cir_cols}
        rad_mean = {pos_name(c): sub[c].mean(skipna=True) for c in rad_cols}

    return cir_mean, rad_mean

def avg_maps(m1: dict, m2: dict) -> dict:
    """按键对两个映射做均值（忽略 NaN）"""
    keys = set(m1.keys()) | set(m2.keys())
    out = {}
    for k in keys:
        out[k] = pd.Series([m1.get(k, float('nan')), m2.get(k, float('nan'))]).mean(skipna=True)
    return out

def mean2(a, b):
    return pd.Series([a, b]).mean(skipna=True)

# 缩写→全称映射（环段位置）
ABBR_TO_FULL = {
    "Ant": "Anterior",
    "AntSept": "Anteroseptal",
    "InfSept": "Inferoseptal",
    "Inf": "Inferior",
    "InfLat": "Inferolateral",
    "AntLat": "Anterolateral",
    # 顶端象限（已是全称）
    "Septal": "Septal",
    "Anterior": "Anterior",
    "Lateral": "Lateral",
    "Inferior": "Inferior",
}

def to_full_label(label_abbr: str) -> str:
    """
    将 'Basal-InfSept' → 'Basal-Inferoseptal'
       'Apical-Septal' 保持不变
       'Apex (missing)' → 'Apex'
    """
    s = str(label_abbr).strip()
    if s.startswith("Apex"):
        return "Apex"
    if "-" in s:
        ring, pos = s.split("-", 1)
        full_pos = ABBR_TO_FULL.get(pos, pos)
        return f"{ring}-{full_pos}"
    return s

# =========================
# 计算流程
# =========================
# 1) 读取数据 + 各 slice 子集均值（不取极值）
s1 = pd.read_csv(SLICE_FILES["slice1"])
s2 = pd.read_csv(SLICE_FILES["slice2"])
s3 = pd.read_csv(SLICE_FILES["slice3"])
s4 = pd.read_csv(SLICE_FILES["slice4"])
s5 = pd.read_csv(SLICE_FILES["slice5"])

cir1, rad1 = per_slice_means_on_subset(s1)
cir2, rad2 = per_slice_means_on_subset(s2)
cir3, rad3 = per_slice_means_on_subset(s3)
cir4, rad4 = per_slice_means_on_subset(s4)
cir5_map, rad5_map = per_slice_means_on_subset(s5)

# 2) (slice1,slice2) 与 (slice3,slice4) 对应位置平均
cir12 = avg_maps(cir1, cir2)
rad12 = avg_maps(rad1, rad2)
cir34 = avg_maps(cir3, cir4)
rad34 = avg_maps(rad3, rad4)

# AHA 6 扇区顺序（环内位置的标准顺序）
AHA6 = ["Ant", "AntSept", "InfSept", "Inf", "InfLat", "AntLat"]

# 3) 组装 1–12 段（Basal、Mid）
rows = []
for i, pos in enumerate(AHA6, start=1):
    rows.append({
        "segment": i,
        "label_abbr": f"Basal-{pos}",
        "cir_mean": cir12.get(pos, float('nan')),
        "rad_mean": rad12.get(pos, float('nan'))
    })
for i, pos in enumerate(AHA6, start=7):
    rows.append({
        "segment": i,
        "label_abbr": f"Mid-{pos}",
        "cir_mean": cir34.get(pos, float('nan')),
        "rad_mean": rad34.get(pos, float('nan'))
    })

# 4) 组装 13–16 段（Apical，来自 slice5；按规则合并位置）
APICAL_RULES = [
    (13, "Apical-Anterior",  ("Ant",)),                      # 13=Ant
    (14, "Apical-Septal",    ("AntSept", "InfSept")),        # 14=mean(AntSept, InfSept)
    (15, "Apical-Inferior",  ("Inf",)),                      # 15=Inf
    (16, "Apical-Lateral",   ("AntLat", "InfLat")),          # 16=mean(AntLat, InfLat)
]
for seg, label_full, pos_tuple in APICAL_RULES:
    if len(pos_tuple) == 1:
        p = pos_tuple[0]
        cir_v = cir5_map.get(p, float('nan'))
        rad_v = rad5_map.get(p, float('nan'))
    else:
        cir_v = mean2(cir5_map.get(pos_tuple[0], float('nan')),
                      cir5_map.get(pos_tuple[1], float('nan')))
        rad_v = mean2(rad5_map.get(pos_tuple[0], float('nan')),
                      rad5_map.get(pos_tuple[1], float('nan')))
    rows.append({
        "segment": seg,
        "label_abbr": label_full,  # 作为缩写标签使用（Apical-* 已是全称）
        "cir_mean": cir_v,
        "rad_mean": rad_v
    })

# 5) 17 段占位
rows.append({
    "segment": 17,
    "label_abbr": "Apex (missing)",
    "cir_mean": float('nan'),
    "rad_mean": float('nan')
})

table = pd.DataFrame(rows).sort_values("segment").reset_index(drop=True)

# 冗余一列：全称标签
table["label_full"] = table["label_abbr"].map(to_full_label)

# 保存逐段表（便于检查）
table[["segment", "label_abbr", "label_full", "cir_mean", "rad_mean"]].to_csv(OUT_TABLE, index=False)

# 6) 依据你给定的 16 段顺序（仅 1–16 段）构建单行输出
#    - 先 CIR 块（按顺序 16 列），再 RAD 块（按相同顺序 16 列）
#    - 列名可选：全称或缩写
order_abbr_16 = CUSTOM_ORDER_ABBR[:]  # 拷贝

# 构造查找表：abbr → 值（从 table 中取）
lookup_abbr_to_vals = {
    lab: table.loc[table["label_abbr"] == lab, ["cir_mean", "rad_mean"]].iloc[0].to_dict()
    for lab in table["label_abbr"].tolist()
    if lab != "Apex (missing)"
}

# 生成列名序列（根据 USE_FULL_LABELS 选择全称/缩写）
def col_name_base(label_abbr: str) -> str:
    return to_full_label(label_abbr) if USE_FULL_LABELS else label_abbr

cir_cols = [f"{col_name_base(lab)}_cir" for lab in order_abbr_16]
rad_cols = [f"{col_name_base(lab)}_rad" for lab in order_abbr_16]

# 生成对应取值（严格按顺序）
cir_vals = [lookup_abbr_to_vals.get(lab, {"cir_mean": float('nan')})["cir_mean"]
            for lab in order_abbr_16]
rad_vals = [lookup_abbr_to_vals.get(lab, {"rad_mean": float('nan')})["rad_mean"]
            for lab in order_abbr_16]

one_row = pd.DataFrame([cir_vals + rad_vals], columns=cir_cols + rad_cols)

# 输出文件（按标签样式选择）
if USE_FULL_LABELS:
    one_row.to_csv(OUT_ONE_ROW_FULL, index=False)
    print(f"Saved single-row (full labels) to: {OUT_ONE_ROW_FULL}")
else:
    one_row.to_csv(OUT_ONE_ROW_ABBR, index=False)
    print(f"Saved single-row (abbr labels) to: {OUT_ONE_ROW_ABBR}")


In [None]:
reordered

In [None]:
result

# PCA

In [5]:
X_train = pd.read_csv('RealCase_X_train.csv', header=None, delimiter=',').values

Y_train_std = pd.read_csv('RealCase_Y_train_std.csv', header=None, delimiter=',').values

X_test = pd.read_csv('RealCase_X_test.csv', header=None, delimiter=',').values

Y_test_std = pd.read_csv('RealCase_Y_test_std.csv', header=None, delimiter=',').values

In [6]:
RealCase = pd.read_csv('RealCase_Y.csv', header=None, delimiter=',').values

In [10]:
col_min = np.nanmin(Y_train_std, axis=0)
col_max = np.nanmax(Y_train_std, axis=0)


(RealCase >= col_min) & (RealCase <= col_max)

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True, False,  True,  True,
        False,  True,  True,  True, False,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True, False,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True, False,  True]])

In [None]:
pca = PCA()

pca.fit(Y_train_std)

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')

plt.grid()
plt.show()

In [None]:
np.argmax(cumulative_variance >= 0.999) + 1

In [11]:
def split_and_apply_pca(train_data, test_data, RealCase, variance_threshold=0.999):

    # 拆分第一列
    train_first_col = train_data[:, 0].reshape(-1, 1)
    test_first_col = test_data[:, 0].reshape(-1, 1)
    RealCase_first_col = RealCase[:, 0].reshape(-1, 1)
    
    train_remaining = train_data[:, 1:]
    test_remaining = test_data[:, 1:]
    RealCase_remaining = RealCase[:, 1:]
    
    # 初始化 PCA 并拟合剩余的列
    pca = PCA()
    pca.fit(train_remaining)
    
    # 计算累计方差
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # 确定主成分个数
    n_components = np.argmax(cumulative_variance >= variance_threshold) + 1
    
    # 根据主成分数重新拟合 PCA
    pca = PCA(n_components=n_components)
    train_reduced = pca.fit_transform(train_remaining)
    test_reduced = pca.transform(test_remaining)
    RealCase_reduced = pca.transform(RealCase_remaining)

    
    # 合并第一列与降维后的数据
    train_final = np.hstack((train_first_col, train_reduced))
    test_final = np.hstack((test_first_col, test_reduced))
    RealCase_final = np.hstack((RealCase_first_col, RealCase_reduced))
    
    return train_final, test_final, RealCase_final, n_components

In [12]:
Y_train_pca, Y_test_pca, RealCase_pca, n_components = split_and_apply_pca(Y_train_std, Y_test_std, RealCase)

In [None]:
n_components

In [13]:
col_min = np.nanmin(Y_train_pca, axis=0)
col_max = np.nanmax(Y_train_pca, axis=0)


(RealCase_pca >= col_min) & (RealCase_pca <= col_max)

array([[ True, False,  True,  True,  True,  True,  True,  True, False,
        False, False,  True,  True, False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False,  True,  True, False, False, False, False,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False, False,  True, False, False, False, False, False]])

In [None]:
# np.savetxt("RealCase_Y_train_pca.csv", Y_train_pca, delimiter=",", fmt="%.8f")

# np.savetxt("RealCase_Y_test_pca.csv", Y_test_pca, delimiter=",", fmt="%.8f")

np.savetxt("RealCase_Y_pca.csv", RealCase_pca, delimiter=",", fmt="%.8f")