# Set up

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# RealCase

In [None]:
# Load slices
s1 = pd.read_csv("slice1.csv")
s2 = pd.read_csv("slice2.csv")
s3 = pd.read_csv("slice3.csv")
s4 = pd.read_csv("slice4.csv")
s5 = pd.read_csv("slice5.csv")

def per_slice_extrema(df):
    cir_cols = [c for c in df.columns if c.startswith("cir")]
    rad_cols = [c for c in df.columns if c.startswith("rad")]
    def pos_name(col):
        name = col[3:]
        if name.endswith("Total"):
            name = name[:-5]
        return name
    cir_min = {pos_name(c): df[c].min(skipna=True) for c in cir_cols}
    rad_max = {pos_name(c): df[c].max(skipna=True) for c in rad_cols}
    return cir_min, rad_max

# Per-slice extrema
cir1, rad1 = per_slice_extrema(s1)
cir2, rad2 = per_slice_extrema(s2)
cir3, rad3 = per_slice_extrema(s3)
cir4, rad4 = per_slice_extrema(s4)
cir5, rad5 = per_slice_extrema(s5)

# Average pairs (slice1,slice2) and (slice3,slice4)
def avg_maps(m1, m2):
    keys = set(m1.keys()) | set(m2.keys())
    out = {}
    for k in keys:
        v1 = m1.get(k, float('nan'))
        v2 = m2.get(k, float('nan'))
        out[k] = pd.Series([v1, v2]).mean(skipna=True)
    return out

cir12 = avg_maps(cir1, cir2)
rad12 = avg_maps(rad1, rad2)
cir34 = avg_maps(cir3, cir4)
rad34 = avg_maps(rad3, rad4)

# Canonical 6-sector order
aha6_order = ["Ant", "AntSept", "InfSept", "Inf", "InfLat", "AntLat"]

rows = []
# Segments 1–6
for i, pos in enumerate(aha6_order, start=1):
    rows.append({
        "segment": i,
        "label": f"Basal-{pos}",
        "cir_min": cir12.get(pos, float('nan')),
        "rad_max": rad12.get(pos, float('nan'))
    })
# Segments 7–12
for i, pos in enumerate(aha6_order, start=7):
    rows.append({
        "segment": i,
        "label": f"Mid-{pos}",
        "cir_min": cir34.get(pos, float('nan')),
        "rad_max": rad34.get(pos, float('nan'))
    })

# Apical ring (13–16): use slice5 only, with positional averaging per previous rule
def mean2(a, b):
    return pd.Series([a, b]).mean(skipna=True)

apical_map = [
    (13, "Apical-Anterior",  ("Ant",)),                    # Ant
    (14, "Apical-Septal",    ("AntSept", "InfSept")),      # mean of septal pair
    (15, "Apical-Inferior",  ("Inf",)),                    # Inf
    (16, "Apical-Lateral",   ("AntLat", "InfLat")),        # mean of lateral pair
]

for seg, label, positions in apical_map:
    if len(positions) == 1:
        pos = positions[0]
        cir_v = cir5.get(pos, float('nan'))
        rad_v = rad5.get(pos, float('nan'))
    else:
        cir_v = mean2(cir5.get(positions[0], float('nan')), cir5.get(positions[1], float('nan')))
        rad_v = mean2(rad5.get(positions[0], float('nan')), rad5.get(positions[1], float('nan')))
    rows.append({
        "segment": seg,
        "label": label,
        "cir_min": cir_v,
        "rad_max": rad_v
    })

# Segment 17 remains missing
rows.append({
    "segment": 17,
    "label": "Apex (missing)",
    "cir_min": float('nan'),
    "rad_max": float('nan')
})

result = pd.DataFrame(rows).sort_values("segment").reset_index(drop=True)

In [None]:
def extract_row_maps(df):
    row = df.iloc[-2]  # second-to-last
    cir_cols = [c for c in df.columns if c.startswith("cir")]
    rad_cols = [c for c in df.columns if c.startswith("rad")]
    def pos_name(col):
        name = col[3:]
        if name.endswith("Total"):
            name = name[:-5]
        return name
    cir_map = {pos_name(c): row[c] for c in cir_cols}
    rad_map = {pos_name(c): row[c] for c in rad_cols}
    return cir_map, rad_map

cir1, rad1 = extract_row_maps(s1)
cir2, rad2 = extract_row_maps(s2)
cir3, rad3 = extract_row_maps(s3)
cir4, rad4 = extract_row_maps(s4)
cir5, rad5 = extract_row_maps(s5)

# Average pairs (slice1,slice2) and (slice3,slice4)
def avg_maps(m1, m2):
    keys = set(m1.keys()) | set(m2.keys())
    out = {}
    for k in keys:
        out[k] = pd.Series([m1.get(k, float('nan')), m2.get(k, float('nan'))]).mean(skipna=True)
    return out

cir12 = avg_maps(cir1, cir2)
rad12 = avg_maps(rad1, rad2)
cir34 = avg_maps(cir3, cir4)
rad34 = avg_maps(rad3, rad4)

aha6_order = ["Ant", "AntSept", "InfSept", "Inf", "InfLat", "AntLat"]

rows = []
# 1–6: average of slice1&2 at the second-to-last row
for i, pos in enumerate(aha6_order, start=1):
    rows.append({
        "segment": i,
        "label": f"Basal-{pos}",
        "cir_value": cir12.get(pos, float('nan')),
        "rad_value": rad12.get(pos, float('nan')),
        "rule": "2nd-last row avg(s1,s2)"
    })

# 7–12: average of slice3&4 at the second-to-last row
for i, pos in enumerate(aha6_order, start=7):
    rows.append({
        "segment": i,
        "label": f"Mid-{pos}",
        "cir_value": cir34.get(pos, float('nan')),
        "rad_value": rad34.get(pos, float('nan')),
        "rule": "2nd-last row avg(s3,s4)"
    })

# 13–16: from slice5's second-to-last row, with positional averaging
def mean2(a, b):
    return pd.Series([a, b]).mean(skipna=True)

apical_map = [
    (13, "Apical-Anterior",  ("Ant",)),
    (14, "Apical-Septal",    ("AntSept", "InfSept")),
    (15, "Apical-Inferior",  ("Inf",)),
    (16, "Apical-Lateral",   ("AntLat", "InfLat")),
]

for seg, label, positions in apical_map:
    if len(positions) == 1:
        pos = positions[0]
        cir_v = cir5.get(pos, float('nan'))
        rad_v = rad5.get(pos, float('nan'))
    else:
        cir_v = mean2(cir5.get(positions[0], float('nan')), cir5.get(positions[1], float('nan')))
        rad_v = mean2(rad5.get(positions[0], float('nan')), rad5.get(positions[1], float('nan')))
    rows.append({
        "segment": seg,
        "label": label,
        "cir_value": cir_v,
        "rad_value": rad_v,
        "rule": "2nd-last row slice5"
    })

# 17: missing
rows.append({
    "segment": 17,
    "label": "Apex (missing)",
    "cir_value": float('nan'),
    "rad_value": float('nan'),
    "rule": "N/A"
})

result = pd.DataFrame(rows).sort_values("segment").reset_index(drop=True)

In [None]:
result

# PCA

In [5]:
X_train = pd.read_csv('RealCase_X_train.csv', header=None, delimiter=',').values

Y_train_std = pd.read_csv('RealCase_Y_train_std.csv', header=None, delimiter=',').values

X_test = pd.read_csv('RealCase_X_test.csv', header=None, delimiter=',').values

Y_test_std = pd.read_csv('RealCase_Y_test_std.csv', header=None, delimiter=',').values

In [6]:
RealCase = pd.read_csv('RealCase_Y.csv', header=None, delimiter=',').values

In [10]:
col_min = np.nanmin(Y_train_std, axis=0)
col_max = np.nanmax(Y_train_std, axis=0)


(RealCase >= col_min) & (RealCase <= col_max)

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True, False,  True,  True,
        False,  True,  True,  True, False,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True, False,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True, False,  True]])

In [None]:
pca = PCA()

pca.fit(Y_train_std)

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')

plt.grid()
plt.show()

In [None]:
np.argmax(cumulative_variance >= 0.999) + 1

In [11]:
def split_and_apply_pca(train_data, test_data, RealCase, variance_threshold=0.999):

    # 拆分第一列
    train_first_col = train_data[:, 0].reshape(-1, 1)
    test_first_col = test_data[:, 0].reshape(-1, 1)
    RealCase_first_col = RealCase[:, 0].reshape(-1, 1)
    
    train_remaining = train_data[:, 1:]
    test_remaining = test_data[:, 1:]
    RealCase_remaining = RealCase[:, 1:]
    
    # 初始化 PCA 并拟合剩余的列
    pca = PCA()
    pca.fit(train_remaining)
    
    # 计算累计方差
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # 确定主成分个数
    n_components = np.argmax(cumulative_variance >= variance_threshold) + 1
    
    # 根据主成分数重新拟合 PCA
    pca = PCA(n_components=n_components)
    train_reduced = pca.fit_transform(train_remaining)
    test_reduced = pca.transform(test_remaining)
    RealCase_reduced = pca.transform(RealCase_remaining)

    
    # 合并第一列与降维后的数据
    train_final = np.hstack((train_first_col, train_reduced))
    test_final = np.hstack((test_first_col, test_reduced))
    RealCase_final = np.hstack((RealCase_first_col, RealCase_reduced))
    
    return train_final, test_final, RealCase_final, n_components

In [12]:
Y_train_pca, Y_test_pca, RealCase_pca, n_components = split_and_apply_pca(Y_train_std, Y_test_std, RealCase)

In [None]:
n_components

In [13]:
col_min = np.nanmin(Y_train_pca, axis=0)
col_max = np.nanmax(Y_train_pca, axis=0)


(RealCase_pca >= col_min) & (RealCase_pca <= col_max)

array([[ True, False,  True,  True,  True,  True,  True,  True, False,
        False, False,  True,  True, False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False,  True,  True, False, False, False, False,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False, False,  True, False, False, False, False, False]])

In [None]:
# np.savetxt("RealCase_Y_train_pca.csv", Y_train_pca, delimiter=",", fmt="%.8f")

# np.savetxt("RealCase_Y_test_pca.csv", Y_test_pca, delimiter=",", fmt="%.8f")

np.savetxt("RealCase_Y_pca.csv", RealCase_pca, delimiter=",", fmt="%.8f")