In [None]:
import os
from pathlib import Path
import shutil

import numpy as np
import pandas as pd
import nibabel as nib
import SimpleITK as sitk
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

PROJECT_ROOT = Path.cwd().parent       # 当前工作目录
DATA_ROOT    = PROJECT_ROOT / "data"

DATA_RAW      = DATA_ROOT / "raw"        # 原始 Totalsegmentator 数据（每个病人一个文件夹）
DATA_SYMM     = DATA_ROOT / "symmetric"  # 筛出左右体积接近的
DATA_COMPLETE = DATA_ROOT / "complete"   # KMeans 判为 complete 的
DATA_PARTIAL  = DATA_ROOT / "partial"    # 判为 partial 的
DATA_RAS      = DATA_ROOT / "ras_1mm"    # RAS+1mm 输出

CSV_DIR       = DATA_ROOT / "csv"
FIG_DIR       = DATA_ROOT / "fig"

for p in [DATA_SYMM, DATA_COMPLETE, DATA_PARTIAL, DATA_RAS, CSV_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# 原 Totalsegmentator 里用到的结构文件名
REQUIRED_FILES = {
    "left_scapula":  "scapula_left.nii.gz",
    "right_scapula": "scapula_right.nii.gz",
    "left_humerus":  "humerus_left.nii.gz",
    "right_humerus": "humerus_right.nii.gz",
}

In [None]:
def load_nifti(path: Path) -> nib.Nifti1Image:
    return nib.load(str(path))


def compute_volume_ml(img: nib.Nifti1Image) -> float:
    """体积，单位 ml"""
    data = img.get_fdata()
    spacing = img.header.get_zooms()  # mm
    voxel_vol_mm3 = np.prod(spacing)
    vol_ml = (data > 0).sum() * voxel_vol_mm3 / 1000.0  # 1 ml = 1000 mm3
    return float(vol_ml)


def ensure_seg_paths(patient_dir: Path) -> dict:
    seg = patient_dir / "segmentations"
    paths = {}
    for key, fn in REQUIRED_FILES.items():
        p = seg / fn
        paths[key] = p if p.exists() else None
    return paths

In [None]:
def compute_volumes_for_all_patients():
    records = []
    for pdir in sorted(DATA_RAW.glob("s*")):
        paths = ensure_seg_paths(pdir)
        row = {"patient": pdir.name}
        for key, p in paths.items():
            if p is None:
                row[key] = np.nan
            else:
                row[key] = compute_volume_ml(load_nifti(p))
        records.append(row)

    df = pd.DataFrame(records)
    out_csv = CSV_DIR / "volumes_raw.csv"
    df.to_csv(out_csv, index=False)
    print(f"[volume] saved to {out_csv}")
    return df


def within_ratio(a, b, tol=0.10) -> bool:
    if np.isnan(a) or np.isnan(b) or a == 0 or b == 0:
        return False
    return abs(a - b) / max(a, b) < tol


def filter_symmetric_pairs(df: pd.DataFrame, tol=0.10):
    df = df.copy()
    df["scapula_sym"] = df.apply(
        lambda r: within_ratio(r["left_scapula"], r["right_scapula"], tol), axis=1
    )
    df["humerus_sym"] = df.apply(
        lambda r: within_ratio(r["left_humerus"], r["right_humerus"], tol), axis=1
    )
    df_sym = df[df["scapula_sym"] & df["humerus_sym"]].reset_index(drop=True)

    out_csv = CSV_DIR / "volumes_symmetric.csv"
    df_sym.to_csv(out_csv, index=False)
    print(f"[symmetric] saved to {out_csv}")

    # 同时把这些病人拷贝到 DATA_SYMM
    for pid in df_sym["patient"]:
        src = DATA_RAW / pid
        dst = DATA_SYMM / pid
        if not dst.exists():
            shutil.copytree(src, dst)
    print(f"[symmetric] copied patients to {DATA_SYMM}")

    return df_sym

In [None]:
def plot_humerus_violin_complete_vs_partial(df_clustered):
    """
    violin plot:
    - complete group humerus volume (left & right)
    - partial group humerus volume (left & right)
    """
    df = df_clustered.copy()

    # long format
    v_records = []
    for _, r in df.iterrows():
        for side in ["left_humerus", "right_humerus"]:
            v_records.append({
                "patient": r["patient"],
                "group": r["group"],
                "side": side,
                "volume": r[side]
            })

    df_long = pd.DataFrame(v_records)

    # 绘图
    fig, ax = plt.subplots(figsize=(8, 5))
    groups = ["complete", "partial"]
    data = [
        df_long[(df_long["group"] == g)]["volume"].values
        for g in groups
    ]

    ax.violinplot(data, showmeans=True)
    ax.set_xticks([1, 2])
    ax.set_xticklabels(groups)
    ax.set_ylabel("Humerus Volume (ml)")
    ax.set_title("Humerus Volume: complete vs partial")

    fig.tight_layout()
    out = FIG_DIR / "violin_humerus_complete_vs_partial.png"
    fig.savefig(out, dpi=300)
    plt.close(fig)
    print(f"[violin] saved to {out}")


def cluster_completeness(df_vol: pd.DataFrame, n_clusters: int = 2):
    df = df_vol.copy()
    df["humerus_max_vol"] = df[["left_humerus", "right_humerus"]].max(axis=1)

    # KMeans 只用 humerus 体积
    X = df["humerus_max_vol"].values.reshape(-1, 1)
    km = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto")
    labels = km.fit_predict(X)
    df["cluster"] = labels

    # 体积大的 cluster 视为 complete
    cluster_means = df.groupby("cluster")["humerus_max_vol"].mean()
    complete_cluster = cluster_means.idxmax()
    df["group"] = df["cluster"].apply(
        lambda c: "complete" if c == complete_cluster else "partial"
    )

    out_csv = CSV_DIR / "volumes_clustered.csv"
    df.to_csv(out_csv, index=False)
    print(f"[cluster] saved to {out_csv}")

    # 拷贝到 complete / partial 目录，只保留需要的 4 个结构
    for _, row in df.iterrows():
        src = DATA_SYMM / row["patient"]
        dst_root = DATA_COMPLETE if row["group"] == "complete" else DATA_PARTIAL
        dst = dst_root / row["patient"]
        if dst.exists():
            continue
        shutil.copytree(src, dst)
        seg_dir = dst / "segmentations"
        for fn in seg_dir.iterdir():
            if fn.name not in REQUIRED_FILES.values():
                fn.unlink()
    print(f"[cluster] copied to {DATA_COMPLETE} & {DATA_PARTIAL}")
    return df

In [None]:
def main_before_manual_check():
    print("PROJECT_ROOT:", PROJECT_ROOT)
    print("DATA_ROOT   :", DATA_ROOT)

    # 1) 体积统计
    df_vol = compute_volumes_for_all_patients()

    # completeness 聚类 + violin 对称筛选
    df_sym = filter_symmetric_pairs(df_vol, tol=0.10)
    df_clustered = cluster_completeness(df_sym)
    plot_humerus_violin_complete_vs_partial(df_clustered)

In [None]:
main_before_manual_check()