In [None]:
# ===== 必改：按你的本机路径与切片名修改 =====
tsv_path = "/Users/yhu10/Desktop/VLM/pipline_data_cell_level/measurements.tsv"   # 你的 QuPath 导出
case_id  = "CMU-2"                                        # 例如 CMU-2.svs -> "CMU-2"
data_dir = "/Users/yhu10/Desktop/VLM/pipline_data_cell_level/cell_level_analysis" # Groovy 会从 data_dir/<case_id>/ 读取
# ==============================================

import numpy as np, pandas as pd
from pathlib import Path

LABEL_COL = "Classification"   # 你 TSV 中已有的标签列

# 1) 读取 TSV
df = pd.read_csv(tsv_path, sep="\t")

# 2) 清洗标签（统一大小写/空白 & 合并同义词，可按需扩展）
def _norm(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().lower()
    return np.nan if s in ("", "nan") else s
df[LABEL_COL] = df[LABEL_COL].map(_norm)

alias = {
    "stroma":"Stroma", "stromal":"Stroma",
    "immune":"Immune", "immune cell":"Immune", "immune cells":"Immune"
}
df[LABEL_COL] = df[LABEL_COL].map(lambda s: alias.get(s, s.title() if isinstance(s,str) else s))

# 3) 选择特征（排除明显非特征列）
needed = ["Centroid X µm","Centroid Y µm"]
for c in needed:
    if c not in df.columns:
        raise ValueError(f"缺少必需列：{c}（导出测量时要包含质心坐标，单位µm）")

drop = {LABEL_COL, *needed, "ID","Name","ROI","Image","Path","Parent","Tile","X","Y"}
drop = {c for c in drop if c in df.columns}
feat_cols = [c for c in df.columns if c not in drop and pd.api.types.is_numeric_dtype(df[c])]
assert len(feat_cols)>0, "没有可用数值特征列，请检查 TSV。"
X = df[feat_cols].fillna(0.0).to_numpy()

# 4) 训练（仅用已标注行），并预测所有细胞
is_lab = df[LABEL_COL].notna()
if not is_lab.any():
    raise ValueError("没有任何已标注细胞，无法训练。请先在 TSV 的 classification 列留少量标签。")

y_lab = df.loc[is_lab, LABEL_COL].astype(str)

try:
    # 优先 XGBoost（安装失败自动回退）
    from xgboost import XGBClassifier
    clf = XGBClassifier(
        n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9,
        n_jobs=4, eval_metric="mlogloss", random_state=42
    )
    clf.fit(X[is_lab], y_lab)
    proba = clf.predict_proba(X); classes = clf.classes_
except Exception:
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    clf = make_pipeline(StandardScaler(with_mean=True),
                        SVC(kernel="rbf", probability=True, class_weight="balanced"))
    clf.fit(X[is_lab], y_lab)
    proba = clf.predict_proba(X); classes = clf.classes_

idx      = proba.argmax(1)
y_pred   = classes[idx]
y_score  = proba.max(1)

# 5) 导出 detections_prediction.txt（无表头；列序固定）
cx = df["Centroid X µm"].to_numpy()
cy = df["Centroid Y µm"].to_numpy()

out_dir = Path(data_dir) / case_id
out_dir.mkdir(parents=True, exist_ok=True)

det = pd.DataFrame({
    "id": np.arange(len(df)),
    "score": np.round(y_score, 4),  # 置信度（可用于后续过滤）
    "label": y_pred,                # Groovy index=2
    "reserved": 0,                  # 占位
    "Centroid X µm": cx,            # Groovy index=4
    "Centroid Y µm": cy             # Groovy index=5
})
det_path = out_dir / "detections_prediction.txt"
det.to_csv(det_path, sep="\t", index=False, header=False)
print("Wrote:", det_path)


Wrote: /Users/yhu10/Desktop/VLM/pipline_data_cell_level/cell_level_analysis/CMU-2/detections_prediction.txt


In [9]:
# ===== 改这里：按你的路径与切片名 =====
tsv_path   = "/Users/yhu10/Desktop/VLM/pipline_data_cell_level/measurements.tsv"                 # QuPath 导出的 measurements.tsv
case_id    = "CMU-2"                                                      # 你的切片名（去扩展名）
out_dir    = "/Users/yhu10/Desktop/VLM/pipline_data_cell_level/cell_level_analysis"              # UMAP TSV 输出目录（= marimo 的 csv_path_root）
pred_path  = f"/Users/yhu10/Desktop/VLM/pipline_data_cell_level/cell_level_analysis/{case_id}/detections_prediction.txt"  # 我们已生成的预测文件
# ====================================

import numpy as np, pandas as pd
from pathlib import Path

# 1) 读取测量表
df = pd.read_csv(tsv_path, sep="\t")

# 2) 读取预测label（第3列=label；第5/6列是 Centroid X/Y µm）
pred = pd.read_csv(pred_path, sep="\t", header=None,
                   names=["id","score","label","reserved","Centroid X µm","Centroid Y µm"])

# 如果 measurements.tsv 里也有 Centroid X/Y µm，优先用 measurements 里的坐标（更全）
if "Centroid X µm" in df.columns and "Centroid Y µm" in df.columns:
    cx = df["Centroid X µm"].to_numpy()
    cy = df["Centroid Y µm"].to_numpy()
else:
    cx = pred["Centroid X µm"].to_numpy()
    cy = pred["Centroid Y µm"].to_numpy()

labels = pred["label"].astype(str).to_numpy()

# 3) 选数值特征做降维（排除明显非特征列）
drop = {"Centroid X µm","Centroid Y µm","ID","Name","ROI","Image","Path",
        "Parent","Tile","X","Y","Object ID","Object type","Classification","PathClass",
        "classification","class","label","Combined_Cluster"}
feat_cols = [c for c in df.columns if c not in drop and pd.api.types.is_numeric_dtype(df[c])]
assert len(feat_cols) > 0, "没有可用的数值特征列，请检查 measurements.tsv"

X = df[feat_cols].fillna(0.0).to_numpy()

# 4) 计算 UMAP（若未装 umap-learn，则回退到 PCA）
try:
    import umap
    emb = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42).fit_transform(X)
    print("UMAP 完成")
except Exception:
    from sklearn.decomposition import PCA
    emb = PCA(n_components=2, random_state=42).fit_transform(X)
    print("未安装 umap-learn，已用 PCA 代替")

# 5) 按 marimo 需求导出 TSV：<case_id>_cell_feature_umap.tsv
umap_df = pd.DataFrame({
    "umap_x": emb[:,0],
    "umap_y": emb[:,1],
    "Centroid X µm": cx,
    "Centroid Y µm": cy,
    "Combined_Cluster": labels  # marimo 代码里当作类别列
})

out_path = Path(out_dir) / f"{case_id}_cell_feature_umap.tsv"
out_path.parent.mkdir(parents=True, exist_ok=True)
umap_df.to_csv(out_path, sep="\t", index=False)
print("Wrote:", out_path)


未安装 umap-learn，已用 PCA 代替
Wrote: /Users/yhu10/Desktop/VLM/pipline_data_cell_level/cell_level_analysis/CMU-2_cell_feature_umap.tsv


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T


In [10]:
%pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl (31.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.2/31.2 MB[0m [31m5.7 MB/s[0m  [33m0:00:05[0mm0:00:01[0m00:01[0mm
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-21.0.0
Note: you may need to restart the kernel to use updated packages.
