# Features engineer

# Train on rank on groupby chr

In [13]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
TRAIN_PATH = "../preprocessed_data/CAGE-merged/X1_features.tsv"
VAL_PATH   = "../preprocessed_data/CAGE-merged/X2_features.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]

print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING
# ============================================================
df_train_full = pd.read_csv(TRAIN_PATH, sep="\t")
df_val_full   = pd.read_csv(VAL_PATH, sep="\t")
feature_cols = [c for c in df_train_full.columns if c not in META_COLS]
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,   # reproducibility flag (>= LGBM 4.0)
    "force_row_wise": True,  # prevent threading non-determinism
}

# ============================================================
#                  CROSS-CHROMOSOME TRAINING
# ============================================================
results = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")
    
    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
    df_val   = df_val_full[df_val_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)

    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")
    print("📊 Per-Chromosome Spearman Correlation:")

    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["pred"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        print(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


🔒 Global seed set to 42
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']
Feature count: 456

🚀 Fold 1 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0666904	val's rmse: 0.190953
[400]	train's rmse: 0.0355242	val's rmse: 0.190126
Early stopping, best iteration is:
[381]	train's rmse: 0.0376451	val's rmse: 0.189838

📈 Fold 1 Overall Spearman ρ = 0.6865
📊 Per-Chromosome Spearman Correlation:
   chr12 : ρ = 0.7536
   chr17 : ρ = 0.7440
   chr2  : ρ = 0.7182
   chr22 : ρ = 0.7110
   chr7  : ρ = 0.7591

🚀 Fold 2 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[85]	train's rmse: 0.105647	val's rmse: 

# Train on rank transform groupby cell line

In [14]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
TRAIN_PATH = "../preprocessed_data/CAGE-merged/X1_all_rank_features.tsv"
VAL_PATH   = "../preprocessed_data/CAGE-merged/X2_all_rank_features.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]

print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING
# ============================================================
df_train_full = pd.read_csv(TRAIN_PATH, sep="\t")
df_val_full   = pd.read_csv(VAL_PATH, sep="\t")
feature_cols = [c for c in df_train_full.columns if c not in META_COLS]
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,   # reproducibility flag (>= LGBM 4.0)
    "force_row_wise": True,  # prevent threading non-determinism
}

# ============================================================
#                  CROSS-CHROMOSOME TRAINING
# ============================================================
results = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")
    
    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
    df_val   = df_val_full[df_val_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)

    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")
    print("📊 Per-Chromosome Spearman Correlation:")

    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["pred"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        print(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


🔒 Global seed set to 42
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']
Feature count: 456

🚀 Fold 1 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	train's rmse: 0.0956605	val's rmse: 0.190943

📈 Fold 1 Overall Spearman ρ = 0.7379
📊 Per-Chromosome Spearman Correlation:
   chr12 : ρ = 0.7499
   chr17 : ρ = 0.7369
   chr2  : ρ = 0.7297
   chr22 : ρ = 0.7281
   chr7  : ρ = 0.7449

🚀 Fold 2 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[87]	train's rmse: 0.105695	val's rmse: 0.197202

📈 Fold 2 Overall Spearman ρ = 0.7439
📊 Per-Chromosome Spearman Correlation:
   chr13 : ρ = 0.

# Train on leave one chr out method on X1 -> X2, X2 -> X1

In [15]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
DATA_DIR = "../preprocessed_data/CAGE-merged"
META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"
chromosomes = [f"chr{i}" for i in range(2, 23)]

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,
    "force_row_wise": True,
}

# ============================================================
#                  FUNCTION: LOCO EXPERIMENT
# ============================================================
def run_loco(train_name, val_name):
    """Perform Leave-One-Chromosome-Out CV: train on `train_name`, validate on `val_name`"""
    print(f"\n==============================")
    print(f" 🔁 LOCO: {train_name} → {val_name}")
    print(f"==============================")

    df_train_full = pd.read_csv(f"{DATA_DIR}/{train_name}_all_rank_features.tsv", sep="\t")
    df_val_full   = pd.read_csv(f"{DATA_DIR}/{val_name}_all_rank_features.tsv", sep="\t")
    feature_cols = [c for c in df_train_full.columns if c not in META_COLS]

    results = []

    # ---------------- LOCO LOOP ----------------
    for chrom_val in chromosomes:
        print(f"\n🚀 Leave-One-Chromosome-Out: {chrom_val}")

        train_chrs = [c for c in chromosomes if c != chrom_val]
        df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
        df_val   = df_val_full[df_val_full["chr"] == chrom_val].copy()

        X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
        X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            valid_names=["train", "val"],
            num_boost_round=2000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=200)
            ],
        )

        df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)
        rho = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
        print(f"📈 {chrom_val} Spearman ρ = {rho:.4f}")

        results.append({"chr": chrom_val, "spearman": rho})

    # ---------------- LOCO SUMMARY ----------------
    summary_loco = pd.DataFrame(results)
    mean_loco = summary_loco["spearman"].mean()
    print(f"\nMean Spearman (LOCO {train_name}→{val_name}): {mean_loco:.4f}")

    # ---------------- Full Model Baseline ----------------
    print("\n🏋️ Training Full Model (all chr)...")
    X_train_full, y_train_full = df_train_full[feature_cols], df_train_full[TARGET_COL]
    X_val_full, y_val_full     = df_val_full[feature_cols], df_val_full[TARGET_COL]

    dtrain_full = lgb.Dataset(X_train_full, label=y_train_full)
    dval_full   = lgb.Dataset(X_val_full, label=y_val_full, reference=dtrain_full)

    full_model = lgb.train(
        params,
        dtrain_full,
        valid_sets=[dtrain_full, dval_full],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    df_val_full["pred_full"] = full_model.predict(X_val_full, num_iteration=full_model.best_iteration)
    rho_full = spearmanr(df_val_full[TARGET_COL], df_val_full["pred_full"])[0]
    print(f"\n🌍 Full model ({train_name}→{val_name}) Spearman ρ = {rho_full:.4f}")

    # ---------------- Comparison ----------------
    summary_loco["direction"] = f"{train_name}→{val_name}"
    summary_loco["full_model_rho"] = rho_full
    summary_loco["delta_vs_full"] = summary_loco["spearman"] - rho_full

    print("\n===== LOCO vs Full Model Comparison =====")
    print(summary_loco.sort_values("delta_vs_full", ascending=False))
    print(f"\nAverage Δ(LOCO − Full): {summary_loco['delta_vs_full'].mean():.4f}")

    return summary_loco


# ============================================================
#                  RUN BOTH DIRECTIONS
# ============================================================
summary_X1_to_X2 = run_loco("X1", "X2")
summary_X2_to_X1 = run_loco("X2", "X1")

# ============================================================
#                  COMBINED SUMMARY
# ============================================================
summary_all = pd.concat([summary_X1_to_X2, summary_X2_to_X1], ignore_index=True)
print("\n===== Combined LOCO Summary =====")
print(summary_all)

# Save results (optional)
summary_all.to_csv("summary_LOCO_X1X2_comparison.tsv", sep="\t", index=False)

# ============================================================
#                  GLOBAL SUMMARY STATS
# ============================================================
print("\n📊 Mean Spearman by direction:")
print(summary_all.groupby("direction")["spearman"].mean())

print("\n📊 Mean Δ(LOCO − Full):")
print(summary_all.groupby("direction")["delta_vs_full"].mean())


🔒 Global seed set to 42

 🔁 LOCO: X1 → X2

🚀 Leave-One-Chromosome-Out: chr2
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0796673	val's rmse: 0.206367
Early stopping, best iteration is:
[114]	train's rmse: 0.099182	val's rmse: 0.205459
📈 chr2 Spearman ρ = 0.7228

🚀 Leave-One-Chromosome-Out: chr3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	train's rmse: 0.106023	val's rmse: 0.205065
📈 chr3 Spearman ρ = 0.7405

🚀 Leave-One-Chromosome-Out: chr4
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0818229	val's rmse: 0.1918
Early stopping, best iteration is:
[105]	train's rmse: 0.103672	val's rmse: 0.188282
📈 chr4 Spearman ρ = 0.7589

🚀 Leave-One-Chromosome-Out: chr5
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0807501	val's rmse: 0.217726
Early stopping, best iteration is:
[159]	train's rmse: 0.0890291	val's rmse: 0.214984
📈 

# mix X1, X2 predict leave chr

In [16]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
X1_PATH = "../preprocessed_data/CAGE-merged/X1_features.tsv"
X2_PATH = "../preprocessed_data/CAGE-merged/X2_features.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]  # 五折，每折約4條chr
print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING & MERGING
# ============================================================
df_X1 = pd.read_csv(X1_PATH, sep="\t")
df_X2 = pd.read_csv(X2_PATH, sep="\t")

# 加入 cell line label 方便日後分析
df_X1["cell_line"] = "X1"
df_X2["cell_line"] = "X2"

# 合併成完整 dataset
df_full = pd.concat([df_X1, df_X2], ignore_index=True)
feature_cols = [c for c in df_full.columns if c not in META_COLS + ["cell_line"]]

print(f"Combined data shape: {df_full.shape}")
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,
    "force_row_wise": True,
}

# ============================================================
#                  CHROMOSOME-BASED K-FOLD CV
# ============================================================
results = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")

    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_full[df_full["chr"].isin(train_chrs)].copy()
    df_val   = df_full[df_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)

    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")

    # ---- per-chr correlation ----
    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["pred"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        print(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


🔒 Global seed set to 42
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']
Combined data shape: (32568, 466)
Feature count: 456

🚀 Fold 1 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0968828	val's rmse: 0.148716
Early stopping, best iteration is:
[253]	train's rmse: 0.0883764	val's rmse: 0.148628

📈 Fold 1 Overall Spearman ρ = 0.7623
   chr12 : ρ = 0.7963
   chr17 : ρ = 0.7598
   chr2  : ρ = 0.7719
   chr22 : ρ = 0.7671
   chr7  : ρ = 0.7796

🚀 Fold 2 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.10283	val's rmse: 0.148603
Early stopping, best iteration is:
[138]	train's rmse: 0.113515	val's rmse: 0.1484