# Features engineer

In [None]:
import os
import random
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import KFold

# --- Models ---
import lightgbm as lgb
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier


# ============================================================
# MAIN FUNCTION
# ============================================================
def run_masked_training(
    train_path,
    val_path,
    meta_cols,
    target_rank="gex_rank",
    target_binary="gex_binary",
    features_path=None,
    test_path=None,
    model_type="lightgbm",       # lightgbm, xgboost, catboost, lasso
    mask_threshold=0.4,
    n_inner_folds=5,
    seed=42,
):
    """
    🧬 Nested CV + Masked Regression Training Framework

    Supports: LightGBM / XGBoost / CatBoost / Lasso
    Supports: Multiple masking methods ("mean", "all", "union", "soft", "none")
    Optional: Test prediction (average across outer folds)
    """

    # ============================================================
    # GLOBAL SEED
    # ============================================================
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    print(f"🔒 Global seed set to {seed}")

    # ============================================================
    # LOAD DATA
    # ============================================================
    df_train_full = pd.read_csv(train_path, sep="\t")
    df_val_full   = pd.read_csv(val_path, sep="\t")

    # Setup binary label
    df_train_full[target_binary] = (df_train_full["gex"] > 0.0).astype(int)
    df_val_full[target_binary]   = (df_val_full["gex"] > 0.0).astype(int)

    # Feature selection
    if features_path and os.path.exists(features_path):
        feature_cols = pd.read_csv(features_path, sep="\t")["feature"].tolist()
        print(f"✅ Loaded {len(feature_cols)} selected features from {features_path}")
    else:
        feature_cols = [c for c in df_train_full.columns if c not in meta_cols + [target_binary]]
        print(f"⚙️ Using all {len(feature_cols)} features")

    # Optional test
    df_test_full = None
    if test_path and os.path.exists(test_path):
        df_test_full = pd.read_csv(test_path, sep="\t")
        print(f"🧪 Test file loaded: {test_path} (shape={df_test_full.shape})")

    chromosomes = [f"chr{i}" for i in range(2, 23)]

    # ============================================================
    # MODEL DEFINITIONS
    # ============================================================
    def make_models():
        """Return classifier and regressor based on model_type."""
        if model_type == "lightgbm":
            params_bin = {
                "objective": "binary",
                "metric": ["auc"],
                "learning_rate": 0.016676974956976915,
                "num_leaves": 48,
                "max_depth": 8,
                "feature_fraction": 0.64561553423692,
                "bagging_fraction": 0.8113835038425429,
                "bagging_freq": 6,
                "lambda_l1": 0.3316673054635859,
                "lambda_l2": 0.8969317795206216,
                "min_gain_to_split": 0.04923442843722911,
                "min_data_in_leaf": 38,
                "verbose": -1,
                "seed": SEED,
            }
            params_reg = params_bin.copy()
            params_reg.update({"objective": "regression", "metric": "rmse"})
            return params_bin, params_reg

        elif model_type == "xgboost":
            params_bin = XGBClassifier(
                n_estimators=1000,
                learning_rate=0.035962471748043,
                max_depth=10,
                subsample=0.8971619038015882,
                colsample_bytree=0.6636371114832225,
                reg_alpha=0.12734537736853208,
                reg_lambda=1.628472451981166,
                gamma=0.0024413287883050493,
                min_child_weight=53,
                eval_metric="auc",
                random_state=seed,
                tree_method="hist"
            )

            params_reg = {
                "booster": "gbtree",
                "eval_metric": "mae",
                "learning_rate": 0.035962471748043,
                "max_depth": 10,
                "max_leaves": 104,
                "subsample": 0.8971619038015882,
                "colsample_bytree": 0.6636371114832225,
                "alpha": 0.12734537736853208,
                "lambda": 1.628472451981166,
                "gamma": 0.0024413287883050493,
                "min_child_weight": 53,
                "random_state": seed
            }
            return params_bin, params_reg

        elif model_type == "catboost":
            params_bin = CatBoostClassifier(
                iterations=1000,
                depth=10,
                learning_rate=0.035,
                l2_leaf_reg=3.0,
                subsample=0.9,
                random_seed=seed,
                verbose=False
            )
            params_reg = CatBoostRegressor(
                iterations=1000,
                depth=10,
                learning_rate=0.035,
                l2_leaf_reg=3.0,
                subsample=0.9,
                random_seed=seed,
                verbose=False
            )
            return params_bin, params_reg

        elif model_type == "lasso":
            return Lasso(alpha=0.001, random_state=seed), Lasso(alpha=0.001, random_state=seed)

        else:
            raise ValueError(f"❌ Unknown model_type: {model_type}")

    params_bin, params_reg = make_models()

    # ============================================================
    # RESULTS STORAGE
    # ============================================================
    results_summary = []
    results_inner = []
    test_preds_binary = []
    test_preds_reg = []

    # ============================================================
    # OUTER LOOP (Leave-One-Chromosome)
    # ============================================================
    for val_chr in chromosomes:
    print(f"\n🚀 Outer Fold: Leave-one-chromosome = {val_chr}")

    # 合併 train + val 作為全集
    df_full = pd.concat([df_train_full, df_val_full], ignore_index=True)
    df_train = df_full[df_full["chr"] != val_chr].copy()
    df_val   = df_full[df_full["chr"] == val_chr].copy()

    inner_chrs = [c for c in chromosomes if c != val_chr]
    folds = [inner_chrs[i::n_inner_folds] for i in range(n_inner_folds)]

    preds_prob_test_folds = []
    preds_reg_test_folds  = []
    inner_results = []

    for fold_idx, val_chrs in enumerate(folds):
        df_inner_train = df_train[~df_train["chr"].isin(val_chrs)]
        df_inner_val   = df_train[df_train["chr"].isin(val_chrs)]

        X_train, y_train_bin, y_train_reg = (
            df_inner_train[feature_cols],
            df_inner_train[target_binary],
            df_inner_train[target_rank],
        )
        X_val, y_val_bin, y_val_reg = (
            df_inner_val[feature_cols],
            df_inner_val[target_binary],
            df_inner_val[target_rank],
        )

        # Train models
        if model_type == "lightgbm":
            dtrain_bin = lgb.Dataset(X_train, label=y_train_bin)
            dval_bin   = lgb.Dataset(X_val, label=y_val_bin)
            model_bin = lgb.train(params_bin, dtrain_bin, valid_sets=[dval_bin], verbose_eval=False)

            dtrain_reg = lgb.Dataset(X_train, label=y_train_reg)
            dval_reg   = lgb.Dataset(X_val, label=y_val_reg)
            model_reg = lgb.train(params_reg, dtrain_reg, valid_sets=[dval_reg], verbose_eval=False)
        elif model_type == "xgboost":
            model_bin = params_bin.fit(X_train, y_train_bin)
            model_reg = XGBRegressor(**params_reg).fit(X_train, y_train_reg)
        elif model_type == "catboost":
            model_bin = params_bin.fit(X_train, y_train_bin)
            model_reg = params_reg.fit(X_train, y_train_reg)
        else:
            model_bin = params_bin.fit(X_train, y_train_bin)
            model_reg = params_reg.fit(X_train, y_train_reg)

        # Validation predictions
        pred_prob_val = model_bin.predict(X_val)
        pred_reg_val  = model_reg.predict(X_val)
        rho_fold = spearmanr(y_val_reg, pred_reg_val)[0]

        pred_masked_val = pred_reg_val * (pred_prob_val >= mask_threshold).astype(int)
        rho_mask_fold = spearmanr(y_val_reg, pred_masked_val)[0]

        inner_results.append({"inner_fold": fold_idx+1, "rho_reg": rho_fold, "rho_masked": rho_mask_fold})
        print(f"   Inner Fold {fold_idx+1} → ρ_reg={rho_fold:.4f}, ρ_masked={rho_mask_fold:.4f}")

        # Outer test chromosome prediction
        X_outer = df_val[feature_cols]
        preds_prob_test_folds.append(model_bin.predict(X_outer))
        preds_reg_test_folds.append(model_reg.predict(X_outer))

        # Test set prediction (optional)
        if df_test_full is not None:
            X_test = df_test_full[feature_cols]
            test_preds_binary.append(model_bin.predict(X_test))
            test_preds_reg.append(model_reg.predict(X_test))

    # Average over inner folds
    mean_prob = np.mean(preds_prob_test_folds, axis=0)
    mean_reg  = np.mean(preds_reg_test_folds, axis=0)

    rho_outer_nomask = spearmanr(df_val[target_rank], mean_reg)[0]
    rho_outer_masked = spearmanr(df_val[target_rank], mean_reg * (mean_prob >= mask_threshold))[0]

    results_summary.append({
        "chr": val_chr,
        "rho_nomask": rho_outer_nomask,
        "rho_masked": rho_outer_masked
    })
    results_inner.extend(inner_results)
    print(f"📊 Outer Fold {val_chr}: ρ_nomask={rho_outer_nomask:.4f}, ρ_masked={rho_outer_masked:.4f}")


# Train on rank on groupby chr

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
TRAIN_PATH = "../preprocessed_data/reference/1. merged data/with_y_250/X1_all_logzscore_logzscore_with_y.tsv"
VAL_PATH   = "../preprocessed_data/reference/1. merged data/with_y_250/X2_all_logzscore_logzscore_with_y.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]

TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]

print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING
# ============================================================
df_train_full = pd.read_csv(TRAIN_PATH, sep="\t")
df_val_full   = pd.read_csv(VAL_PATH, sep="\t")
feature_cols = [c for c in df_train_full.columns if c not in META_COLS]
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,   # reproducibility flag (>= LGBM 4.0)
    "force_row_wise": True,  # prevent threading non-determinism
}

# ============================================================
#                  CROSS-CHROMOSOME TRAINING
# ============================================================
results = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")
    
    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
    df_val   = df_val_full[df_val_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)

    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")
    print("📊 Per-Chromosome Spearman Correlation:")

    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["pred"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        print(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


🔒 Global seed set to 42
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']
Feature count: 456

🚀 Fold 1 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0666904	val's rmse: 0.190953
[400]	train's rmse: 0.0355242	val's rmse: 0.190126
Early stopping, best iteration is:
[381]	train's rmse: 0.0376451	val's rmse: 0.189838

📈 Fold 1 Overall Spearman ρ = 0.6865
📊 Per-Chromosome Spearman Correlation:
   chr12 : ρ = 0.7536
   chr17 : ρ = 0.7440
   chr2  : ρ = 0.7182
   chr22 : ρ = 0.7110
   chr7  : ρ = 0.7591

🚀 Fold 2 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[85]	train's rmse: 0.105647	val's rmse: 

# Train on rank transform groupby cell line

In [14]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
TRAIN_PATH = "../preprocessed_data/CAGE-merged/X1_all_rank_features.tsv"
VAL_PATH   = "../preprocessed_data/CAGE-merged/X2_all_rank_features.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]

print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING
# ============================================================
df_train_full = pd.read_csv(TRAIN_PATH, sep="\t")
df_val_full   = pd.read_csv(VAL_PATH, sep="\t")
feature_cols = [c for c in df_train_full.columns if c not in META_COLS]
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,   # reproducibility flag (>= LGBM 4.0)
    "force_row_wise": True,  # prevent threading non-determinism
}

# ============================================================
#                  CROSS-CHROMOSOME TRAINING
# ============================================================
results = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")
    
    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
    df_val   = df_val_full[df_val_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)

    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")
    print("📊 Per-Chromosome Spearman Correlation:")

    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["pred"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        print(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


🔒 Global seed set to 42
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']
Feature count: 456

🚀 Fold 1 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	train's rmse: 0.0956605	val's rmse: 0.190943

📈 Fold 1 Overall Spearman ρ = 0.7379
📊 Per-Chromosome Spearman Correlation:
   chr12 : ρ = 0.7499
   chr17 : ρ = 0.7369
   chr2  : ρ = 0.7297
   chr22 : ρ = 0.7281
   chr7  : ρ = 0.7449

🚀 Fold 2 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[87]	train's rmse: 0.105695	val's rmse: 0.197202

📈 Fold 2 Overall Spearman ρ = 0.7439
📊 Per-Chromosome Spearman Correlation:
   chr13 : ρ = 0.

# Train on leave one chr out method on X1 -> X2, X2 -> X1

In [15]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
DATA_DIR = "../preprocessed_data/CAGE-merged"
META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"
chromosomes = [f"chr{i}" for i in range(2, 23)]

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,
    "force_row_wise": True,
}

# ============================================================
#                  FUNCTION: LOCO EXPERIMENT
# ============================================================
def run_loco(train_name, val_name):
    """Perform Leave-One-Chromosome-Out CV: train on `train_name`, validate on `val_name`"""
    print(f"\n==============================")
    print(f" 🔁 LOCO: {train_name} → {val_name}")
    print(f"==============================")

    df_train_full = pd.read_csv(f"{DATA_DIR}/{train_name}_all_rank_features.tsv", sep="\t")
    df_val_full   = pd.read_csv(f"{DATA_DIR}/{val_name}_all_rank_features.tsv", sep="\t")
    feature_cols = [c for c in df_train_full.columns if c not in META_COLS]

    results = []

    # ---------------- LOCO LOOP ----------------
    for chrom_val in chromosomes:
        print(f"\n🚀 Leave-One-Chromosome-Out: {chrom_val}")

        train_chrs = [c for c in chromosomes if c != chrom_val]
        df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
        df_val   = df_val_full[df_val_full["chr"] == chrom_val].copy()

        X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
        X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            valid_names=["train", "val"],
            num_boost_round=2000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=200)
            ],
        )

        df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)
        rho = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
        print(f"📈 {chrom_val} Spearman ρ = {rho:.4f}")

        results.append({"chr": chrom_val, "spearman": rho})

    # ---------------- LOCO SUMMARY ----------------
    summary_loco = pd.DataFrame(results)
    mean_loco = summary_loco["spearman"].mean()
    print(f"\nMean Spearman (LOCO {train_name}→{val_name}): {mean_loco:.4f}")

    # ---------------- Full Model Baseline ----------------
    print("\n🏋️ Training Full Model (all chr)...")
    X_train_full, y_train_full = df_train_full[feature_cols], df_train_full[TARGET_COL]
    X_val_full, y_val_full     = df_val_full[feature_cols], df_val_full[TARGET_COL]

    dtrain_full = lgb.Dataset(X_train_full, label=y_train_full)
    dval_full   = lgb.Dataset(X_val_full, label=y_val_full, reference=dtrain_full)

    full_model = lgb.train(
        params,
        dtrain_full,
        valid_sets=[dtrain_full, dval_full],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    df_val_full["pred_full"] = full_model.predict(X_val_full, num_iteration=full_model.best_iteration)
    rho_full = spearmanr(df_val_full[TARGET_COL], df_val_full["pred_full"])[0]
    print(f"\n🌍 Full model ({train_name}→{val_name}) Spearman ρ = {rho_full:.4f}")

    # ---------------- Comparison ----------------
    summary_loco["direction"] = f"{train_name}→{val_name}"
    summary_loco["full_model_rho"] = rho_full
    summary_loco["delta_vs_full"] = summary_loco["spearman"] - rho_full

    print("\n===== LOCO vs Full Model Comparison =====")
    print(summary_loco.sort_values("delta_vs_full", ascending=False))
    print(f"\nAverage Δ(LOCO − Full): {summary_loco['delta_vs_full'].mean():.4f}")

    return summary_loco


# ============================================================
#                  RUN BOTH DIRECTIONS
# ============================================================
summary_X1_to_X2 = run_loco("X1", "X2")
summary_X2_to_X1 = run_loco("X2", "X1")

# ============================================================
#                  COMBINED SUMMARY
# ============================================================
summary_all = pd.concat([summary_X1_to_X2, summary_X2_to_X1], ignore_index=True)
print("\n===== Combined LOCO Summary =====")
print(summary_all)

# Save results (optional)
summary_all.to_csv("summary_LOCO_X1X2_comparison.tsv", sep="\t", index=False)

# ============================================================
#                  GLOBAL SUMMARY STATS
# ============================================================
print("\n📊 Mean Spearman by direction:")
print(summary_all.groupby("direction")["spearman"].mean())

print("\n📊 Mean Δ(LOCO − Full):")
print(summary_all.groupby("direction")["delta_vs_full"].mean())


🔒 Global seed set to 42

 🔁 LOCO: X1 → X2

🚀 Leave-One-Chromosome-Out: chr2
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0796673	val's rmse: 0.206367
Early stopping, best iteration is:
[114]	train's rmse: 0.099182	val's rmse: 0.205459
📈 chr2 Spearman ρ = 0.7228

🚀 Leave-One-Chromosome-Out: chr3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	train's rmse: 0.106023	val's rmse: 0.205065
📈 chr3 Spearman ρ = 0.7405

🚀 Leave-One-Chromosome-Out: chr4
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0818229	val's rmse: 0.1918
Early stopping, best iteration is:
[105]	train's rmse: 0.103672	val's rmse: 0.188282
📈 chr4 Spearman ρ = 0.7589

🚀 Leave-One-Chromosome-Out: chr5
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0807501	val's rmse: 0.217726
Early stopping, best iteration is:
[159]	train's rmse: 0.0890291	val's rmse: 0.214984
📈 

# mix X1, X2 predict leave chr

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
X1_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X1_all_rank_features_with_y.tsv"
X2_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X2_features.tsv"
TEST_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X3_test.tsv"
FEATURES_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/shap_results/top70_features_pruned/selected_union_features.tsv" 

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]  # 五折，每折約4條chr
print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING & MERGING
# ============================================================
df_X1 = pd.read_csv(X1_PATH, sep="\t")
df_X2 = pd.read_csv(X2_PATH, sep="\t")

# 加入 cell line label 方便日後分析
df_X1["cell_line"] = "X1"
df_X2["cell_line"] = "X2"

# 合併成完整 dataset
df_full = pd.concat([df_X1, df_X2], ignore_index=True)
feature_cols = [c for c in df_full.columns if c not in META_COLS + ["cell_line"]]

print(f"Combined data shape: {df_full.shape}")
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,
    "force_row_wise": True,
}

# ============================================================
#                  CHROMOSOME-BASED K-FOLD CV
# ============================================================
results = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")

    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_full[df_full["chr"].isin(train_chrs)].copy()
    df_val   = df_full[df_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)

    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")

    # ---- per-chr correlation ----
    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["pred"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        print(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


🔒 Global seed set to 42
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']
Combined data shape: (32568, 466)
Feature count: 456

🚀 Fold 1 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.0968828	val's rmse: 0.148716
Early stopping, best iteration is:
[253]	train's rmse: 0.0883764	val's rmse: 0.148628

📈 Fold 1 Overall Spearman ρ = 0.7623
   chr12 : ρ = 0.7963
   chr17 : ρ = 0.7598
   chr2  : ρ = 0.7719
   chr22 : ρ = 0.7671
   chr7  : ρ = 0.7796

🚀 Fold 2 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.10283	val's rmse: 0.148603
Early stopping, best iteration is:
[138]	train's rmse: 0.113515	val's rmse: 0.1484

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  CONFIGURATION
# ============================================================
X1_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X1_all_rank_features_with_y.tsv"
X2_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X2_features.tsv"
TEST_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X3_test.tsv"
FEATURES_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/shap_results/top70_features_pruned/selected_union_features.tsv" 
feature_cols = pd.read_csv(FEATURES_PATH, sep="\t").

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

# Chromosomes (autosomes 2–22)
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]  # 五折，每折約4條chr
print("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    print(f"Fold {i+1}: {fset}")

# ============================================================
#                  DATA LOADING & MERGING
# ============================================================
df_X1 = pd.read_csv(X1_PATH, sep="\t")
df_X2 = pd.read_csv(X2_PATH, sep="\t")
df_test = pd.read_csv(TEST_PATH, sep="\t")

# 合併成完整 dataset
df_full = pd.concat([df_X1, df_X2], ignore_index=True)
feature_df = pd.read_csv(FEATURES_PATH, sep="\t")
feature_cols = feature_df['feature'].tolist()

print(f"Combined data shape: {df_full.shape}")
print(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params_bin = {
    "objective": "binary",
    "metric": ["auc"],
    "learning_rate": 0.016676974956976915,
    "num_leaves": 48,
    "max_depth": 8,
    "feature_fraction": 0.64561553423692,
    "bagging_fraction": 0.8113835038425429,
    "bagging_freq": 6,
    "lambda_l1": 0.3316673054635859,
    "lambda_l2": 0.8969317795206216,
    "min_gain_to_split": 0.04923442843722911,
    "min_data_in_leaf": 38,
    "verbose": -1,
    "seed": SEED,
}

params_reg = params_bin.copy()
params_reg.update({
    "objective": "regression",
    "metric": "rmse"
})

# ============================================================
#                  CHROMOSOME-BASED K-FOLD CV
# ============================================================
results = []
test_rg_result = []
test_bn_result = []

for fold_idx, val_chrs in enumerate(folds):
    print(f"\n🚀 Fold {fold_idx+1} | Val chromosomes: {val_chrs}")

    # -----------------------------
    # Split by chromosome
    # -----------------------------
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_full[df_full["chr"].isin(train_chrs)].copy()
    df_val   = df_full[df_full["chr"].isin(val_chrs)].copy()

    # -----------------------------
    # Prepare X, y
    # -----------------------------
    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    # -----------------------------
    # LightGBM Dataset
    # -----------------------------
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # -----------------------------
    # Train model
    # -----------------------------
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # -----------------------------
    # Predictions
    # -----------------------------
    df_val["pred"] = model.predict(X_val, num_iteration=model.best_iteration)
    test_result.append(model.predict(df_test[feature_cols], num_iteration=model.best_iteration))
    # -----------------------------
    # Evaluate Spearman correlation
    # -----------------------------
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["pred"])[0]
    print(f"\n📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")



# ============================================================
#                  SUMMARY
# ============================================================
summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

# --- 收集所有 per-chr spearman ---
chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

# --- 計算每個 chr 的平均 ---
chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean().rename(columns={"spearman": "mean_spearman"})

# ============================================================
#                  PRINT RESULTS
# ============================================================
print("\n===== Cross-Chromosome Fold Summary =====")
print(summary_fold)
print(f"\nMean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")

print("\n===== Per-Chromosome Mean Spearman =====")
print(chr_mean.sort_values("mean_spearman", ascending=False))
print(f"\nOverall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")


In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import KFold
import pickle   # ✅ 新增
import json
from datetime import datetime

# ============================================================
# GLOBAL CONFIG
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

TRAIN_1_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X1_all_rank_features_with_y.tsv"
TRAIN_2_PATH   = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X2_all_rank_features_with_y.tsv"
FEATURES_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/shap_results/top70_features_pruned/selected_union_features.tsv"
TEST_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X3_test.tsv"
OUTPUT_DIR = "./results/lgbm/final_model/"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_RANK = "gex_rank"
TARGET_BINARY = "gex_binary"

chromosomes = [f"chr{i}" for i in range(2, 23)]
N_INNER_FOLDS = 5
MASK_THRESHOLD = 0.4
USE_MASK = False

os.makedirs(OUTPUT_DIR, exist_ok=True)
LOG_PATH = os.path.join(OUTPUT_DIR, "log.txt")

# ✅ 如果已有舊的 log.txt，直接覆蓋（清空）
with open(LOG_PATH, "w") as f:
    f.write("")  # 或寫入 header，如 "==== New Experiment ====\n"

def log(msg):
    """同時印出並寫入 log.txt"""
    print(msg)
    with open(LOG_PATH, "a") as f:
        f.write(f"{msg}\n")


log(f"🚀 Experiment started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log(f"Experiment directory: {OUTPUT_DIR}")


# ============================================================
# LOAD DATA
# ============================================================
df_train_1_full = pd.read_csv(TRAIN_1_PATH, sep="\t")
df_train_2_full = pd.read_csv(TRAIN_2_PATH, sep="\t")
df_test = pd.read_csv(TEST_PATH, sep="\t")

if FEATURES_PATH is not None:
    selected_features = pd.read_csv(FEATURES_PATH, sep="\t")["feature"].tolist()
    feature_cols = [c for c in selected_features if c not in META_COLS]
    log(f"🔑 Loaded {len(feature_cols)} selected features from {FEATURES_PATH}")
else:
    feature_cols = [c for c in df_train_1_full.columns if c not in META_COLS]

if USE_MASK:
    df_train_1_full[TARGET_BINARY] = (df_train_1_full["gex"] > 0.0).astype(int)
    df_train_2_full[TARGET_BINARY]   = (df_train_2_full["gex"] > 0.0).astype(int)
    
df_full = pd.concat([df_train_1_full, df_train_2_full], ignore_index=True)

log(f"Feature count: {len(feature_cols)}")

# ============================================================
# MODEL PARAMS
# ============================================================
params_bin = {
    "objective": "binary",
    "metric": ["auc"],
    "learning_rate": 0.016676974956976915,
    "num_leaves": 48,
    "max_depth": 8,
    "feature_fraction": 0.64561553423692,
    "bagging_fraction": 0.8113835038425429,
    "bagging_freq": 6,
    "lambda_l1": 0.3316673054635859,
    "lambda_l2": 0.8969317795206216,
    "min_gain_to_split": 0.04923442843722911,
    "min_data_in_leaf": 38,
    "verbose": -1,
    "seed": SEED,
}

params_reg = params_bin.copy()
params_reg.update({"objective": "regression", "metric": "rmse"})
# ============================================================
#                  SAVE CONFIG
# ============================================================
config = {
    "train_1_path": TRAIN_1_PATH,
    "train_2_path": TRAIN_2_PATH,
    "test_path": TEST_PATH,
    "seed": SEED,
    "params_reg": params_reg,
    "folds": N_INNER_FOLDS,
    "target_col": TARGET_RANK,
    "use_mask": USE_MASK,
    "feature": feature_cols,
}
if USE_MASK:
    config["params_bin"] = params_bin
    config["mask_threshold"] = MASK_THRESHOLD
    
with open(os.path.join(OUTPUT_DIR, "config.json"), "w") as f:
    json.dump(config, f, indent=4)

# ============================================================
# MAIN LOOP: LEAVE-ONE-CHROMOSOME
# ============================================================
results_summary = []   # 平均 per chromosome
results_inner = []     # 詳細每個 inner fold
test_rg_result_chr = []
test_bn_result_chr = []
for val_chr in chromosomes:
    log(f"\n🚀 Leave-one-chromosome: {val_chr}")

    # Split outer folds
    df_train = df_full[df_full["chr"] != val_chr].copy()
    df_train_test  = df_full[df_full["chr"] == val_chr].copy()

    inner_chrs = [c for c in chromosomes if c != val_chr]
    folds = [inner_chrs[i::5] for i in range(5)]

    pred_train_test_reg_folds = []
    pred_train_test_prob_folds = []

    pred_test_reg_folds = []
    pred_test_prob_folds = []
    log("🧩 Chromosome folds:")

    for fold_idx, fset in enumerate(folds):
        log(f"Fold {fold_idx+1}: {fset}")

        df_inner_train = df_train[df_train["chr"].isin(fset) == False]
        df_inner_val   = df_train[df_train["chr"].isin(fset)]

        # Prepare data
        X_train, y_train_reg = (
            df_inner_train[feature_cols],
            df_inner_train[TARGET_RANK],
        )
        X_val, y_val_reg = (
            df_inner_val[feature_cols],
            df_inner_val[TARGET_RANK],
        )

        if USE_MASK:
            # Apply masking
            y_train_bin = df_inner_train[TARGET_BINARY]
            y_val_bin = df_inner_val[TARGET_BINARY]
            # Binary model
            dtrain_bin = lgb.Dataset(X_train, label=y_train_bin)
            dval_bin   = lgb.Dataset(X_val, label=y_val_bin, reference=dtrain_bin)
            model_bin = lgb.train(
                params_bin,
                dtrain_bin,
                valid_sets=[dtrain_bin, dval_bin],
                num_boost_round=2000,
                callbacks=[lgb.early_stopping(stopping_rounds=200)],
            )

        # Regression model
        dtrain_reg = lgb.Dataset(X_train, label=y_train_reg)
        dval_reg   = lgb.Dataset(X_val, label=y_val_reg, reference=dtrain_reg)
        model_reg = lgb.train(
            params_reg,
            dtrain_reg,
            valid_sets=[dtrain_reg, dval_reg],
            num_boost_round=2000,
            callbacks=[lgb.early_stopping(stopping_rounds=200)],
        )

        # Predict on train test chromosome
        X_train_test = df_train_test[feature_cols]
        pred_train_test_reg  = model_reg.predict(X_train_test)
        pred_train_test_reg_folds.append(pred_train_test_reg)

        # Predict on test chromosome
        X_test = df_test[feature_cols]
        pred_test_reg  = model_reg.predict(X_test)
        pred_test_reg_folds.append(pred_test_reg)
        
        if USE_MASK:
            pred_train_test_prob = model_bin.predict(X_train_test)
            pred_train_test_prob_folds.append(pred_train_test_prob)
            pred_test_prob = model_bin.predict(X_test)
            pred_test_prob_folds.append(pred_test_prob)


        # Evaluate per fold
        rho_fold = spearmanr(df_train_test[TARGET_RANK], pred_train_test_reg)[0]
        if USE_MASK:
            auc_train = roc_auc_score(df_inner_train[TARGET_BINARY], model_bin.predict(X_train))
            auc_val = roc_auc_score(df_inner_val[TARGET_BINARY], model_bin.predict(X_val))
            results_inner.append({
                "outer_chr": val_chr,
                "inner_fold": fold_idx + 1,
                "rho_reg": rho_fold,
                "auc_train_bin": auc_train,
                "auc_val_bin": auc_val
            })  
        else:
            results_inner.append({
                "outer_chr": val_chr,
                "inner_fold": fold_idx + 1,
                "rho_reg": rho_fold,
            })  
        log(f"   Inner Fold {fold_idx+1} → ρ_reg={rho_fold:.4f}")
    # Stack inner predictions
    
    pred_train_test_reg_folds = np.array(pred_train_test_reg_folds)
    mean_train_test_reg = pred_train_test_reg_folds.mean(axis=0)

    # Stack inner predictions
    pred_test_reg_folds = np.array(pred_test_reg_folds)
    mean_test_reg = pred_test_reg_folds.mean(axis=0)

    test_rg_result_chr.append(mean_test_reg)

    if USE_MASK:
        pred_train_test_prob_folds = np.array(pred_train_test_prob_folds)
        mean_train_test_prob = pred_train_test_prob_folds.mean(axis=0)
        pred_test_prob_folds = np.array(pred_test_prob_folds)
        mean_test_prob = pred_test_prob_folds.mean(axis=0)
        test_bn_result_chr.append(mean_test_prob)
        # ============================================================
        # Combine masking strategies
        # ============================================================
        train_test_mask_mean = (mean_train_test_prob >= MASK_THRESHOLD).astype(int)
        train_test_pred_mean_masked = mean_train_test_reg * train_test_mask_mean
        # ============================================================
        # Evaluate aggregated predictions
        # ============================================================
        train_test_rho_mean  = spearmanr(df_train_test[TARGET_RANK], train_test_pred_mean_masked)[0]
        train_test_rho_none  = spearmanr(df_train_test[TARGET_RANK], mean_train_test_reg)[0]
        results_summary.append({
            "chr": val_chr,
            "rho_mean":  train_test_rho_mean,
            "rho_none":  train_test_rho_none,
        })
        log(
            f"📊 {val_chr} → "
            f"No-mask ρ={train_test_rho_none:.4f}"
        )
    else:
        train_test_rho_none  = spearmanr(df_train_test[TARGET_RANK], mean_train_test_reg)[0]
        results_summary.append({
            "chr": val_chr,
            "rho_none":  train_test_rho_none,
        })
        log(
            f"📊 {val_chr} → "
            f"Mean-mask ρ={train_test_rho_mean:.4f} | "
            f"No-mask ρ={train_test_rho_none:.4f}"
        )
    # Stack inner predictions
test_rg_result_chr = np.array(test_rg_result_chr)
mean_test_reg = test_rg_result_chr.mean(axis=0)
if USE_MASK:
    test_bn_result_chr = np.array(test_bn_result_chr)
    mean_test_prob = test_bn_result_chr.mean(axis=0)
    test_predictions_df = pd.DataFrame({
        "gene_name": df_test["gene_name"],
        "mean_bn_prob": mean_test_prob,
        "mean_gex_reg": mean_test_reg
    })
else:
    test_predictions_df = pd.DataFrame({
        "gene_name": df_test["gene_name"],
        "mean_gex_reg": mean_test_reg
    })


# ============================================================
# SUMMARY
# ============================================================
results_df = pd.DataFrame(results_summary)

results_inner_df = pd.DataFrame(results_inner)

log("\n===== Per-Chromosome Summary =====")
log(results_df)
if  USE_MASK:
    log(f"\nAverage ρ (Mean-mask):  {results_df['rho_mean'].mean():.4f}")
log(f"Average ρ (No-mask):    {results_df['rho_none'].mean():.4f}")

log("\n===== Inner Fold Detail (first few rows) =====")
print(results_inner_df.head())

# # Optional: save both
results_inner_df.to_csv(f"{OUTPUT_DIR}/results_inner_folds.tsv", sep="\t", index=False)
test_predictions_df.to_csv(f"{OUTPUT_DIR}/test_predictions_df.tsv", sep="\t", index=False)
log("\n💾 Saved detailed results → results_inner_folds.tsv")
log("\n💾 Saved test predictions → test_predictions_df.tsv")


🔒 Global seed set to 42
🚀 Experiment started at 2025-10-17 22:32:50
Experiment directory: ./results/lgbm/final_model/
🔑 Loaded 63 selected features from ../preprocessed_data/reference/1. merged data/with_y_100_one_side/shap_results/top70_features_pruned/selected_union_features.tsv
Feature count: 63

🚀 Leave-one-chromosome: chr2
🧩 Chromosome folds:
Fold 1: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[713]	training's rmse: 0.115177	valid_1's rmse: 0.13337
   Inner Fold 1 → ρ_reg=0.7933
Fold 2: ['chr4', 'chr9', 'chr14', 'chr19']
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[652]	training's rmse: 0.114411	valid_1's rmse: 0.136633
   Inner Fold 2 → ρ_reg=0.7915
Fold 3: ['chr5', 'chr10', 'chr15', 'chr20']
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[992]	training's rmse: 0.113872	valid_1's rmse: 0.1353

KeyError: 'rho_mean'

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import KFold
import pickle   # ✅ 新增
import json
from datetime import datetime

# ============================================================
# GLOBAL CONFIG
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

TRAIN_1_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X1_all_rank_features_with_y.tsv"
TRAIN_2_PATH   = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X2_all_rank_features_with_y.tsv"
FEATURES_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/shap_results/top70_features_pruned/selected_union_features.tsv"
TEST_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X3_test.tsv"
OUTPUT_DIR = "./results/lgbm/final_model/"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_RANK = "gex_rank"
TARGET_BINARY = "gex_binary"

chromosomes = [f"chr{i}" for i in range(2, 23)]
N_INNER_FOLDS = 5
MASK_THRESHOLD = 0.4
USE_MASK = False
MIX_TRAIN_1_2 = True
os.makedirs(OUTPUT_DIR, exist_ok=True)
LOG_PATH = os.path.join(OUTPUT_DIR, "log.txt")

# ✅ 如果已有舊的 log.txt，直接覆蓋（清空）
with open(LOG_PATH, "w") as f:
    f.write("")  # 或寫入 header，如 "==== New Experiment ====\n"

def log(msg):
    """同時印出並寫入 log.txt"""
    print(msg)
    with open(LOG_PATH, "a") as f:
        f.write(f"{msg}\n")


log(f"🚀 Experiment started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log(f"Experiment directory: {OUTPUT_DIR}")


# ============================================================
# LOAD DATA
# ============================================================
df_train_1_full = pd.read_csv(TRAIN_1_PATH, sep="\t")
df_train_2_full = pd.read_csv(TRAIN_2_PATH, sep="\t")
df_test = pd.read_csv(TEST_PATH, sep="\t")

if FEATURES_PATH is not None:
    selected_features = pd.read_csv(FEATURES_PATH, sep="\t")["feature"].tolist()
    feature_cols = [c for c in selected_features if c not in META_COLS]
    log(f"🔑 Loaded {len(feature_cols)} selected features from {FEATURES_PATH}")
else:
    feature_cols = [c for c in df_train_1_full.columns if c not in META_COLS]

if USE_MASK:
    df_train_1_full[TARGET_BINARY] = (df_train_1_full["gex"] > 0.0).astype(int)
    df_train_2_full[TARGET_BINARY]   = (df_train_2_full["gex"] > 0.0).astype(int)

if MIX_TRAIN_1_2:
    df_full = pd.concat([df_train_1_full, df_train_2_full], ignore_index=True)
    log("🔀 Merged TRAIN 1 and TRAIN 2 for training.")

log(f"Feature count: {len(feature_cols)}")

# ============================================================
# MODEL PARAMS
# ============================================================
params_bin = {
    "objective": "binary",
    "metric": ["auc"],
    "learning_rate": 0.016676974956976915,
    "num_leaves": 48,
    "max_depth": 8,
    "feature_fraction": 0.64561553423692,
    "bagging_fraction": 0.8113835038425429,
    "bagging_freq": 6,
    "lambda_l1": 0.3316673054635859,
    "lambda_l2": 0.8969317795206216,
    "min_gain_to_split": 0.04923442843722911,
    "min_data_in_leaf": 38,
    "verbose": -1,
    "seed": SEED,
}

params_reg = params_bin.copy()
params_reg.update({"objective": "regression", "metric": "rmse"})
# ============================================================
#                  SAVE CONFIG
# ============================================================
config = {
    "train_1_path": TRAIN_1_PATH,
    "train_2_path": TRAIN_2_PATH,
    "test_path": TEST_PATH,
    "seed": SEED,
    "params_reg": params_reg,
    "folds": N_INNER_FOLDS,
    "target_col": TARGET_RANK,
    "use_mask": USE_MASK,
    "feature": feature_cols,
}
if USE_MASK:
    config["params_bin"] = params_bin
    config["mask_threshold"] = MASK_THRESHOLD
    
with open(os.path.join(OUTPUT_DIR, "config.json"), "w") as f:
    json.dump(config, f, indent=4)

# ============================================================
# MAIN LOOP: LEAVE-ONE-CHROMOSOME
# ============================================================
results_summary = []   # 平均 per chromosome
results_inner = []     # 詳細每個 inner fold
test_rg_result_chr = []
test_bn_result_chr = []
for val_chr in chromosomes:
    log(f"\n🚀 Leave-one-chromosome: {val_chr}")

    # Split outer folds
    df_train = df_full[df_full["chr"] != val_chr].copy()
    df_train_test  = df_full[df_full["chr"] == val_chr].copy()

    inner_chrs = [c for c in chromosomes if c != val_chr]
    folds = [inner_chrs[i::5] for i in range(5)]

    pred_train_test_reg_folds = []
    pred_train_test_prob_folds = []

    pred_test_reg_folds = []
    pred_test_prob_folds = []
    log("🧩 Chromosome folds:")

    for fold_idx, fset in enumerate(folds):
        log(f"Fold {fold_idx+1}: {fset}")

        df_inner_train = df_train[df_train["chr"].isin(fset) == False]
        df_inner_val   = df_train[df_train["chr"].isin(fset)]

        # Prepare data
        X_train, y_train_reg = (
            df_inner_train[feature_cols],
            df_inner_train[TARGET_RANK],
        )
        X_val, y_val_reg = (
            df_inner_val[feature_cols],
            df_inner_val[TARGET_RANK],
        )

        if USE_MASK:
            # Apply masking
            y_train_bin = df_inner_train[TARGET_BINARY]
            y_val_bin = df_inner_val[TARGET_BINARY]
            # Binary model
            dtrain_bin = lgb.Dataset(X_train, label=y_train_bin)
            dval_bin   = lgb.Dataset(X_val, label=y_val_bin, reference=dtrain_bin)
            model_bin = lgb.train(
                params_bin,
                dtrain_bin,
                valid_sets=[dtrain_bin, dval_bin],
                num_boost_round=2000,
                callbacks=[lgb.early_stopping(stopping_rounds=200)],
            )

        # Regression model
        dtrain_reg = lgb.Dataset(X_train, label=y_train_reg)
        dval_reg   = lgb.Dataset(X_val, label=y_val_reg, reference=dtrain_reg)
        model_reg = lgb.train(
            params_reg,
            dtrain_reg,
            valid_sets=[dtrain_reg, dval_reg],
            num_boost_round=2000,
            callbacks=[lgb.early_stopping(stopping_rounds=200)],
        )

        # Predict on train test chromosome
        X_train_test = df_train_test[feature_cols]
        pred_train_test_reg  = model_reg.predict(X_train_test)
        pred_train_test_reg_folds.append(pred_train_test_reg)

        # Predict on test chromosome
        X_test = df_test[feature_cols]
        pred_test_reg  = model_reg.predict(X_test)
        pred_test_reg_folds.append(pred_test_reg)
        
        if USE_MASK:
            pred_train_test_prob = model_bin.predict(X_train_test)
            pred_train_test_prob_folds.append(pred_train_test_prob)
            pred_test_prob = model_bin.predict(X_test)
            pred_test_prob_folds.append(pred_test_prob)


        # Evaluate per fold
        rho_fold = spearmanr(df_train_test[TARGET_RANK], pred_train_test_reg)[0]
        if USE_MASK:
            auc_train = roc_auc_score(df_inner_train[TARGET_BINARY], model_bin.predict(X_train))
            auc_val = roc_auc_score(df_inner_val[TARGET_BINARY], model_bin.predict(X_val))
            results_inner.append({
                "outer_chr": val_chr,
                "inner_fold": fold_idx + 1,
                "rho_reg": rho_fold,
                "auc_train_bin": auc_train,
                "auc_val_bin": auc_val
            })  
        else:
            results_inner.append({
                "outer_chr": val_chr,
                "inner_fold": fold_idx + 1,
                "rho_reg": rho_fold,
            })  
        log(f"   Inner Fold {fold_idx+1} → ρ_reg={rho_fold:.4f}")
    # Stack inner predictions
    
    pred_train_test_reg_folds = np.array(pred_train_test_reg_folds)
    mean_train_test_reg = pred_train_test_reg_folds.mean(axis=0)

    # Stack inner predictions
    pred_test_reg_folds = np.array(pred_test_reg_folds)
    mean_test_reg = pred_test_reg_folds.mean(axis=0)

    test_rg_result_chr.append(mean_test_reg)

    if USE_MASK:
        pred_train_test_prob_folds = np.array(pred_train_test_prob_folds)
        mean_train_test_prob = pred_train_test_prob_folds.mean(axis=0)
        pred_test_prob_folds = np.array(pred_test_prob_folds)
        mean_test_prob = pred_test_prob_folds.mean(axis=0)
        test_bn_result_chr.append(mean_test_prob)
        # ============================================================
        # Combine masking strategies
        # ============================================================
        train_test_mask_mean = (mean_train_test_prob >= MASK_THRESHOLD).astype(int)
        train_test_pred_mean_masked = mean_train_test_reg * train_test_mask_mean
        # ============================================================
        # Evaluate aggregated predictions
        # ============================================================
        train_test_rho_mean  = spearmanr(df_train_test[TARGET_RANK], train_test_pred_mean_masked)[0]
        train_test_rho_none  = spearmanr(df_train_test[TARGET_RANK], mean_train_test_reg)[0]
        results_summary.append({
            "chr": val_chr,
            "rho_mean":  train_test_rho_mean,
            "rho_none":  train_test_rho_none,
        })
        log(
            f"📊 {val_chr} → "
            f"No-mask ρ={train_test_rho_none:.4f}"
        )
    else:
        train_test_rho_none  = spearmanr(df_train_test[TARGET_RANK], mean_train_test_reg)[0]
        results_summary.append({
            "chr": val_chr,
            "rho_none":  train_test_rho_none,
        })
        log(
            f"📊 {val_chr} → "
            f"Mean-mask ρ={train_test_rho_mean:.4f} | "
            f"No-mask ρ={train_test_rho_none:.4f}"
        )
    # Stack inner predictions
test_rg_result_chr = np.array(test_rg_result_chr)
mean_test_reg = test_rg_result_chr.mean(axis=0)
if USE_MASK:
    test_bn_result_chr = np.array(test_bn_result_chr)
    mean_test_prob = test_bn_result_chr.mean(axis=0)
    test_predictions_df = pd.DataFrame({
        "gene_name": df_test["gene_name"],
        "mean_bn_prob": mean_test_prob,
        "gex_predicted": mean_test_reg
    })
else:
    test_predictions_df = pd.DataFrame({
        "gene_name": df_test["gene_name"],
        "gex_predicted": mean_test_reg
    })


# ============================================================
# SUMMARY
# ============================================================
results_df = pd.DataFrame(results_summary)

results_inner_df = pd.DataFrame(results_inner)

log("\n===== Per-Chromosome Summary =====")
log(results_df)
if  USE_MASK:
    log(f"\nAverage ρ (Mean-mask):  {results_df['rho_mean'].mean():.4f}")
log(f"Average ρ (No-mask):    {results_df['rho_none'].mean():.4f}")

log("\n===== Inner Fold Detail (first few rows) =====")
print(results_inner_df.head())
assert isinstance(mean_test_prob, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(mean_test_prob.dtype, np.number), 'Prediction array must be numeric'
assert mean_test_prob.shape[0] == len(df_test["gene_name"]), 'Each gene should have a unique predicted expression'
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "Wang_Ding_Yang_Project1.zip" # TODO
prediction_save_path = f'{OUTPUT_DIR}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

# # Optional: save both
results_inner_df.to_csv(f"{OUTPUT_DIR}/results_inner_folds.tsv", sep="\t", index=False)
test_predictions_df.to_csv(prediction_save_path, compression=compression_options)
log("\n💾 Saved detailed results → results_inner_folds.tsv")
log("\n💾 Saved test predictions → gex_predicted.tsv")
