In [1]:
# -*- coding: utf-8 -*-
# Jupyter cell — LOCO + chr-disjoint HPO with expanded spaces for CatBoost/XGBoost

import os, json, random, warnings
warnings.filterwarnings("ignore")

import optuna
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from functools import partial
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor

# ========== config ==========
SEED = 42
random.seed(SEED); np.random.seed(SEED)

TRAIN_PATH = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X1_all_rank_features_with_y.tsv"
VAL_PATH   = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/X2_all_rank_features_with_y.tsv"

SAVE_ROOT = "../results/hpo_loco_chr_disjoint"
os.makedirs(SAVE_ROOT, exist_ok=True)

META_COLS = ["gene_name","chr","gene_start","gene_end","TSS_start","TSS_end","strand","gex","gex_rank"]
TARGET_R  = "gex_rank"
TARGET_B  = "gex_binary"

# trials per study（依資源調整）
N_TRIALS_LGBM_BIN = 300
N_TRIALS_XGB_BIN  = 300
N_TRIALS_CAT_BIN  = 300
N_TRIALS_CAT_REG  = 300

K_VAL_CHR = 4  # 每個 trial 驗證用的 chr 數

# ========== load ==========
df_x1 = pd.read_csv(TRAIN_PATH, sep="\t"); df_x1["cell_id"] = "X1"
df_x2 = pd.read_csv(VAL_PATH,   sep="\t"); df_x2["cell_id"] = "X2"

df_all = pd.concat([df_x1, df_x2], ignore_index=True)
df_all[TARGET_B] = (df_all["gex"] > 0.0).astype(int)

# features
drop_cols = set(META_COLS + [TARGET_B, "cell_id"])
feature_cols = [c for c in df_all.columns if c not in drop_cols]
chromosomes = sorted(df_all["chr"].unique())
print(f"🧬 chromosomes: {chromosomes}")
print(f"Feature count: {len(feature_cols)} | Total rows: {len(df_all):,}")

# ========== helpers ==========
def split_loco_by_chromosomes(trial, train_cell, val_cell, k_val=K_VAL_CHR):
    """LOCO: train on one cell, validate on the other; chromosomes disjoint."""
    rng = random.Random(SEED)  # 確保每個 trial 不同
    if k_val > len(chromosomes):
        raise ValueError("k_val exceeds available chromosomes.")
    val_chrs = rng.sample(chromosomes, k_val)
    print(val_chrs)
    tr = df_all[(df_all["cell_id"] == train_cell) & (~df_all["chr"].isin(val_chrs))].copy()
    va = df_all[(df_all["cell_id"] == val_cell)   & ( df_all["chr"].isin(val_chrs))].copy()
    return tr, va, val_chrs

def mean_per_chr_auc(df_v, y_true, y_pred):
    per = []
    tmp = df_v.assign(_y=np.asarray(y_true), _p=np.asarray(y_pred))
    for _, g in tmp.groupby("chr"):
        yt = g["_y"].values; yp = g["_p"].values
        if len(np.unique(yt)) == 2 and len(yt) >= 3:
            per.append(roc_auc_score(yt, yp))
    return np.nanmean(per) if per else np.nan

def mean_per_chr_spearman(df_v, y_true, y_pred):
    per = []
    tmp = df_v.assign(_y=np.asarray(y_true), _p=np.asarray(y_pred))
    for _, g in tmp.groupby("chr"):
        if len(g) >= 3:
            per.append(spearmanr(g["_y"], g["_p"])[0])
    return np.nanmean(per) if per else np.nan

# ==== 替換這兩段：split_loco_by_chromosomes 與 objective_xgb_bin ====
import xgboost as xgb
from xgboost.core import XGBoostError

print("xgboost version:", xgb.__version__)  # 方便確認版本


def _xgb_params_full(trial):
    booster = trial.suggest_categorical("booster", ["gbtree", "dart"])
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": booster,
        "eta": trial.suggest_float("eta", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 40),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 30.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        # 這些在部分舊版可能不支援，先嘗試，失敗會 fallback
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 0.0, 5.0),  # L2
        "alpha": trial.suggest_float("alpha", 0.0, 2.0),    # L1
        "max_delta_step": trial.suggest_float("max_delta_step", 0.0, 5.0),
        "tree_method": "hist",
        "max_bin": trial.suggest_int("max_bin", 128, 512, step=64),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        # "sampling_method": trial.suggest_categorical("sampling_method", ["uniform", "gradient_based"]),  # 易踩版本，先註解
        "seed": SEED,
    }
    if booster == "dart":
        params.update({
            "rate_drop": trial.suggest_float("rate_drop", 0.0, 0.3),
            "skip_drop": trial.suggest_float("skip_drop", 0.0, 0.3),
            # 下兩個在某些舊版不存在，故不加
            # "sample_type": trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
            # "normalize_type": trial.suggest_categorical("normalize_type", ["tree", "forest"]),
        })
    return params

def _xgb_params_safe(trial):
    """最大相容的安全參數集（幾乎所有版本都可跑）"""
    booster = "gbtree"  # 安全起見固定 gbtree
    return {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": booster,
        "eta": trial.suggest_float("eta_safe", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth_safe", 3, 12),
        "min_child_weight": trial.suggest_float("min_child_weight_safe", 1.0, 10.0),
        "gamma": trial.suggest_float("gamma_safe", 0.0, 5.0),
        "subsample": trial.suggest_float("subsample_safe", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree_safe", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda_safe", 0.0, 5.0),
        "alpha": trial.suggest_float("alpha_safe", 0.0, 2.0),
        "tree_method": "hist",
        "seed": SEED,
    }

def objective_xgb_bin(trial, train_cell, val_cell):
    tr, va, _ = split_loco_by_chromosomes(trial, train_cell, val_cell)
    X_tr, y_tr = tr[feature_cols], tr[TARGET_B]
    X_va, y_va = va[feature_cols], va[TARGET_B]

    dtr = xgb.DMatrix(X_tr, label=y_tr, feature_names=list(feature_cols))
    dva = xgb.DMatrix(X_va, label=y_va, feature_names=list(feature_cols))

    params = _xgb_params_full(trial)
    num_boost_round = 8000
    early_stopping_rounds = 200

    try:
        bst = xgb.train(params=params,
                        dtrain=dtr,
                        num_boost_round=num_boost_round,
                        evals=[(dva, "val")],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=False)
    except XGBoostError as e:
        # 版本不支援某參數 → 自動降級到安全參數集
        print(f"[XGB] Fallback to safe params due to: {str(e).splitlines()[0]}")
        params = _xgb_params_safe(trial)
        bst = xgb.train(params=params,
                        dtrain=dtr,
                        num_boost_round=num_boost_round,
                        evals=[(dva, "val")],
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=False)

    best_ntree_limit = getattr(bst, "best_ntree_limit", None)
    p = bst.predict(dva, ntree_limit=best_ntree_limit) if best_ntree_limit else bst.predict(dva)

    auc_chr = mean_per_chr_auc(va, y_va, p)
    auc_all = roc_auc_score(y_va, p)
    return auc_chr if not np.isnan(auc_chr) else auc_all



# ========== run studies for both LOCO directions ==========
def run_and_save(name, objective_fn, n_trials, train_cell, val_cell):
    tag = f"{name}_{train_cell}_to_{val_cell}"
    print(f"\n===== 🔎 Start study: {tag} | trials={n_trials} =====")
    study = optuna.create_study(direction="maximize",
                                sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(partial(objective_fn, train_cell=train_cell, val_cell=val_cell),
                   n_trials=n_trials, show_progress_bar=True)
    print(f"✅ {tag} Best value: {study.best_value}")
    print("🏆 Best params:")
    for k,v in study.best_params.items():
        print(f"  {k}: {v}")
    out_dir = os.path.join(SAVE_ROOT, tag); os.makedirs(out_dir, exist_ok=True)
    study.trials_dataframe().to_csv(os.path.join(out_dir, "trials_log.csv"), index=False)
    with open(os.path.join(out_dir, "best_params.json"), "w") as f:
        json.dump(study.best_params, f, indent=4)
    print(f"💾 Saved to {out_dir}")
    return study

PAIRS = [("X1","X2"), ("X2","X1")]

for a,b in PAIRS:
    run_and_save("xgb_binary",   objective_xgb_bin,  N_TRIALS_XGB_BIN,  a, b)

[I 2025-10-23 07:06:34,055] A new study created in memory with name: no-name-4b004bea-c68c-4a26-8154-1e8955fce85e


🧬 chromosomes: ['chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9']
Feature count: 2892 | Total rows: 32,568
xgboost version: 3.0.5

===== 🔎 Start study: xgb_binary_X1_to_X2 | trials=300 =====


  0%|          | 0/300 [00:00<?, ?it/s]

['chr9', 'chr13', 'chr10', 'chr18']


  0%|          | 0/300 [00:09<?, ?it/s]


[W 2025-10-23 07:06:43,776] Trial 0 failed with parameters: {'booster': 'dart', 'eta': 0.08960785365368121, 'max_depth': 25, 'min_child_weight': 5.524540572830659, 'gamma': 0.7799726016810132, 'subsample': 0.5290418060840998, 'colsample_bytree': 0.9330880728874675, 'colsample_bylevel': 0.8005575058716043, 'colsample_bynode': 0.8540362888980227, 'lambda': 0.10292247147901223, 'alpha': 1.9398197043239886, 'max_delta_step': 4.162213204002109, 'max_bin': 192, 'grow_policy': 'lossguide', 'rate_drop': 0.09127267288786131, 'skip_drop': 0.15742692948967135} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ml4g_project1/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/7g/j5s0yzcj34l3v043s7znkplc0000gn/T/ipykernel_26769/2087270118.py", line 156, in objective_xgb_bin
    bst = xgb.train(params=params,
          

KeyboardInterrupt: 