# Features engineer

In [5]:
import pandas as pd
import numpy as np

def quantile_normalize_across(df_list, meta_cols):
    """
    跨多個 DataFrame 做真正的 quantile normalization。
    每個 feature column 分別以所有樣本的 rank 平均值取代。
    """

    # 取出所有 features
    feature_names = [c for c in df_list[0].columns if c not in meta_cols]
    n_features = len(feature_names)

    # 檢查形狀一致
    n_rows = [len(df) for df in df_list]
    if len(set(n_rows)) != 1:
        raise ValueError(f"❌ All DataFrames must have the same number of rows. Got: {n_rows}")

    n = n_rows[0]
    print(f"🧩 Performing quantile normalization on {len(df_list)} datasets, each with {n} rows and {n_features} features")

    # 準備 feature 矩陣列表
    features = [df[feature_names].to_numpy() for df in df_list]

    # 建立一個空間保存平均排序值
    mean_sorted_all = np.zeros((n, n_features))

    # 對每個 feature column 做 rank-based 平均
    for j in range(n_features):
        # 取所有 datasets 的該 feature
        vals = np.vstack([f[:, j] for f in features])
        # 對每個 dataset 內做排序
        sorted_each = np.sort(vals, axis=1)
        # 對 rank 取平均
        mean_sorted_all[:, j] = np.mean(sorted_each, axis=0)

    # 定義函式：把平均 rank 值放回原位置
    def apply_quantile_norm(X, mean_sorted_all):
        X_norm = np.zeros_like(X)
        for j in range(X.shape[1]):
            ranks = np.argsort(np.argsort(X[:, j]))
            X_norm[:, j] = mean_sorted_all[ranks, j]
        return X_norm

    # 套用到所有 DataFrame
    normalized = [apply_quantile_norm(X, mean_sorted_all) for X in features]

    # 回存
    dfs_qn = []
    for df, norm_values in zip(df_list, normalized):
        df_qn = pd.concat([
            df[meta_cols].reset_index(drop=True),
            pd.DataFrame(norm_values, columns=feature_names)
        ], axis=1)
        dfs_qn.append(df_qn)

    return dfs_qn


In [6]:
X1 = pd.read_csv("../preprocessed_data/CAGE-merged/4. features selection/X1_all_rank_features_pruned.tsv", sep="\t")
X2 = pd.read_csv("../preprocessed_data/CAGE-merged/4. features selection/X2_all_rank_features_pruned.tsv", sep="\t")

meta_cols = ["gene_name","chr","gene_start","gene_end","TSS_start","TSS_end","strand","gex","gex_rank"]

X1_qn, X2_qn = quantile_normalize_across([X1, X2], meta_cols)

# 儲存結果
X1_qn.to_csv("../preprocessed_data/CAGE-merged/4. features selection/X1_all_rank_features_pruned_qn.tsv", sep="\t", index=False)
X2_qn.to_csv("../preprocessed_data/CAGE-merged/4. features selection/X2_all_rank_features_pruned_qn.tsv", sep="\t", index=False)


🧩 Performing quantile normalization on 2 datasets, each with 16284 rows and 277 features


In [9]:
X2_qn["DNase_gene_z_std"].sort_values()

8043     0.000000
3951     0.000000
5433     0.000000
2617     0.000000
4531     0.000000
           ...   
15661    5.572742
13669    5.928350
9566     6.239169
3391     6.800296
8552     9.069464
Name: DNase_gene_z_std, Length: 16284, dtype: float64

In [10]:
X1_qn["DNase_gene_z_std"].sort_values()


7208     0.000000
4569     0.000000
1220     0.000000
14020    0.000000
14015    0.000000
           ...   
6926     5.572742
14936    5.928350
5802     6.239169
15046    6.800296
998      9.069464
Name: DNase_gene_z_std, Length: 16284, dtype: float64

In [None]:
import os
import json
import random
import pickle   # ✅ 新增
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr
from datetime import datetime

# ============================================================
#                  GLOBAL REPRODUCIBILITY
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  EXPERIMENT CONFIGURATION
# ============================================================
EXPERIMENT_DIR = "../results/lgbm/model_setting/baseline"
TRAIN_PATH = "../preprocessed_data/CAGE-merged/4. features selection/X1_all_rank_features_pruned.tsv"
VAL_PATH   = "../preprocessed_data/CAGE-merged/4. features selection/X2_all_rank_features_pruned.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

os.makedirs(EXPERIMENT_DIR, exist_ok=True)
LOG_PATH = os.path.join(EXPERIMENT_DIR, "log.txt")

# ✅ 如果已有舊的 log.txt，直接覆蓋（清空）
with open(LOG_PATH, "w") as f:
    f.write("")  # 或寫入 header，如 "==== New Experiment ====\n"

def log(msg):
    """同時印出並寫入 log.txt"""
    print(msg)
    with open(LOG_PATH, "a") as f:
        f.write(f"{msg}\n")


log(f"🚀 Experiment started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log(f"Experiment directory: {EXPERIMENT_DIR}")

# ============================================================
#                  DATA LOADING
# ============================================================
df_train_full = pd.read_csv(TRAIN_PATH, sep="\t")
df_val_full   = pd.read_csv(VAL_PATH, sep="\t")

feature_cols = [c for c in df_train_full.columns if c not in META_COLS]
log(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  K-FOLD DEFINITION
# ============================================================
chromosomes = [f"chr{i}" for i in range(2, 23)]
folds = [chromosomes[i::5] for i in range(5)]

log("🧩 Chromosome folds:")
for i, fset in enumerate(folds):
    log(f"Fold {i+1}: {fset}")

# ============================================================
#                  LIGHTGBM PARAMETERS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 16,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,
    "force_row_wise": True,
}

# ============================================================
#                  SAVE CONFIG
# ============================================================
config = {
    "train_path": TRAIN_PATH,
    "val_path": VAL_PATH,
    "seed": SEED,
    "params": params,
    "folds": folds,
    "target_col": TARGET_COL,
}
with open(os.path.join(EXPERIMENT_DIR, "config.json"), "w") as f:
    json.dump(config, f, indent=4)

# ============================================================
#                  TRAINING
# ============================================================
results = []
preds_all = []

preds_all = []

for fold_idx, val_chrs in enumerate(folds):
    log(f"\n🚀 Fold {fold_idx+1}/{len(folds)} | Val chromosomes: {val_chrs}")
    fold_dir = os.path.join(EXPERIMENT_DIR, f"fold_{fold_idx+1}")
    os.makedirs(fold_dir, exist_ok=True)

    # Split
    train_chrs = [c for c in chromosomes if c not in val_chrs]
    df_train = df_train_full[df_train_full["chr"].isin(train_chrs)].copy()
    df_val   = df_val_full[df_val_full["chr"].isin(val_chrs)].copy()

    # ✅ 保留原始 index，方便之後對齊
    df_val = df_val.reset_index().rename(columns={"index": "orig_idx"})

    X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
    X_val, y_val     = df_val[feature_cols], df_val[TARGET_COL]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dval],
        valid_names=["train", "val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ],
    )

    # Save model (.pkl)
    model_path = os.path.join(fold_dir, "model.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    log(f"💾 Model saved: {model_path}")

    # Predict
    df_val["predicted_gex_rank"] = model.predict(X_val, num_iteration=model.best_iteration)
    df_val["fold_id"] = fold_idx + 1  # optional: fold 來源

    preds_all.append(df_val[["orig_idx", "gene_name", "predicted_gex_rank", "fold_id"]])

    # Evaluate
    overall_spearman = spearmanr(df_val[TARGET_COL], df_val["predicted_gex_rank"])[0]
    log(f"📈 Fold {fold_idx+1} Overall Spearman ρ = {overall_spearman:.4f}")
    log("📊 Per-Chromosome Spearman:")
    chr_corrs = []
    for chrom, subdf in df_val.groupby("chr"):
        if len(subdf) < 2:
            continue
        rho = spearmanr(subdf[TARGET_COL], subdf["predicted_gex_rank"])[0]
        chr_corrs.append({"chr": chrom, "spearman": rho})
        log(f"   {chrom:<6s}: ρ = {rho:.4f}")

    results.append({
        "fold": fold_idx + 1,
        "val_chr": val_chrs,
        "overall_spearman": overall_spearman,
        "per_chr": chr_corrs
    })

# ============================================================
#                  SAVE PREDICTIONS & SUMMARY
# ============================================================

# ============================================================
# 🔁 合併並依照原始順序還原
# ============================================================
df_preds_all = pd.concat(preds_all, ignore_index=True)
df_preds_all = df_preds_all.sort_values("orig_idx").reset_index(drop=True)

# 驗證對齊
assert len(df_preds_all) == len(df_val_full)
assert (df_preds_all["gene_name"].values == df_val_full["gene_name"].values).all(), \
    "❌ Prediction order mismatch with df_val_full!"

# ✅ 輸出只有 gene_name + predicted_gex_rank
df_preds_all[["gene_name", "predicted_gex_rank"]].to_csv(
    os.path.join(EXPERIMENT_DIR, "predict_val.tsv"), sep="\t", index=False
)
log("💾 Final predictions (aligned) saved (predict_val.tsv)")

# ✅ 全域 Spearman：用原始 df_val_full 的真值 vs 你的預測
overall_val_spearman = spearmanr(
    df_val_full["gex_rank"], 
    df_preds_all["predicted_gex_rank"]
)[0]
log(f"🌍 Overall validation Spearman (aligned) = {overall_val_spearman:.4f}")




summary_fold = pd.DataFrame([
    {"fold": r["fold"], "val_chr": ",".join(r["val_chr"]), "overall_spearman": r["overall_spearman"]}
    for r in results
])

chr_rows = []
for r in results:
    for c in r["per_chr"]:
        chr_rows.append({"fold": r["fold"], "chr": c["chr"], "spearman": c["spearman"]})
summary_chr = pd.DataFrame(chr_rows)

chr_mean = summary_chr.groupby("chr", as_index=False)["spearman"].mean()
chr_mean.rename(columns={"spearman": "mean_spearman"}, inplace=True)

# summary_output = summary_fold.merge(chr_mean, how="cross")
# summary_output.to_csv(os.path.join(EXPERIMENT_DIR, "summary.tsv"), sep="\t", index=False)

# log("💾 Summary saved (summary.tsv)")
log(f"Mean Spearman (across folds): {summary_fold['overall_spearman'].mean():.4f}")
log(f"Overall mean of per-chromosome means: {chr_mean['mean_spearman'].mean():.4f}")

log(f"\n✅ Experiment finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


🔒 Global seed set to 42
🚀 Experiment started at 2025-10-14 00:18:20
Experiment directory: ../results/lgbm/model_setting/v1
Feature count: 277
🧩 Chromosome folds:
Fold 1: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Fold 2: ['chr3', 'chr8', 'chr13', 'chr18']
Fold 3: ['chr4', 'chr9', 'chr14', 'chr19']
Fold 4: ['chr5', 'chr10', 'chr15', 'chr20']
Fold 5: ['chr6', 'chr11', 'chr16', 'chr21']

🚀 Fold 1/5 | Val chromosomes: ['chr2', 'chr7', 'chr12', 'chr17', 'chr22']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.136981	val's rmse: 0.173474
[400]	train's rmse: 0.130169	val's rmse: 0.172215
[600]	train's rmse: 0.12545	val's rmse: 0.171884
Early stopping, best iteration is:
[692]	train's rmse: 0.123533	val's rmse: 0.171791
💾 Model saved: ../results/lgbm/model_setting/v1/fold_1/model.pkl
📈 Fold 1 Overall Spearman ρ = 0.7532
📊 Per-Chromosome Spearman:
   chr12 : ρ = 0.7743
   chr17 : ρ = 0.7411
   chr2  : ρ = 0.7480
   chr22 : ρ = 0.7287
   chr7  : ρ = 0.7594

🚀

In [None]:
import os
import json
import random
import pickle
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import spearmanr
from datetime import datetime

# ============================================================
#                  GLOBAL CONFIG
# ============================================================
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
print(f"🔒 Global seed set to {SEED}")

# ============================================================
#                  PATHS
# ============================================================
EXPERIMENT_DIR = "../results/lgbm/leave_one_chr"
TRAIN_PATH = "../preprocessed_data/CAGE-merged/4. features selection/X1_all_rank_features_pruned.tsv"
VAL_PATH   = "../preprocessed_data/CAGE-merged/4. features selection/X2_all_rank_features_pruned.tsv"

META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
TARGET_COL = "gex_rank"

os.makedirs(EXPERIMENT_DIR, exist_ok=True)
LOG_PATH = os.path.join(EXPERIMENT_DIR, "log.txt")
with open(LOG_PATH, "w") as f:
    f.write("==== Leave-One-Chromosome-Out Experiment ====\n")

def log(msg):
    print(msg)
    with open(LOG_PATH, "a") as f:
        f.write(f"{msg}\n")

# ============================================================
#                  DATA LOADING
# ============================================================
df_train_full = pd.read_csv(TRAIN_PATH, sep="\t")
df_val_full   = pd.read_csv(VAL_PATH, sep="\t")

feature_cols = [c for c in df_train_full.columns if c not in META_COLS]
chromosomes = [f"chr{i}" for i in range(2, 23)]
log(f"📚 Loaded data with {len(chromosomes)} chromosomes.")
log(f"Feature count: {len(feature_cols)}")

# ============================================================
#                  LIGHTGBM PARAMS
# ============================================================
params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 16,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "seed": SEED,
    "deterministic": True,
    "force_row_wise": True,
}

# ============================================================
#                  EXPERIMENT LOOP
# ============================================================
overall_results = []
preds_all = []

for test_chr in chromosomes:
    log(f"\n🧪 Outer Loop → Test chromosome: {test_chr}")
    outer_dir = os.path.join(EXPERIMENT_DIR, f"test_{test_chr}")
    os.makedirs(outer_dir, exist_ok=True)

    # 剩下的 20 個 chromosome 用於 inner CV
    inner_chrs = [c for c in chromosomes if c != test_chr]
    folds = [inner_chrs[i::5] for i in range(5)]
    log(f"Inner folds (5-fold over 20 chr): {folds}")

    test_df = df_val_full[df_val_full["chr"] == test_chr].copy()
    test_X = test_df[feature_cols]
    fold_preds = []

    for fold_idx, val_chrs in enumerate(folds):
        log(f"\n🚀 Fold {fold_idx+1}/5 | Val chromosomes: {val_chrs}")
        fold_dir = os.path.join(outer_dir, f"fold_{fold_idx+1}")
        os.makedirs(fold_dir, exist_ok=True)

        train_chrs = [c for c in inner_chrs if c not in val_chrs]
        df_train = df_train_full[df_train_full["chr"].isin(train_chrs)]
        df_val = df_train_full[df_train_full["chr"].isin(val_chrs)]

        X_train, y_train = df_train[feature_cols], df_train[TARGET_COL]
        X_val, y_val = df_val[feature_cols], df_val[TARGET_COL]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            valid_names=["train", "val"],
            num_boost_round=2000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=200)
            ],
        )

        model_path = os.path.join(fold_dir, "model.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(model, f)
        log(f"💾 Model saved: {model_path}")

        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        val_spearman = spearmanr(y_val, val_pred)[0]
        log(f"📈 Fold {fold_idx+1} inner val Spearman = {val_spearman:.4f}")

        # 對 test_chr 預測
        test_pred = model.predict(test_X, num_iteration=model.best_iteration)
        fold_preds.append(test_pred)

    # =======================================================
    # 平均五個模型對 test_chr 的預測
    # =======================================================
    mean_pred = np.mean(np.vstack(fold_preds), axis=0)
    test_df["predicted_gex_rank"] = mean_pred
    test_df["test_chr"] = test_chr
    preds_all.append(test_df[["gene_name", "chr", "predicted_gex_rank"]])

    # 計算 Spearman
    test_spearman = spearmanr(test_df["gex_rank"], test_df["predicted_gex_rank"])[0]
    overall_results.append({"test_chr": test_chr, "spearman": test_spearman})
    log(f"🌍 Test chromosome {test_chr}: Spearman ρ = {test_spearman:.4f}")

# ============================================================
#                  SUMMARY
# ============================================================
df_preds_all = pd.concat(preds_all, ignore_index=True)
df_preds_all.to_csv(os.path.join(EXPERIMENT_DIR, "predict_all.tsv"), sep="\t", index=False)

df_summary = pd.DataFrame(overall_results)
df_summary.to_csv(os.path.join(EXPERIMENT_DIR, "summary.tsv"), sep="\t", index=False)

mean_rho = df_summary["spearman"].mean()
log(f"\n✅ Overall mean Spearman across 21 test chromosomes: {mean_rho:.4f}")
log(f"Experiment completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


🔒 Global seed set to 42
📚 Loaded data with 21 chromosomes.
Feature count: 277

🧪 Outer Loop → Test chromosome: chr2
Inner folds (5-fold over 20 chr): [['chr3', 'chr8', 'chr13', 'chr18'], ['chr4', 'chr9', 'chr14', 'chr19'], ['chr5', 'chr10', 'chr15', 'chr20'], ['chr6', 'chr11', 'chr16', 'chr21'], ['chr7', 'chr12', 'chr17', 'chr22']]

🚀 Fold 1/5 | Val chromosomes: ['chr3', 'chr8', 'chr13', 'chr18']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.123313	val's rmse: 0.145375
Early stopping, best iteration is:
[156]	train's rmse: 0.127639	val's rmse: 0.145251
💾 Model saved: ../results/lgbm/leave_one_chr/test_chr2/fold_1/model.pkl
📈 Fold 1 inner val Spearman = 0.7779

🚀 Fold 2/5 | Val chromosomes: ['chr4', 'chr9', 'chr14', 'chr19']
Training until validation scores don't improve for 100 rounds
[200]	train's rmse: 0.121314	val's rmse: 0.14863
[400]	train's rmse: 0.103246	val's rmse: 0.148088
Early stopping, best iteration is:
[428]	train's rmse: 0.101137	val