# Features engineer

In [4]:
import os
import numpy as np
import pandas as pd
import itertools

# ============================================================
#                FEATURE ENGINEERING HELPERS
# ============================================================

def safe_div(a, b):
    """安全除法，防止除以 0"""
    with np.errstate(divide='ignore', invalid='ignore'):
        return np.where(np.abs(b) > 1e-8, a / b, 0)


def check_new_columns(df, prev_cols, context=""):
    """
    ✅ 只檢查「新增加的欄位」中是否有 NaN 或 inf。
    並列出對應的 gene_name（前幾個例子）。
    """
    new_cols = [c for c in df.columns if c not in prev_cols]
    if not new_cols:
        return df  # 沒新增欄位就略過
    for col in new_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    numeric_cols = [c for c in new_cols if np.issubdtype(df[c].dtype, np.number)]

    # 轉 inf → NaN
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

    nan_ratio = df[numeric_cols].isna().mean()
    bad = nan_ratio[nan_ratio > 0].sort_values(ascending=False)

    if not bad.empty:
        print(f"⚠️ [{context}] {len(bad)} new features contain NaN/inf:")
        # 列出每個欄位 NaN 的 gene_name（只顯示前幾個避免太多）
        for col in bad.index[:5]:  # 只顯示前5個問題最多的欄位
            nan_genes = df.loc[df[col].isna(), "gene_name"].head(5).tolist() \
                        if "gene_name" in df.columns else []
            if nan_genes:
                print(f"   ↳ {col}: {len(df[col].isna())} NaN — e.g. {nan_genes}")
            else:
                print(f"   ↳ {col}: {len(df[col].isna())} NaN (no gene_name column found)")
    else:
        print(f"✅ [{context}] All {len(numeric_cols)} new features valid.")

    return df


# ============================================================
#                 FEATURE RANK TRANSFORMATION
# ============================================================

from scipy.stats import rankdata
import numpy as np
from scipy.stats import rankdata

from scipy.stats import rankdata
import numpy as np

def rank_transform_features(df):
    """
    Rank-transform feature values, optionally per chromosome.
    If by_chr=True → rank within each chromosome.
    If by_chr=False → rank across the entire dataset.

    根據 biological direction (activating/repressive) 決定正向或反向排名。
    """
    print(f"🔢 Performing rank transformation...")

    exclude_cols = ["gene_name", "chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand","gex", "gex_rank"]
    numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
    
    activating_marks = ["DNase", "H3K27ac", "H3K4me3", "H3K36me3", "H3K4me1"]
    repressive_marks = ["H3K9me3", "H3K27me3"]

    df_ranked = df.copy()

    def rank_with_direction(vals, col):
        """Apply biological-direction-aware ranking to a numeric array."""
        if np.all(np.isnan(vals)):
            return np.full_like(vals, np.nan, dtype=float)
        if any(mark in col for mark in repressive_marks) or "repress" in col.lower():
            ranks = rankdata(-vals, method="average") / len(vals)
        else:
            ranks = rankdata(vals, method="average") / len(vals)
        return ranks

    # --- chromosome-based ranking ---
    for chrom, subdf in df.groupby("chr", sort=False):
        idx = subdf.index
        for col in numeric_cols:
            vals = subdf[col].values
            df_ranked.loc[idx, f"{col}_chr_rank"] = rank_with_direction(vals, col)
    # --- global ranking ---
    for col in numeric_cols:
        vals = df[col].values
        df_ranked[f"{col}_global_rank"] = rank_with_direction(vals, col)

    return df_ranked


def add_gene_structure(df):
    """0️⃣ gene structure features"""
    prev_cols = df.columns.copy()

    if all(col in df.columns for col in ["gene_start", "gene_end"]):
        df["gene_length"] = df["gene_end"] - df["gene_start"]
    else:
        df["gene_length"] = np.nan

    if all(col in df.columns for col in ["TSS_start", "TSS_end"]):
        df["tss_length"] = df["TSS_end"] - df["TSS_start"]
    else:
        df["tss_length"] = np.nan

    return check_new_columns(df, prev_cols, "gene_structure")


def add_promoter_gene_ratio(df, marks):
    """1️⃣ promoter vs gene body ratio"""
    prev_cols = df.columns.copy()

    for mark in marks:
        gene_mean, tss_mean = f"{mark}_gene_signal_mean", f"{mark}_tss_signal_mean"
        gene_std, tss_std   = f"{mark}_gene_signal_std", f"{mark}_tss_signal_std"
        if gene_mean in df and tss_mean in df:
            df[f"{mark}_ratio_mean"] = safe_div(df[tss_mean], df[gene_mean])
        if gene_std in df and tss_std in df:
            df[f"{mark}_ratio_std"] = safe_div(df[tss_std], df[gene_std])

    return check_new_columns(df, prev_cols, "promoter_gene_ratio")

def add_activation_balance(df):
    """2️⃣ activation–repression balance"""
    prev_cols = df.columns.copy()

    if all(c in df for c in ["H3K27ac_tss_signal_mean", "H3K27me3_tss_signal_mean"]):
        df["balance_H3K27"] = df["H3K27ac_tss_signal_mean"] - df["H3K27me3_tss_signal_mean"]
    if all(c in df for c in ["H3K4me3_tss_signal_mean", "H3K9me3_tss_signal_mean"]):
        df["balance_H3K4"] = df["H3K4me3_tss_signal_mean"] - df["H3K9me3_tss_signal_mean"]

    return check_new_columns(df, prev_cols, "activation_balance")


def add_promoter_entropy(df, activating_marks, repressive_marks):
    """3️⃣ promoter entropy & variability"""
    prev_cols = df.columns.copy()

    tss_cols = [f"{m}_tss_signal_mean" for m in activating_marks + repressive_marks if f"{m}_tss_signal_mean" in df]
    if not tss_cols:
        return df

    df["promoter_variability"] = df[tss_cols].std(axis=1)
    norm_vals = df[tss_cols].div(df[tss_cols].sum(axis=1), axis=0)
    df["promoter_entropy"] = -np.nansum(
        np.nan_to_num(norm_vals) * np.log(np.nan_to_num(norm_vals) + 1e-8), axis=1
    )

    return check_new_columns(df, prev_cols, "promoter_entropy")


def add_chromatin_indices(df, activating_marks, repressive_marks):
    """4️⃣ openness & repression index"""
    prev_cols = df.columns.copy()

    act_cols = [f"{m}_tss_signal_mean" for m in activating_marks if f"{m}_tss_signal_mean" in df]
    rep_cols = [f"{m}_tss_signal_mean" for m in repressive_marks if f"{m}_tss_signal_mean" in df]
    if act_cols:
        df["openness_index"] = df[act_cols].mean(axis=1)
    if rep_cols:
        df["repression_index"] = df[rep_cols].mean(axis=1)

    return check_new_columns(df, prev_cols, "chromatin_indices")



def add_strand_features(df):
    """5️⃣ strand-aware features"""
    prev_cols = df.columns.copy()

    if "strand" in df.columns:
        df["strand_is_plus"] = (df["strand"] == "+").astype(int)
        df["strand_is_minus"] = (df["strand"] == "-").astype(int)

    return check_new_columns(df, prev_cols, "strand_features")

import itertools
import numpy as np

def add_cross_mark_interactions(df, marks):
    """
    6️⃣ pairwise cross-mark interactions (enhanced)
    - Multiplicative (×)
    - Ratio (/)
    - Difference (-)
    - Absolute difference (absdiff)
    - Promoter–gene cross interactions

    Example output:
        H3K27ac_H3K4me3_mul
        H3K27ac_H3K4me3_ratio
        H3K27ac_H3K4me3_diff
        H3K27ac_H3K4me3_absdiff
        H3K27ac_tss_H3K36me3_gene_cross
    """
    prev_cols = df.columns.copy()

    def safe_div(a, b):
        """Safe division."""
        with np.errstate(divide='ignore', invalid='ignore'):
            return np.where(np.abs(b) > 1e-8, a / b, 0)

    for m1, m2 in itertools.combinations(marks, 2):
        # --- 取得 TSS 欄位 ---
        c1, c2 = f"{m1}_tss_signal_mean", f"{m2}_tss_signal_mean"
        if c1 in df and c2 in df:
            a, b = df[c1].astype(float), df[c2].astype(float)

            df[f"{m1}_{m2}_mul"]      = a * b
            df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
            df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
            df[f"{m1}_{m2}_diff"]     = a - b
            df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)

    # --- Promoter–Gene Cross Interactions ---
    for m1, m2 in itertools.combinations(marks, 2):
        tss_col_1  = f"{m1}_tss_signal_mean"
        gene_col_2 = f"{m2}_gene_signal_mean"
        tss_col_2  = f"{m2}_tss_signal_mean"
        gene_col_1 = f"{m1}_gene_signal_mean"

        if tss_col_1 in df and gene_col_2 in df:
            df[f"{m1}_tss_{m2}_gene_cross"] = df[tss_col_1].astype(float) * df[gene_col_2].astype(float)
        if tss_col_2 in df and gene_col_1 in df:
            df[f"{m2}_tss_{m1}_gene_cross"] = df[tss_col_2].astype(float) * df[gene_col_1].astype(float)
    
    for m1, m2 in itertools.combinations(marks, 2):
        # --- 取得 TSS 欄位 ---
        c1, c2 = f"{m1}_tss_signal_std", f"{m2}_tss_signal_std"
        if c1 in df and c2 in df:
            a, b = df[c1].astype(float), df[c2].astype(float)

            df[f"{m1}_{m2}_mul"]      = a * b
            df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
            df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
            df[f"{m1}_{m2}_diff"]     = a - b
            df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)

    # --- Promoter–Gene Cross Interactions ---
    for m1, m2 in itertools.combinations(marks, 2):
        tss_col_1  = f"{m1}_tss_signal_std"
        gene_col_2 = f"{m2}_gene_signal_std"
        tss_col_2  = f"{m2}_tss_signal_std"
        gene_col_1 = f"{m1}_gene_signal_std"

        if tss_col_1 in df and gene_col_2 in df:
            df[f"{m1}_tss_{m2}_gene_std_cross"] = df[tss_col_1].astype(float) * df[gene_col_2].astype(float)
        if tss_col_2 in df and gene_col_1 in df:
            df[f"{m2}_tss_{m1}_gene_std_cross"] = df[tss_col_2].astype(float) * df[gene_col_1].astype(float)

    return check_new_columns(df, prev_cols, "cross_mark_interactions_v2")



def add_activation_repression_indices(df):
    """7️⃣ summarized activation/repression indices"""
    prev_cols = df.columns.copy()

    if all(c in df for c in ["H3K27ac_tss_signal_mean", "H3K27me3_tss_signal_mean"]):
        df["activation_balance"] = df["H3K27ac_tss_signal_mean"] - df["H3K27me3_tss_signal_mean"]
    if all(c in df for c in ["H3K4me3_tss_signal_mean", "H3K9me3_tss_signal_mean"]):
        df["promoter_activity"] = df["H3K4me3_tss_signal_mean"] - df["H3K9me3_tss_signal_mean"]
    if all(c in df for c in ["H3K9me3_tss_signal_mean", "H3K27me3_tss_signal_mean"]):
        df["repression_index"] = (df["H3K9me3_tss_signal_mean"] + df["H3K27me3_tss_signal_mean"]) / 2
    if all(c in df for c in ["H3K27ac_tss_signal_mean", "H3K4me3_tss_signal_mean", "DNase_tss_signal_mean"]):
        df["activation_index"] = (
            df["H3K27ac_tss_signal_mean"] + df["H3K4me3_tss_signal_mean"] + df["DNase_tss_signal_mean"]
        ) / 3

    return check_new_columns(df, prev_cols, "activation_repression_indices")


def add_axis_and_delta(df, marks):
    """8️⃣ axis sum & promoter-body delta"""
    prev_cols = df.columns.copy()

    for mark in marks:
        g_mean, t_mean = f"{mark}_gene_signal_mean", f"{mark}_tss_signal_mean"
        if g_mean in df and t_mean in df:
            df[f"{mark}_axis_sum"] = df[t_mean] + df[g_mean]
            df[f"{mark}_promoter_body_delta"] = df[t_mean] - df[g_mean]

    return check_new_columns(df, prev_cols, "axis_and_delta")

def add_tss_distance_feature(df):
    """9️⃣ Distance between TSS midpoint and nearest gene boundary (start or end)."""
    prev_cols = df.columns.copy()
    required_cols = ["gene_start", "gene_end", "TSS_end", "TSS_start"]
    if not all(c in df.columns for c in required_cols):
        print("⚠️ Missing required columns for TSS distance computation.")
        df["tss_to_gene_boundary_min"] = np.nan
        return df

    # --- 計算 TSS midpoint ---
    Mid_tss = (df["TSS_start"].astype(float) + df["TSS_end"].astype(float)) / 2

    # --- 與 gene boundary 距離 ---
    dist_to_start = np.abs(Mid_tss - df["gene_start"].astype(float))
    dist_to_end = np.abs(Mid_tss - df["gene_end"].astype(float))
    gene_region_length = np.abs(df["gene_end"].astype(float) - df["gene_start"].astype(float))

    # --- 取最小距離 + 個別距離 ---
    df["tss_to_gene_boundary_min"] = np.minimum(dist_to_start, dist_to_end)
    df["tss_to_gene_boundary_max"] = np.maximum(dist_to_start, dist_to_end)
    df["tss_to_gene_start_dist"] = dist_to_start
    df["tss_to_gene_end_dist"] = dist_to_end
    df["tss_to_gene_boundary_min_gene_region_ratio"] = np.minimum(dist_to_start, dist_to_end) / gene_region_length
    df["tss_to_gene_boundary_max_gene_region_ratio"] = np.maximum(dist_to_start, dist_to_end) / gene_region_length
    df["tss_to_gene_start_dist_gene_region_ratio"] = dist_to_start / gene_region_length
    df["tss_to_gene_end_dist_gene_region_ratio"] = dist_to_end / gene_region_length

    return check_new_columns(df, prev_cols, "tss_to_gene_boundary_min")

# ============================================================
#                MAIN PIPELINE FUNCTION
# ============================================================


def run_feature_engineering(merged_dir, cells, marks):
    activating_marks = ["DNase", "H3K27ac", "H3K4me3", "H3K36me3"]
    repressive_marks = ["H3K9me3", "H3K27me3"]

    for cell in cells:
        in_path = os.path.join(merged_dir, f"{cell}_all_logzscore_logzscore.tsv")
        if not os.path.exists(in_path):
            print(f"⚠️ Missing input file: {in_path}")
            continue

        print(f"\n📂 Processing {cell} ...")
        df = pd.read_csv(in_path, sep="\t")
        # if cell in ["X1", "X2"]:
        #     df = df[df["chr"] != "chr1"].reset_index(drop=True)
        # else:
        #     df = df[df["chr"] == "chr1"].reset_index(drop=True)
        df = add_gene_structure(df)
        df = add_tss_distance_feature(df)   # 🆕 新增這一行
        df = add_promoter_gene_ratio(df, activating_marks + repressive_marks)
        df = add_activation_balance(df)
        df = add_promoter_entropy(df, activating_marks, repressive_marks)
        df = add_chromatin_indices(df, activating_marks, repressive_marks)
        df = add_strand_features(df)
        df = add_cross_mark_interactions(df, marks)
        df = add_activation_repression_indices(df)
        df = add_axis_and_delta(df, marks)
        df = rank_transform_features(df)
        df = add_bed_topology_features(df, marks)
        df = add_advanced_chromatin_features(df, marks)   # 🧠 加在這裡
        df = add_cross_layer_features(df, marks=marks, prefix="cross", bw_norm="logz")

        out_path = os.path.join(merged_dir, f"{cell}_all_rank_features.tsv")
        df.to_csv(out_path, sep="\t", index=False)
        print(f"✅ Saved engineered features → {out_path}")

    print("\n🎯 Feature engineering complete for all cell lines.")

# ============================================================
#                EXECUTION EXAMPLE
# ============================================================

if __name__ == "__main__":
    merged_dir = "../preprocessed_data/reference/1. merged data/without_y_100_one_side/"
    cells = ["X1", "X2", "X3"]
    META_COLS = ["gene_name", "chr", "gene_start", "gene_end",
             "TSS_start", "TSS_end", "strand", "gex", "gex_rank"]
    marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]

# === 讀取資料 ===
    run_feature_engineering(merged_dir, cells, marks)



📂 Processing X1 ...
✅ [gene_structure] All 2 new features valid.
✅ [tss_to_gene_boundary_min] All 8 new features valid.
✅ [promoter_gene_ratio] All 12 new features valid.
✅ [activation_balance] All 2 new features valid.
✅ [promoter_entropy] All 2 new features valid.
✅ [chromatin_indices] All 2 new features valid.
✅ [strand_features] All 2 new features valid.
✅ [cross_mark_interactions_v2] All 189 new features valid.
✅ [activation_repression_indices] All 3 new features valid.
✅ [axis_and_delta] All 14 new features valid.
🔢 Performing rank transformation...


  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}

✅ [bed_topology_features_v2] All 70 new features valid.


  df["promoter_gene_coherence"] = df.apply(promoter_gene_coherence, axis=1)
  df["chromatin_entropy_mean"] = df[entropy_cols].mean(axis=1)
  df["chromatin_entropy_std"] = df[entropy_cols].std(axis=1)
  df[f"{prefix}_{mark}_tss_bw_over_peak_density"] = safe_div(df[tss_bw_mean], df[tss_peak_dens])
  df[f"{prefix}_{mark}_tss_bw_std_over_peak_entropy"] = safe_div(df[tss_bw_std], df[tss_peak_entr])
  df[f"{prefix}_{mark}_gene_bw_over_peak_density"] = safe_div(df[gene_bw_mean], df[gene_peak_dens])
  df[f"{prefix}_{mark}_gene_bw_std_over_peak_entropy"] = safe_div(df[gene_bw_std], df[gene_peak_entr])
  df[f"{prefix}_{mark}_tss_entropy_diff"] = df[tss_bw_entropy] - df[tss_peak_entr]
  df[f"{prefix}_{mark}_tss_bw_vs_bed_mean_diff"] = df[tss_bw_mean] - df[tss_bed_mean]
  df[f"{prefix}_{mark}_gene_entropy_diff"] = df[gene_bw_entropy] - df[gene_peak_entr]
  df[f"{prefix}_{mark}_tss_peak_density_times_bw_mean"] = df[tss_peak_dens] * df[tss_bw_mean]
  df[f"{prefix}_{mark}_tss_peak_density_times_bw_en

✅ [advanced_chromatin_features] All 15 new features valid.
✅ [cross] Added 98 cross-layer features.
✅ Saved engineered features → ../preprocessed_data/reference/1. merged data/without_y_100_one_side/X1_all_rank_features.tsv

📂 Processing X2 ...
✅ [gene_structure] All 2 new features valid.
✅ [tss_to_gene_boundary_min] All 8 new features valid.
✅ [promoter_gene_ratio] All 12 new features valid.
✅ [activation_balance] All 2 new features valid.
✅ [promoter_entropy] All 2 new features valid.
✅ [chromatin_indices] All 2 new features valid.
✅ [strand_features] All 2 new features valid.
✅ [cross_mark_interactions_v2] All 189 new features valid.
✅ [activation_repression_indices] All 3 new features valid.
✅ [axis_and_delta] All 14 new features valid.
🔢 Performing rank transformation...


  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}

✅ [bed_topology_features_v2] All 70 new features valid.


  df["promoter_gene_coherence"] = df.apply(promoter_gene_coherence, axis=1)
  df["chromatin_entropy_mean"] = df[entropy_cols].mean(axis=1)
  df["chromatin_entropy_std"] = df[entropy_cols].std(axis=1)
  df[f"{prefix}_{mark}_tss_bw_over_peak_density"] = safe_div(df[tss_bw_mean], df[tss_peak_dens])
  df[f"{prefix}_{mark}_tss_bw_std_over_peak_entropy"] = safe_div(df[tss_bw_std], df[tss_peak_entr])
  df[f"{prefix}_{mark}_gene_bw_over_peak_density"] = safe_div(df[gene_bw_mean], df[gene_peak_dens])
  df[f"{prefix}_{mark}_gene_bw_std_over_peak_entropy"] = safe_div(df[gene_bw_std], df[gene_peak_entr])
  df[f"{prefix}_{mark}_tss_entropy_diff"] = df[tss_bw_entropy] - df[tss_peak_entr]
  df[f"{prefix}_{mark}_tss_bw_vs_bed_mean_diff"] = df[tss_bw_mean] - df[tss_bed_mean]
  df[f"{prefix}_{mark}_gene_entropy_diff"] = df[gene_bw_entropy] - df[gene_peak_entr]
  df[f"{prefix}_{mark}_tss_peak_density_times_bw_mean"] = df[tss_peak_dens] * df[tss_bw_mean]
  df[f"{prefix}_{mark}_tss_peak_density_times_bw_en

✅ [advanced_chromatin_features] All 15 new features valid.
✅ [cross] Added 98 cross-layer features.
✅ Saved engineered features → ../preprocessed_data/reference/1. merged data/without_y_100_one_side/X2_all_rank_features.tsv

📂 Processing X3 ...
✅ [gene_structure] All 2 new features valid.
✅ [tss_to_gene_boundary_min] All 8 new features valid.
✅ [promoter_gene_ratio] All 12 new features valid.
✅ [activation_balance] All 2 new features valid.
✅ [promoter_entropy] All 2 new features valid.
✅ [chromatin_indices] All 2 new features valid.
✅ [strand_features] All 2 new features valid.
✅ [cross_mark_interactions_v2] All 189 new features valid.
✅ [activation_repression_indices] All 3 new features valid.
✅ [axis_and_delta] All 14 new features valid.
🔢 Performing rank transformation...


  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}_{m2}_mul"]      = a * b
  df[f"{m1}_{m2}_ratio"]    = safe_div(a, b)
  df[f"{m2}_{m1}_ratio"]    = safe_div(b, a)  # 雙向比值
  df[f"{m1}_{m2}_diff"]     = a - b
  df[f"{m1}_{m2}_absdiff"]  = np.abs(a - b)
  df[f"{m1}

✅ [bed_topology_features_v2] All 70 new features valid.


  df["promoter_gene_coherence"] = df.apply(promoter_gene_coherence, axis=1)
  df["chromatin_entropy_mean"] = df[entropy_cols].mean(axis=1)
  df["chromatin_entropy_std"] = df[entropy_cols].std(axis=1)
  df[f"{prefix}_{mark}_tss_bw_over_peak_density"] = safe_div(df[tss_bw_mean], df[tss_peak_dens])
  df[f"{prefix}_{mark}_tss_bw_std_over_peak_entropy"] = safe_div(df[tss_bw_std], df[tss_peak_entr])
  df[f"{prefix}_{mark}_gene_bw_over_peak_density"] = safe_div(df[gene_bw_mean], df[gene_peak_dens])
  df[f"{prefix}_{mark}_gene_bw_std_over_peak_entropy"] = safe_div(df[gene_bw_std], df[gene_peak_entr])
  df[f"{prefix}_{mark}_tss_entropy_diff"] = df[tss_bw_entropy] - df[tss_peak_entr]
  df[f"{prefix}_{mark}_tss_bw_vs_bed_mean_diff"] = df[tss_bw_mean] - df[tss_bed_mean]
  df[f"{prefix}_{mark}_gene_entropy_diff"] = df[gene_bw_entropy] - df[gene_peak_entr]
  df[f"{prefix}_{mark}_tss_peak_density_times_bw_mean"] = df[tss_peak_dens] * df[tss_bw_mean]
  df[f"{prefix}_{mark}_tss_peak_density_times_bw_en

✅ [advanced_chromatin_features] All 15 new features valid.
✅ [cross] Added 98 cross-layer features.
✅ Saved engineered features → ../preprocessed_data/reference/1. merged data/without_y_100_one_side/X3_all_rank_features.tsv

🎯 Feature engineering complete for all cell lines.


In [2]:
def add_advanced_chromatin_features(df, marks):
    """
    🧬 Advanced chromatin-level feature engineering
    ---------------------------------------------------
    包含：
      1️⃣ Accessibility gradient (promoter–gene body差異梯度)
      2️⃣ Promoter asymmetry (方向性)
      3️⃣ Enhancer–Promoter coupling
      4️⃣ Co-accessibility / synergy (H3K27ac × DNase)
      5️⃣ Bivalent index / balance (activation–repression antagonism)
      6️⃣ Promoter–Gene body coherence (跨標誌協調性)
      7️⃣ Chromatin entropy diversity (全域染色質複雜度)
    """
    prev_cols = df.columns.copy()

    # ============================================================
    # 1️⃣ Accessibility gradient per mark
    # ============================================================
    if "gene_length" in df:
        for mark in marks:
            tss_col = f"{mark}_tss_logz_mean"
            gene_col = f"{mark}_gene_logz_mean"
            if tss_col in df and gene_col in df:
                df[f"{mark}_accessibility_gradient"] = safe_div(
                    (df[tss_col] - df[gene_col]), df["gene_length"] + 1e-8
                )

    # ============================================================
    # 2️⃣ Promoter asymmetry (需要上游/下游分別 feature 時再補)
    # ============================================================
    # 若已有 upstream / downstream mean，可用下列形式：
    if all(c in df.columns for c in ["DNase_tss_logz_mean", "DNase_gene_logz_mean"]):
        df["DNase_promoter_asymmetry"] = safe_div(
            df["DNase_tss_logz_mean"] - df["DNase_gene_logz_mean"],
            df["DNase_tss_logz_mean"] + df["DNase_gene_logz_mean"] + 1e-8
        )

    # ============================================================
    # 3️⃣ Enhancer–Promoter coupling (H3K27ac × H3K4me3)
    # ============================================================
    if all(c in df.columns for c in ["H3K27ac_gene_logz_mean", "H3K4me3_tss_logz_mean"]):
        df["enhancer_promoter_synergy"] = (
            df["H3K27ac_gene_logz_mean"] * df["H3K4me3_tss_logz_mean"]
        )

    # ============================================================
    # 4️⃣ H3K27ac × DNase synergy / ratio
    # ============================================================
    if all(c in df.columns for c in ["H3K27ac_tss_logz_mean", "DNase_tss_logz_mean"]):
        df["H3K27ac_DNase_synergy"] = (
            df["H3K27ac_tss_logz_mean"] * df["DNase_tss_logz_mean"]
        )
        df["H3K27ac_DNase_ratio"] = safe_div(
            df["H3K27ac_tss_logz_mean"], df["DNase_tss_logz_mean"]
        )

    # ============================================================
    # 5️⃣ Bivalent index / balance (H3K27ac vs H3K27me3)
    # ============================================================
    if all(c in df.columns for c in ["H3K27ac_tss_logz_mean", "H3K27me3_tss_logz_mean"]):
        df["bivalent_index"] = (
            df["H3K27ac_tss_logz_mean"] * df["H3K27me3_tss_logz_mean"]
        )
        df["bivalent_balance"] = (
            df["H3K27ac_tss_logz_mean"] - df["H3K27me3_tss_logz_mean"]
        )

    # ============================================================
    # 6️⃣ Promoter–Gene coherence across marks
    # ============================================================
    from scipy.stats import pearsonr

    def promoter_gene_coherence(row):
        tss_vals, gene_vals = [], []
        for m in marks:
            tss_col, gene_col = f"{m}_tss_logz_mean", f"{m}_gene_logz_mean"
            if tss_col in df.columns and gene_col in df.columns:
                tss_vals.append(row.get(tss_col, np.nan))
                gene_vals.append(row.get(gene_col, np.nan))
        tss_vals, gene_vals = np.array(tss_vals), np.array(gene_vals)
        if np.isnan(tss_vals).any() or np.isnan(gene_vals).any() or len(tss_vals) < 2:
            return 0.0
        try:
            return pearsonr(tss_vals, gene_vals)[0]
        except Exception:
            return 0.0

    df["promoter_gene_coherence"] = df.apply(promoter_gene_coherence, axis=1)

    # ============================================================
    # 7️⃣ Chromatin entropy diversity
    # ============================================================
    entropy_cols = [c for c in df.columns if c.endswith("_entropy")]
    if entropy_cols:
        df["chromatin_entropy_mean"] = df[entropy_cols].mean(axis=1)
        df["chromatin_entropy_std"] = df[entropy_cols].std(axis=1)

    return check_new_columns(df, prev_cols, "advanced_chromatin_features")


In [3]:
import numpy as np

def add_cross_layer_features(df, marks, prefix="cross", bw_norm="logz"):
    """
    🧬 Cross-layer features between BED and bigWig for each histone mark.

    假設 DataFrame 內同時有：
      - BED 層:  {mark}_tss_signal_mean, {mark}_gene_signal_mean, {mark}_tss_peak_density, {mark}_tss_peak_entropy ...
      - bigWig 層: {mark}_tss_{bw_norm}_mean, {mark}_tss_entropy, {mark}_gene_{bw_norm}_mean, ...

    Parameters
    ----------
    df : pd.DataFrame
        合併後的 DataFrame，包含 bed + bigwig 特徵
    marks : list of str
        e.g. ["H3K27ac", "H3K4me3", "H3K27me3", "DNase"]
    prefix : str
        cross 層特徵前綴名稱
    bw_norm : str
        bigwig normalization prefix (logz, zscore, log, raw)

    Returns
    -------
    pd.DataFrame : 含新增 cross-layer 特徵的 DataFrame
    """
    prev_cols = df.columns.copy()

    def safe_div(a, b):
        with np.errstate(divide='ignore', invalid='ignore'):
            return np.where(np.abs(b) > 1e-8, a / b, 0)

    for mark in marks:
        # --- BED 層欄位 ---
        tss_bed_mean  = f"{mark}_tss_signal_mean"
        gene_bed_mean = f"{mark}_gene_signal_mean"

        tss_peak_dens = f"{mark}_tss_peak_density"  if f"{mark}_tss_peak_density"  in df else None
        gene_peak_dens = f"{mark}_gene_peak_density" if f"{mark}_gene_peak_density" in df else None
        tss_peak_entr = f"{mark}_tss_peak_entropy"  if f"{mark}_tss_peak_entropy"  in df else None
        gene_peak_entr = f"{mark}_gene_peak_entropy" if f"{mark}_gene_peak_entropy" in df else None

        # --- bigWig 層欄位 ---
        tss_bw_mean   = f"{mark}_tss_{bw_norm}_mean"
        gene_bw_mean  = f"{mark}_gene_{bw_norm}_mean"
        tss_bw_std    = f"{mark}_tss_{bw_norm}_std"
        gene_bw_std   = f"{mark}_gene_{bw_norm}_std"
        tss_bw_entropy = f"{mark}_tss_entropy"
        gene_bw_entropy = f"{mark}_gene_entropy"

        # --- 跳過沒有 bigwig 層的 mark ---
        if not any(c in df.columns for c in [tss_bw_mean, gene_bw_mean]):
            continue

        # ============================================================
        # 1️⃣ 比率型 cross features
        # ============================================================
        if tss_bw_mean in df and tss_peak_dens:
            df[f"{prefix}_{mark}_tss_bw_over_peak_density"] = safe_div(df[tss_bw_mean], df[tss_peak_dens])
        if tss_bw_std in df and tss_peak_entr:
            df[f"{prefix}_{mark}_tss_bw_std_over_peak_entropy"] = safe_div(df[tss_bw_std], df[tss_peak_entr])

        if gene_bw_mean in df and gene_peak_dens:
            df[f"{prefix}_{mark}_gene_bw_over_peak_density"] = safe_div(df[gene_bw_mean], df[gene_peak_dens])
        if gene_bw_std in df and gene_peak_entr:
            df[f"{prefix}_{mark}_gene_bw_std_over_peak_entropy"] = safe_div(df[gene_bw_std], df[gene_peak_entr])
        # ============================================================
        # 2️⃣ 差異型 cross features
        # ============================================================
        if tss_bw_entropy in df and tss_peak_entr:
            df[f"{prefix}_{mark}_tss_entropy_diff"] = df[tss_bw_entropy] - df[tss_peak_entr]
        if tss_bw_mean in df and tss_bed_mean in df:
            df[f"{prefix}_{mark}_tss_bw_vs_bed_mean_diff"] = df[tss_bw_mean] - df[tss_bed_mean]
        if gene_bw_entropy in df and gene_peak_entr:
            df[f"{prefix}_{mark}_gene_entropy_diff"] = df[gene_bw_entropy] - df[gene_peak_entr]

        # ============================================================
        # 3️⃣ 加權型 cross features
        # ============================================================
        if tss_bw_mean in df and tss_peak_dens:
            df[f"{prefix}_{mark}_tss_peak_density_times_bw_mean"] = df[tss_peak_dens] * df[tss_bw_mean]
        if tss_bw_entropy in df and tss_peak_dens:
            df[f"{prefix}_{mark}_tss_peak_density_times_bw_entropy"] = df[tss_peak_dens] * df[tss_bw_entropy]
        if tss_bw_mean in df and tss_peak_entr:
            df[f"{prefix}_{mark}_tss_peak_entropy_times_bw_mean"] = df[tss_peak_entr] * df[tss_bw_mean]

        # ============================================================
        # 4️⃣ promoter vs gene 平衡
        # ============================================================
        if tss_bw_mean in df and gene_bw_mean in df:
            df[f"{prefix}_{mark}_bw_promoter_gene_delta"] = df[tss_bw_mean] - df[gene_bw_mean]
            df[f"{prefix}_{mark}_bw_promoter_gene_ratio"] = safe_div(df[tss_bw_mean], df[gene_bw_mean])

        # ============================================================
        # 5️⃣ BED–bigWig 層交互
        # ============================================================
        if tss_bed_mean in df and tss_bw_mean in df:
            df[f"{prefix}_{mark}_tss_bw_bed_interaction"] = df[tss_bed_mean] * df[tss_bw_mean]
        if gene_bed_mean in df and gene_bw_mean in df:
            df[f"{prefix}_{mark}_gene_bw_bed_interaction"] = df[gene_bed_mean] * df[gene_bw_mean]

    # ============================================================
    # 🧹 清理 NaN / inf
    # ============================================================
    new_cols = [c for c in df.columns if c not in prev_cols and df[c].dtype.kind in "fc"]
    df[new_cols] = df[new_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

    print(f"✅ [{prefix}] Added {len(new_cols)} cross-layer features.")
    return df
def add_bed_topology_features(df, marks):
    """
    🧩 Additional BED-based topology and spatial features (mark-aware version).
    
    每個 histone mark 都會根據以下 BED 層變數計算：
      - compactness_index = coverage_ratio / num_peaks
      - signal_density_product = signal_mean * peak_density
      - entropy_density_ratio = peak_entropy / peak_density
      - signal_coefficient_of_variation = signal_std / |signal_mean|
      - avg_peak_dist_per_peak = closest_peak_to_TSS / num_peaks

    Parameters
    ----------
    df : pd.DataFrame
        含所有 mark 的 BED 特徵
    marks : list of str
        e.g. ["H3K27ac", "H3K4me3", "H3K9me3", "DNase"]

    Returns
    -------
    df : pd.DataFrame
        含各 mark 的 BED topology 特徵
    """
    prev_cols = df.columns.copy()

    for mark in marks:
        for region in ["gene", "tss"]:
            # --- dynamic feature names ---
            prefix = f"{mark}_{region}_"
            num_peaks = f"{prefix}num_peaks"
            cov_ratio = f"{prefix}coverage_ratio"
            peak_dens = f"{prefix}peak_density"
            peak_entropy = f"{prefix}peak_entropy"
            signal_mean = f"{prefix}signal_mean"
            signal_std = f"{prefix}signal_std"
            closest_peak = f"{prefix}closest_peak_to_TSS"

            # 1️⃣ Compactness & density
            if all(c in df.columns for c in [num_peaks, cov_ratio]):
                df[f"{prefix}compactness_index"] = safe_div(df[cov_ratio], df[num_peaks])
            if all(c in df.columns for c in [signal_mean, peak_dens]):
                df[f"{prefix}signal_density_product"] = df[signal_mean] * df[peak_dens]

            # 2️⃣ Entropy–density ratio
            if all(c in df.columns for c in [peak_entropy, peak_dens]):
                df[f"{prefix}entropy_density_ratio"] = safe_div(df[peak_entropy], df[peak_dens])

            # 3️⃣ Coefficient of variation
            if all(c in df.columns for c in [signal_mean, signal_std]):
                df[f"{prefix}signal_coefficient_of_variation"] = safe_div(
                    df[signal_std], np.abs(df[signal_mean]) + 1e-8
                )

            # 4️⃣ Avg peak distance per peak
            if all(c in df.columns for c in [closest_peak, num_peaks]):
                df[f"{prefix}avg_peak_dist_per_peak"] = safe_div(df[closest_peak], df[num_peaks])

    return check_new_columns(df, prev_cols, "bed_topology_features_v2")


In [9]:
TRAIN_PATH = "../preprocessed_data/reference/1. merged data/with_y_500_one_side/X1_all_logzscore_logzscore_with_y.tsv"

# === 讀取資料 ===
df_train = pd.read_csv(TRAIN_PATH, sep="\t")
marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]

df = add_advanced_chromatin_features(df_train, marks=marks)
df

✅ [advanced_chromatin_features] All 9 new features valid.


Unnamed: 0,gene_name,gex,gex_rank,DNase_gene_has_peak,DNase_gene_num_peaks,DNase_gene_peak_density,DNase_gene_signal_sum,DNase_gene_signal_mean,DNase_gene_signal_std,DNase_gene_signal_min,...,H3K9me3_tss_laplacian,DNase_promoter_asymmetry,enhancer_promoter_synergy,H3K27ac_DNase_synergy,H3K27ac_DNase_ratio,bivalent_index,bivalent_balance,promoter_gene_coherence,chromatin_entropy_mean,chromatin_entropy_std
0,SLC20A1,0.000000,0.290131,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.019608,-0.040363,2.209690,2.810323,2.489691,-1.802298,3.326512,0.802313,2.985658,4.931584
1,C11orf58,2239.103328,0.997421,1,1.0,0.001501,-0.533665,-0.533665,0.000000,-0.533665,...,0.044036,1.005767,-0.071360,119.603303,0.780632,-4.978579,10.177855,0.081195,3.187104,5.720149
2,ZSCAN9,19.798064,0.800018,1,1.0,0.028084,-0.542128,-0.542128,0.000000,-0.542128,...,0.044518,0.708751,5.425161,31.891028,1.413212,-2.941203,7.151442,0.659880,2.695835,4.640487
3,CD19,411.530623,0.973532,1,3.0,0.098665,-1.646059,-0.548686,0.002255,-0.551435,...,0.030176,0.414917,10.727123,27.513807,0.990872,-4.068784,6.000622,0.876591,2.684454,4.626418
4,TMEM123,34.214129,0.833702,1,6.0,0.019519,-3.291924,-0.548654,0.007273,-0.560546,...,0.050919,0.863612,4.017016,50.884955,0.470313,-1.069331,5.110604,0.465227,3.366209,5.345235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16279,TMPRSS9,0.000000,0.290131,1,3.0,0.010874,-1.652054,-0.550685,0.001697,-0.553017,...,0.014230,-2.493325,-0.015593,0.001671,0.210492,-0.010344,-0.570318,0.251809,2.982703,5.366407
16280,PSMA3,1.057178,0.647445,1,2.0,0.017718,-1.074457,-0.537229,0.005296,-0.542524,...,0.070753,0.208488,2.643330,2.155439,1.706326,-0.907492,2.390979,0.360374,2.916623,5.061438
16281,PLEKHA4,0.000000,0.290131,1,1.0,0.007538,-0.535512,-0.535512,0.000000,-0.535512,...,0.006795,-6.095466,-0.075021,-0.078272,-1.129903,0.055994,0.109103,0.567889,2.892668,5.019665
16282,DENND1C,155.981831,0.930914,1,3.0,0.047327,-1.652381,-0.550794,0.005757,-0.555776,...,0.028092,0.804045,5.968892,84.811604,1.211342,-7.898447,10.915129,0.661514,2.851654,4.829278


In [26]:
df

Unnamed: 0,gene_name,gex,gex_rank,DNase_gene_has_peak,DNase_gene_num_peaks,DNase_gene_peak_density,DNase_gene_signal_sum,DNase_gene_signal_mean,DNase_gene_signal_std,DNase_gene_signal_min,...,H3K27me3_gene_compactness_index,H3K27me3_gene_signal_density_product,H3K27me3_gene_entropy_density_ratio,H3K27me3_gene_signal_coefficient_of_variation,H3K27me3_gene_avg_peak_dist_per_peak,H3K27me3_tss_compactness_index,H3K27me3_tss_signal_density_product,H3K27me3_tss_entropy_density_ratio,H3K27me3_tss_signal_coefficient_of_variation,H3K27me3_tss_avg_peak_dist_per_peak
0,SLC20A1,0.000000,0.290131,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C11orf58,2239.103328,0.997421,1,1.0,0.001501,-0.533665,-0.533665,0.000000,-0.533665,...,0.001426,-0.008965,0.0,0.0,124404.0,0.0,0.0,0.0,0.0,0.0
2,ZSCAN9,19.798064,0.800018,1,1.0,0.028084,-0.542128,-0.542128,0.000000,-0.542128,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CD19,411.530623,0.973532,1,3.0,0.098665,-1.646059,-0.548686,0.002255,-0.551435,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TMEM123,34.214129,0.833702,1,6.0,0.019519,-3.291924,-0.548654,0.007273,-0.560546,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16279,TMPRSS9,0.000000,0.290131,1,3.0,0.010874,-1.652054,-0.550685,0.001697,-0.553017,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16280,PSMA3,1.057178,0.647445,1,2.0,0.017718,-1.074457,-0.537229,0.005296,-0.542524,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16281,PLEKHA4,0.000000,0.290131,1,1.0,0.007538,-0.535512,-0.535512,0.000000,-0.535512,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16282,DENND1C,155.981831,0.930914,1,3.0,0.047327,-1.652381,-0.550794,0.005757,-0.555776,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df

Unnamed: 0,gene_name,gex,gex_rank,DNase_gene_has_peak,DNase_gene_num_peaks,DNase_gene_peak_density,DNase_gene_signal_sum,DNase_gene_signal_mean,DNase_gene_signal_std,DNase_gene_signal_min,...,H3K9me3_tss_logz_min,H3K9me3_tss_logz_max,H3K9me3_tss_logz_diff,H3K9me3_tss_gradient_mean,H3K9me3_tss_slope,H3K9me3_tss_kurtosis,H3K9me3_tss_skewness,H3K9me3_tss_entropy,H3K9me3_tss_autocorr,H3K9me3_tss_laplacian
0,SLC20A1,0.000000,0.290131,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,-1.700009,1.633207,3.333216,0.009786,0.001885,2.567852,1.131109,8.827529,0.991645,0.019608
1,C11orf58,2239.103328,0.997421,1,1.0,0.001501,-0.533665,-0.533665,0.000000,-0.533665,...,-1.491168,2.546634,4.037803,0.021978,0.000842,0.059213,0.996185,9.007329,0.985974,0.044036
2,ZSCAN9,19.798064,0.800018,1,1.0,0.028084,-0.542128,-0.542128,0.000000,-0.542128,...,-1.491168,1.272558,2.763726,0.022218,0.002574,-0.314109,0.898287,9.012572,0.975843,0.044518
3,CD19,411.530623,0.973532,1,3.0,0.098665,-1.646059,-0.548686,0.002255,-0.551435,...,-1.491168,1.837447,3.328615,0.015298,-0.002879,-0.020243,0.781871,8.793605,0.990435,0.030176
4,TMEM123,34.214129,0.833702,1,6.0,0.019519,-3.291924,-0.548654,0.007273,-0.560546,...,-1.491168,2.955102,4.446270,0.025485,0.002894,0.133049,1.302931,8.913313,0.988881,0.050919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16279,TMPRSS9,0.000000,0.290131,1,3.0,0.010874,-1.652054,-0.550685,0.001697,-0.553017,...,-0.851194,2.116955,2.968149,0.007102,0.001847,5.946096,1.224679,8.043104,0.980776,0.014230
16280,PSMA3,1.057178,0.647445,1,2.0,0.017718,-1.074457,-0.537229,0.005296,-0.542524,...,-1.491168,3.334321,4.825489,0.035312,0.001480,-0.693444,0.107575,8.854055,0.984881,0.070753
16281,PLEKHA4,0.000000,0.290131,1,1.0,0.007538,-0.535512,-0.535512,0.000000,-0.535512,...,-0.851194,0.079744,0.930939,0.003391,-0.000152,-0.704966,-1.137995,8.212727,0.990321,0.006795
16282,DENND1C,155.981831,0.930914,1,3.0,0.047327,-1.652381,-0.550794,0.005757,-0.555776,...,-1.774088,1.272558,3.046646,0.015545,0.001896,0.603147,1.209556,9.075045,0.988630,0.028092


In [34]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, rankdata

def save_feature_summary(df, out_path):
    """
    生成每個特徵的統計摘要：
    - mean, std, min, max
    - missing rate
    - Spearman correlation with gex
    """
    print("🧮 Generating feature summary (Spearman correlation)...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if "gex" not in df.columns:
        raise ValueError("❌ 'gex' column not found in DataFrame!")
    if "gex" in numeric_cols:
        numeric_cols.remove("gex")

    summaries = []
    gex_vals = df["gex"].values

    for col in numeric_cols:
        vals = df[col].values
        mask = ~np.isnan(vals)

        # 基本統計
        mean_val = np.nanmean(vals)
        std_val = np.nanstd(vals)
        min_val = np.nanmin(vals)
        max_val = np.nanmax(vals)
        missing_rate = 1 - mask.mean()

        # Spearman correlation
        if np.sum(mask) > 3:
            try:
                corr, pval = spearmanr(vals[mask], gex_vals[mask])
            except Exception:
                corr, pval = np.nan, np.nan
        else:
            corr, pval = np.nan, np.nan

        summaries.append({
            "feature": col,
            "mean": mean_val,
            "std": std_val,
            "min": min_val,
            "max": max_val,
            "missing_rate": missing_rate,
            "spearman_corr_with_gex": corr,
            "corr_pvalue": pval,
        })

    summary_df = pd.DataFrame(summaries).sort_values(by="spearman_corr_with_gex", ascending=False)
    out_summary_path = out_path.replace(".tsv", "_feature_summary.tsv")
    summary_df.to_csv(out_summary_path, sep="\t", index=False, float_format="%.5f")

    print(f"✅ Feature summary saved → {out_summary_path}")
    print(f"📊 Total features analyzed: {len(summary_df)}")

    return summary_df

In [35]:
df = pd.read_csv("../preprocessed_data/CAGE-merged/X1_features.tsv", sep="\t")
summary_df = save_feature_summary(df, "../preprocessed_data/CAGE-merged/X1_features.tsv")

🧮 Generating feature summary (Spearman correlation)...
✅ Feature summary saved → ../preprocessed_data/CAGE-merged/X1_features_feature_summary.tsv
📊 Total features analyzed: 460


In [36]:
df = pd.read_csv("../preprocessed_data/CAGE-merged/X2_features.tsv", sep="\t")
summary_df = save_feature_summary(df, "../preprocessed_data/CAGE-merged/X2_features.tsv")

🧮 Generating feature summary (Spearman correlation)...
✅ Feature summary saved → ../preprocessed_data/CAGE-merged/X2_features_feature_summary.tsv
📊 Total features analyzed: 460
