# Features engineer

In [None]:
import os
import numpy as np
import pandas as pd
import itertools

# ============================================================
#                FEATURE ENGINEERING HELPERS
# ============================================================

def safe_div(a, b):
    """安全除法，防止除以 0"""
    with np.errstate(divide='ignore', invalid='ignore'):
        return np.where(np.abs(b) > 1e-8, a / b, 0)


def check_new_columns(df, prev_cols, context=""):
    """
    ✅ 只檢查「新增加的欄位」中是否有 NaN 或 inf。
    並列出對應的 gene_name（前幾個例子）。
    """
    new_cols = [c for c in df.columns if c not in prev_cols]
    if not new_cols:
        return df  # 沒新增欄位就略過
    for col in new_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    numeric_cols = [c for c in new_cols if np.issubdtype(df[c].dtype, np.number)]

    # 轉 inf → NaN
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

    nan_ratio = df[numeric_cols].isna().mean()
    bad = nan_ratio[nan_ratio > 0].sort_values(ascending=False)

    if not bad.empty:
        print(f"⚠️ [{context}] {len(bad)} new features contain NaN/inf:")
        # 列出每個欄位 NaN 的 gene_name（只顯示前幾個避免太多）
        for col in bad.index[:5]:  # 只顯示前5個問題最多的欄位
            nan_genes = df.loc[df[col].isna(), "gene_name"].head(5).tolist() \
                        if "gene_name" in df.columns else []
            if nan_genes:
                print(f"   ↳ {col}: {len(df[col].isna())} NaN — e.g. {nan_genes}")
            else:
                print(f"   ↳ {col}: {len(df[col].isna())} NaN (no gene_name column found)")
    else:
        print(f"✅ [{context}] All {len(numeric_cols)} new features valid.")

    return df


# ============================================================
#                 FEATURE RANK TRANSFORMATION
# ============================================================

from scipy.stats import rankdata
import numpy as np
from scipy.stats import rankdata

from scipy.stats import rankdata
import numpy as np

def rank_transform_features(df, by_chr=True):
    """
    Rank-transform feature values, optionally per chromosome.
    If by_chr=True → rank within each chromosome.
    If by_chr=False → rank across the entire dataset.

    根據 biological direction (activating/repressive) 決定正向或反向排名。
    """
    mode = "per-chromosome" if by_chr else "global"
    print(f"🔢 Performing {mode} rank transformation...")

    exclude_cols = ["gene_name", "chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]
    numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
    
    activating_marks = ["DNase", "H3K27ac", "H3K4me3", "H3K36me3", "H3K4me1"]
    repressive_marks = ["H3K9me3", "H3K27me3"]

    df_ranked = df.copy()

    def rank_with_direction(vals, col):
        """Apply biological-direction-aware ranking to a numeric array."""
        if np.all(np.isnan(vals)):
            return np.full_like(vals, np.nan, dtype=float)
        if any(mark in col for mark in repressive_marks) or "repress" in col.lower():
            ranks = rankdata(-vals, method="average") / len(vals)
        else:
            ranks = rankdata(vals, method="average") / len(vals)
        return ranks

    if by_chr:
        # --- chromosome-based ranking ---
        for chrom, subdf in df.groupby("chr", sort=False):
            idx = subdf.index
            for col in numeric_cols:
                vals = subdf[col].values
                df_ranked.loc[idx, f"{col}_rank"] = rank_with_direction(vals, col)
    else:
        # --- global ranking ---
        for col in numeric_cols:
            vals = df[col].values
            df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)

    return df_ranked


def add_gene_structure(df):
    """0️⃣ gene structure features"""
    prev_cols = df.columns.copy()

    if all(col in df.columns for col in ["gene_start", "gene_end"]):
        df["gene_length"] = df["gene_end"] - df["gene_start"]
    else:
        df["gene_length"] = np.nan

    if all(col in df.columns for col in ["TSS_start", "TSS_end"]):
        df["tss_length"] = df["TSS_end"] - df["TSS_start"]
    else:
        df["tss_length"] = np.nan

    return check_new_columns(df, prev_cols, "gene_structure")


def add_promoter_gene_ratio(df, marks):
    """1️⃣ promoter vs gene body ratio"""
    prev_cols = df.columns.copy()

    for mark in marks:
        gene_mean, tss_mean = f"{mark}_gene_z_mean", f"{mark}_tss_z_mean"
        gene_std, tss_std = f"{mark}_gene_z_std", f"{mark}_tss_z_std"
        if gene_mean in df and tss_mean in df:
            df[f"{mark}_ratio_mean"] = safe_div(df[tss_mean], df[gene_mean])
        if gene_std in df and tss_std in df:
            df[f"{mark}_ratio_std"] = safe_div(df[tss_std], df[gene_std])

    return check_new_columns(df, prev_cols, "promoter_gene_ratio")


def add_activation_balance(df):
    """2️⃣ activation–repression balance"""
    prev_cols = df.columns.copy()

    if all(c in df for c in ["H3K27ac_tss_z_mean", "H3K27me3_tss_z_mean"]):
        df["balance_H3K27"] = df["H3K27ac_tss_z_mean"] - df["H3K27me3_tss_z_mean"]
    if all(c in df for c in ["H3K4me3_tss_z_mean", "H3K9me3_tss_z_mean"]):
        df["balance_H3K4"] = df["H3K4me3_tss_z_mean"] - df["H3K9me3_tss_z_mean"]

    return check_new_columns(df, prev_cols, "activation_balance")


def add_promoter_entropy(df, activating_marks, repressive_marks):
    """3️⃣ promoter entropy & variability"""
    prev_cols = df.columns.copy()

    tss_cols = [f"{m}_tss_z_mean" for m in activating_marks + repressive_marks if f"{m}_tss_z_mean" in df]
    if not tss_cols:
        return df

    df["promoter_variability"] = df[tss_cols].std(axis=1)
    norm_vals = df[tss_cols].div(df[tss_cols].sum(axis=1), axis=0)
    df["promoter_entropy"] = -np.nansum(
        np.nan_to_num(norm_vals) * np.log(np.nan_to_num(norm_vals) + 1e-8), axis=1
    )

    return check_new_columns(df, prev_cols, "promoter_entropy")


def add_chromatin_indices(df, activating_marks, repressive_marks):
    """4️⃣ openness & repression index"""
    prev_cols = df.columns.copy()

    act_cols = [f"{m}_tss_z_mean" for m in activating_marks if f"{m}_tss_z_mean" in df]
    rep_cols = [f"{m}_tss_z_mean" for m in repressive_marks if f"{m}_tss_z_mean" in df]
    if act_cols:
        df["openness_index"] = df[act_cols].mean(axis=1)
    if rep_cols:
        df["repression_index"] = df[rep_cols].mean(axis=1)

    return check_new_columns(df, prev_cols, "chromatin_indices")


def add_strand_features(df):
    """5️⃣ strand-aware features"""
    prev_cols = df.columns.copy()

    if "strand" in df.columns:
        df["strand_is_plus"] = (df["strand"] == "+").astype(int)
        df["strand_is_minus"] = (df["strand"] == "-").astype(int)

    return check_new_columns(df, prev_cols, "strand_features")


def add_cross_mark_interactions(df, marks):
    """6️⃣ pairwise cross-mark interactions"""
    prev_cols = df.columns.copy()

    for m1, m2 in itertools.combinations(marks, 2):
        c1, c2 = f"{m1}_tss_z_mean", f"{m2}_tss_z_mean"
        if c1 in df and c2 in df:
            df[f"{m1}_{m2}_interaction"] = df[c1] * df[c2]

    return check_new_columns(df, prev_cols, "cross_mark_interactions")


def add_activation_repression_indices(df):
    """7️⃣ summarized activation/repression indices"""
    prev_cols = df.columns.copy()

    if all(c in df for c in ["H3K27ac_tss_z_mean", "H3K27me3_tss_z_mean"]):
        df["activation_balance"] = df["H3K27ac_tss_z_mean"] - df["H3K27me3_tss_z_mean"]
    if all(c in df for c in ["H3K4me3_tss_z_mean", "H3K9me3_tss_z_mean"]):
        df["promoter_activity"] = df["H3K4me3_tss_z_mean"] - df["H3K9me3_tss_z_mean"]
    if all(c in df for c in ["H3K9me3_tss_z_mean", "H3K27me3_tss_z_mean"]):
        df["repression_index"] = (df["H3K9me3_tss_z_mean"] + df["H3K27me3_tss_z_mean"]) / 2
    if all(c in df for c in ["H3K27ac_tss_z_mean", "H3K4me3_tss_z_mean", "DNase_tss_z_mean"]):
        df["activation_index"] = (
            df["H3K27ac_tss_z_mean"] + df["H3K4me3_tss_z_mean"] + df["DNase_tss_z_mean"]
        ) / 3

    return check_new_columns(df, prev_cols, "activation_repression_indices")


def add_axis_and_delta(df, marks):
    """8️⃣ axis sum & promoter-body delta"""
    prev_cols = df.columns.copy()

    for mark in marks:
        g_mean, t_mean = f"{mark}_gene_z_mean", f"{mark}_tss_z_mean"
        if g_mean in df and t_mean in df:
            df[f"{mark}_axis_sum"] = df[t_mean] + df[g_mean]
            df[f"{mark}_promoter_body_delta"] = df[t_mean] - df[g_mean]

    return check_new_columns(df, prev_cols, "axis_and_delta")


# ============================================================
#                MAIN PIPELINE FUNCTION
# ============================================================

def run_feature_engineering(merged_dir, cells, marks, by_chr=True):
    activating_marks = ["DNase", "H3K27ac", "H3K4me3", "H3K36me3"]
    repressive_marks = ["H3K9me3", "H3K27me3"]

    for cell in cells:
        in_path = os.path.join(merged_dir, f"{cell}_zscore.tsv")
        if not os.path.exists(in_path):
            print(f"⚠️ Missing input file: {in_path}")
            continue

        print(f"\n📂 Processing {cell} ...")
        df = pd.read_csv(in_path, sep="\t")

        df = add_gene_structure(df)
        df = add_promoter_gene_ratio(df, activating_marks + repressive_marks)
        df = add_activation_balance(df)
        df = add_promoter_entropy(df, activating_marks, repressive_marks)
        df = add_chromatin_indices(df, activating_marks, repressive_marks)
        df = add_strand_features(df)
        df = add_cross_mark_interactions(df, marks)
        df = add_activation_repression_indices(df)
        df = add_axis_and_delta(df, marks)
        df = rank_transform_features(df, by_chr=by_chr)

        out_path = os.path.join(merged_dir, f"{cell}_all_rank_features.tsv")
        df.to_csv(out_path, sep="\t", index=False)
        print(f"✅ Saved engineered features → {out_path}")

    print("\n🎯 Feature engineering complete for all cell lines.")

# ============================================================
#                EXECUTION EXAMPLE
# ============================================================

if __name__ == "__main__":
    merged_dir = "../preprocessed_data/CAGE-merged"
    cells = ["X1", "X2", "X3"]
    marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]

    run_feature_engineering(merged_dir, cells, marks, by_chr=False)



📂 Processing X1 ...
✅ [gene_structure] All 2 new features valid.
✅ [promoter_gene_ratio] All 12 new features valid.
✅ [activation_balance] All 2 new features valid.
✅ [promoter_entropy] All 2 new features valid.
✅ [chromatin_indices] All 2 new features valid.
✅ [strand_features] All 2 new features valid.
✅ [cross_mark_interactions] All 21 new features valid.
✅ [activation_repression_indices] All 3 new features valid.
✅ [axis_and_delta] All 14 new features valid.
🔢 Performing global rank transformation...


  np.nan_to_num(norm_vals) * np.log(np.nan_to_num(norm_vals) + 1e-8), axis=1
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_ran

✅ Saved engineered features → ../preprocessed_data/CAGE-merged/X1_all_rank_features.tsv

📂 Processing X2 ...
✅ [gene_structure] All 2 new features valid.
✅ [promoter_gene_ratio] All 12 new features valid.
✅ [activation_balance] All 2 new features valid.
✅ [promoter_entropy] All 2 new features valid.
✅ [chromatin_indices] All 2 new features valid.
✅ [strand_features] All 2 new features valid.
✅ [cross_mark_interactions] All 21 new features valid.
✅ [activation_repression_indices] All 3 new features valid.
✅ [axis_and_delta] All 14 new features valid.
🔢 Performing global rank transformation...


  np.nan_to_num(norm_vals) * np.log(np.nan_to_num(norm_vals) + 1e-8), axis=1
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_rank"] = rank_with_direction(vals, col)
  df_ranked[f"{col}_ran

In [34]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, rankdata

def save_feature_summary(df, out_path):
    """
    生成每個特徵的統計摘要：
    - mean, std, min, max
    - missing rate
    - Spearman correlation with gex
    """
    print("🧮 Generating feature summary (Spearman correlation)...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if "gex" not in df.columns:
        raise ValueError("❌ 'gex' column not found in DataFrame!")
    if "gex" in numeric_cols:
        numeric_cols.remove("gex")

    summaries = []
    gex_vals = df["gex"].values

    for col in numeric_cols:
        vals = df[col].values
        mask = ~np.isnan(vals)

        # 基本統計
        mean_val = np.nanmean(vals)
        std_val = np.nanstd(vals)
        min_val = np.nanmin(vals)
        max_val = np.nanmax(vals)
        missing_rate = 1 - mask.mean()

        # Spearman correlation
        if np.sum(mask) > 3:
            try:
                corr, pval = spearmanr(vals[mask], gex_vals[mask])
            except Exception:
                corr, pval = np.nan, np.nan
        else:
            corr, pval = np.nan, np.nan

        summaries.append({
            "feature": col,
            "mean": mean_val,
            "std": std_val,
            "min": min_val,
            "max": max_val,
            "missing_rate": missing_rate,
            "spearman_corr_with_gex": corr,
            "corr_pvalue": pval,
        })

    summary_df = pd.DataFrame(summaries).sort_values(by="spearman_corr_with_gex", ascending=False)
    out_summary_path = out_path.replace(".tsv", "_feature_summary.tsv")
    summary_df.to_csv(out_summary_path, sep="\t", index=False, float_format="%.5f")

    print(f"✅ Feature summary saved → {out_summary_path}")
    print(f"📊 Total features analyzed: {len(summary_df)}")

    return summary_df

In [35]:
df = pd.read_csv("../preprocessed_data/CAGE-merged/X1_features.tsv", sep="\t")
summary_df = save_feature_summary(df, "../preprocessed_data/CAGE-merged/X1_features.tsv")

🧮 Generating feature summary (Spearman correlation)...
✅ Feature summary saved → ../preprocessed_data/CAGE-merged/X1_features_feature_summary.tsv
📊 Total features analyzed: 460


In [None]:
我現在要來寫給lgbm訓練
我要讀取這個file:
來當training data: preprocessed_data/CAGE-merged/X1_features.tsv
然後preprocessed_data/CAGE-merged/X1_features.tsv來當valdata

我要用的column是除了gene_name	chr	gene_start	gene_end	TSS_start	TSS_end	strand	gex以外的所有columns，並且預測的目標是gex，並且幫我把gex改成rank based，並且這個r我想要

In [36]:
df = pd.read_csv("../preprocessed_data/CAGE-merged/X2_features.tsv", sep="\t")
summary_df = save_feature_summary(df, "../preprocessed_data/CAGE-merged/X2_features.tsv")

🧮 Generating feature summary (Spearman correlation)...
✅ Feature summary saved → ../preprocessed_data/CAGE-merged/X2_features_feature_summary.tsv
📊 Total features analyzed: 460
