# Features engineer

In [None]:
import os
import pandas as pd

# ============================================================
#                  基本設定
# ============================================================
base_dir = "../preprocessed_data/reference/0. data"
bed_dir = os.path.join(base_dir, "bed5250")
bed_norm_suffix = 'logzscore'  # or zscore, raw


bw_dir = os.path.join(base_dir, "bigwig250")
bw_norm_suffix = 'logzscore'  # bigwig 檔案的正規化後綴

output_dir = "../preprocessed_data/reference/1. merged data/without_y_250/"  # 合併輸出在這層
os.makedirs(output_dir, exist_ok=True)

cells = ["X1", "X2", "X3"]

# 要去除的欄位
drop_cols = ["chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]

# ============================================================
#                  逐 cell line 合併
# ============================================================
for cell in cells:
    bed_path = os.path.join(bed_dir, f"{cell}_{bed_norm_suffix}.tsv")
    bw_path = os.path.join(bw_dir, f"{cell}_{bw_norm_suffix}.tsv")
    out_path = os.path.join(output_dir, f"{cell}_all_{bed_norm_suffix}_{bw_norm_suffix}.tsv")

    print(f"\n=== 🧬 {cell}: merging quantile-normalized bed + bigwig ===")

    if not os.path.exists(bed_path):
        print(f"⚠️ Missing bed file: {bed_path}")
        continue
    if not os.path.exists(bw_path):
        print(f"⚠️ Missing bigwig file: {bw_path}")
        continue

    # --- 讀取 ---
    df_bed = pd.read_csv(bed_path, sep="\t")
    df_bw = pd.read_csv(bw_path, sep="\t")

    if "gene_name" not in df_bed.columns or "gene_name" not in df_bw.columns:
        raise ValueError(f"❌ Missing 'gene_name' column in {cell}")

    print(f"📘 Loaded bed={df_bed.shape}, bigwig={df_bw.shape}")

    # --- 去掉不需要的欄位 ---
    keep_cols = [c for c in df_bed.columns if c not in drop_cols]
    df_bed = df_bed[keep_cols]

    # --- 合併 ---
    df_merged = pd.merge(df_bed, df_bw, on="gene_name", how="outer")
    print(f"✅ Merged shape: {df_merged.shape}")

    # --- 儲存 ---
    df_merged.to_csv(out_path, sep="\t", index=False)
    print(f"💾 Saved → {out_path}")


=== 🧬 X1: merging quantile-normalized bed + bigwig ===
📘 Loaded bed=(18268, 511), bigwig=(18268, 175)
✅ Merged shape: (18268, 679)
💾 Saved → ../preprocessed_data/reference/1. merged data/without_y_5000_one_side/X1_all_logzscore_logzscore.tsv

=== 🧬 X2: merging quantile-normalized bed + bigwig ===
📘 Loaded bed=(18268, 511), bigwig=(18268, 175)
✅ Merged shape: (18268, 679)
💾 Saved → ../preprocessed_data/reference/1. merged data/without_y_5000_one_side/X2_all_logzscore_logzscore.tsv

=== 🧬 X3: merging quantile-normalized bed + bigwig ===
⚠️ Missing bigwig file: ../preprocessed_data/reference/0. data/bigwig5000_one_side/X3_logzscore.tsv


In [None]:
import os
import pandas as pd

# ============================================================
#                  基本設定
# ============================================================
y_dir = "../preprocessed_data/reference/0. data/"
features_dir = "../preprocessed_data/reference/1. merged data/without_y_250"
output_dir = "../preprocessed_data/reference/1. merged data/with_y_250"
os.makedirs(output_dir, exist_ok=True)

cells = ["X1", "X2"]

# 這兩個 suffix 記得根據實際命名設定
bed_norm_suffix = "logzscore"
bw_norm_suffix = "logzscore"

# ============================================================
#                  主流程
# ============================================================
for cell in cells:
    y_path = os.path.join(y_dir, f"{cell}_y.tsv")
    qn_path = os.path.join(features_dir, f"{cell}_all_{bed_norm_suffix}_{bw_norm_suffix}.tsv")
    out_path = os.path.join(output_dir, f"{cell}_all_{bed_norm_suffix}_{bw_norm_suffix}_with_y.tsv")

    print(f"\n=== 🧬 {cell}: dense normalized rank (0–1) + merge with QN ===")

    if not os.path.exists(y_path):
        print(f"⚠️ Missing Y file: {y_path}")
        continue
    if not os.path.exists(qn_path):
        print(f"⚠️ Missing QN file: {qn_path}")
        continue

    # --- 讀取 ---
    df_y = pd.read_csv(y_path, sep="\t")
    df_qn = pd.read_csv(qn_path, sep="\t")

    if "gene_name" not in df_y.columns or "gene_name" not in df_qn.columns:
        raise ValueError(f"❌ Missing 'gene_name' column in {cell}")
    if "gex" not in df_y.columns:
        raise ValueError(f"❌ Missing 'gex' column in {cell}_y.tsv")

    print(f"📘 Loaded y={df_y.shape}, qn={df_qn.shape}")

    # --- 以 df_y 為主進行合併 ---
    df_merged = pd.merge(df_y, df_qn, on="gene_name", how="left")

    print(f"✅ Merged shape: {df_merged.shape}")

    # --- 輸出 ---
    df_merged.to_csv(out_path, sep="\t", index=False)
    print(f"💾 Saved → {out_path}")



=== 🧬 X1: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(16284, 3), qn=(18268, 679)
✅ Merged shape: (16284, 681)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_5000_one_side/X1_all_logzscore_logzscore_with_y.tsv

=== 🧬 X2: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(16284, 3), qn=(18268, 679)
✅ Merged shape: (16284, 681)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_5000_one_side/X2_all_logzscore_logzscore_with_y.tsv


In [2]:
import os
import pandas as pd

# ============================================================
#                  基本設定
# ============================================================
#                  基本設定
# ============================================================
y_dir = "../preprocessed_data/reference/0. data/"
features_dir = "../preprocessed_data/reference/0. data/bigwig500_one_side/"
output_dir = "../preprocessed_data/reference/1. merged data/with_y_500_only_bi_one_side"
os.makedirs(output_dir, exist_ok=True)

cells = ["X1", "X2"]

# 這兩個 suffix 記得根據實際命名設定
norm_suffix = "logzscore"

# ============================================================
#                  主流程
# ============================================================
for cell in cells:
    y_path = os.path.join(y_dir, f"{cell}_y.tsv")
    qn_path = os.path.join(features_dir, f"{cell}_{norm_suffix}.tsv")
    out_path = os.path.join(output_dir, f"{cell}_{norm_suffix}_with_y.tsv")

    print(f"\n=== 🧬 {cell}: dense normalized rank (0–1) + merge with QN ===")

    if not os.path.exists(y_path):
        print(f"⚠️ Missing Y file: {y_path}")
        continue
    if not os.path.exists(qn_path):
        print(f"⚠️ Missing QN file: {qn_path}")
        continue

    # --- 讀取 ---
    df_y = pd.read_csv(y_path, sep="\t")
    df_qn = pd.read_csv(qn_path, sep="\t")

    if "gene_name" not in df_y.columns or "gene_name" not in df_qn.columns:
        raise ValueError(f"❌ Missing 'gene_name' column in {cell}")
    if "gex" not in df_y.columns:
        raise ValueError(f"❌ Missing 'gex' column in {cell}_y.tsv")

    print(f"📘 Loaded y={df_y.shape}, qn={df_qn.shape}")

    # --- 以 df_y 為主進行合併 ---
    df_merged = pd.merge(df_y, df_qn, on="gene_name", how="left")

    print(f"✅ Merged shape: {df_merged.shape}")

    # --- 輸出 ---
    df_merged.to_csv(out_path, sep="\t", index=False)
    print(f"💾 Saved → {out_path}")



=== 🧬 X1: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(16284, 3), qn=(18268, 175)
✅ Merged shape: (16284, 177)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_500_only_bi_one_side/X1_logzscore_with_y.tsv

=== 🧬 X2: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(16284, 3), qn=(18268, 175)
✅ Merged shape: (16284, 177)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_500_only_bi_one_side/X2_logzscore_with_y.tsv
