# Merged train+val and merged info and y

# Add Marks z-score stat

In [18]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
merged_base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. raw_bigwig"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
tss_window = 5000  # +/- 1kb 區域作為 promoter 區域
cells = [
    "X1",
    "X2",
    "X3",
]
# ============================================================
# 統一 gene reference
# ============================================================
ref_path = '../preprocessed_data/reference/reference_gene_table.tsv'
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference table: {ref_path} ({len(ref_genes)} genes)")

# === 載入 global normalization 統計 ===
with open(stats_path, "r") as f:
    stats = json.load(f)

# === 輔助函式 ===
import numpy as np
from scipy.stats import kurtosis, skew
from numpy.fft import fft


import numpy as np
from scipy.stats import kurtosis, skew


def region_zsignal(bw, chrom, start, end, global_mean, global_std, mark_name=None, cell_name=None):
    """
    取出 bigWig 區域的多層級 z-score 特徵：
    1️⃣ Mean/Std/Min/Max/Diff
    2️⃣ Signal Gradient / Derivative Features
    3️⃣ Shape-based Descriptors (Kurtosis, Skewness)
    4️⃣ Entropy / Complexity
    5️⃣ Spatial Autocorrelation / Smoothness
    ⚙️ 所有 NaN / inf / 無效區域均會設為 0.0，保證輸出穩定。
    """

    feature_keys = [
        "z_mean", "z_std", "z_min", "z_max", "z_diff",
        "gradient_mean", "slope", "kurtosis", "skewness",
        "entropy", "autocorr", "laplacian"
    ]

    chroms = bw.chroms()

    # === 1️⃣ 染色體不存在 ===
    if chrom not in chroms:
        print(f"⚠️ [region_zsignal] Missing chromosome {chrom} in {mark_name or '?'} ({cell_name or '?'}) → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    chrom_length = chroms[chrom]
    start = max(0, int(start))
    end = min(int(end), chrom_length)

    # === 2️⃣ 無效區域 ===
    if end <= start:
        print(f"⚠️ [region_zsignal] Invalid region {chrom}:{start}-{end} (end <= start) in {mark_name or '?'} → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    # === 3️⃣ 取值 ===
    vals = np.array(bw.values(chrom, start, end, numpy=True))
    if vals is None or len(vals) == 0:
        print(f"⚠️ [region_zsignal] Empty values for {chrom}:{start}-{end} in {mark_name or '?'} → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    vals = vals[~np.isnan(vals)]
    if len(vals) == 0:
        print(f"⚠️ [region_zsignal] All NaN values for {chrom}:{start}-{end} in {mark_name or '?'} → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    # === 4️⃣ 計算統計量 ===
    local_mean = np.mean(vals)
    local_std  = np.std(vals)
    local_min  = np.min(vals)
    local_max  = np.max(vals)
    local_diff = local_max - local_min
    z_mean = local_mean
    z_std  = local_std 
    z_min  = local_min
    z_max  = local_max
    z_diff = local_diff

    # === Z-score normalization ===
    # z_mean = (local_mean - global_mean) / (global_std + 1e-8)
    # z_std  = local_std / (global_std + 1e-8)
    # z_min  = (local_min - global_mean) / (global_std + 1e-8)
    # z_max  = (local_max - global_mean) / (global_std + 1e-8)
    # z_diff = z_max - z_min

    # === 5️⃣ 處理 inf/nan 結果 ===
    for name in ["z_mean", "z_std", "z_min", "z_max", "z_diff"]:
        val = locals()[name]
        if not np.isfinite(val):
            print(f"⚠️ [region_zsignal] {name} not finite ({val}) for {chrom}:{start}-{end} in {mark_name or '?'} → set to 0")
            locals()[name] = 0.0

    # === 6️⃣ Signal Gradient / Slope ===
    if len(vals) > 1:
        diffs = np.diff(vals)
        gradient_mean = np.mean(np.abs(diffs)) if len(diffs) > 0 else 0.0
        x = np.arange(len(vals))
        try:
            slope = np.polyfit(x, vals, 1)[0]
        except Exception:
            slope = 0.0
    else:
        gradient_mean, slope = 0.0, 0.0

    # === 7️⃣ Shape-based Descriptors ===
    sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
    asymmetry = skew(vals) if len(vals) > 3 else 0.0

    # === 8️⃣ Entropy / Complexity ===
    p = np.abs(vals)
    if p.sum() == 0:
        local_entropy = 0.0
    else:
        p = p / (p.sum() + 1e-8)
        local_entropy = -np.sum(p * np.log2(p + 1e-8))

    # === 9️⃣ Spatial Autocorrelation / Smoothness ===
    if len(vals) > 2:
        try:
            autocorr = np.corrcoef(vals[:-1], vals[1:])[0, 1]
        except Exception:
            autocorr = 0.0
    else:
        autocorr = 0.0

    if len(vals) > 3:
        laplacian = np.mean(np.abs(vals[:-2] - 2 * vals[1:-1] + vals[2:]))
    else:
        laplacian = 0.0

    # === 10️⃣ 確保所有結果有限 ===
    result = {
        "z_mean": z_mean,
        "z_std": z_std,
        "z_min": z_min,
        "z_max": z_max,
        "z_diff": z_diff,
        "gradient_mean": gradient_mean,
        "slope": slope,
        "kurtosis": sharpness,
        "skewness": asymmetry,
        "entropy": local_entropy,
        "autocorr": autocorr,
        "laplacian": laplacian,
    }

    for k, v in result.items():
        if not np.isfinite(v):
            result[k] = 0.0

    return result


def get_tss_region(row, window=1000):
    """根據 strand (+/-) 給出 promoter (TSS ± window) 區域"""
    if row["strand"] == "+":
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    else:  # strand == "-"
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    return start, end


# === 主迴圈：對每個 cell line 做 ===
# === 主迴圈：對每個 cell line 做 ===
for cell in cells:


    print(f"\n🧬 Processing cell line: {cell}")
    genes = pd.read_csv(ref_path, sep="\t")


    # === 對每個 mark 做特徵提取 ===
    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        global_mean = stats[key]["mean"]
        global_std = stats[key]["std"]

        # === 建立空字典儲存所有 feature ===
        gene_features = {f"{mark}_gene_{k}": [] for k in [
            "z_mean", "z_std", "z_min", "z_max", "z_diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]}
        tss_features = {f"{mark}_tss_{k}": [] for k in [
            "z_mean", "z_std", "z_min", "z_max", "z_diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]}

        # === 主迴圈：對每個基因計算 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            # === Gene body ===
            g_stats = region_zsignal(
                bw, chrom, row["gene_start"], row["gene_end"],
                global_mean, global_std,
                mark_name=mark, cell_name=cell
            )
            for k, v in g_stats.items():
                gene_features[f"{mark}_gene_{k}"].append(v)

            # === TSS / promoter ===
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(
                bw, chrom, tss_start, tss_end,
                global_mean, global_std,
                mark_name=mark, cell_name=cell
            )
            for k, v in t_stats.items():
                tss_features[f"{mark}_tss_{k}"].append(v)

        bw.close()

        # === 合併進 DataFrame ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 輸出結果 ===
    out_path = os.path.join(merged_base, f"{cell}_zscore_dynamics.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)
    print(f"\n✅ Saved dynamic z-score features for {cell} → {out_path}")


📖 Loaded reference table: ../preprocessed_data/reference/reference_gene_table.tsv (18268 genes)

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:23<00:00, 126.89it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 27%|██▋       | 4970/18268 [00:45<02:54, 76.37it/s] 

⚠️ [region_zsignal] All NaN values for chr10:133623895-133626795 in H3K27ac → fill 0.0


 52%|█████▏    | 9570/18268 [01:24<00:31, 278.03it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27ac → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K27ac → fill 0.0


100%|██████████| 18268/18268 [02:39<00:00, 114.56it/s]



📂 Reading H3K4me3 (X1) ...


  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:37<00:00, 116.11it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3961/18268 [00:35<01:50, 129.15it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K27me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K27me3 → fill 0.0


 52%|█████▏    | 9569/18268 [01:22<00:30, 282.78it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K27me3 → fill 0.0


100%|██████████| 18268/18268 [02:32<00:00, 119.78it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3946/18268 [00:35<01:50, 130.07it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K36me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K36me3 → fill 0.0


 52%|█████▏    | 9544/18268 [01:22<00:32, 268.94it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K36me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K36me3 → fill 0.0


100%|██████████| 18268/18268 [02:35<00:00, 117.66it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3962/18268 [00:36<01:50, 129.88it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K4me1 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K4me1 → fill 0.0


100%|██████████| 18268/18268 [02:34<00:00, 117.94it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3949/18268 [00:36<01:50, 129.81it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K9me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K9me3 → fill 0.0


 52%|█████▏    | 9544/18268 [01:23<00:32, 272.44it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K9me3 → fill 0.0


100%|██████████| 18268/18268 [02:39<00:00, 114.81it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved dynamic z-score features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. raw_bigwig/X1_zscore_dynamics.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:52<00:00, 106.14it/s]



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 52%|█████▏    | 9544/18268 [01:35<00:32, 265.73it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27ac → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K27ac → fill 0.0


100%|██████████| 18268/18268 [02:47<00:00, 108.97it/s]



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 52%|█████▏    | 9560/18268 [01:31<00:30, 289.05it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K4me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K4me3 → fill 0.0


100%|██████████| 18268/18268 [03:08<00:00, 96.90it/s] 



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3946/18268 [00:46<01:50, 129.11it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K27me3 → fill 0.0


 52%|█████▏    | 9548/18268 [01:41<00:37, 230.45it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K27me3 → fill 0.0


100%|██████████| 18268/18268 [03:08<00:00, 96.72it/s] 



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3945/18268 [00:46<02:21, 100.94it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K36me3 → fill 0.0


 52%|█████▏    | 9539/18268 [01:53<00:38, 226.37it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K36me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K36me3 → fill 0.0


  c /= stddev[None, :]
100%|██████████| 18268/18268 [03:38<00:00, 83.71it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 52%|█████▏    | 9550/18268 [01:56<00:36, 236.29it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K4me1 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K4me1 → fill 0.0


100%|██████████| 18268/18268 [03:34<00:00, 85.35it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 52%|█████▏    | 9538/18268 [01:55<00:44, 197.81it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K9me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K9me3 → fill 0.0


100%|██████████| 18268/18268 [03:26<00:00, 88.48it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved dynamic z-score features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. raw_bigwig/X2_zscore_dynamics.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:47<00:00, 109.00it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3951/18268 [00:38<02:20, 102.12it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K27ac → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K27ac → fill 0.0


 27%|██▋       | 4968/18268 [00:49<02:02, 108.18it/s]

⚠️ [region_zsignal] All NaN values for chr10:133623895-133626795 in H3K27ac → fill 0.0
⚠️ [region_zsignal] All NaN values for chr10:133621745-133631795 in H3K27ac → fill 0.0


 52%|█████▏    | 9557/18268 [01:31<00:34, 252.41it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27ac → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K27ac → fill 0.0


100%|██████████| 18268/18268 [02:50<00:00, 107.08it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3949/18268 [00:35<01:47, 132.73it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K4me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K4me3 → fill 0.0


 52%|█████▏    | 9563/18268 [01:20<00:32, 266.69it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K4me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K4me3 → fill 0.0


100%|██████████| 18268/18268 [02:33<00:00, 118.73it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3949/18268 [00:38<01:54, 125.43it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K27me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K27me3 → fill 0.0


 52%|█████▏    | 9569/18268 [01:31<00:32, 269.47it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27me3 → fill 0.0


 62%|██████▏   | 11287/18268 [01:48<00:56, 122.60it/s]

⚠️ [region_zsignal] All NaN values for chr22:50767501-50783667 in H3K27me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr22:50777264-50787314 in H3K27me3 → fill 0.0


100%|██████████| 18268/18268 [02:59<00:00, 101.80it/s]



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3944/18268 [00:55<01:57, 122.21it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K36me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K36me3 → fill 0.0


 52%|█████▏    | 9523/18268 [01:57<00:35, 248.69it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K36me3 → fill 0.0


  c /= stddev[None, :]
100%|██████████| 18268/18268 [03:20<00:00, 91.06it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3960/18268 [00:42<01:59, 119.55it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K4me1 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K4me1 → fill 0.0


 52%|█████▏    | 9543/18268 [01:35<00:41, 210.40it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K4me1 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K4me1 → fill 0.0


100%|██████████| 18268/18268 [03:04<00:00, 98.87it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 22%|██▏       | 3957/18268 [00:43<02:02, 116.97it/s]

⚠️ [region_zsignal] All NaN values for chr4:190173774-190185942 in H3K9me3 → fill 0.0
⚠️ [region_zsignal] All NaN values for chr4:190168773-190178823 in H3K9me3 → fill 0.0


 52%|█████▏    | 9561/18268 [01:33<00:37, 230.39it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K9me3 → fill 0.0


100%|██████████| 18268/18268 [02:54<00:00, 104.68it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved dynamic z-score features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. raw_bigwig/X3_zscore_dynamics.tsv


In [23]:
import os
import pandas as pd
import glob
import re

# ============================================================
#                  基本設定
# ============================================================
base_dir = "../preprocessed_data/reference/0. raw_bigwig"
files = sorted(glob.glob(os.path.join(base_dir, "X*_zscore_dynamics.tsv")))

print(f"📂 Found {len(files)} files:")
for f in files:
    print(f"  - {os.path.basename(f)}")

# ============================================================
#                  欄位重新命名函式
# ============================================================
def rename_columns(df):
    rename_map = {}
    for col in df.columns:
        new_col = col
        new_col = re.sub(r"_z_mean$", "_mean", new_col)
        new_col = re.sub(r"_z_std$", "_std", new_col)
        new_col = re.sub(r"_z_min$", "_min", new_col)
        new_col = re.sub(r"_z_max$", "_max", new_col)
        new_col = re.sub(r"_z_diff$", "_diff", new_col)
        rename_map[col] = new_col
    return df.rename(columns=rename_map)

# ============================================================
#                  主流程
# ============================================================
mark_min_results = {}
summary_rows = []

for f in files:
    cell = os.path.basename(f).split("_")[0]
    print(f"\n🔧 Processing {cell} ...")

    df = pd.read_csv(f, sep="\t")
    df = rename_columns(df)

    # 儲存回原地
    df.to_csv(f, sep="\t", index=False)
    print(f"💾 Saved updated file: {f}")
    print(f"🧩 Columns: {len(df.columns)} total")
    
    # 檢查是否有任何 *_tss_min / *_gene_min 欄位
    tss_cols = [c for c in df.columns if "_tss_min" in c]
    gene_cols = [c for c in df.columns if "_gene_min" in c]

    if not tss_cols or not gene_cols:
        print(f"⚠️ No '_tss_min' or '_gene_min' columns found in {cell}. Skipping.")
        continue

    # === 找出每個 mark ===
    mark_cols = sorted({col.split("_")[0] for col in df.columns if "_tss_min" in col or "_gene_min" in col})
    print(f"✅ Marks found: {mark_cols}")

    for mark in mark_cols:
        tss_col = f"{mark}_tss_min"
        gene_col = f"{mark}_gene_min"

        if tss_col not in df.columns or gene_col not in df.columns:
            print(f"⚠️ Missing {tss_col} or {gene_col}")
            continue

        min_tss = df[tss_col].min(skipna=True)
        min_gene = df[gene_col].min(skipna=True)

        summary_rows.append({
            "mark": mark,
            "cell": cell,
            "tss_min": min_tss,
            "gene_min": min_gene
        })

# ============================================================
#                  結果輸出
# ============================================================
if summary_rows:
    summary_df = pd.DataFrame(summary_rows).sort_values(["mark", "cell"]).reset_index(drop=True)
    print("\n🧭 Summary of minimum values per mark:")
    print(summary_df)

    summary_path = os.path.join(base_dir, "summary_min_values.tsv")
    summary_df.to_csv(summary_path, sep="\t", index=False)
    print(f"\n✅ Summary saved to: {summary_path}")
else:
    print("\n❌ No marks found with '_tss_min' and '_gene_min' columns. Please check column names in your TSV files.")


📂 Found 3 files:
  - X1_zscore_dynamics.tsv
  - X2_zscore_dynamics.tsv
  - X3_zscore_dynamics.tsv

🔧 Processing X1 ...
💾 Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X1_zscore_dynamics.tsv
🧩 Columns: 175 total
✅ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

🔧 Processing X2 ...
💾 Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X2_zscore_dynamics.tsv
🧩 Columns: 175 total
✅ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

🔧 Processing X3 ...
💾 Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X3_zscore_dynamics.tsv
🧩 Columns: 175 total
✅ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

🧭 Summary of minimum values per mark:
        mark cell   tss_min  gene_min
0      DNase   X1  0.024102  0.024102
1      DNase   X2  0.003446  0.003446
2      DNase   X3  0.000000  0.000000
3    H3K27ac   X1  0.000000  0.00000