# Merged train+val and merged info and y

# Add Marks z-score stat

In [1]:

# === 輔助函式 ===
import numpy as np
from scipy.stats import kurtosis, skew
from numpy.fft import fft


# ============================================================
# ⚙️ 通用 Normalization Function
# ============================================================
def normalize_signal(vals, global_mean=None, global_std=None, mode="none"):
    """
    多模式 normalization 支援：
      - "none": 不做任何處理
      - "zscore": (x - μ) / σ
      - "log_zscore": ((log1p(x)) - μ_log) / σ_log
      - "log_only": 只取 log1p，不標準化
    """
    if mode == "none":
        return vals

    elif mode == "zscore":
        if global_mean is None or global_std is None:
            raise ValueError("Missing global_mean/global_std for zscore normalization.")
        return (vals - global_mean) / (global_std + 1e-8)

    elif mode == "log_zscore":
        if global_mean is None or global_std is None:
            raise ValueError("Missing global_mean/global_std for log_zscore normalization.")
        vals = np.log1p(np.clip(vals, a_min=0, a_max=None))
        return (vals - global_mean) / (global_std + 1e-8)

    elif mode == "log_only":
        return np.log1p(np.clip(vals, a_min=0, a_max=None))

    else:
        raise ValueError(f"Unknown normalization mode: {mode}")


# ============================================================
# 🧬 region_zsignal (with normalization selection)
# ============================================================
def region_zsignal(bw, chrom, start, end, global_mean=None, global_std=None,
                   mark_name=None, cell_name=None, norm_mode="none"):
    """
    Extracts region-level features with multiple normalization modes.
    """
    feature_keys = [
        "mean", "std", "min", "max", "diff",
        "gradient_mean", "slope", "kurtosis", "skewness",
        "entropy", "autocorr", "laplacian"
    ]

    chroms = bw.chroms()
    if chrom not in chroms:
        return {k: 0.0 for k in feature_keys}

    chrom_length = chroms[chrom]
    start, end = max(0, int(start)), min(int(end), chrom_length)
    if end <= start:
        return {k: 0.0 for k in feature_keys}

    vals = np.array(bw.values(chrom, start, end, numpy=True))
    vals = vals[~np.isnan(vals)]
    if len(vals) == 0:
        return {k: 0.0 for k in feature_keys}

    # === 🔧 Apply normalization (可動態切換) ===
    vals_norm = normalize_signal(vals, global_mean, global_std, mode=norm_mode)

    # === Feature Extraction ===
    local_mean = np.mean(vals_norm)
    local_std  = np.std(vals_norm)
    local_min  = np.min(vals_norm)
    local_max  = np.max(vals_norm)
    local_diff = local_max - local_min

    # Gradient / slope
    if len(vals_norm) > 1:
        diffs = np.diff(vals_norm)
        gradient_mean = np.mean(np.abs(diffs))
        try:
            slope = np.polyfit(np.arange(len(vals_norm)), vals_norm, 1)[0]
        except Exception:
            slope = 0.0
    else:
        gradient_mean, slope = 0.0, 0.0

    # Shape-based descriptors
    sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
    asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0

    # Entropy
    p = np.abs(vals_norm)
    p_sum = np.sum(p)
    local_entropy = -np.sum((p / (p_sum + 1e-8)) * np.log2(p / (p_sum + 1e-8))) if p_sum > 0 else 0.0

    # Autocorrelation & Laplacian
    autocorr = np.corrcoef(vals_norm[:-1], vals_norm[1:])[0, 1] if len(vals_norm) > 2 else 0.0
    laplacian = np.mean(np.abs(vals_norm[:-2] - 2 * vals_norm[1:-1] + vals_norm[2:])) if len(vals_norm) > 3 else 0.0

    result = {
        "mean": local_mean,
        "std": local_std,
        "min": local_min,
        "max": local_max,
        "diff": local_diff,
        "gradient_mean": gradient_mean,
        "slope": slope,
        "kurtosis": sharpness,
        "skewness": asymmetry,
        "entropy": local_entropy,
        "autocorr": autocorr,
        "laplacian": laplacian,
    }

    # 保險處理 nan/inf
    for k, v in result.items():
        if not np.isfinite(v):
            result[k] = 0.0

    return result


def get_tss_region(row, window=1000):
    """根據 strand (+/-) 給出 promoter (TSS ± window) 區域"""
    if row["strand"] == "+":
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    else:  # strand == "-"
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    return start, end

In [None]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig250/"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 250  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization 模式可選 ===
# 可選 "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# 載入 reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# 主迴圈
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        # --- 根據 norm_mode 選取 global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === 動態命名 prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === 建立空字典儲存所有 feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- 對 mean/std/min/max/diff 加上 prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === 建立空字典儲存 features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === 主迴圈 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === 合併 ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 儲存結果 ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
 44%|████▍     | 8129/18268 [01:06<01:24, 119.63it/s]

In [31]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig500_one_side/"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 500  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization 模式可選 ===
# 可選 "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# 載入 reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# 主迴圈
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        # --- 根據 norm_mode 選取 global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === 動態命名 prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === 建立空字典儲存所有 feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- 對 mean/std/min/max/diff 加上 prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === 建立空字典儲存 features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === 主迴圈 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === 合併 ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 儲存結果 ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:42<00:00, 178.29it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:55<00:00, 157.70it/s]



📂 Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:00<00:00, 151.49it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:50<00:00, 165.23it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:51<00:00, 163.20it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:52<00:00, 162.24it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:52<00:00, 161.76it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig500_one_side/X1_logzscore.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:45<00:00, 173.94it/s]



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:58<00:00, 154.28it/s]



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:02<00:00, 149.03it/s]



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:59<00:00, 152.73it/s]



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:04<00:00, 147.03it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:08<00:00, 141.88it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:06<00:00, 144.27it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig500_one_side/X2_logzscore.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:52<00:00, 161.89it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:54<00:00, 159.60it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:54<00:00, 159.57it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:54<00:00, 160.00it/s]



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:55<00:00, 157.88it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:58<00:00, 153.84it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:57<00:00, 156.05it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig500_one_side/X3_logzscore.tsv


In [34]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig5000_one_side/"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 5000  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization 模式可選 ===
# 可選 "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# 載入 reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# 主迴圈
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        # --- 根據 norm_mode 選取 global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === 動態命名 prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === 建立空字典儲存所有 feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- 對 mean/std/min/max/diff 加上 prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === 建立空字典儲存 features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === 主迴圈 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === 合併 ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 儲存結果 ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:01<00:00, 150.63it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:01<00:00, 150.17it/s]



📂 Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:08<00:00, 142.06it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:02<00:00, 149.31it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:06<00:00, 144.57it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:04<00:00, 146.91it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [2:10:14<00:00,  2.34it/s]   
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig5000_one_side/X1_logzscore.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [03:08<00:00, 96.71it/s] 



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [03:25<00:00, 88.94it/s] 



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [03:34<00:00, 85.26it/s] 



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [03:07<00:00, 97.52it/s] 



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [03:01<00:00, 100.60it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [03:30<00:00, 86.91it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:55<00:00, 104.02it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig5000_one_side/X2_logzscore.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:42<00:00, 112.74it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:49<00:00, 107.91it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:52<00:00, 105.92it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [03:19<00:00, 91.77it/s] 



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [03:34<00:00, 85.18it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [03:27<00:00, 87.96it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [03:07<00:00, 97.34it/s] 
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig5000_one_side/X3_logzscore.tsv


In [33]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig3000_one_side/"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 3000  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization 模式可選 ===
# 可選 "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# 載入 reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# 主迴圈
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        # --- 根據 norm_mode 選取 global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === 動態命名 prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === 建立空字典儲存所有 feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- 對 mean/std/min/max/diff 加上 prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === 建立空字典儲存 features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === 主迴圈 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === 合併 ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 儲存結果 ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:55<00:00, 157.84it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:02<00:00, 149.23it/s]



📂 Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:27<00:00, 124.19it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:26<00:00, 124.59it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:36<00:00, 117.07it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:24<00:00, 126.32it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:19<00:00, 131.29it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig3000_one_side/X1_logzscore.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:50<00:00, 165.40it/s]



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:58<00:00, 154.19it/s]



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:06<00:00, 144.22it/s]



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:06<00:00, 144.42it/s]



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:13<00:00, 137.30it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:14<00:00, 136.16it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:10<00:00, 139.76it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig3000_one_side/X2_logzscore.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:50<00:00, 165.63it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:50<00:00, 164.99it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:53<00:00, 160.56it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:51<00:00, 164.33it/s]



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:54<00:00, 159.86it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:54<00:00, 160.04it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:54<00:00, 160.09it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig3000_one_side/X3_logzscore.tsv


In [32]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000_one_side/"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 2000  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization 模式可選 ===
# 可選 "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# 載入 reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# 主迴圈
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        # --- 根據 norm_mode 選取 global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === 動態命名 prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === 建立空字典儲存所有 feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- 對 mean/std/min/max/diff 加上 prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === 建立空字典儲存 features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === 主迴圈 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === 合併 ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 儲存結果 ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:50<00:00, 165.80it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:58<00:00, 153.72it/s]



📂 Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:59<00:00, 152.61it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:51<00:00, 164.37it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:59<00:00, 153.34it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:58<00:00, 154.45it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:01<00:00, 149.76it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000_one_side/X1_logzscore.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:55<00:00, 158.42it/s]



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:53<00:00, 161.19it/s]



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:07<00:00, 143.30it/s]



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:02<00:00, 149.45it/s]



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:07<00:00, 142.94it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:10<00:00, 139.75it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:05<00:00, 146.13it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000_one_side/X2_logzscore.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:53<00:00, 161.33it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:53<00:00, 161.14it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:51<00:00, 164.50it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:51<00:00, 163.74it/s]



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:53<00:00, 161.29it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:58<00:00, 153.96it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:58<00:00, 153.61it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000_one_side/X3_logzscore.tsv


In [26]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 2000  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization 模式可選 ===
# 可選 "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# 載入 reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# 主迴圈
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        # --- 根據 norm_mode 選取 global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === 動態命名 prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === 建立空字典儲存所有 feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- 對 mean/std/min/max/diff 加上 prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === 建立空字典儲存 features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === 主迴圈 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === 合併 ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 儲存結果 ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:00<00:00, 151.62it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:01<00:00, 149.91it/s]



📂 Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:09<00:00, 141.13it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:01<00:00, 150.77it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:09<00:00, 141.57it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:06<00:00, 144.15it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:04<00:00, 146.45it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000/X1_logzscore.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:58<00:00, 154.52it/s]



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:59<00:00, 153.19it/s]



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:09<00:00, 140.59it/s]



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:12<00:00, 138.05it/s]



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:13<00:00, 136.38it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:19<00:00, 130.97it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:11<00:00, 138.48it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000/X2_logzscore.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [01:59<00:00, 153.37it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:03<00:00, 148.42it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:00<00:00, 151.98it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:02<00:00, 148.94it/s]



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:03<00:00, 147.73it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:07<00:00, 143.37it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:04<00:00, 146.63it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig2000/X3_logzscore.tsv


In [23]:
import os
import pandas as pd
import glob
import re

# ============================================================
#                  基本設定
# ============================================================
base_dir = "../preprocessed_data/reference/0. raw_bigwig"
files = sorted(glob.glob(os.path.join(base_dir, "X*_zscore_dynamics.tsv")))

print(f"📂 Found {len(files)} files:")
for f in files:
    print(f"  - {os.path.basename(f)}")

# ============================================================
#                  欄位重新命名函式
# ============================================================
def rename_columns(df):
    rename_map = {}
    for col in df.columns:
        new_col = col
        new_col = re.sub(r"_z_mean$", "_mean", new_col)
        new_col = re.sub(r"_z_std$", "_std", new_col)
        new_col = re.sub(r"_z_min$", "_min", new_col)
        new_col = re.sub(r"_z_max$", "_max", new_col)
        new_col = re.sub(r"_z_diff$", "_diff", new_col)
        rename_map[col] = new_col
    return df.rename(columns=rename_map)

# ============================================================
#                  主流程
# ============================================================
mark_min_results = {}
summary_rows = []

for f in files:
    cell = os.path.basename(f).split("_")[0]
    print(f"\n🔧 Processing {cell} ...")

    df = pd.read_csv(f, sep="\t")
    df = rename_columns(df)

    # 儲存回原地
    df.to_csv(f, sep="\t", index=False)
    print(f"💾 Saved updated file: {f}")
    print(f"🧩 Columns: {len(df.columns)} total")
    
    # 檢查是否有任何 *_tss_min / *_gene_min 欄位
    tss_cols = [c for c in df.columns if "_tss_min" in c]
    gene_cols = [c for c in df.columns if "_gene_min" in c]

    if not tss_cols or not gene_cols:
        print(f"⚠️ No '_tss_min' or '_gene_min' columns found in {cell}. Skipping.")
        continue

    # === 找出每個 mark ===
    mark_cols = sorted({col.split("_")[0] for col in df.columns if "_tss_min" in col or "_gene_min" in col})
    print(f"✅ Marks found: {mark_cols}")

    for mark in mark_cols:
        tss_col = f"{mark}_tss_min"
        gene_col = f"{mark}_gene_min"

        if tss_col not in df.columns or gene_col not in df.columns:
            print(f"⚠️ Missing {tss_col} or {gene_col}")
            continue

        min_tss = df[tss_col].min(skipna=True)
        min_gene = df[gene_col].min(skipna=True)

        summary_rows.append({
            "mark": mark,
            "cell": cell,
            "tss_min": min_tss,
            "gene_min": min_gene
        })

# ============================================================
#                  結果輸出
# ============================================================
if summary_rows:
    summary_df = pd.DataFrame(summary_rows).sort_values(["mark", "cell"]).reset_index(drop=True)
    print("\n🧭 Summary of minimum values per mark:")
    print(summary_df)

    summary_path = os.path.join(base_dir, "summary_min_values.tsv")
    summary_df.to_csv(summary_path, sep="\t", index=False)
    print(f"\n✅ Summary saved to: {summary_path}")
else:
    print("\n❌ No marks found with '_tss_min' and '_gene_min' columns. Please check column names in your TSV files.")


📂 Found 3 files:
  - X1_zscore_dynamics.tsv
  - X2_zscore_dynamics.tsv
  - X3_zscore_dynamics.tsv

🔧 Processing X1 ...
💾 Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X1_zscore_dynamics.tsv
🧩 Columns: 175 total
✅ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

🔧 Processing X2 ...
💾 Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X2_zscore_dynamics.tsv
🧩 Columns: 175 total
✅ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

🔧 Processing X3 ...
💾 Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X3_zscore_dynamics.tsv
🧩 Columns: 175 total
✅ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

🧭 Summary of minimum values per mark:
        mark cell   tss_min  gene_min
0      DNase   X1  0.024102  0.024102
1      DNase   X2  0.003446  0.003446
2      DNase   X3  0.000000  0.000000
3    H3K27ac   X1  0.000000  0.00000