# Extract information from bigwig

In [None]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm
from scripts.extract_bigwig import region_zsignal, get_tss_region

# === Configuration ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig100_one_side/"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 100  # +/- 100bp window centered at TSS (one side helper will expand)
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === Normalization mode options: "none", "zscore", "log_zscore", "log_only" ===
norm_mode = "log_zscore"

# ============================================================
# Load reference & global normalization statistics
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)

# ============================================================
# Main loop
# ============================================================
for cell in cells:
    print(f"\n🧬 Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        # Resolve bigWig path with common extensions
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            bw.close()
            continue

        # Pick global mean/std according to normalization mode
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None  # no normalization or log-only handled in region_zsignal

        # Prefix used for basic moment features under each mode
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # Canonical feature keys returned by region_zsignal
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]

        # Apply prefix only to basic moment features; keep advanced ones unchanged
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # Buffers to collect features per gene for this (mark, cell)
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}

        # Iterate genes and compute features for gene body and TSS window
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                # Append NaN for missing chromosomes to keep alignment
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            # Gene-body features
            g_stats = region_zsignal(
                bw, chrom, row["gene_start"], row["gene_end"],
                global_mean, global_std, mark, cell, norm_mode
            )
            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS-window features
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(
                bw, chrom, tss_start, tss_end,
                global_mean, global_std, mark, cell, norm_mode
            )
            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)

        bw.close()

        # Attach accumulated features to the genes table
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # Persist per-cell features
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n✅ Saved {out_suffix} features for {cell} → {out_path}")


📖 Loaded reference: 18268 genes

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:06<00:00, 144.55it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:08<00:00, 142.04it/s]



📂 Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:04<00:00, 146.45it/s]



📂 Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:07<00:00, 142.92it/s]



📂 Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:08<00:00, 141.64it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:09<00:00, 141.38it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:09<00:00, 141.25it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X1 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig50_one_side/X1_logzscore.tsv

🧬 Processing cell line: X2

📂 Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:26<00:00, 124.95it/s]



📂 Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:13<00:00, 136.65it/s]



📂 Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:03<00:00, 147.72it/s]



📂 Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:09<00:00, 140.56it/s]



📂 Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:21<00:00, 129.05it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:08<00:00, 142.03it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [02:04<00:00, 147.18it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X2 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig50_one_side/X2_logzscore.tsv

🧬 Processing cell line: X3

📂 Reading DNase (X3) ...


  c /= stddev[:, None]
  c /= stddev[None, :]
  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
100%|██████████| 18268/18268 [01:48<00:00, 169.07it/s]



📂 Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:48<00:00, 167.87it/s]



📂 Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:43<00:00, 176.52it/s]



📂 Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:45<00:00, 172.44it/s]



📂 Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:46<00:00, 171.25it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:45<00:00, 173.33it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



📂 Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|██████████| 18268/18268 [01:47<00:00, 170.43it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



✅ Saved logzscore features for X3 → /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig50_one_side/X3_logzscore.tsv
