# Merged train+val and merged info and y

# Add Marks z-score stat

In [19]:

# === ËºîÂä©ÂáΩÂºè ===
import numpy as np
from scipy.stats import kurtosis, skew
from numpy.fft import fft


# ============================================================
# ‚öôÔ∏è ÈÄöÁî® Normalization Function
# ============================================================
def normalize_signal(vals, global_mean=None, global_std=None, mode="none"):
    """
    Â§öÊ®°Âºè normalization ÊîØÊè¥Ôºö
      - "none": ‰∏çÂÅö‰ªª‰ΩïËôïÁêÜ
      - "zscore": (x - Œº) / œÉ
      - "log_zscore": ((log1p(x)) - Œº_log) / œÉ_log
      - "log_only": Âè™Âèñ log1pÔºå‰∏çÊ®ôÊ∫ñÂåñ
    """
    if mode == "none":
        return vals

    elif mode == "zscore":
        if global_mean is None or global_std is None:
            raise ValueError("Missing global_mean/global_std for zscore normalization.")
        return (vals - global_mean) / (global_std + 1e-8)

    elif mode == "log_zscore":
        if global_mean is None or global_std is None:
            raise ValueError("Missing global_mean/global_std for log_zscore normalization.")
        vals = np.log1p(np.clip(vals, a_min=0, a_max=None))
        return (vals - global_mean) / (global_std + 1e-8)

    elif mode == "log_only":
        return np.log1p(np.clip(vals, a_min=0, a_max=None))

    else:
        raise ValueError(f"Unknown normalization mode: {mode}")


# ============================================================
# üß¨ region_zsignal (with normalization selection)
# ============================================================
def region_zsignal(bw, chrom, start, end, global_mean=None, global_std=None,
                   mark_name=None, cell_name=None, norm_mode="none"):
    """
    Extracts region-level features with multiple normalization modes.
    """
    feature_keys = [
        "mean", "std", "min", "max", "diff",
        "gradient_mean", "slope", "kurtosis", "skewness",
        "entropy", "autocorr", "laplacian"
    ]

    chroms = bw.chroms()
    if chrom not in chroms:
        return {k: 0.0 for k in feature_keys}

    chrom_length = chroms[chrom]
    start, end = max(0, int(start)), min(int(end), chrom_length)
    if end <= start:
        return {k: 0.0 for k in feature_keys}

    vals = np.array(bw.values(chrom, start, end, numpy=True))
    vals = vals[~np.isnan(vals)]
    if len(vals) == 0:
        return {k: 0.0 for k in feature_keys}

    # === üîß Apply normalization (ÂèØÂãïÊÖãÂàáÊèõ) ===
    vals_norm = normalize_signal(vals, global_mean, global_std, mode=norm_mode)

    # === Feature Extraction ===
    local_mean = np.mean(vals_norm)
    local_std  = np.std(vals_norm)
    local_min  = np.min(vals_norm)
    local_max  = np.max(vals_norm)
    local_diff = local_max - local_min

    # Gradient / slope
    if len(vals_norm) > 1:
        diffs = np.diff(vals_norm)
        gradient_mean = np.mean(np.abs(diffs))
        try:
            slope = np.polyfit(np.arange(len(vals_norm)), vals_norm, 1)[0]
        except Exception:
            slope = 0.0
    else:
        gradient_mean, slope = 0.0, 0.0

    # Shape-based descriptors
    sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
    asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0

    # Entropy
    p = np.abs(vals_norm)
    p_sum = np.sum(p)
    local_entropy = -np.sum((p / (p_sum + 1e-8)) * np.log2(p / (p_sum + 1e-8))) if p_sum > 0 else 0.0

    # Autocorrelation & Laplacian
    autocorr = np.corrcoef(vals_norm[:-1], vals_norm[1:])[0, 1] if len(vals_norm) > 2 else 0.0
    laplacian = np.mean(np.abs(vals_norm[:-2] - 2 * vals_norm[1:-1] + vals_norm[2:])) if len(vals_norm) > 3 else 0.0

    result = {
        "mean": local_mean,
        "std": local_std,
        "min": local_min,
        "max": local_max,
        "diff": local_diff,
        "gradient_mean": gradient_mean,
        "slope": slope,
        "kurtosis": sharpness,
        "skewness": asymmetry,
        "entropy": local_entropy,
        "autocorr": autocorr,
        "laplacian": laplacian,
    }

    # ‰øùÈö™ËôïÁêÜ nan/inf
    for k, v in result.items():
        if not np.isfinite(v):
            result[k] = 0.0

    return result


def get_tss_region(row, window=1000):
    """Ê†πÊìö strand (+/-) Áµ¶Âá∫ promoter (TSS ¬± window) ÂçÄÂüü"""
    if row["strand"] == "+":
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    else:  # strand == "-"
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    return start, end

In [22]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === Âü∫Êú¨Ë®≠ÂÆö ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 5000  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization Ê®°ÂºèÂèØÈÅ∏ ===
# ÂèØÈÅ∏ "none", "zscore", "log_zscore", "log_only"
norm_mode = "log_zscore"

# ============================================================
# ËºâÂÖ• reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"üìñ Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# ‰∏ªËø¥Âúà
# ============================================================
for cell in cells:
    print(f"\nüß¨ Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"‚ö†Ô∏è Missing {mark} ({cell})")
            continue

        print(f"\nüìÇ Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"‚ö†Ô∏è No global stat found for {key}")
            continue

        # --- Ê†πÊìö norm_mode ÈÅ∏Âèñ global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === ÂãïÊÖãÂëΩÂêç prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === Âª∫Á´ãÁ©∫Â≠óÂÖ∏ÂÑ≤Â≠òÊâÄÊúâ feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- Â∞ç mean/std/min/max/diff Âä†‰∏ä prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === Âª∫Á´ãÁ©∫Â≠óÂÖ∏ÂÑ≤Â≠ò features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === ‰∏ªËø¥Âúà ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === Âêà‰Ωµ ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === ÂÑ≤Â≠òÁµêÊûú ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n‚úÖ Saved {out_suffix} features for {cell} ‚Üí {out_path}")


üìñ Loaded reference: 18268 genes

üß¨ Processing cell line: X1

üìÇ Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:37<00:00, 116.28it/s]



üìÇ Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:28<00:00, 122.85it/s]



üìÇ Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:32<00:00, 119.93it/s]



üìÇ Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:28<00:00, 122.86it/s]



üìÇ Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:34<00:00, 118.49it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:38<00:00, 115.50it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K9me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:48<00:00, 108.35it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



‚úÖ Saved logzscore features for X1 ‚Üí /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig/X1_logzscore_features.tsv

üß¨ Processing cell line: X2

üìÇ Reading DNase (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:31<00:00, 120.88it/s]



üìÇ Reading H3K27ac (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:41<00:00, 113.23it/s]



üìÇ Reading H3K4me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:58<00:00, 102.44it/s]



üìÇ Reading H3K27me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:42<00:00, 112.48it/s]



üìÇ Reading H3K36me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [03:02<00:00, 100.03it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K4me1 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:45<00:00, 110.17it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K9me3 (X2) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:57<00:00, 102.77it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



‚úÖ Saved logzscore features for X2 ‚Üí /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig/X2_logzscore_features.tsv

üß¨ Processing cell line: X3

üìÇ Reading DNase (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:42<00:00, 112.23it/s]



üìÇ Reading H3K27ac (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:55<00:00, 104.08it/s]



üìÇ Reading H3K4me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:43<00:00, 112.02it/s]



üìÇ Reading H3K27me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:44<00:00, 110.96it/s]



üìÇ Reading H3K36me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
  c /= stddev[None, :]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:50<00:00, 106.95it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K4me1 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:48<00:00, 108.46it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K9me3 (X3) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:44<00:00, 111.29it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



‚úÖ Saved logzscore features for X3 ‚Üí /Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig/X3_logzscore_features.tsv


In [None]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === Âü∫Êú¨Ë®≠ÂÆö ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
output_dir = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. data/bigwig"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
cells = ["X1","X2","X3"]

tss_window = 5000  # +/- 5kb
ref_path = "../preprocessed_data/reference/reference_gene_table.tsv"

# === normalization Ê®°ÂºèÂèØÈÅ∏ ===
# ÂèØÈÅ∏ "none", "zscore", "log_zscore", "log_only"
norm_mode = "zscore"

# ============================================================
# ËºâÂÖ• reference & global stats
# ============================================================
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"üìñ Loaded reference: {len(ref_genes)} genes")

with open(stats_path, "r") as f:
    stats = json.load(f)




# ============================================================
# ‰∏ªËø¥Âúà
# ============================================================
for cell in cells:
    print(f"\nüß¨ Processing cell line: {cell}")
    genes = ref_genes.copy()

    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"‚ö†Ô∏è Missing {mark} ({cell})")
            continue

        print(f"\nüìÇ Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"‚ö†Ô∏è No global stat found for {key}")
            continue

        # --- Ê†πÊìö norm_mode ÈÅ∏Âèñ global mean/std ---
        if norm_mode == "log_zscore":
            global_mean = stats[key]["log1p"]["mean"]
            global_std = stats[key]["log1p"]["std"]
        elif norm_mode == "zscore":
            global_mean = stats[key]["linear"]["mean"]
            global_std = stats[key]["linear"]["std"]
        else:
            global_mean = global_std = None

        # === ÂãïÊÖãÂëΩÂêç prefix ===
        prefix = {
            "none": "raw",
            "zscore": "z",
            "log_zscore": "logz",
            "log_only": "log"
        }[norm_mode]

        # === Âª∫Á´ãÁ©∫Â≠óÂÖ∏ÂÑ≤Â≠òÊâÄÊúâ feature ===
        base_keys = [
            "mean", "std", "min", "max", "diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]


        # --- Â∞ç mean/std/min/max/diff Âä†‰∏ä prefix ---
        prefixed_keys = [f"{prefix}_{k}" for k in base_keys[:5]]
        other_keys = base_keys[5:]
        feat_keys = prefixed_keys + other_keys

        # === Âª∫Á´ãÁ©∫Â≠óÂÖ∏ÂÑ≤Â≠ò features ===
        gene_features = {f"{mark}_gene_{k}": [] for k in feat_keys}
        tss_features  = {f"{mark}_tss_{k}": [] for k in feat_keys}


        # === ‰∏ªËø¥Âúà ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            g_stats = region_zsignal(bw, chrom, row["gene_start"], row["gene_end"],
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in g_stats.items():
                target_key = f"{mark}_gene_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_gene_{k}"
                gene_features[target_key].append(v)

            # TSS
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(bw, chrom, tss_start, tss_end,
                                    global_mean, global_std, mark, cell, norm_mode)

            for k, v in t_stats.items():
                target_key = f"{mark}_tss_{prefix}_{k}" if k in base_keys[:5] else f"{mark}_tss_{k}"
                tss_features[target_key].append(v)


        bw.close()

        # === Âêà‰Ωµ ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === ÂÑ≤Â≠òÁµêÊûú ===
    out_suffix = {
        "none": "raw",
        "zscore": "zscore",
        "log_zscore": "logzscore",
        "log_only": "log"
    }[norm_mode]

    out_path = os.path.join(output_dir, f"{cell}_{out_suffix}.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)

    print(f"\n‚úÖ Saved {out_suffix} features for {cell} ‚Üí {out_path}")


üìñ Loaded reference: 18268 genes

üß¨ Processing cell line: X1

üìÇ Reading DNase (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:30<00:00, 121.00it/s]



üìÇ Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:32<00:00, 120.11it/s]



üìÇ Reading H3K4me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:27<00:00, 124.17it/s]



üìÇ Reading H3K27me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:29<00:00, 121.89it/s]



üìÇ Reading H3K36me3 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18268/18268 [02:27<00:00, 123.83it/s]
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals
  genes[col] = vals



üìÇ Reading H3K4me1 (X1) ...


  sharpness = kurtosis(vals_norm) if len(vals_norm) > 3 else 0.0
  asymmetry = skew(vals_norm) if len(vals_norm) > 3 else 0.0
  c /= stddev[:, None]
 26%|‚ñà‚ñà‚ñã       | 4827/18268 [00:42<02:16, 98.31it/s] 

In [23]:
import os
import pandas as pd
import glob
import re

# ============================================================
#                  Âü∫Êú¨Ë®≠ÂÆö
# ============================================================
base_dir = "../preprocessed_data/reference/0. raw_bigwig"
files = sorted(glob.glob(os.path.join(base_dir, "X*_zscore_dynamics.tsv")))

print(f"üìÇ Found {len(files)} files:")
for f in files:
    print(f"  - {os.path.basename(f)}")

# ============================================================
#                  Ê¨Ñ‰ΩçÈáçÊñ∞ÂëΩÂêçÂáΩÂºè
# ============================================================
def rename_columns(df):
    rename_map = {}
    for col in df.columns:
        new_col = col
        new_col = re.sub(r"_z_mean$", "_mean", new_col)
        new_col = re.sub(r"_z_std$", "_std", new_col)
        new_col = re.sub(r"_z_min$", "_min", new_col)
        new_col = re.sub(r"_z_max$", "_max", new_col)
        new_col = re.sub(r"_z_diff$", "_diff", new_col)
        rename_map[col] = new_col
    return df.rename(columns=rename_map)

# ============================================================
#                  ‰∏ªÊµÅÁ®ã
# ============================================================
mark_min_results = {}
summary_rows = []

for f in files:
    cell = os.path.basename(f).split("_")[0]
    print(f"\nüîß Processing {cell} ...")

    df = pd.read_csv(f, sep="\t")
    df = rename_columns(df)

    # ÂÑ≤Â≠òÂõûÂéüÂú∞
    df.to_csv(f, sep="\t", index=False)
    print(f"üíæ Saved updated file: {f}")
    print(f"üß© Columns: {len(df.columns)} total")
    
    # Ê™¢Êü•ÊòØÂê¶Êúâ‰ªª‰Ωï *_tss_min / *_gene_min Ê¨Ñ‰Ωç
    tss_cols = [c for c in df.columns if "_tss_min" in c]
    gene_cols = [c for c in df.columns if "_gene_min" in c]

    if not tss_cols or not gene_cols:
        print(f"‚ö†Ô∏è No '_tss_min' or '_gene_min' columns found in {cell}. Skipping.")
        continue

    # === ÊâæÂá∫ÊØèÂÄã mark ===
    mark_cols = sorted({col.split("_")[0] for col in df.columns if "_tss_min" in col or "_gene_min" in col})
    print(f"‚úÖ Marks found: {mark_cols}")

    for mark in mark_cols:
        tss_col = f"{mark}_tss_min"
        gene_col = f"{mark}_gene_min"

        if tss_col not in df.columns or gene_col not in df.columns:
            print(f"‚ö†Ô∏è Missing {tss_col} or {gene_col}")
            continue

        min_tss = df[tss_col].min(skipna=True)
        min_gene = df[gene_col].min(skipna=True)

        summary_rows.append({
            "mark": mark,
            "cell": cell,
            "tss_min": min_tss,
            "gene_min": min_gene
        })

# ============================================================
#                  ÁµêÊûúËº∏Âá∫
# ============================================================
if summary_rows:
    summary_df = pd.DataFrame(summary_rows).sort_values(["mark", "cell"]).reset_index(drop=True)
    print("\nüß≠ Summary of minimum values per mark:")
    print(summary_df)

    summary_path = os.path.join(base_dir, "summary_min_values.tsv")
    summary_df.to_csv(summary_path, sep="\t", index=False)
    print(f"\n‚úÖ Summary saved to: {summary_path}")
else:
    print("\n‚ùå No marks found with '_tss_min' and '_gene_min' columns. Please check column names in your TSV files.")


üìÇ Found 3 files:
  - X1_zscore_dynamics.tsv
  - X2_zscore_dynamics.tsv
  - X3_zscore_dynamics.tsv

üîß Processing X1 ...
üíæ Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X1_zscore_dynamics.tsv
üß© Columns: 175 total
‚úÖ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

üîß Processing X2 ...
üíæ Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X2_zscore_dynamics.tsv
üß© Columns: 175 total
‚úÖ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

üîß Processing X3 ...
üíæ Saved updated file: ../preprocessed_data/reference/0. raw_bigwig/X3_zscore_dynamics.tsv
üß© Columns: 175 total
‚úÖ Marks found: ['DNase', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3', 'H3K9me3']

üß≠ Summary of minimum values per mark:
        mark cell   tss_min  gene_min
0      DNase   X1  0.024102  0.024102
1      DNase   X2  0.003446  0.003446
2      DNase   X3  0.000000  0.0000