# Merged train+val and merged info and y

In [None]:
import pandas as pd
import os
import re

# === 自然排序函式 ===
def chr_sort_key(chr_name):
    m = re.match(r"chr(\d+)", chr_name)
    if m:
        return int(m.group(1))
    elif chr_name == "chrX":
        return 23
    elif chr_name == "chrY":
        return 24
    else:
        return 100  # 其他 contigs (random, Un, etc.)

# === 路徑設定 ===
base = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data\CAGE-train\CAGE-train"
out_folder = r"C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\CAGE-merged"
os.makedirs(out_folder, exist_ok=True)

# === 處理 X1, X2 ===
for cell in ["X1", "X2"]:
    print(f"\n🔹 Processing {cell} ...")

    # 讀取 info
    info_train = pd.read_csv(os.path.join(base, f"{cell}_train_info.tsv"), sep="\t")
    info_val = pd.read_csv(os.path.join(base, f"{cell}_val_info.tsv"), sep="\t")
    info_merged = pd.concat([info_train, info_val], ignore_index=True)

    # 讀取 y
    y_train = pd.read_csv(os.path.join(base, f"{cell}_train_y.tsv"), sep="\t")
    y_val = pd.read_csv(os.path.join(base, f"{cell}_val_y.tsv"), sep="\t")
    y_merged = pd.concat([y_train, y_val], ignore_index=True)

    if y_merged.shape[1] == 2:
        y_merged.columns = ["gene_name", "gex"]
    else:
        y_merged.columns = ["gex"]
        y_merged.insert(0, "gene_name", info_merged["gene_name"])

    merged = pd.merge(info_merged, y_merged, on="gene_name", how="inner")

    # 儲存結果
    merged_path = os.path.join(out_folder, f"{cell}_merged.tsv")
    merged.to_csv(merged_path, sep="\t", index=False)
    print(f"✅ Saved merged file: {merged_path} ({len(merged)} genes)")

    # 印出排序後的 chr 名稱
    if "chr" in merged.columns:
        unique_chrs = sorted(merged["chr"].unique(), key=chr_sort_key)
        print(f"🧬 {cell} unique chromosomes ({len(unique_chrs)}):")
        print(", ".join(unique_chrs))
    else:
        print(f"⚠️ Column 'chr' not found in merged file for {cell}!")

print("\n🎯 All cell lines merged successfully!")


# Add Marks z-score stat

In [None]:
import os
import json
import numpy as np
import pandas as pd
import pyBigWig
from tqdm import tqdm

# === 基本設定 ===
base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/ML4G_Project_1_Data"
merged_base = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/reference/0. raw_bigwig"
stats_path = "/Users/deweywang/Desktop/GitHub/Gene-expression-prediction/preprocessed_data/global_norm_stats.json"

marks = ["DNase", "H3K27ac", "H3K4me3", "H3K27me3", "H3K36me3", "H3K4me1", "H3K9me3"]
tss_window = 5000  # +/- 1kb 區域作為 promoter 區域
cells = [
    "X1",
    "X2",
    "X3",
]
# ============================================================
# 統一 gene reference
# ============================================================
ref_path = '../preprocessed_data/reference/reference_gene_table.tsv'
ref_genes = pd.read_csv(ref_path, sep="\t")
print(f"📖 Loaded reference table: {ref_path} ({len(ref_genes)} genes)")

# === 載入 global normalization 統計 ===
with open(stats_path, "r") as f:
    stats = json.load(f)

# === 輔助函式 ===
import numpy as np
from scipy.stats import kurtosis, skew
from numpy.fft import fft


import numpy as np
from scipy.stats import kurtosis, skew


def region_zsignal(bw, chrom, start, end, global_mean, global_std, mark_name=None, cell_name=None):
    """
    取出 bigWig 區域的多層級 z-score 特徵：
    1️⃣ Mean/Std/Min/Max/Diff
    2️⃣ Signal Gradient / Derivative Features
    3️⃣ Shape-based Descriptors (Kurtosis, Skewness)
    4️⃣ Entropy / Complexity
    5️⃣ Spatial Autocorrelation / Smoothness
    ⚙️ 所有 NaN / inf / 無效區域均會設為 0.0，保證輸出穩定。
    """

    feature_keys = [
        "z_mean", "z_std", "z_min", "z_max", "z_diff",
        "gradient_mean", "slope", "kurtosis", "skewness",
        "entropy", "autocorr", "laplacian"
    ]

    chroms = bw.chroms()

    # === 1️⃣ 染色體不存在 ===
    if chrom not in chroms:
        print(f"⚠️ [region_zsignal] Missing chromosome {chrom} in {mark_name or '?'} ({cell_name or '?'}) → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    chrom_length = chroms[chrom]
    start = max(0, int(start))
    end = min(int(end), chrom_length)

    # === 2️⃣ 無效區域 ===
    if end <= start:
        print(f"⚠️ [region_zsignal] Invalid region {chrom}:{start}-{end} (end <= start) in {mark_name or '?'} → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    # === 3️⃣ 取值 ===
    vals = np.array(bw.values(chrom, start, end, numpy=True))
    if vals is None or len(vals) == 0:
        print(f"⚠️ [region_zsignal] Empty values for {chrom}:{start}-{end} in {mark_name or '?'} → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    vals = vals[~np.isnan(vals)]
    if len(vals) == 0:
        print(f"⚠️ [region_zsignal] All NaN values for {chrom}:{start}-{end} in {mark_name or '?'} → fill 0.0")
        return {k: 0.0 for k in feature_keys}

    # === 4️⃣ 計算統計量 ===
    local_mean = np.mean(vals)
    local_std  = np.std(vals)
    local_min  = np.min(vals)
    local_max  = np.max(vals)
    local_diff = local_max - local_min
    z_mean = local_mean
    z_std  = local_std 
    z_min  = local_min
    z_max  = local_max
    z_diff = local_diff

    # === Z-score normalization ===
    # z_mean = (local_mean - global_mean) / (global_std + 1e-8)
    # z_std  = local_std / (global_std + 1e-8)
    # z_min  = (local_min - global_mean) / (global_std + 1e-8)
    # z_max  = (local_max - global_mean) / (global_std + 1e-8)
    # z_diff = z_max - z_min

    # === 5️⃣ 處理 inf/nan 結果 ===
    for name in ["z_mean", "z_std", "z_min", "z_max", "z_diff"]:
        val = locals()[name]
        if not np.isfinite(val):
            print(f"⚠️ [region_zsignal] {name} not finite ({val}) for {chrom}:{start}-{end} in {mark_name or '?'} → set to 0")
            locals()[name] = 0.0

    # === 6️⃣ Signal Gradient / Slope ===
    if len(vals) > 1:
        diffs = np.diff(vals)
        gradient_mean = np.mean(np.abs(diffs)) if len(diffs) > 0 else 0.0
        x = np.arange(len(vals))
        try:
            slope = np.polyfit(x, vals, 1)[0]
        except Exception:
            slope = 0.0
    else:
        gradient_mean, slope = 0.0, 0.0

    # === 7️⃣ Shape-based Descriptors ===
    sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
    asymmetry = skew(vals) if len(vals) > 3 else 0.0

    # === 8️⃣ Entropy / Complexity ===
    p = np.abs(vals)
    if p.sum() == 0:
        local_entropy = 0.0
    else:
        p = p / (p.sum() + 1e-8)
        local_entropy = -np.sum(p * np.log2(p + 1e-8))

    # === 9️⃣ Spatial Autocorrelation / Smoothness ===
    if len(vals) > 2:
        try:
            autocorr = np.corrcoef(vals[:-1], vals[1:])[0, 1]
        except Exception:
            autocorr = 0.0
    else:
        autocorr = 0.0

    if len(vals) > 3:
        laplacian = np.mean(np.abs(vals[:-2] - 2 * vals[1:-1] + vals[2:]))
    else:
        laplacian = 0.0

    # === 10️⃣ 確保所有結果有限 ===
    result = {
        "z_mean": z_mean,
        "z_std": z_std,
        "z_min": z_min,
        "z_max": z_max,
        "z_diff": z_diff,
        "gradient_mean": gradient_mean,
        "slope": slope,
        "kurtosis": sharpness,
        "skewness": asymmetry,
        "entropy": local_entropy,
        "autocorr": autocorr,
        "laplacian": laplacian,
    }

    for k, v in result.items():
        if not np.isfinite(v):
            result[k] = 0.0

    return result


def get_tss_region(row, window=1000):
    """根據 strand (+/-) 給出 promoter (TSS ± window) 區域"""
    if row["strand"] == "+":
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    else:  # strand == "-"
        start = max(0, row["TSS_start"] - window)
        end = row["TSS_end"] + window
    return start, end


# === 主迴圈：對每個 cell line 做 ===
# === 主迴圈：對每個 cell line 做 ===
for cell in cells:


    print(f"\n🧬 Processing cell line: {cell}")
    genes = pd.read_csv(ref_path, sep="\t")


    # === 對每個 mark 做特徵提取 ===
    for mark in marks:
        bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bw")
        if not os.path.exists(bw_path):
            bw_path = os.path.join(base, f"{mark}-bigwig", f"{cell}.bigwig")
        if not os.path.exists(bw_path):
            print(f"⚠️ Missing {mark} ({cell})")
            continue

        print(f"\n📂 Reading {mark} ({cell}) ...")
        bw = pyBigWig.open(bw_path)
        chroms = bw.chroms()

        key = f"{mark}_{cell}"
        if key not in stats:
            print(f"⚠️ No global stat found for {key}")
            continue

        global_mean = stats[key]["mean"]
        global_std = stats[key]["std"]

        # === 建立空字典儲存所有 feature ===
        gene_features = {f"{mark}_gene_{k}": [] for k in [
            "z_mean", "z_std", "z_min", "z_max", "z_diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]}
        tss_features = {f"{mark}_tss_{k}": [] for k in [
            "z_mean", "z_std", "z_min", "z_max", "z_diff",
            "gradient_mean", "slope", "kurtosis", "skewness",
            "entropy", "autocorr", "laplacian"
        ]}

        # === 主迴圈：對每個基因計算 ===
        for _, row in tqdm(genes.iterrows(), total=len(genes)):
            chrom = row["chr"]
            if chrom not in chroms:
                for d in (gene_features, tss_features):
                    for k in d.keys():
                        d[k].append(np.nan)
                continue

            # === Gene body ===
            g_stats = region_zsignal(
                bw, chrom, row["gene_start"], row["gene_end"],
                global_mean, global_std,
                mark_name=mark, cell_name=cell
            )
            for k, v in g_stats.items():
                gene_features[f"{mark}_gene_{k}"].append(v)

            # === TSS / promoter ===
            tss_start, tss_end = get_tss_region(row, window=tss_window)
            t_stats = region_zsignal(
                bw, chrom, tss_start, tss_end,
                global_mean, global_std,
                mark_name=mark, cell_name=cell
            )
            for k, v in t_stats.items():
                tss_features[f"{mark}_tss_{k}"].append(v)

        bw.close()

        # === 合併進 DataFrame ===
        for feat_dict in (gene_features, tss_features):
            for col, vals in feat_dict.items():
                genes[col] = vals

    # === 輸出結果 ===
    out_path = os.path.join(merged_base, f"{cell}_zscore_dynamics.tsv")
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    genes.to_csv(out_path, sep="\t", index=False)
    print(f"\n✅ Saved dynamic z-score features for {cell} → {out_path}")


📖 Loaded reference table: ../preprocessed_data/reference/reference_gene_table.tsv (18268 genes)

🧬 Processing cell line: X1

📂 Reading DNase (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
100%|██████████| 18268/18268 [02:23<00:00, 126.89it/s]



📂 Reading H3K27ac (X1) ...


  sharpness = kurtosis(vals) if len(vals) > 3 else 0.0
  asymmetry = skew(vals) if len(vals) > 3 else 0.0
  c /= stddev[:, None]
 27%|██▋       | 4970/18268 [00:45<02:54, 76.37it/s] 

⚠️ [region_zsignal] All NaN values for chr10:133623895-133626795 in H3K27ac → fill 0.0


 52%|█████▏    | 9570/18268 [01:24<00:31, 278.03it/s]

⚠️ [region_zsignal] All NaN values for chr5:181367268-181368262 in H3K27ac → fill 0.0
⚠️ [region_zsignal] All NaN values for chr5:181362267-181372317 in H3K27ac → fill 0.0


 61%|██████    | 11098/18268 [01:38<00:53, 134.37it/s]

In [15]:
genes

Unnamed: 0,gene_name,chr,gene_start,gene_end,strand,TSS_start,TSS_end,DNase_gene_z_mean,DNase_gene_z_std,DNase_gene_z_min,...,H3K9me3_tss_z_min,H3K9me3_tss_z_max,H3K9me3_tss_z_diff,H3K9me3_tss_gradient_mean,H3K9me3_tss_slope,H3K9me3_tss_kurtosis,H3K9me3_tss_skewness,H3K9me3_tss_entropy,H3K9me3_tss_autocorr,H3K9me3_tss_laplacian
0,A1BG,chr19,58345178,58353492,-,58353079,58353129,0.038224,0.052898,-0.011556,...,-0.862413,0.349543,1.211956,0.001636,0.000245,0.918737,1.473681,9.792528,0.987182,0.003276
1,A1CF,chr10,50799409,50885675,-,50828411,50828461,-0.006141,0.009658,-0.011556,...,-0.862413,1.091401,1.953814,0.001417,-0.000194,2.807764,1.877420,9.583064,0.996891,0.002837
2,A2M,chr12,9067664,9116229,-,9106246,9106296,-0.004155,0.008992,-0.011556,...,-0.862413,1.091401,1.953814,0.002204,-0.000351,2.208680,1.418608,9.725735,0.991113,0.004411
3,A2ML1,chr12,8822621,8887001,+,8822471,8845003,-0.001246,0.014772,-0.011556,...,-0.911531,3.475869,4.387401,0.002441,-0.000003,13.754194,2.766256,14.273006,0.985209,0.004882
4,A4GALT,chr22,42692121,42721298,-,42695583,42721298,0.020935,0.062725,-0.011556,...,-0.898537,3.475869,4.374406,0.002737,0.000004,10.179984,2.511094,14.364174,0.986371,0.005405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18263,BRINP2,chr1,177170958,177282422,+,177256780,177256830,-0.000026,0.016728,-0.011556,...,-0.757819,1.311209,2.069027,0.007165,-0.000263,-0.797523,0.634981,9.748020,0.977535,0.014343
18264,PPIE,chr1,39692182,39763914,+,39738878,39738928,0.003701,0.026539,-0.011556,...,-0.911531,-0.503010,0.408521,0.000863,-0.000112,-0.498347,0.784071,9.643456,0.991571,0.001702
18265,CRB1,chr1,197268204,197478455,+,197268277,197268327,-0.004602,0.009601,-0.011556,...,-0.757819,2.744095,3.501914,0.004496,0.000492,4.504135,1.997466,9.799135,0.979204,0.009001
18266,TRIM33,chr1,114392790,114511203,-,114406940,114406967,0.000549,0.033520,-0.011556,...,-0.757819,1.091401,1.849220,0.004424,-0.000042,3.135016,1.222237,9.821344,0.957967,0.008646
