# Merge the file extracted from bed and bigwig

In [None]:
import os
import pandas as pd

# ============================================================
# Configuration
# ============================================================
base_dir = "../preprocessed_data/reference/0. data"

# BED inputs (normalized)
bed_dir = os.path.join(base_dir, "bed100_one_side")
bed_norm_suffix = "logzscore"  # alternatives: "zscore", "raw"

# bigWig inputs (normalized)
bw_dir = os.path.join(base_dir, "bigwig100_one_side")
bw_norm_suffix = "logzscore"   # normalization suffix used in bigWig TSV file names

# Output directory for merged (features only, no Y)
output_dir = "../preprocessed_data/reference/1. merged data/without_y_100_one_side/"
os.makedirs(output_dir, exist_ok=True)

# Cells to process
cells = ["X1", "X2", "X3"]

# Columns to drop from BED (metadata retained in bigWig or not needed downstream)
drop_cols = ["chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]

# ============================================================
# Merge per cell line
# ============================================================
for cell in cells:
    bed_path = os.path.join(bed_dir, f"{cell}_{bed_norm_suffix}.tsv")
    bw_path = os.path.join(bw_dir, f"{cell}_{bw_norm_suffix}.tsv")
    out_path = os.path.join(output_dir, f"{cell}_all_{bed_norm_suffix}_{bw_norm_suffix}.tsv")

    print(f"\n=== 🧬 {cell}: merging {bed_norm_suffix} BED + {bw_norm_suffix} bigWig ===")

    if not os.path.exists(bed_path):
        print(f"[WARN] Missing BED file: {bed_path}")
        continue
    if not os.path.exists(bw_path):
        print(f"[WARN] Missing bigWig file: {bw_path}")
        continue

    # --- Load inputs ---
    df_bed = pd.read_csv(bed_path, sep="\t")
    df_bw = pd.read_csv(bw_path, sep="\t")

    # --- Basic schema check ---
    if "gene_name" not in df_bed.columns or "gene_name" not in df_bw.columns:
        raise ValueError(f"Missing 'gene_name' column in inputs for cell: {cell}")

    print(f"[INFO] Loaded shapes: bed={df_bed.shape}, bigwig={df_bw.shape}")

    # --- Drop unwanted BED metadata columns (keep features) ---
    keep_cols = [c for c in df_bed.columns if c not in drop_cols]
    df_bed = df_bed[keep_cols]

    # --- Outer-merge on gene_name to retain all entries ---
    df_merged = pd.merge(df_bed, df_bw, on="gene_name", how="outer")
    print(f"[INFO] Merged shape: {df_merged.shape}")

    # --- Save merged TSV ---
    df_merged.to_csv(out_path, sep="\t", index=False)
    print(f"[OK] Saved → {out_path}")



=== 🧬 X1: merging logzscore bed + logzscore bigwig ===
📘 Loaded bed=(18268, 511), bigwig=(18268, 175)
✅ Merged shape: (18268, 679)
💾 Saved → ../preprocessed_data/reference/1. merged data/without_y_100_one_side/X1_all_logzscore_logzscore.tsv

=== 🧬 X2: merging logzscore bed + logzscore bigwig ===
📘 Loaded bed=(18268, 511), bigwig=(18268, 175)
✅ Merged shape: (18268, 679)
💾 Saved → ../preprocessed_data/reference/1. merged data/without_y_100_one_side/X2_all_logzscore_logzscore.tsv

=== 🧬 X3: merging logzscore bed + logzscore bigwig ===
📘 Loaded bed=(18268, 511), bigwig=(18268, 175)
✅ Merged shape: (18268, 679)
💾 Saved → ../preprocessed_data/reference/1. merged data/without_y_100_one_side/X3_all_logzscore_logzscore.tsv
