In [None]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# Train Data Preprocessing

## Merged train+val and merged info and y

In [13]:
import pandas as pd
import os
import re

# === 自然排序函式 ===
def chr_sort_key(chr_name):
    m = re.match(r"chr(\d+)", chr_name)
    if m:
        return int(m.group(1))
    elif chr_name == "chrX":
        return 23
    elif chr_name == "chrY":
        return 24
    else:
        return 100  # 其他 contigs (random, Un, etc.)

# === 路徑設定 ===
base = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data\CAGE-train\CAGE-train"
out_folder = r"C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\CAGE-merged"
os.makedirs(out_folder, exist_ok=True)

# === 處理 X1, X2 ===
for cell in ["X1", "X2"]:
    print(f"\n🔹 Processing {cell} ...")

    # 讀取 info
    info_train = pd.read_csv(os.path.join(base, f"{cell}_train_info.tsv"), sep="\t")
    info_val = pd.read_csv(os.path.join(base, f"{cell}_val_info.tsv"), sep="\t")
    info_merged = pd.concat([info_train, info_val], ignore_index=True)

    # 讀取 y
    y_train = pd.read_csv(os.path.join(base, f"{cell}_train_y.tsv"), sep="\t")
    y_val = pd.read_csv(os.path.join(base, f"{cell}_val_y.tsv"), sep="\t")
    y_merged = pd.concat([y_train, y_val], ignore_index=True)

    if y_merged.shape[1] == 2:
        y_merged.columns = ["gene_name", "gex"]
    else:
        y_merged.columns = ["gex"]
        y_merged.insert(0, "gene_name", info_merged["gene_name"])

    merged = pd.merge(info_merged, y_merged, on="gene_name", how="inner")

    # 儲存結果
    merged_path = os.path.join(out_folder, f"{cell}_merged.tsv")
    merged.to_csv(merged_path, sep="\t", index=False)
    print(f"✅ Saved merged file: {merged_path} ({len(merged)} genes)")

    # 印出排序後的 chr 名稱
    if "chr" in merged.columns:
        unique_chrs = sorted(merged["chr"].unique(), key=chr_sort_key)
        print(f"🧬 {cell} unique chromosomes ({len(unique_chrs)}):")
        print(", ".join(unique_chrs))
    else:
        print(f"⚠️ Column 'chr' not found in merged file for {cell}!")

print("\n🎯 All cell lines merged successfully!")



🔹 Processing X1 ...
✅ Saved merged file: C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\CAGE-merged\X1_merged.tsv (16284 genes)
🧬 X1 unique chromosomes (21):
chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22

🔹 Processing X2 ...
✅ Saved merged file: C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\CAGE-merged\X2_merged.tsv (16284 genes)
🧬 X2 unique chromosomes (21):
chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22

🎯 All cell lines merged successfully!


## Split the dataset

Split H3K4me1, H3K4me3, H3K27ac, H3K36me3, H3K9me3, Chromatin Accessibility, Gene Expression, Gene Information based on cell line and chromasome

In [None]:
import os
import pandas as pd

# === 基本設定 ===
base_data = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data"

marks = [
    "DNase", "H3K27me3", "H3K4me1",
    "H3K4me3", "H3K27ac", "H3K36me3", "H3K9me3"
]

# UCSC narrowPeak 欄位名稱
bed_cols = [
    "chrom", "chromStart", "chromEnd", "name", "score",
    "strand", "signalValue", "pValue", "qValue", "peak"
]

# 收集各 mark 的 chromosome 集合
chrom_sets = {}

# === 讀取每個 mark 的 BED 檔 ===
for mark in marks:
    bed_path = os.path.join(base_data, f"{mark}-bed", "X1.bed")
    if not os.path.exists(bed_path):
        print(f"⚠️ Missing file: {bed_path}")
        continue

    # 探測欄位數
    ncols = len(pd.read_csv(bed_path, sep="\t", nrows=1).columns)
    cols = bed_cols[:ncols]

    # 讀取 BED
    bed = pd.read_csv(bed_path, sep="\t", header=None, names=cols)
    chroms = sorted(bed["chrom"].unique())

    chrom_sets[mark] = set(chroms)
    print(f"📂 {mark}: {len(chroms)} chromosomes found")

# === 合併後 unique 的結果 ===
if len(chrom_sets) == 0:
    print("\n❌ No BED files found.")
else:
    all_chroms = sorted(set.union(*chrom_sets.values()))
    common_chroms = sorted(set.intersection(*chrom_sets.values()))

    print("\n🧬 Unique chromosomes across ALL marks (union):")
    for i, chrom in enumerate(all_chroms, 1):
        print(f"{i:>2}. {chrom}")
    print(f"\n✅ Total unique chromosomes found: {len(all_chroms)}")

    print("\n🔗 Chromosomes PRESENT in ALL marks (intersection):")
    for i, chrom in enumerate(common_chroms, 1):
        print(f"{i:>2}. {chrom}")
    print(f"\n✅ Total common chromosomes across all marks: {len(common_chroms)}")


📂 DNase: 24 chromosomes found
📂 H3K27me3: 24 chromosomes found
📂 H3K4me1: 26 chromosomes found
📂 H3K4me3: 30 chromosomes found
📂 H3K27ac: 30 chromosomes found
📂 H3K36me3: 27 chromosomes found
📂 H3K9me3: 30 chromosomes found

🧬 Unique chromosomes across ALL marks (union):
 1. chr1
 2. chr10
 3. chr11
 4. chr12
 5. chr13
 6. chr14
 7. chr15
 8. chr16
 9. chr17
10. chr17_GL000205v2_random
11. chr17_KI270729v1_random
12. chr18
13. chr19
14. chr1_KI270713v1_random
15. chr2
16. chr20
17. chr21
18. chr22
19. chr22_KI270733v1_random
20. chr22_KI270736v1_random
21. chr3
22. chr4
23. chr5
24. chr5_GL000208v1_random
25. chr6
26. chr7
27. chr8
28. chr9
29. chrEBV
30. chrUn_GL000195v1
31. chrUn_GL000216v2
32. chrUn_GL000219v1
33. chrUn_GL000220v1
34. chrUn_GL000224v1
35. chrUn_KI270336v1
36. chrUn_KI270435v1
37. chrUn_KI270591v1
38. chrX
39. chrY

✅ Total unique chromosomes found: 39

🔗 Chromosomes PRESENT in ALL marks (intersection):
 1. chr1
 2. chr10
 3. chr11
 4. chr12
 5. chr13
 6. chr14
 7. c

In [19]:
import os
import pandas as pd

# === 基本設定 ===
base_data = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data"
merged_base = r"C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\CAGE-merged"
output_root = r"C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\chromosomes\train"

marks = [
    "DNase", "H3K27me3", "H3K4me1",
    "H3K4me3", "H3K27ac", "H3K36me3", "H3K9me3"
]

# 目標染色體集合
target_chroms = [f"chr{i}" for i in range(2, 23)]

# === 建立主輸出資料夾 ===
os.makedirs(output_root, exist_ok=True)

# === 主迴圈：處理 X1 與 X2 ===
for cell in ["X1", "X2"]:
    print(f"\n🔹 Processing {cell} ...")

    # === Step 1. 處理 merged gene expression ===
    merged_tsv = os.path.join(merged_base, f"{cell}_merged.tsv")
    if not os.path.exists(merged_tsv):
        print(f"⚠️ Missing merged TSV for {cell}: {merged_tsv}")
        continue

    genes = pd.read_csv(merged_tsv, sep="\t")

    if "chr" not in genes.columns:
        raise ValueError(f"❌ Missing 'chr' column in {cell}_merged.tsv!")

    present_chrs = sorted(genes["chr"].unique())
    missing_chrs = sorted(set(target_chroms) - set(present_chrs))

    print(f"🧬 Found chromosomes in {cell}_merged.tsv: {', '.join(present_chrs)}")
    if missing_chrs:
        print(f"⚠️ Missing from {cell}_merged.tsv: {', '.join(missing_chrs)}")
    else:
        print("✅ All target chromosomes present.")

    # 過濾 chr2–chr22
    genes_filtered = genes[genes["chr"].isin(target_chroms)]

    # 依 chr 分檔
    for chrom, df_sub in genes_filtered.groupby("chr"):
        chr_cell_dir = os.path.join(output_root, chrom, cell)
        os.makedirs(chr_cell_dir, exist_ok=True)
        out_path = os.path.join(chr_cell_dir, f"{cell}_genes_{chrom}.tsv")
        df_sub.to_csv(out_path, sep="\t", index=False)
        print(f"✅ Saved {cell} gene file for {chrom}: {len(df_sub)} records")

    # === Step 2. 處理 histone mark BED 檔 ===
    for mark in marks:
        bed_path = os.path.join(base_data, f"{mark}-bed", f"{cell}.bed")
        if not os.path.exists(bed_path):
            print(f"⚠️ Skipping {mark} ({cell}) — missing file: {bed_path}")
            continue

        print(f"\n📂 Processing {mark} ({cell}) ...")

        # 自動偵測欄位數
        ncols = len(pd.read_csv(bed_path, sep="\t", nrows=1).columns)
        bed_cols = ["chrom", "start", "end", "name", "score",
                    "strand", "signal", "pval", "qval", "summit"][:ncols]
        bed = pd.read_csv(bed_path, sep="\t", header=None, names=bed_cols)

        # 檢查 chr 完整性
        present_chrs = sorted(bed["chrom"].unique())
        missing_chrs = sorted(set(target_chroms) - set(present_chrs))
        print(f"   ➤ Found chromosomes: {', '.join(present_chrs)}")
        if missing_chrs:
            print(f"   ⚠️ Missing target chromosomes: {', '.join(missing_chrs)}")
        else:
            print("   ✅ All target chromosomes present.")

        # 過濾 chr2–chr22
        bed_filtered = bed[bed["chrom"].isin(target_chroms)]

        # 分 chr 輸出
        for chrom, df_sub in bed_filtered.groupby("chrom"):
            chr_cell_dir = os.path.join(output_root, chrom, cell)
            os.makedirs(chr_cell_dir, exist_ok=True)
            out_path = os.path.join(chr_cell_dir, f"{mark}_{cell}_{chrom}.bed")
            df_sub.to_csv(out_path, sep="\t", header=False, index=False)

        print(f"✅ Split {mark} ({cell}) into {len(bed_filtered['chrom'].unique())} chromosomes.")

print("\n🎯 All gene + BED files (X1 & X2, chr2–chr22) processed successfully!")



🔹 Processing X1 ...
🧬 Found chromosomes in X1_merged.tsv: chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9
✅ All target chromosomes present.
✅ Saved X1 gene file for chr10: 709 records
✅ Saved X1 gene file for chr11: 1259 records
✅ Saved X1 gene file for chr12: 994 records
✅ Saved X1 gene file for chr13: 312 records
✅ Saved X1 gene file for chr14: 591 records
✅ Saved X1 gene file for chr15: 569 records
✅ Saved X1 gene file for chr16: 805 records
✅ Saved X1 gene file for chr17: 1144 records
✅ Saved X1 gene file for chr18: 260 records
✅ Saved X1 gene file for chr19: 1383 records
✅ Saved X1 gene file for chr2: 1203 records
✅ Saved X1 gene file for chr20: 526 records
✅ Saved X1 gene file for chr21: 209 records
✅ Saved X1 gene file for chr22: 423 records
✅ Saved X1 gene file for chr3: 1036 records
✅ Saved X1 gene file for chr4: 738 records
✅ Saved X1 gene file for chr5: 848 records
✅ Saved X1 gene file

# Test Data

In [17]:
import pandas as pd
import re

# === 路徑設定 ===
path = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data\CAGE-train\CAGE-train\X3_test_info.tsv"

# === 定義自然排序函式（chr1→chr22→chrX→chrY→其他）===
def chr_sort_key(chr_name):
    m = re.match(r"chr(\d+)", chr_name)
    if m:
        return int(m.group(1))
    elif chr_name == "chrX":
        return 23
    elif chr_name == "chrY":
        return 24
    else:
        return 100  # 其他放最後（例如 chrUn, chrM, random）

# === 讀取資料 ===
X3_test_info = pd.read_csv(path, sep="\t")

if "chr" not in X3_test_info.columns:
    raise ValueError("❌ Missing 'chr' column in X3_test_info.tsv!")

# === 取得 unique 染色體（並排序）===
unique_chrs = sorted(X3_test_info["chr"].unique(), key=chr_sort_key)

# === 印出結果 ===
print("🧬 Unique chromosomes in X3_test_info.tsv:")
print(", ".join(unique_chrs))
print(f"\n✅ Total unique chromosomes: {len(unique_chrs)}")


🧬 Unique chromosomes in X3_test_info.tsv:
chr1

✅ Total unique chromosomes: 1


In [20]:
import os
import pandas as pd

# === 基本設定 ===
base_data = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data"
x3_info_path = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data\CAGE-train\CAGE-train\X3_test_info.tsv"
output_root = r"C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\chromosomes\test"

marks = [
    "DNase", "H3K27me3", "H3K4me1",
    "H3K4me3", "H3K27ac", "H3K36me3", "H3K9me3"
]

target_chrom = "chr1"  # 只保留 chr1
cell = "X3"

# === 建立輸出資料夾 ===
chr_cell_dir = os.path.join(output_root, target_chrom, cell)
os.makedirs(chr_cell_dir, exist_ok=True)

# === Step 1. 處理 X3_test_info.tsv ===
print(f"\n🔹 Processing {cell} gene info ...")

info = pd.read_csv(x3_info_path, sep="\t")

if "chr" not in info.columns:
    raise ValueError("❌ Missing 'chr' column in X3_test_info.tsv!")

unique_chrs = sorted(info["chr"].unique())
print(f"🧬 Chromosomes found in {cell}_test_info.tsv: {', '.join(unique_chrs)}")

# 只保留 chr1
info_chr1 = info[info["chr"] == target_chrom]

if info_chr1.empty:
    print("⚠️ No records found for chr1 in test info file!")
else:
    out_path = os.path.join(chr_cell_dir, f"{cell}_genes_{target_chrom}.tsv")
    info_chr1.to_csv(out_path, sep="\t", index=False)
    print(f"✅ Saved {cell} gene file for {target_chrom}: {len(info_chr1)} records")

# === Step 2. 處理 histone mark BED 檔 ===
for mark in marks:
    bed_path = os.path.join(base_data, f"{mark}-bed", f"{cell}.bed")
    if not os.path.exists(bed_path):
        print(f"⚠️ Skipping {mark} ({cell}) — missing file: {bed_path}")
        continue

    print(f"\n📂 Processing {mark} ({cell}) ...")

    # 自動偵測欄位數
    ncols = len(pd.read_csv(bed_path, sep="\t", nrows=1).columns)
    bed_cols = ["chrom", "start", "end", "name", "score",
                "strand", "signal", "pval", "qval", "summit"][:ncols]
    bed = pd.read_csv(bed_path, sep="\t", header=None, names=bed_cols)

    # 檢查 chr1 是否存在
    if target_chrom not in bed["chrom"].unique():
        print(f"⚠️ {mark} ({cell}) missing {target_chrom}!")
        continue

    # 只保留 chr1
    bed_chr1 = bed[bed["chrom"] == target_chrom]
    out_path = os.path.join(chr_cell_dir, f"{mark}_{cell}_{target_chrom}.bed")
    bed_chr1.to_csv(out_path, sep="\t", header=False, index=False)
    print(f"✅ Saved {mark}_{cell}_{target_chrom}.bed ({len(bed_chr1)} peaks)")

print(f"\n🎯 All {cell} BED and gene files (chr1 only) processed successfully!")



🔹 Processing X3 gene info ...
🧬 Chromosomes found in X3_test_info.tsv: chr1
✅ Saved X3 gene file for chr1: 1984 records

📂 Processing DNase (X3) ...
✅ Saved DNase_X3_chr1.bed (6937 peaks)

📂 Processing H3K27me3 (X3) ...
✅ Saved H3K27me3_X3_chr1.bed (7378 peaks)

📂 Processing H3K4me1 (X3) ...
✅ Saved H3K4me1_X3_chr1.bed (9852 peaks)

📂 Processing H3K4me3 (X3) ...
✅ Saved H3K4me3_X3_chr1.bed (3305 peaks)

📂 Processing H3K27ac (X3) ...
✅ Saved H3K27ac_X3_chr1.bed (5010 peaks)

📂 Processing H3K36me3 (X3) ...
✅ Saved H3K36me3_X3_chr1.bed (7018 peaks)

📂 Processing H3K9me3 (X3) ...
✅ Saved H3K9me3_X3_chr1.bed (2101 peaks)

🎯 All X3 BED and gene files (chr1 only) processed successfully!
