# Merged train+val and merged info and y

In [None]:
import pandas as pd
import os
import re

# === 自然排序函式 ===
def chr_sort_key(chr_name):
    m = re.match(r"chr(\d+)", chr_name)
    if m:
        return int(m.group(1))
    elif chr_name == "chrX":
        return 23
    elif chr_name == "chrY":
        return 24
    else:
        return 100  # 其他 contigs (random, Un, etc.)

# === 路徑設定 ===
base = r"C:\Users\wani\Desktop\Courses\ML for genomics\ML4G_Project_1_Data\CAGE-train\CAGE-train"
out_folder = r"C:\Users\wani\Desktop\Courses\ML for genomics\preprocessed_data\CAGE-merged"
os.makedirs(out_folder, exist_ok=True)

# === 處理 X1, X2 ===
for cell in ["X1", "X2"]:
    print(f"\n🔹 Processing {cell} ...")

    # 讀取 info
    info_train = pd.read_csv(os.path.join(base, f"{cell}_train_info.tsv"), sep="\t")
    info_val = pd.read_csv(os.path.join(base, f"{cell}_val_info.tsv"), sep="\t")
    info_merged = pd.concat([info_train, info_val], ignore_index=True)

    # 讀取 y
    y_train = pd.read_csv(os.path.join(base, f"{cell}_train_y.tsv"), sep="\t")
    y_val = pd.read_csv(os.path.join(base, f"{cell}_val_y.tsv"), sep="\t")
    y_merged = pd.concat([y_train, y_val], ignore_index=True)

    if y_merged.shape[1] == 2:
        y_merged.columns = ["gene_name", "gex"]
    else:
        y_merged.columns = ["gex"]
        y_merged.insert(0, "gene_name", info_merged["gene_name"])

    merged = pd.merge(info_merged, y_merged, on="gene_name", how="inner")

    # 儲存結果
    merged_path = os.path.join(out_folder, f"{cell}_merged.tsv")
    merged.to_csv(merged_path, sep="\t", index=False)
    print(f"✅ Saved merged file: {merged_path} ({len(merged)} genes)")

    # 印出排序後的 chr 名稱
    if "chr" in merged.columns:
        unique_chrs = sorted(merged["chr"].unique(), key=chr_sort_key)
        print(f"🧬 {cell} unique chromosomes ({len(unique_chrs)}):")
        print(", ".join(unique_chrs))
    else:
        print(f"⚠️ Column 'chr' not found in merged file for {cell}!")

print("\n🎯 All cell lines merged successfully!")
