# Merge X1, X2 with their y

In [None]:
import os
import pandas as pd

# ============================================================
# Configuration
# ============================================================
y_dir = "../preprocessed_data/reference/0. data/"
features_dir = "../preprocessed_data/reference/1. merged data/without_y_100_one_side"
output_dir = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/"
os.makedirs(output_dir, exist_ok=True)

cells = ["X1", "X2"]


# ============================================================
# Main pipeline
# ============================================================
for cell in cells:
    y_path = os.path.join(y_dir, f"{cell}_y.tsv")
    qn_path = os.path.join(features_dir, f"{cell}_all_rank_features.tsv")
    out_path = os.path.join(output_dir, f"{cell}_all_rank_features_with_y.tsv")

    print(f"\n=== 🧬 {cell}: dense rank-normalized Y (0–1) + merge with features (QN) ===")

    if not os.path.exists(y_path):
        print(f"[WARN] Missing Y file: {y_path}")
        continue
    if not os.path.exists(qn_path):
        print(f"[WARN] Missing features (QN) file: {qn_path}")
        continue

    # --- Load inputs ---
    df_y = pd.read_csv(y_path, sep="\t")
    df_qn = pd.read_csv(qn_path, sep="\t")

    # --- Basic schema checks ---
    if "gene_name" not in df_y.columns or "gene_name" not in df_qn.columns:
        raise ValueError(f"Missing 'gene_name' column in inputs for cell: {cell}")
    if "gex" not in df_y.columns:
        raise ValueError(f"Missing 'gex' column in {cell}_y.tsv")

    print(f"[INFO] Loaded shapes: y={df_y.shape}, qn={df_qn.shape}")

    # --- Merge using Y as the anchor (left join) ---
    df_merged = pd.merge(df_y, df_qn, on="gene_name", how="left")
    print(f"[INFO] Merged shape: {df_merged.shape}")

    # --- Save output ---
    df_merged.to_csv(out_path, sep="\t", index=False)
    print(f"[OK] Saved → {out_path}")



=== 🧬 X1: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(16284, 3), qn=(16284, 2899)
✅ Merged shape: (16284, 2901)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_100_one_side/X1_all_rank_features_with_y.tsv

=== 🧬 X2: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(16284, 3), qn=(16284, 2899)
✅ Merged shape: (16284, 2901)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_100_one_side/X2_all_rank_features_with_y.tsv


# Selecting predicted genes for X3

In [None]:
import os
import pandas as pd

# ============================================================
# Configuration
# ============================================================
test_path = "../ML4G_Project_1_Data/CAGE-train/CAGE-train/X3_test_info.tsv"
features_dir = "../preprocessed_data/reference/1. merged data/without_y_100_one_side"
output_dir = "../preprocessed_data/reference/1. merged data/with_y_100_one_side/"
os.makedirs(output_dir, exist_ok=True)

cells = ["X3"]

# ============================================================
# Main pipeline
# ============================================================
for cell in cells:
    qn_path = os.path.join(features_dir, f"{cell}_all_rank_features.tsv")
    out_path = os.path.join(output_dir, f"{cell}_test.tsv")

    print(f"\n=== 🧬 Selecting predicted genes for {cell} ===")

    if not os.path.exists(test_path):
        print(f"[WARN] Missing test file: {test_path}")
        continue
    if not os.path.exists(qn_path):
        print(f"[WARN] Missing features (QN) file: {qn_path}")
        continue

    # --- Load inputs ---
    df_y = pd.read_csv(test_path, sep="\t")
    df_qn = pd.read_csv(qn_path, sep="\t")

    # --- Basic schema check ---
    if "gene_name" not in df_y.columns or "gene_name" not in df_qn.columns:
        raise ValueError(f"Missing 'gene_name' column in inputs for cell: {cell}")

    print(f"[INFO] Loaded shapes: test={df_y.shape}, features={df_qn.shape}")

    # --- Left-merge using test set as the anchor ---
    df_merged = pd.merge(df_y, df_qn, on="gene_name", how="left")
    print(f"[INFO] Merged shape: {df_merged.shape}")

    # --- Save output TSV ---
    df_merged.to_csv(out_path, sep="\t", index=False)
    print(f"[OK] Saved → {out_path}")



=== 🧬 X3: dense normalized rank (0–1) + merge with QN ===
📘 Loaded y=(1984, 7), qn=(1984, 2914)
✅ Merged shape: (1984, 2920)
💾 Saved → ../preprocessed_data/reference/1. merged data/with_y_100_one_side/X3_test.tsv
