# Train Data Preprocessing

## Merged train+val and merged info and y

In [14]:
import os
import pandas as pd
import numpy as np
from scipy.stats import rankdata

# ============================================================
# 基本設定
# ============================================================
base_dir = "../ML4G_Project_1_Data/CAGE-train/CAGE-train"
output_dir = "../preprocessed_data/reference/0. data"

os.makedirs(output_dir, exist_ok=True)

cells = ["X1", "X2"]

# ============================================================
# 合併 + Rank normalization
# ============================================================
for cell in cells:
    # === 路徑設定 ===
    train_path = os.path.join(base_dir, f"{cell}_train_y.tsv")
    val_path   = os.path.join(base_dir, f"{cell}_val_y.tsv")
    ranked_out = os.path.join(output_dir, f"{cell}_y.tsv")

    # === 讀取資料 ===
    df_train = pd.read_csv(train_path, sep="\t")
    df_val   = pd.read_csv(val_path, sep="\t")

    print(f"\n📘 {cell}: train = {len(df_train)}, val = {len(df_val)}")

    # === 合併資料 ===
    df_merged = pd.concat([df_train, df_val], axis=0, ignore_index=True)
    print(f"✅ Total merged rows = {len(df_merged)}")

    # === 排名 normalization ===
    vals = df_merged["gex"].to_numpy()
    ranks = rankdata(vals, method="average") / len(vals)  # 越大值 rank 越高
    df_merged["gex_rank"] = ranks

    # === 儲存結果 ===
    df_merged.to_csv(ranked_out, sep="\t", index=False)
    print(f"✅ Saved ranked file → {ranked_out}")
    print(df_merged.head(5))



📘 X1: train = 14310, val = 1974
✅ Total merged rows = 16284
✅ Saved ranked file → ../preprocessed_data/reference/0. data/X1_y.tsv
  gene_name          gex  gex_rank
0   SLC20A1     0.000000  0.290131
1  C11orf58  2239.103328  0.997421
2    ZSCAN9    19.798064  0.800018
3      CD19   411.530623  0.973532
4   TMEM123    34.214129  0.833702

📘 X2: train = 14310, val = 1974
✅ Total merged rows = 16284
✅ Saved ranked file → ../preprocessed_data/reference/0. data/X2_y.tsv
  gene_name          gex  gex_rank
0   SLC20A1     0.000000  0.248066
1  C11orf58  2906.145340  0.998158
2    ZSCAN9     9.414950  0.756479
3      CD19     0.000000  0.248066
4   TMEM123    26.995929  0.818472
