# Train Data Preprocessing

## Merged train+val and merged info and y

In [None]:
import os
import pandas as pd

# ============================================================
# 路徑設定
# ============================================================
base = "../ML4G_Project_1_Data/CAGE-train/CAGE-train"

files = {
    "X1_train": os.path.join(base, "X1_train_info.tsv"),
    "X1_val":   os.path.join(base, "X1_val_info.tsv"),
    "X2_train": os.path.join(base, "X2_train_info.tsv"),
    "X2_val":   os.path.join(base, "X2_val_info.tsv"),
}

# ============================================================
# 載入並合併 X1 / X2
# ============================================================
def load_and_concat(train_path, val_path):
    df_train = pd.read_csv(train_path, sep="\t")
    df_val = pd.read_csv(val_path, sep="\t")
    df = pd.concat([df_train, df_val], ignore_index=True)
    return df

df_X1 = load_and_concat(files["X1_train"], files["X1_val"])
df_X2 = load_and_concat(files["X2_train"], files["X2_val"])

# ============================================================
# 指定要比較的欄位
# ============================================================
cols_to_check = ["gene_name", "chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]

df1 = df_X1[cols_to_check].copy().reset_index(drop=True)
df2 = df_X2[cols_to_check].copy().reset_index(drop=True)

print(f"X1 rows: {len(df1)}, X2 rows: {len(df2)}")

# ============================================================
# 比較資料列是否完全一致
# ============================================================
# 方法1：直接用 pandas.equals()
same_structure = df1.equals(df2)

if same_structure:
    print("✅ X1 and X2 gene metadata are COMPLETELY identical.")
else:
    print("❌ Differences found between X1 and X2 gene annotations.")

    # 檢查差異筆數與 gene_name 不同之處
    merged = df1.merge(df2, on=cols_to_check, how="outer", indicator=True)
    diff = merged[merged["_merge"] != "both"]

    print(f"⚠️ Number of differing entries: {len(diff)}")
    print("Sample differences:")
    print(diff.head(10))

    # 如果你只想知道哪邊少了哪些基因：
    missing_in_X2 = df1.merge(df2, on=cols_to_check, how="left", indicator=True)
    missing_in_X2 = missing_in_X2[missing_in_X2["_merge"] == "left_only"]
    print(f"\n🧩 Genes in X1 but not in X2: {len(missing_in_X2)}")
    print(missing_in_X2.head())

    missing_in_X1 = df2.merge(df1, on=cols_to_check, how="left", indicator=True)
    missing_in_X1 = missing_in_X1[missing_in_X1["_merge"] == "left_only"]
    print(f"\n🧩 Genes in X2 but not in X1: {len(missing_in_X1)}")
    print(missing_in_X1.head())


X1 rows: 16284, X2 rows: 16284
✅ X1 and X2 gene metadata are COMPLETELY identical.


In [3]:
import os
import pandas as pd

# ============================================================
# 路徑設定
# ============================================================
base = "../ML4G_Project_1_Data/CAGE-train/CAGE-train"

X1_train = os.path.join(base, "X1_train_info.tsv")
X1_val   = os.path.join(base, "X1_val_info.tsv")
X2_train = os.path.join(base, "X2_train_info.tsv")
X2_val   = os.path.join(base, "X2_val_info.tsv")
X3_test  = os.path.join(base, "X3_test_info.tsv")

# ============================================================
# 載入與合併
# ============================================================
def load_concat(train, val):
    df1 = pd.read_csv(train, sep="\t")
    df2 = pd.read_csv(val, sep="\t")
    return pd.concat([df1, df2], ignore_index=True)

df_X1 = load_concat(X1_train, X1_val)
df_X2 = load_concat(X2_train, X2_val)
df_X3 = pd.read_csv(X3_test, sep="\t")

print(f"✅ Loaded: X1({len(df_X1)}), X2({len(df_X2)}), X3({len(df_X3)})")

# ============================================================
# 準備合併用的 key
# ============================================================
merge_keys = ["gene_name", "chr", "gene_start", "gene_end", "strand"]

merged = df_X1.merge(
    df_X2,
    on=merge_keys,
    how="outer",
    suffixes=("_X1", "_X2"),
    indicator=True
)

print(f"🔗 Merged X1 & X2: {len(merged)} entries")

# ============================================================
# 若兩者 TSS 不同 → 用 min/max 統一
# ============================================================
def pick_reference(row):
    # 若有缺值 (某基因只出現在一邊)，取存在的值
    tss_start_1, tss_start_2 = row.get("TSS_start_X1"), row.get("TSS_start_X2")
    tss_end_1, tss_end_2 = row.get("TSS_end_X1"), row.get("TSS_end_X2")

    # 將缺失值視為 NaN 處理
    tss_start_1 = pd.to_numeric(tss_start_1, errors="coerce")
    tss_start_2 = pd.to_numeric(tss_start_2, errors="coerce")
    tss_end_1 = pd.to_numeric(tss_end_1, errors="coerce")
    tss_end_2 = pd.to_numeric(tss_end_2, errors="coerce")

    # 判斷有無雙方資料
    if pd.notna(tss_start_1) and pd.notna(tss_start_2):
        tss_start = min(tss_start_1, tss_start_2)
    else:
        tss_start = tss_start_1 if pd.notna(tss_start_1) else tss_start_2

    if pd.notna(tss_end_1) and pd.notna(tss_end_2):
        tss_end = max(tss_end_1, tss_end_2)
    else:
        tss_end = tss_end_1 if pd.notna(tss_end_1) else tss_end_2

    return pd.Series({"TSS_start": int(tss_start), "TSS_end": int(tss_end)})

merged_ref = merged.join(merged.apply(pick_reference, axis=1))

# ============================================================
# 建立統一 reference
# ============================================================
df_ref = merged_ref[merge_keys + ["TSS_start", "TSS_end"]].copy().drop_duplicates(ignore_index=True)

print(f"📘 Unified reference length (X1+X2): {len(df_ref)}")

# ============================================================
# 加上 X3（直接 concat）
# ============================================================
df_X3_ref = df_X3[["gene_name", "chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]]
df_final_ref = pd.concat([df_ref, df_X3_ref], ignore_index=True)

print(f"🧩 Final unified reference (with X3): {len(df_final_ref)}")

# ============================================================
# 儲存
# ============================================================
output_path = "../preprocessed_data/CAGE-merged/reference_gene_table.tsv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_final_ref.to_csv(output_path, sep="\t", index=False)

print(f"✅ Saved reference table → {output_path}")


✅ Loaded: X1(16284), X2(16284), X3(1984)
🔗 Merged X1 & X2: 16284 entries
📘 Unified reference length (X1+X2): 16284
🧩 Final unified reference (with X3): 18268
✅ Saved reference table → ../preprocessed_data/CAGE-merged/reference_gene_table.tsv


In [5]:
df_final_ref.isna().any().any()


np.False_