In [1]:
import os
import pandas as pd
from scripts.data_preprocessing import load_and_concat

# ============================================================
# Path configuration
# ============================================================
base = "../ML4G_Project_1_Data/CAGE-train/CAGE-train"

files = {
    "X1_train": os.path.join(base, "X1_train_info.tsv"),
    "X1_val":   os.path.join(base, "X1_val_info.tsv"),
    "X2_train": os.path.join(base, "X2_train_info.tsv"),
    "X2_val":   os.path.join(base, "X2_val_info.tsv"),
}

# ============================================================
# Load and concatenate X1 / X2 splits (train + val)
# ============================================================
df_X1 = load_and_concat(files["X1_train"], files["X1_val"])
df_X2 = load_and_concat(files["X2_train"], files["X2_val"])

# ============================================================
# Columns to compare across X1 and X2
# ============================================================
cols_to_check = ["gene_name", "chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]

df1 = df_X1[cols_to_check].copy().reset_index(drop=True)
df2 = df_X2[cols_to_check].copy().reset_index(drop=True)

print(f"X1 rows: {len(df1)}, X2 rows: {len(df2)}")

# ============================================================
# Compare whether rows/metadata are exactly identical
# ============================================================
# Method 1: direct structural/value equality with pandas.equals()
same_structure = df1.equals(df2)

if same_structure:
    print("✅ X1 and X2 gene metadata are COMPLETELY identical.")
else:
    print("❌ Differences found between X1 and X2 gene annotations.")

    # Inspect number of differing entries and sample differences
    merged = df1.merge(df2, on=cols_to_check, how="outer", indicator=True)
    diff = merged[merged["_merge"] != "both"]

    print(f"⚠️ Number of differing entries: {len(diff)}")
    print("Sample differences:")
    print(diff.head(10))

    # If you only want to know which genes are missing on each side:
    missing_in_X2 = df1.merge(df2, on=cols_to_check, how="left", indicator=True)
    missing_in_X2 = missing_in_X2[missing_in_X2["_merge"] == "left_only"]
    print(f"\n🧩 Genes in X1 but not in X2: {len(missing_in_X2)}")
    print(missing_in_X2.head())

    missing_in_X1 = df2.merge(df1, on=cols_to_check, how="left", indicator=True)
    missing_in_X1 = missing_in_X1[missing_in_X1["_merge"] == "left_only"]
    print(f"\n🧩 Genes in X2 but not in X1: {len(missing_in_X1)}")
    print(missing_in_X1.head())


X1 rows: 16284, X2 rows: 16284
❌ Differences found between X1 and X2 gene annotations.
⚠️ Number of differing entries: 2890
Sample differences:
   gene_name    chr  gene_start   gene_end  TSS_start    TSS_end strand  \
3      A2ML1  chr12     8822621    8887001    8822471    8822521      +   
4      A2ML1  chr12     8822621    8887001    8844953    8845003      +   
5     A4GALT  chr22    42692121   42721298   42695583   42695633      -   
6     A4GALT  chr22    42692121   42721298   42721248   42721298      -   
15     AAMDC  chr11    77821109   77918432   77821108   77821158      +   
16     AAMDC  chr11    77821109   77918432   77821141   77821191      +   
26      AASS   chr7   122064583  122144308  122144205  122144255      -   
27      AASS   chr7   122064583  122144308  122144230  122144280      -   
33    ABCA12   chr2   214931542  215138626  215138354  215138404      -   
34    ABCA12   chr2   214931542  215138626  215138378  215138428      -   

        _merge  
3   right_onl

Only the TSS_start TSS_end are different. 

In [2]:
import os
import pandas as pd
from scripts.data_preprocessing import pick_reference

# ============================================================
# Path configuration
# ============================================================
base = "../ML4G_Project_1_Data/CAGE-train/CAGE-train"

X1_train = os.path.join(base, "X1_train_info.tsv")
X1_val   = os.path.join(base, "X1_val_info.tsv")
X2_train = os.path.join(base, "X2_train_info.tsv")
X2_val   = os.path.join(base, "X2_val_info.tsv")
X3_test  = os.path.join(base, "X3_test_info.tsv")

# ============================================================
# Load and concatenate splits
# ============================================================
def load_concat(train, val):
    """Load train/val TSVs and concatenate them row-wise."""
    df1 = pd.read_csv(train, sep="\t")
    df2 = pd.read_csv(val, sep="\t")
    return pd.concat([df1, df2], ignore_index=True)

df_X1 = load_concat(X1_train, X1_val)
df_X2 = load_concat(X2_train, X2_val)
df_X3 = pd.read_csv(X3_test, sep="\t")

print(f"✅ Loaded: X1({len(df_X1)}), X2({len(df_X2)}), X3({len(df_X3)})")

# ============================================================
# Keys used to align X1 and X2 before TSS reconciliation
# ============================================================
merge_keys = ["gene_name", "chr", "gene_start", "gene_end", "strand"]

merged = df_X1.merge(
    df_X2,
    on=merge_keys,
    how="outer",
    suffixes=("_X1", "_X2"),
    indicator=True
)

print(f"🔗 Merged X1 & X2: {len(merged)} entries")

# ============================================================
# Reconcile TSS if they differ (use pick_reference to choose min/max as needed)
# ============================================================
merged_ref = merged.join(merged.apply(pick_reference, axis=1))

# ============================================================
# Build unified reference table from reconciled TSS
# ============================================================
df_ref = merged_ref[merge_keys + ["TSS_start", "TSS_end"]].copy().drop_duplicates(ignore_index=True)

print(f"📘 Unified reference length (X1+X2): {len(df_ref)}")

# ============================================================
# Append X3 rows to the unified reference (row-wise concat)
# ============================================================
df_X3_ref = df_X3[["gene_name", "chr", "gene_start", "gene_end", "TSS_start", "TSS_end", "strand"]]
df_final_ref = pd.concat([df_ref, df_X3_ref], ignore_index=True)

print(f"🧩 Final unified reference (with X3): {len(df_final_ref)}")

# ============================================================
# Save the final reference table
# ============================================================
output_path = "../preprocessed_data/CAGE-merged/reference_gene_table.tsv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_final_ref.to_csv(output_path, sep="\t", index=False)

print(f"✅ Saved reference table → {output_path}")


✅ Loaded: X1(16284), X2(16284), X3(1984)
🔗 Merged X1 & X2: 16284 entries
📘 Unified reference length (X1+X2): 16284
🧩 Final unified reference (with X3): 18268
✅ Saved reference table → ../preprocessed_data/CAGE-merged/reference_gene_table.tsv
