In [1]:
# %%
# --- 1. Imports ---
import os, time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, average_precision_score
)

sns.set_theme(style="whitegrid")
print("‚úÖ Libraries loaded.")

# %%
# --- 2. Paths and Configuration ---

# Automatically detect project root
def find_contrastive_root():
    cwd = os.path.abspath(os.getcwd())
    parts = cwd.split(os.sep)
    for i in range(len(parts), 0, -1):
        candidate = os.sep.join(parts[:i])
        if os.path.basename(candidate) == "Contrastive_Learning":
            return candidate
    raise RuntimeError("‚ùå Could not find Contrastive_Learning root from current directory.")

CONTRASTIVE_ROOT = find_contrastive_root()
print(f"üìÇ Project root: {CONTRASTIVE_ROOT}")

ROOT_RESULTS = f"{CONTRASTIVE_ROOT}/files/results"
DATA_BASE     = f"{CONTRASTIVE_ROOT}/data/ASCOT"
OUT_DIR       = f"{ROOT_RESULTS}/../classification_eval"
os.makedirs(OUT_DIR, exist_ok=True)

# Ground truth label files (HIGH vs LOW exons)
GT_HIGH = f"{DATA_BASE}/variable_cassette_exons_with_binary_labels_HIGH.csv"
GT_LOW  = f"{DATA_BASE}/variable_cassette_exons_with_binary_labels_LOW.csv"

print(f"‚úÖ Ground Truth:\nHIGH ‚Üí {GT_HIGH}\nLOW  ‚Üí {GT_LOW}")

‚úÖ Libraries loaded.
üìÇ Project root: /gpfs/commons/home/atalukder/Contrastive_Learning
‚úÖ Ground Truth:
HIGH ‚Üí /gpfs/commons/home/atalukder/Contrastive_Learning/data/ASCOT/variable_cassette_exons_with_binary_labels_HIGH.csv
LOW  ‚Üí /gpfs/commons/home/atalukder/Contrastive_Learning/data/ASCOT/variable_cassette_exons_with_binary_labels_LOW.csv


In [2]:
# %%
# --- 3. Utility Functions ---

def logit(p):
    p = np.clip(p, 1e-6, 1 - 1e-6)
    return np.log(p / (1 - p))

def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def compute_classification_metrics_per_exon(gt_file, pred_long_df, model_name, expression_type, txt_writer=None):
    """
    Compute per-exon classification metrics (AUROC, AUPRC, Accuracy, etc.)
    using Œîlogit(Œ®) predictions.
    HIGH exons ‚Üí label 1 if Œîlogit < 0 (under-expressed)
    LOW  exons ‚Üí label 1 if Œîlogit > 0 (over-expressed)
    """
    gt = pd.read_csv(gt_file)
    if "exon_id" not in gt.columns:
        raise ValueError("Ground truth file must contain 'exon_id'.")

    # Pivot predictions to wide format
    required = {"exon_id", "tissue", "pred_delta_logit"}
    if not required.issubset(pred_long_df.columns):
        raise ValueError(f"Missing columns in predictions: {required}")

    pred_wide = pred_long_df.pivot(index="exon_id", columns="tissue", values="pred_delta_logit").reset_index()

    # Match overlapping tissues
    tissue_cols = [c for c in gt.columns if c != "exon_id" and c in pred_wide.columns]
    if not tissue_cols:
        print(f"‚ö†Ô∏è No overlapping tissues for {model_name} ({expression_type})")
        return pd.DataFrame(), ""

    merged = pd.merge(gt, pred_wide, on="exon_id", suffixes=("_gt", "_pred"))

    results = []
    for _, row in merged.iterrows():
        y_true = row[tissue_cols].astype(int).values
        y_prob = row[[t for t in tissue_cols]].astype(float).values

        if len(set(y_true)) < 2:
            continue

        if expression_type.upper() == "HIGH":
            y_pred = (y_prob < 0).astype(int)
        else:
            y_pred = (y_prob > 0).astype(int)

        exon_metrics = {
            "model": model_name,
            "expression_type": expression_type,
            "exon_id": row["exon_id"],
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred, zero_division=0),
            "recall": recall_score(y_true, y_pred, zero_division=0),
            "f1": f1_score(y_true, y_pred, zero_division=0),
            "auroc": roc_auc_score(y_true, y_prob),
            "auprc": average_precision_score(y_true, y_prob),
        }
        results.append(exon_metrics)

    df = pd.DataFrame(results)
    if df.empty:
        msg = f"‚ö†Ô∏è No valid exons for {model_name} ({expression_type}).\n"
        print(msg)
        return df, msg

    mean_vals = df[["accuracy","precision","recall","f1","auroc","auprc"]].mean().round(4)
    summary = (
        f"\nüìä {model_name} ({expression_type} exons)\n"
        f"   n_exons   : {len(df)}\n" +
        "".join([f"   {k:<10}: {v:.4f}\n" for k,v in mean_vals.items()])
    )

    print(summary.strip())
    if txt_writer:
        txt_writer.write(summary)
    return df, summary


In [3]:
# %%
# --- 4. Load Your Predicted Œîlogit(Œ®) Files ---

# Example placeholders: replace with your actual model outputs
# Each should be a long-format DataFrame:
# exon_id | tissue | pred_delta_logit

def load_pred_delta_logit_tsv(path):
    df = pd.read_csv(path, sep="\t")
    if not {"exon_id"}.issubset(df.columns):
        raise ValueError(f"{path} missing exon_id column.")
    long_df = df.melt(id_vars=["exon_id"], var_name="tissue", value_name="pred_delta_logit")
    print(f"‚úÖ Loaded {len(long_df)} predictions from {os.path.basename(path)}")
    return long_df

# --- SOTA ---
SOTA_FILE = f"{ROOT_RESULTS}/mtsplice_originalTFweight_results/intron_300bp_results/variable_all_tissues_predicted_logit_delta.tsv"
sota_pred_delta_logit_long = load_pred_delta_logit_tsv(SOTA_FILE)
sota_name = "MTSplice_original_SOTA"

# --- Experiment 1 ---
result_file_name1 =  "exprmnt_2025_10_28__20_28_29" # EMPRAIPsi_200bpIntrons_mtspliceHyperparams_2025_10_28__20_28_29
exp1_name = '200bpIntrons_mtspliceHyperparams'
EXP1_FILE = f"{ROOT_RESULTS}/{result_file_name1}/ensemble_evaluation_from_valdiation/test_set_evaluation/tsplice_final_predictions_all_tissues.tsv"
exp1_pred_delta_logit_long = load_pred_delta_logit_tsv(EXP1_FILE)


# --- Experiment 2 ---
result_file_name2 =  "exprmnt_2025_10_28__20_12_58" # EMPRAIPsi_300bpIntrons_mtspliceHyperparams_2025_10_28__20_12_58
exp2_name  = '300bpIntrons_mtspliceHyperparams'
EXP2_FILE = f"{ROOT_RESULTS}/{result_file_name2}/ensemble_evaluation_from_valdiation/test_set_evaluation/tsplice_final_predictions_all_tissues.tsv"
exp2_pred_delta_logit_long = load_pred_delta_logit_tsv(EXP2_FILE)


‚úÖ Loaded 90776 predictions from variable_all_tissues_predicted_logit_delta.tsv
‚úÖ Loaded 66136 predictions from tsplice_final_predictions_all_tissues.tsv
‚úÖ Loaded 90776 predictions from tsplice_final_predictions_all_tissues.tsv


In [4]:
# %%
# --- 5. Evaluate All Models ---

summary_txt = f"{OUT_DIR}/classification_summary_{time.strftime('%Y_%m_%d__%H_%M_%S')}.txt"
with open(summary_txt, "w") as f:
    f.write("===== Œîlogit(Œ®) Classification Summary =====\n")
    for model_name, pred_long_df in [
        (sota_name, sota_pred_delta_logit_long),
        (exp1_name, exp1_pred_delta_logit_long),
        (exp2_name, exp2_pred_delta_logit_long),
    ]:
        f.write(f"\n===== {model_name} =====\n")
        df_high, _ = compute_classification_metrics_per_exon(GT_HIGH, pred_long_df, model_name, "HIGH", txt_writer=f)
        df_low, _  = compute_classification_metrics_per_exon(GT_LOW,  pred_long_df, model_name, "LOW",  txt_writer=f)
        df_high.to_csv(f"{OUT_DIR}/{model_name}_HIGH_classification.csv", index=False)
        df_low.to_csv(f"{OUT_DIR}/{model_name}_LOW_classification.csv", index=False)
print(f"‚úÖ Summary written to {summary_txt}")

KeyError: "None of [Index(['Retina - Eye', 'RPE/Choroid/Sclera - Eye', 'Subcutaneous - Adipose',\n       'Visceral (Omentum) - Adipose', 'Adrenal Gland', 'Aorta - Artery',\n       'Coronary - Artery', 'Tibial - Artery', 'Bladder', 'Amygdala - Brain',\n       'Anterior cingulate - Brain', 'Caudate nucleus - Brain',\n       'Cerebellar Hemisphere - Brain', 'Cerebellum - Brain', 'Cortex - Brain',\n       'Frontal Cortex - Brain', 'Hippocampus - Brain', 'Hypothalamus - Brain',\n       'Nucleus accumbens - Brain', 'Putamen - Brain',\n       'Spinal cord (C1) - Brain', 'Substantia nigra - Brain',\n       'Mammary Tissue - Breast', 'EBV-xform lymphocytes - Cells',\n       'Leukemia (CML) - Cells', 'Xform. fibroblasts - Cells',\n       'Ectocervix - Cervix', 'Endocervix - Cervix', 'Sigmoid - Colon',\n       'Transverse - Colon', 'Gastroesoph. Junc. - Esophagus',\n       'Mucosa - Esophagus', 'Muscularis - Esophagus', 'Fallopian Tube',\n       'Atrial Appendage - Heart', 'Left Ventricle - Heart', 'Cortex - Kidney',\n       'Liver', 'Lung', 'Minor Salivary Gland', 'Skeletal - Muscle',\n       'Tibial - Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate',\n       'Not Sun Exposed - Skin', 'Sun Exposed (Lower leg) - Skin',\n       'Ileum - Small Intestine', 'Spleen', 'Stomach', 'Testis', 'Thyroid',\n       'Uterus', 'Vagina', 'Whole Blood'],\n      dtype='object')] are in the [index]"

In [None]:
# %%
# --- 6. Build Summary Table ---

summary_rows = []
for model_name, pred_long_df in [
    (sota_name, sota_pred_delta_logit_long),
    (exp1_name, exp1_pred_delta_logit_long),
    (exp2_name, exp2_pred_delta_logit_long),
]:
    df_high, _ = compute_classification_metrics_per_exon(GT_HIGH, pred_long_df, model_name, "HIGH")
    df_low, _  = compute_classification_metrics_per_exon(GT_LOW,  pred_long_df, model_name, "LOW")
    summary_rows.append({
        "Model": model_name,
        "AUROC_HIGH": df_high["auroc"].mean(),
        "AUROC_LOW":  df_low["auroc"].mean(),
        "AUPRC_HIGH": df_high["auprc"].mean(),
        "AUPRC_LOW":  df_low["auprc"].mean(),
        "Accuracy_HIGH": df_high["accuracy"].mean(),
        "Accuracy_LOW":  df_low["accuracy"].mean()
    })

summary_df = pd.DataFrame(summary_rows).round(4)
display(summary_df)
summary_csv = f"{OUT_DIR}/classification_summary_table_{time.strftime('%Y_%m_%d__%H_%M_%S')}.csv"
summary_df.to_csv(summary_csv, index=False)
print(f"üìä Summary table saved ‚Üí {summary_csv}")

In [None]:
# %%
# --- 7. Plot AUROC and AUPRC Barplots ---

plt.figure(figsize=(10, 6))
sns.barplot(
    data=summary_df.melt(id_vars="Model", value_vars=["AUROC_HIGH", "AUROC_LOW"]),
    x="Model", y="value", hue="variable", palette="muted"
)
plt.title("Mean AUROC (Œîlogit Classification)")
plt.ylabel("AUROC")
plt.xlabel("")
plt.legend(title="Exon Type")
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/auroc_barplot_{time.strftime('%Y_%m_%d__%H_%M_%S')}.png", dpi=300)
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(
    data=summary_df.melt(id_vars="Model", value_vars=["AUPRC_HIGH", "AUPRC_LOW"]),
    x="Model", y="value", hue="variable", palette="Set2"
)
plt.title("Mean AUPRC (Œîlogit Classification)")
plt.ylabel("AUPRC")
plt.xlabel("")
plt.legend(title="Exon Type")
plt.tight_layout()
plt.savefig(f"{OUT_DIR}/auprc_barplot_{time.strftime('%Y_%m_%d__%H_%M_%S')}.png", dpi=300)
plt.show()

print("‚úÖ AUROC and AUPRC plots generated.")