In [1]:
# %%
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# exon-specific Œîlogit
from helper import (
    load_and_align_for_delta_logit, pull_vectors_from_row,
    delta_logit_scores, curve_score_from_dlogit, hard_preds_from_dlogit,
    binary_metrics, find_contrastive_root, get_prediction_file, pull_pm1_vectors_from_row,
    pull_pm1_vectors_from_row_predfilter, extract_valid_vectors,
)
import time

# --- Detect and set CONTRASTIVE_ROOT environment variable ---
root_path = str(find_contrastive_root())
os.environ["CONTRASTIVE_ROOT"] = root_path
print(f"‚úÖ CONTRASTIVE_ROOT set to: {root_path}")

# --- Root Paths ---
ROOT_RESULTS = f"{root_path}/files/results"
DATA_BASE = f"{root_path}/data/TS_data/tabula_sapiens/final_data"

# --- Ground Truth Binary Files ---
# GT_HIGH = f"{DATA_BASE}/variable_cassette_exons_with_binary_labels_HIGH_TissueBinPsi.csv"
# GT_LOW  = f"{DATA_BASE}/variable_cassette_exons_with_binary_labels_LOW_TissueBinPsi.csv"

# --- Output directory ---
OUT_DIR = f"{ROOT_RESULTS}/../classification_eval"
os.makedirs(OUT_DIR, exist_ok=True)


‚úÖ CONTRASTIVE_ROOT set to: /gpfs/commons/home/atalukder/Contrastive_Learning


In [2]:

def evaluate_rmse_by_class(
    gt_file: str,
    gt_realpsi_file: str,
    pred_file: str,
    margin_psi: float = 0.10,
    psi_bar_col: str = "mean_psi",
    use_logit_thresholds: bool = False,
    eps: float = 1e-6
) -> pd.DataFrame:
    """
    Compute RMSE(œà) and RMSE(Œîlogit œà) grouped by GT class (-1, 0, +1).

    Args:
        gt_file: CSV with binary (-1/0/1) tissue labels per exon.
        gt_realpsi_file: CSV with true PSI values per tissue.
        pred_file: CSV with predicted PSI per tissue.
        margin_psi: Margin used for thresholding if logit thresholds applied.
        psi_bar_col: Column for baseline mean PSI.
        use_logit_thresholds: If True, uses Œîlogit thresholds instead of œà.
    Returns:
        pd.DataFrame with columns [class, count, percent, rmse_psi, rmse_logit_delta].
    """
    import numpy as np
    import pandas as pd

    def sigmoid(z): return 1.0 / (1.0 + np.exp(-z))
    def logit(p):   return np.log(p) - np.log1p(-p)

    # --- Load aligned data ---
    merged, tissue_cols = load_and_align_for_delta_logit(
        gt_file=gt_file,
        pred_file=pred_file,
        require_cols=["logit_mean_psi"]
    )
    gt_realpsi = pd.read_csv(gt_realpsi_file)
    if "exon_id" not in gt_realpsi.columns:
        raise KeyError("gt_realpsi_file must contain 'exon_id'.")

    psi_gt_map = gt_realpsi.set_index("exon_id")[tissue_cols].to_dict(orient="index")

    # --- Accumulate all tissues globally ---
    all_ytrue, all_psitrue, all_psipred = [], [], []

    for _, row in merged.iterrows():
        exon_id = row["exon_id"]
        if exon_id not in psi_gt_map:
            continue
        g, p, valid = extract_valid_vectors(row, tissue_cols)
        if not np.any(valid):
            continue
        for j, t in enumerate(tissue_cols):
            if valid[j]:
                psi_true_val = psi_gt_map[exon_id].get(t, np.nan)
                if pd.isna(psi_true_val):
                    continue
                all_ytrue.append(int(g[j]))
                all_psitrue.append(float(psi_true_val))
                all_psipred.append(float(p[j]))

    all_ytrue = np.array(all_ytrue, dtype=int)
    all_psitrue = np.array(all_psitrue, dtype=float)
    all_psipred = np.array(all_psipred, dtype=float)

    if all_ytrue.size == 0:
        print("‚ö†Ô∏è No valid data found.")
        return pd.DataFrame()

    # --- Compute Œîlogit(œà) ---
    dlogit_pred = logit(np.clip(all_psipred, eps, 1 - eps)) - logit(np.clip(all_psitrue, eps, 1 - eps))
    dlogit_true = np.zeros_like(dlogit_pred)

    # --- RMSE per GT class ---
    global_rows = []
    for cls in [-1, 0, 1]:
        idx = all_ytrue == cls
        if not np.any(idx):
            continue
        rmse_psi = np.sqrt(np.mean((all_psipred[idx] - all_psitrue[idx]) ** 2))
        rmse_dlogit = np.sqrt(np.mean((dlogit_pred[idx] - dlogit_true[idx]) ** 2))
        global_rows.append({
            "class": cls,
            "count": int(np.sum(idx)),
            "percent": round(100 * np.sum(idx) / len(all_ytrue), 2),
            "rmse_psi": rmse_psi,
            "rmse_logit_delta": rmse_dlogit
        })

    df_global = pd.DataFrame(global_rows)
    
    # --------------------------------------------------------------------------
    # 4Ô∏è‚É£ Combined summary
    
    # --- Clean summary printout ---
    summary = (
        f"\nüåç {pred_file} (Tri-class RMSE Summary)\n"
        f"   n_tissues : {len(tissue_cols)}\n" +
        "\n".join([
            f"   class {int(row['class']):>2}: "
            f"rmse_psi={row['rmse_psi']:.4f}, "
            f"rmse_Œîlogit={row['rmse_logit_delta']:.4f}, "
            f"count={row['count']}, {row['percent']}%"
            for _, row in df_global.iterrows()
        ]) +
        "\n"
    )
    print(summary)


    return df_global, summary


In [3]:
time_stamp = time.strftime("%Y_%m_%d__%H_%M_%S", time.localtime())
SUMMARY_TXT = f"{OUT_DIR}/tissuSpecific_RMSE_summary_{time_stamp}.txt"
summary_lines = ["===== MODEL CLASSIFICATION SUMMARY ====="]

# --- Model 1 ---
result_file_name1 =  "exprmnt_2025_11_05__01_50_41" # EMPRAIPsi_TS_noCL_300bp_rerun_codeChange_2025_11_05__01_50_41
model1_user_name_norun = 'TS_noCL_300bp_rerun_codeChange'
# --- Model 2 ---
result_file_name2 = "exprmnt_2025_11_05__01_52_25" # EMPRAIPsi_TS_CLSwpd_300bp_10Aug_rerun_codeChange_2025_11_05__01_52_25
model2_user_name_norun = 'TS_CLSwpd_300bp_10Aug'

In [None]:
# # --- Paths you likely already have ---
GT_CLASS3_WIDE = f"{DATA_BASE}/test_cassette_exons_with_binary_labels_ExonBinPsi.csv"  # {-1,0,+1} wide
division = 'test'
GT_realPSI_FILE_PATH = f"{DATA_BASE}/{division}_cassette_exons_with_logit_mean_psi.csv"




# --- Model 1 ---
result_file_name = result_file_name1
pred_file = get_prediction_file(ROOT_RESULTS, result_file_name)
df_model1, txt_model1 = evaluate_rmse_by_class(
    gt_file=GT_CLASS3_WIDE,
    gt_realpsi_file=GT_realPSI_FILE_PATH,
    pred_file=pred_file
)

summary_lines += [txt_model1]

# --- Model 2 ---
result_file_name = result_file_name2
pred_file = get_prediction_file(ROOT_RESULTS, result_file_name)
df_model2, txt_model2 = evaluate_rmse_by_class(
    gt_file=GT_CLASS3_WIDE,
    gt_realpsi_file=GT_realPSI_FILE_PATH,
    pred_file=pred_file
)

with open(SUMMARY_TXT, "w") as f:
    f.write("\n".join(summary_lines))


üåç /gpfs/commons/home/atalukder/Contrastive_Learning/files/results/exprmnt_2025_11_05__01_50_41/ensemble_evaluation_from_valdiation/test_set_evaluation/tsplice_final_predictions_all_tissues.tsv (Tri-class RMSE Summary)
   n_tissues : 112
   class -1: rmse_psi=90.9467, rmse_Œîlogit=9.5819, count=352767.0, 78.65%
   class  0: rmse_psi=41.3063, rmse_Œîlogit=12.5931, count=46182.0, 10.3%
   class  1: rmse_psi=93.2024, rmse_Œîlogit=13.0061, count=49606.0, 11.06%

