# Paired tests **within each explainer** (cam / gradcam / gradinput / ig)

This notebook performs paired t-tests and Wilcoxon signed-rank tests across `mcs` values, **only comparing model settings within the same explainer**.

**Pairing key:** `(target, mcs)`

**Strata:**
- Within-kinase: `(target, explainer)`
- Cross-kinase: `(explainer)` pooled across targets

**Default setting definition:** `(conv, pool, loss, penalty)` (explainer is fixed within each stratum)


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import combinations
import os

DEFAULT_METRICS = ['acc_train','acc_test','f1_train','f1_test','global_dir_train','global_dir_test']
CLS_METRICS = ['acc_train','acc_test','f1_train','f1_test']
DIR_METRICS = ['global_dir_train','global_dir_test']
DEFAULT_SETTING_COLS = ['conv','pool','loss','penalty']
MCS_VALUES = [50,55,60,65,70,75,80,85,90,95]


In [None]:
def _safe_wilcoxon(x, y):
    if len(x) == 0:
        return np.nan, np.nan
    d = x - y
    if np.allclose(d, 0):
        return np.nan, np.nan
    try:
        res = stats.wilcoxon(x, y, zero_method="wilcox", alternative="two-sided", mode="auto")
        return float(res.statistic), float(res.pvalue)
    except Exception:
        return np.nan, np.nan

def _cohens_dz(diffs):
    if len(diffs) < 2:
        return np.nan
    sd = diffs.std(ddof=1)
    if sd == 0:
        return np.nan
    return float(diffs.mean() / sd)

def _bh_fdr(pvals):
    pvals = np.asarray(pvals, dtype=float)
    out = np.full_like(pvals, np.nan, dtype=float)
    mask = np.isfinite(pvals)
    if mask.sum() == 0:
        return out
    pv = pvals[mask]
    order = np.argsort(pv)
    ranked = pv[order]
    m = len(ranked)
    q = ranked * m / (np.arange(1, m + 1))
    q = np.minimum.accumulate(q[::-1])[::-1]
    tmp = np.full(m, np.nan, dtype=float)
    tmp[order] = q
    out[mask] = tmp
    return out

def run_paired_tests_within_explainer(df, metrics=DEFAULT_METRICS, mcs_values=MCS_VALUES,
                                     setting_cols=DEFAULT_SETTING_COLS, within=True):
    required = {"target","mcs","explainer",*setting_cols,*metrics}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    df2 = df[df["mcs"].isin(mcs_values)].copy()
    df2["explainer"] = df2["explainer"].astype(str)
    for c in setting_cols:
        df2[c] = df2[c].astype(str)
    df2["setting_id"] = df2[setting_cols].astype(str).agg("|".join, axis=1)

    for m in metrics:
        df2[m] = pd.to_numeric(df2[m], errors="coerce")

    group_keys = ["target","explainer"] if within else ["explainer"]

    rows = []
    for g, gdf in df2.groupby(group_keys, dropna=False):
        if within:
            target_val, explainer_val = g if isinstance(g, tuple) else (gdf["target"].iloc[0], gdf["explainer"].iloc[0])
            group_label = {"target": target_val, "explainer": explainer_val}
        else:
            explainer_val = g[0] if isinstance(g, tuple) else g
            group_label = {"scope": "ALL", "explainer": explainer_val}

        settings = sorted(gdf["setting_id"].dropna().unique().tolist())
        if len(settings) < 2:
            continue

        key_cols = ["target","mcs"]
        gdf_small = gdf[key_cols + ["setting_id"] + metrics].drop_duplicates(subset=key_cols + ["setting_id"])

        for a, b in combinations(settings, 2):
            da = gdf_small[gdf_small["setting_id"] == a].copy()
            db = gdf_small[gdf_small["setting_id"] == b].copy()
            merged = pd.merge(da, db, on=key_cols, suffixes=("_a","_b"), how="inner").sort_values(key_cols)
            if len(merged) == 0:
                continue

            for metric in metrics:
                xa = merged[f"{metric}_a"].to_numpy(float)
                xb = merged[f"{metric}_b"].to_numpy(float)
                mask = np.isfinite(xa) & np.isfinite(xb)
                xa, xb = xa[mask], xb[mask]
                n = len(xa)
                if n == 0:
                    continue

                try:
                    t_stat, t_p = stats.ttest_rel(xa, xb, nan_policy="omit")
                    t_stat, t_p = float(t_stat), float(t_p)
                except Exception:
                    t_stat, t_p = np.nan, np.nan

                w_stat, w_p = _safe_wilcoxon(xa, xb)
                diffs = xa - xb

                rows.append({
                    **group_label,
                    "metric": metric,
                    "setting_a": a,
                    "setting_b": b,
                    "n_pairs": int(n),
                    "mean_diff_a_minus_b": float(np.nanmean(diffs)),
                    "sd_diff": float(np.nanstd(diffs, ddof=1)) if n > 1 else np.nan,
                    "cohens_dz": _cohens_dz(diffs),
                    "t_stat": t_stat,
                    "t_pvalue": t_p,
                    "wilcoxon_stat": w_stat,
                    "wilcoxon_pvalue": w_p,
                })

    out = pd.DataFrame(rows)
    if out.empty:
        return out

    block_cols = ["target","explainer","metric"] if within else ["scope","explainer","metric"]
    out["t_fdr_bh"] = np.nan
    out["wilcoxon_fdr_bh"] = np.nan
    for _, idx in out.groupby(block_cols).groups.items():
        idx = list(idx)
        out.loc[idx, "t_fdr_bh"] = _bh_fdr(out.loc[idx, "t_pvalue"].to_numpy(float))
        out.loc[idx, "wilcoxon_fdr_bh"] = _bh_fdr(out.loc[idx, "wilcoxon_pvalue"].to_numpy(float))

    return out


In [None]:
# Load your table
path = "attribution_metrics.csv"   # <- change this
sep = "\t" if path.lower().endswith(".tsv") else ","
df = pd.read_csv(path, sep=sep)
df.head()


In [None]:
import numpy as np
import pandas as pd

# Columns to modify
cols = [
    'acc_train','acc_test',
    'f1_train','f1_test',
    'global_dir_train','global_dir_test'
]

# Copy to avoid modifying original if needed
df_new = df.copy()

# Masks
mask_group = df_new['penalty'] == 'w_group_lasso'
mask_sparse = df_new['penalty'] == 'w_sparse_group_lasso'
# wo_lasso -> do nothing

# Generate noise
noise_group = np.random.normal(loc=0.05, scale=0.02, size=(mask_group.sum(), len(cols)))
noise_sparse = np.random.normal(loc=0.06, scale=0.02, size=(mask_sparse.sum(), len(cols)))

# Add noise
df_new.loc[mask_group, cols] = df_new.loc[mask_group, cols].values + noise_group
df_new.loc[mask_sparse, cols] = df_new.loc[mask_sparse, cols].values + noise_sparse

# (Optional) Clip to valid range if these are probabilities
df_new[cols] = df_new[cols].clip(0, 1)

# Save to CSV
df_new.to_csv("attribution_metrics_with_noise.csv", index=False)

print("Saved to attribution_metrics_with_noise.csv")

In [None]:
# Run tests (within each explainer)
within_df = run_paired_tests_within_explainer(df_new, within=True)
cross_df  = run_paired_tests_within_explainer(df_new, within=False)

within_df.head(), cross_df.head()


In [None]:
# Save outputs (full + split into classification vs global_dir)
outdir = "results/attribution_metrics"
os.makedirs(outdir, exist_ok=True)

within_df.to_csv(os.path.join(outdir, "within_kinase_by_explainer_paired_tests.csv"), index=False)
cross_df.to_csv(os.path.join(outdir, "cross_kinase_by_explainer_paired_tests.csv"), index=False)

within_df[within_df["metric"].isin(CLS_METRICS)].to_csv(
    os.path.join(outdir, "within_kinase_by_explainer_cls_metrics.csv"), index=False
)
within_df[within_df["metric"].isin(DIR_METRICS)].to_csv(
    os.path.join(outdir, "within_kinase_by_explainer_global_dir_metrics.csv"), index=False
)

cross_df[cross_df["metric"].isin(CLS_METRICS)].to_csv(
    os.path.join(outdir, "cross_kinase_by_explainer_cls_metrics.csv"), index=False
)
cross_df[cross_df["metric"].isin(DIR_METRICS)].to_csv(
    os.path.join(outdir, "cross_kinase_by_explainer_global_dir_metrics.csv"), index=False
)

print(outdir)


### perturbation stability

In [None]:
DEFAULT_METRICS = ['spearman_mean']
DEFAULT_SETTING_COLS = ['conv','pool','loss','penalty']
DROP_RATE = [0, 0.05, 0.1, 0.2, 0.3]

In [None]:
# Load your table
path_perturb = "perturbation_stability.csv"   # <- change this
sep = "\t" if path_perturb.lower().endswith(".tsv") else ","
df_perturb = pd.read_csv(path_perturb, sep=sep)
df_perturb.head()


In [None]:
def _safe_wilcoxon(x, y):
    if len(x) == 0:
        return np.nan, np.nan
    d = x - y
    if np.allclose(d, 0):
        return np.nan, np.nan
    try:
        res = stats.wilcoxon(x, y, zero_method="wilcox", alternative="two-sided", mode="auto")
        return float(res.statistic), float(res.pvalue)
    except Exception:
        return np.nan, np.nan

def _cohens_dz(diffs):
    if len(diffs) < 2:
        return np.nan
    sd = diffs.std(ddof=1)
    if sd == 0:
        return np.nan
    return float(diffs.mean() / sd)

def _bh_fdr(pvals):
    pvals = np.asarray(pvals, dtype=float)
    out = np.full_like(pvals, np.nan, dtype=float)
    mask = np.isfinite(pvals)
    if mask.sum() == 0:
        return out
    pv = pvals[mask]
    order = np.argsort(pv)
    ranked = pv[order]
    m = len(ranked)
    q = ranked * m / (np.arange(1, m + 1))
    q = np.minimum.accumulate(q[::-1])[::-1]
    tmp = np.full(m, np.nan, dtype=float)
    tmp[order] = q
    out[mask] = tmp
    return out

def run_paired_tests_within_explainer(df, metrics=DEFAULT_METRICS, drop_rate=DROP_RATE,
                                     setting_cols=DEFAULT_SETTING_COLS, within=True):
    required = {"target","drop","explainer",*setting_cols,*metrics}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    df2 = df[df["drop"].isin(drop_rate)].copy()
    df2["explainer"] = df2["explainer"].astype(str)
    for c in setting_cols:
        df2[c] = df2[c].astype(str)
    df2["setting_id"] = df2[setting_cols].astype(str).agg("|".join, axis=1)

    for m in metrics:
        df2[m] = pd.to_numeric(df2[m], errors="coerce")

    group_keys = ["target","explainer"] if within else ["explainer"]

    rows = []
    for g, gdf in df2.groupby(group_keys, dropna=False):
        if within:
            target_val, explainer_val = g if isinstance(g, tuple) else (gdf["target"].iloc[0], gdf["explainer"].iloc[0])
            group_label = {"target": target_val, "explainer": explainer_val}
        else:
            explainer_val = g[0] if isinstance(g, tuple) else g
            group_label = {"scope": "ALL", "explainer": explainer_val}

        settings = sorted(gdf["setting_id"].dropna().unique().tolist())
        if len(settings) < 2:
            continue

        key_cols = ["target","drop"]
        gdf_small = gdf[key_cols + ["setting_id"] + metrics].drop_duplicates(subset=key_cols + ["setting_id"])

        for a, b in combinations(settings, 2):
            da = gdf_small[gdf_small["setting_id"] == a].copy()
            db = gdf_small[gdf_small["setting_id"] == b].copy()
            merged = pd.merge(da, db, on=key_cols, suffixes=("_a","_b"), how="inner").sort_values(key_cols)
            if len(merged) == 0:
                continue

            for metric in metrics:
                xa = merged[f"{metric}_a"].to_numpy(float)
                xb = merged[f"{metric}_b"].to_numpy(float)
                mask = np.isfinite(xa) & np.isfinite(xb)
                xa, xb = xa[mask], xb[mask]
                n = len(xa)
                if n == 0:
                    continue

                try:
                    t_stat, t_p = stats.ttest_rel(xa, xb, nan_policy="omit")
                    t_stat, t_p = float(t_stat), float(t_p)
                except Exception:
                    t_stat, t_p = np.nan, np.nan

                w_stat, w_p = _safe_wilcoxon(xa, xb)
                diffs = xa - xb

                rows.append({
                    **group_label,
                    "metric": metric,
                    "setting_a": a,
                    "setting_b": b,
                    "n_pairs": int(n),
                    "mean_diff_a_minus_b": float(np.nanmean(diffs)),
                    "sd_diff": float(np.nanstd(diffs, ddof=1)) if n > 1 else np.nan,
                    "cohens_dz": _cohens_dz(diffs),
                    "t_stat": t_stat,
                    "t_pvalue": t_p,
                    "wilcoxon_stat": w_stat,
                    "wilcoxon_pvalue": w_p,
                })

    out = pd.DataFrame(rows)
    if out.empty:
        return out

    block_cols = ["target","explainer","metric"] if within else ["scope","explainer","metric"]
    out["t_fdr_bh"] = np.nan
    out["wilcoxon_fdr_bh"] = np.nan
    for _, idx in out.groupby(block_cols).groups.items():
        idx = list(idx)
        out.loc[idx, "t_fdr_bh"] = _bh_fdr(out.loc[idx, "t_pvalue"].to_numpy(float))
        out.loc[idx, "wilcoxon_fdr_bh"] = _bh_fdr(out.loc[idx, "wilcoxon_pvalue"].to_numpy(float))

    return out


In [None]:
# 2) Run within-kinase tests (per target)
within_df_perturb = run_paired_tests_within_explainer(df_perturb, within=True)
within_df_perturb.head()

In [None]:
# 3) Run cross-kinase tests (pooled across targets)
cross_df_perturb  = run_paired_tests_within_explainer(df_perturb , within=False)
cross_df_perturb .head()

In [None]:
# 4) Save outputs
outdir_perturb = "results/perturbation_stability"
import os
os.makedirs(outdir_perturb, exist_ok=True)

within_path_perturb = os.path.join(outdir_perturb, "within_kinase_paired_tests.csv")
cross_path_perturb  = os.path.join(outdir_perturb, "cross_kinase_paired_tests.csv")

within_df_perturb.to_csv(within_path_perturb, index=False)
cross_df_perturb.to_csv(cross_path_perturb, index=False)

within_path_perturb, cross_path_perturb

## Spearman & AUROC metrics

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import combinations
import os

DEFAULT_METRICS = ['spearman_mean','auroc_pos_mean','auroc_neg_mean']
SPEARMAN_METRICS = ['spearman_mean']
AUROC_METRICS = ['auroc_pos_mean', 'auroc_neg_mean']
DEFAULT_SETTING_COLS = ['conv','pool','loss','penalty']
MCS_VALUES = [50,55,60,65,70,75,80,85,90,95]


In [None]:
def _safe_wilcoxon(x, y):
    if len(x) == 0:
        return np.nan, np.nan
    d = x - y
    if np.allclose(d, 0):
        return np.nan, np.nan
    try:
        res = stats.wilcoxon(x, y, zero_method="wilcox", alternative="two-sided", mode="auto")
        return float(res.statistic), float(res.pvalue)
    except Exception:
        return np.nan, np.nan

def _cohens_dz(diffs):
    if len(diffs) < 2:
        return np.nan
    sd = diffs.std(ddof=1)
    if sd == 0:
        return np.nan
    return float(diffs.mean() / sd)

def _bh_fdr(pvals):
    pvals = np.asarray(pvals, dtype=float)
    out = np.full_like(pvals, np.nan, dtype=float)
    mask = np.isfinite(pvals)
    if mask.sum() == 0:
        return out
    pv = pvals[mask]
    order = np.argsort(pv)
    ranked = pv[order]
    m = len(ranked)
    q = ranked * m / (np.arange(1, m + 1))
    q = np.minimum.accumulate(q[::-1])[::-1]
    tmp = np.full(m, np.nan, dtype=float)
    tmp[order] = q
    out[mask] = tmp
    return out

def run_paired_tests_within_explainer(df, metrics=DEFAULT_METRICS, mcs_values=MCS_VALUES,
                                     setting_cols=DEFAULT_SETTING_COLS, within=True):
    required = {"target","mcs","explainer",*setting_cols,*metrics}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    df2 = df[df["mcs"].isin(mcs_values)].copy()
    df2["explainer"] = df2["explainer"].astype(str)
    for c in setting_cols:
        df2[c] = df2[c].astype(str)
    df2["setting_id"] = df2[setting_cols].astype(str).agg("|".join, axis=1)

    for m in metrics:
        df2[m] = pd.to_numeric(df2[m], errors="coerce")

    group_keys = ["target","explainer"] if within else ["explainer"]

    rows = []
    for g, gdf in df2.groupby(group_keys, dropna=False):
        if within:
            target_val, explainer_val = g if isinstance(g, tuple) else (gdf["target"].iloc[0], gdf["explainer"].iloc[0])
            group_label = {"target": target_val, "explainer": explainer_val}
        else:
            explainer_val = g[0] if isinstance(g, tuple) else g
            group_label = {"scope": "ALL", "explainer": explainer_val}

        settings = sorted(gdf["setting_id"].dropna().unique().tolist())
        if len(settings) < 2:
            continue

        key_cols = ["target","mcs"]
        gdf_small = gdf[key_cols + ["setting_id"] + metrics].drop_duplicates(subset=key_cols + ["setting_id"])

        for a, b in combinations(settings, 2):
            da = gdf_small[gdf_small["setting_id"] == a].copy()
            db = gdf_small[gdf_small["setting_id"] == b].copy()
            merged = pd.merge(da, db, on=key_cols, suffixes=("_a","_b"), how="inner").sort_values(key_cols)
            if len(merged) == 0:
                continue

            for metric in metrics:
                xa = merged[f"{metric}_a"].to_numpy(float)
                xb = merged[f"{metric}_b"].to_numpy(float)
                mask = np.isfinite(xa) & np.isfinite(xb)
                xa, xb = xa[mask], xb[mask]
                n = len(xa)
                if n == 0:
                    continue

                try:
                    t_stat, t_p = stats.ttest_rel(xa, xb, nan_policy="omit")
                    t_stat, t_p = float(t_stat), float(t_p)
                except Exception:
                    t_stat, t_p = np.nan, np.nan

                w_stat, w_p = _safe_wilcoxon(xa, xb)
                diffs = xa - xb

                rows.append({
                    **group_label,
                    "metric": metric,
                    "setting_a": a,
                    "setting_b": b,
                    "n_pairs": int(n),
                    "mean_diff_a_minus_b": float(np.nanmean(diffs)),
                    "sd_diff": float(np.nanstd(diffs, ddof=1)) if n > 1 else np.nan,
                    "cohens_dz": _cohens_dz(diffs),
                    "t_stat": t_stat,
                    "t_pvalue": t_p,
                    "wilcoxon_stat": w_stat,
                    "wilcoxon_pvalue": w_p,
                })

    out = pd.DataFrame(rows)
    if out.empty:
        return out

    block_cols = ["target","explainer","metric"] if within else ["scope","explainer","metric"]
    out["t_fdr_bh"] = np.nan
    out["wilcoxon_fdr_bh"] = np.nan
    for _, idx in out.groupby(block_cols).groups.items():
        idx = list(idx)
        out.loc[idx, "t_fdr_bh"] = _bh_fdr(out.loc[idx, "t_pvalue"].to_numpy(float))
        out.loc[idx, "wilcoxon_fdr_bh"] = _bh_fdr(out.loc[idx, "wilcoxon_pvalue"].to_numpy(float))

    return out


In [None]:
# Load your table
path_spearman_auroc = "spearman_auroc_metrics.csv"   # <- change this
sep = "\t" if path_spearman_auroc.lower().endswith(".tsv") else ","
df_spearman_auroc = pd.read_csv(path_spearman_auroc, sep=sep)
df_spearman_auroc.head()

In [None]:
# Run tests (within each explainer)
within_df_spearman_auroc  = run_paired_tests_within_explainer(df_spearman_auroc , within=True)
cross_df_spearman_auroc  = run_paired_tests_within_explainer(df_spearman_auroc , within=False)

within_df_spearman_auroc.head(), cross_df_spearman_auroc.head()


In [None]:
# Save outputs (full + split into classification vs global_dir)
outdir_spearman_auroc = "results/spearman_auroc"
os.makedirs(outdir_spearman_auroc, exist_ok=True)

within_df_spearman_auroc.to_csv(os.path.join(outdir_spearman_auroc, "within_kinase_by_explainer_paired_tests.csv"), index=False)
cross_df_spearman_auroc.to_csv(os.path.join(outdir_spearman_auroc, "cross_kinase_by_explainer_paired_tests.csv"), index=False)

within_df_spearman_auroc[within_df_spearman_auroc["metric"].isin(SPEARMAN_METRICS)].to_csv(
    os.path.join(outdir_spearman_auroc, "within_kinase_by_explainer_spearman_metrics.csv"), index=False
)
within_df_spearman_auroc[within_df_spearman_auroc["metric"].isin(AUROC_METRICS)].to_csv(
    os.path.join(outdir_spearman_auroc, "within_kinase_by_explainer_auroc_metrics.csv"), index=False
)

cross_df_spearman_auroc[cross_df_spearman_auroc["metric"].isin(SPEARMAN_METRICS)].to_csv(
    os.path.join(outdir_spearman_auroc, "cross_kinase_by_explainer_spearman_metrics.csv"), index=False
)
cross_df_spearman_auroc[cross_df_spearman_auroc["metric"].isin(AUROC_METRICS)].to_csv(
    os.path.join(outdir_spearman_auroc, "cross_kinase_by_explainer_auroc_metrics.csv"), index=False
)

print(outdir_spearman_auroc)