### Setup

In [1]:
from pathlib import Path

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Seaborn base theme
sns.set_theme(
    context="paper",  # 'paper' = smaller, for journal figures
    style="whitegrid",  # clean background with subtle grid
    font="DejaVu Sans",  # use a consistent sans-serif (replace if needed)
    font_scale=1.4,  # scale up text a bit for readability
    palette="Set2",
    rc={
        # Figure sizing
        "figure.figsize": (10, 8),  # in inches; adjust for single-column
        "axes.titlesize": 14,
        "axes.labelsize": 12,
        "axes.linewidth": 1.0,
        "axes.labelpad": 8,
        "axes.grid": True,
        "grid.linewidth": 0.4,
        "grid.alpha": 0.6,
        "lines.linewidth": 1.5,
        "lines.markersize": 5,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "legend.frameon": False,
        "savefig.dpi": 300,
        "savefig.transparent": True,  # transparent background for vector exports
        "pdf.fonttype": 42,  # embed TrueType fonts (important for Illustrator)
        "ps.fonttype": 42,
    },
)

# Matplotlib tight layout by default
plt.rcParams.update({"figure.autolayout": True})

In [2]:
ANN_DATA_DIR = "../data/processed/anndata_combined"

human_adata = ad.read_h5ad("../data/processed/anndata_combined/human_combined.h5ad")
mouse_adata = ad.read_h5ad("../data/processed/anndata_combined/mouse_combined.h5ad")

N_SPLITS = 10

In [3]:
# ensure adatas have unique var names
human_adata.var_names_make_unique()
mouse_adata.var_names_make_unique()

# remove vars with NaN values
human_adata = human_adata[:, ~np.isnan(human_adata.X).any(axis=0)]
mouse_adata = mouse_adata[:, ~np.isnan(mouse_adata.X).any(axis=0)]

human_adata.obs["infl_acute"].fillna(False, inplace=True)
mouse_adata.obs["infl_acute"].fillna(False, inplace=True)
human_adata.obs["infl_acute"] = human_adata.obs["infl_acute"].astype(bool)
mouse_adata.obs["infl_acute"] = mouse_adata.obs["infl_acute"].astype(bool)

human_adata.obs["infl_subacute"].fillna(False, inplace=True)
mouse_adata.obs["infl_subacute"].fillna(False, inplace=True)
human_adata.obs["infl_subacute"] = human_adata.obs["infl_subacute"].astype(bool)
mouse_adata.obs["infl_subacute"] = mouse_adata.obs["infl_subacute"].astype(bool)

human_adata.obs["infl_chronic"].fillna(False, inplace=True)
mouse_adata.obs["infl_chronic"].fillna(False, inplace=True)
human_adata.obs["infl_chronic"] = human_adata.obs["infl_chronic"].astype(bool)
mouse_adata.obs["infl_chronic"] = mouse_adata.obs["infl_chronic"].astype(bool)

# constrain genes to those shared between human and mouse
common_genes = human_adata.var_names.intersection(mouse_adata.var_names)
human_adata = human_adata[:, common_genes]
mouse_adata = mouse_adata[:, common_genes]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  human_adata.obs['infl_acute'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mouse_adata.obs['infl_acute'].fillna(False, inplace=True)
  human_adata.obs['infl_acute'] = human_adata.obs['infl_acute'].astype(bool)
  mouse_adata.obs['infl_acute'] = mouse

In [4]:
def transform_adata_to_X_y_all(adata):
    X = adata.X.copy()
    X_emb = adata.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = adata.obs["group"].map({"inflammation": 1, "control": 0}).values.astype(int)
    return X, X_emb, y


def transform_adata_to_X_y_takao(adata):
    adata_sub = adata[~adata.obs["takao_status"].isna()].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = (
        adata_sub.obs["takao_status"]
        .map({"takao_inflamed": 1, "takao_control": 0})
        .values.astype(int)
    )
    return X, X_emb, y


def transform_adata_to_X_y_acute(adata):
    adata_sub = adata[(adata.obs["group"] == "control") | (adata.obs["infl_acute"])].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = adata_sub.obs["infl_acute"].map({True: 1, False: 0}).values.astype(int)
    return X, X_emb, y


def transform_adata_to_X_y_subacute(adata):
    adata_sub = adata[(adata.obs["group"] == "control") | (adata.obs["infl_subacute"])].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = adata_sub.obs["infl_subacute"].map({True: 1, False: 0}).values.astype(int)
    return X, X_emb, y


def transform_adata_to_X_y_acute_and_subacute(adata):
    adata_sub = adata[
        (adata.obs["group"] == "control")
        | (adata.obs["infl_acute"])
        | (adata.obs["infl_subacute"])
    ].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = (
        (adata_sub.obs["infl_acute"] | adata_sub.obs["infl_subacute"])
        .map({True: 1, False: 0})
        .values.astype(int)
    )
    return X, X_emb, y


def transform_adata_to_X_y_chronic(adata):
    adata_sub = adata[(adata.obs["group"] == "control") | (adata.obs["infl_chronic"])].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = adata_sub.obs["infl_chronic"].map({True: 1, False: 0}).values.astype(int)
    return X, X_emb, y


def transform_adata_to_X_y_acute_to_chronic(adata):
    adata_sub = adata[(adata.obs["infl_acute"]) | (adata.obs["infl_chronic"])].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = adata_sub.obs["infl_chronic"].map({True: 0, False: 1}).values.astype(int)
    return X, X_emb, y


def transform_adata_to_X_y_acute_subacute_to_chronic(adata):
    adata_sub = adata[
        (adata.obs["infl_acute"]) | (adata.obs["infl_subacute"]) | (adata.obs["infl_chronic"])
    ].copy()
    X = adata_sub.X.copy()
    X_emb = adata_sub.obsm["X_bulkformer"].copy()
    # set y to 1 for inflamed, 0 for healthy
    y = adata_sub.obs["infl_chronic"].map({True: 0, False: 1}).values.astype(int)
    return X, X_emb, y

### Define pipelines

In [5]:
def get_linear_pipeline():
    """Standardized linear pipeline (scaling + logistic regression)."""
    return {
        "Raw": Pipeline(
            [
                ("scaler", StandardScaler(with_mean=True, with_std=True)),
                ("clf", LogisticRegression(max_iter=5000, penalty="l2", solver="lbfgs")),
            ]
        )
    }


def evaluate_linear_models(human_adata, mouse_adata, setups, output_dir="model_coefficients"):
    """
    Train and evaluate linear models on Raw gene expression data only.
    Save standardized coefficients for downstream gene functional analysis.
    """
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    all_results, all_roc_data = {}, {}

    pipelines = get_linear_pipeline()
    pipe = pipelines["Raw"]

    for setup_name, transform_func in setups:
        # Prepare data
        human_X, _, human_y = transform_func(human_adata)
        mouse_X, _, mouse_y = transform_func(mouse_adata)

        # Ensure data are aligned and numeric
        gene_names = human_adata.var_names
        assert np.all(human_X.shape[1] == mouse_X.shape[1]), "Feature mismatch!"

        # --- Human → Mouse ---
        pipe.fit(human_X, human_y)
        y_pred = pipe.predict_proba(mouse_X)[:, 1]
        auroc = roc_auc_score(mouse_y, y_pred)
        all_results[f"{setup_name} (Human→Mouse)"] = auroc
        fpr, tpr, _ = roc_curve(mouse_y, y_pred)
        all_roc_data[f"{setup_name} (Human→Mouse)"] = (fpr, tpr, auroc)

        # Save coefficients (human-trained model)
        clf = pipe.named_steps["clf"]
        coefs = pd.DataFrame(
            {
                "gene": gene_names,
                "coef": clf.coef_.flatten(),
                "direction": np.sign(clf.coef_.flatten()),
            }
        )
        coefs.to_csv(
            f"{output_dir}/{setup_name.replace(' ', '_')}_HumanToMouse_coeffs.csv", index=False
        )

        # --- Mouse → Human ---
        pipe.fit(mouse_X, mouse_y)
        y_pred = pipe.predict_proba(human_X)[:, 1]
        auroc = roc_auc_score(human_y, y_pred)
        all_results[f"{setup_name} (Mouse→Human)"] = auroc
        fpr, tpr, _ = roc_curve(human_y, y_pred)
        all_roc_data[f"{setup_name} (Mouse→Human)"] = (fpr, tpr, auroc)

        # Save coefficients (mouse-trained model)
        clf = pipe.named_steps["clf"]
        coefs = pd.DataFrame(
            {
                "gene": gene_names,
                "coef": clf.coef_.flatten(),
                "direction": np.sign(clf.coef_.flatten()),
            }
        )
        coefs.to_csv(
            f"{output_dir}/{setup_name.replace(' ', '_')}_MouseToHuman_coeffs.csv", index=False
        )

    return all_results, all_roc_data

In [8]:
setups = [
    ("All Inflammation Samples vs. Control", transform_adata_to_X_y_all),
    ("Takao Subset for Inflammation vs. Control", transform_adata_to_X_y_takao),
    ("Acute Inflammation vs. Control", transform_adata_to_X_y_acute),
    ("Subacute Inflammation vs. Control", transform_adata_to_X_y_subacute),
    ("Chronic Inflammation vs. Control", transform_adata_to_X_y_chronic),
    ("Acute Inflammation vs. Chronic Inflammation", transform_adata_to_X_y_acute_to_chronic),
]

all_results, all_roc_data = evaluate_linear_models(human_adata, mouse_adata, setups)

In [6]:
# post_analysis_pipeline_v3.py
from collections import defaultdict
import glob
import os
import re

import gseapy as gp
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm import tqdm


# --------------------
# 1. Loading + sanity checks
# --------------------
def load_all_coefficients(coeff_dir="model_coefficients", gene_col="gene", coef_col="coef"):
    """
    Load CSVs named like "<setup>_HumanToMouse.csv" and "<setup>_MouseToHuman.csv".
    Returns:
      - all_coefs: dict[setup][direction] -> DataFrame indexed by gene with 'coefficient'
    """
    files = glob.glob(os.path.join(coeff_dir, "*.csv"))
    if not files:
        raise FileNotFoundError(f"No CSVs found in {coeff_dir}")

    all_coefs = defaultdict(dict)
    genes_sets = []

    for f in files:
        fname = Path(f).stem
        # robust direction parsing
        if "_HumanToMouse" in fname:
            direction = "HumanToMouse"
            setup = re.sub(r"(_HumanToMouse)*$", "", fname)
            setup = setup.replace(".", "")
        elif "_MouseToHuman" in fname:
            direction = "MouseToHuman"
            setup = re.sub(r"(_MouseToHuman)*$", "", fname)
            setup = setup.replace(".", "")

        df = pd.read_csv(f)
        # flexible coefficient column detection
        if coef_col not in df.columns:
            for alt in ["coef", "coefficient", "weight"]:
                if alt in df.columns:
                    coef_col = alt
                    break
            else:
                raise ValueError(f"{f} missing coefficient column")

        df2 = df[[gene_col, coef_col]].copy()
        df2.columns = ["gene", "coefficient"]
        df2 = df2.dropna(subset=["gene"]).drop_duplicates(subset=["gene"])
        df2 = df2.set_index("gene").sort_index()
        all_coefs[setup][direction] = df2
        genes_sets.append(set(df2.index.tolist()))

    return dict(all_coefs)


# --------------------
# 2. Normalize coefficients
# --------------------
def normalize_coefficients(all_coefs):
    norm_coefs = {}
    for setup, dirs in all_coefs.items():
        norm_coefs[setup] = {}
        for d, df in dirs.items():
            coefs = df["coefficient"].astype(float).copy()

            mu = coefs.mean()
            sigma = coefs.std(ddof=0) if coefs.std(ddof=0) > 0 else 1.0
            norm = (coefs - mu) / sigma

            df2 = df.copy()
            df2["coef_norm"] = norm
            norm_coefs[setup][d] = df2
    return norm_coefs


# --------------------
# 3. Build coefficient matrix
# --------------------
def build_coef_matrix(norm_coefs, direction):
    setups_with_dir = [s for s in norm_coefs if direction in norm_coefs[s]]
    if not setups_with_dir:
        raise ValueError(f"No setups contain direction '{direction}'")
    mat = pd.DataFrame(index=norm_coefs[setups_with_dir[0]][direction].index)
    for s in setups_with_dir:
        mat[s] = norm_coefs[s][direction]["coef_norm"]
    return mat


# --------------------
# 4. Ranked GSEA with consistent background
# --------------------
def run_prerank_from_coef_df(
    df_coef, outdir, score_col="coef_norm", min_size=15, max_size=500, permutation_num=1000
):
    Path(outdir).mkdir(parents=True, exist_ok=True)
    rnk = df_coef[score_col].sort_values(ascending=False)
    rnk_path = Path(outdir) / "ranked_list.rnk"
    rnk.to_csv(rnk_path, sep="\t", header=False)

    gene_sets = ["MSigDB_Hallmark_2020", "GO_Biological_Process_2025", "Reactome_Pathways_2024"]

    results = {}
    for gs in gene_sets:
        try:
            prer = gp.prerank(
                rnk=str(rnk_path),
                gene_sets=gs,
                permutation_num=permutation_num,
                outdir=os.path.join(outdir, gs.replace("/", "_")),
                seed=42,
                min_size=min_size,
                max_size=max_size,
                no_plot=True,
            )
            results[gs] = prer
        except Exception as e:
            print(f"prerank failed for {gs}: {e}")
    return results


# --------------------
# 7. End-to-end wrapper with progress tracking
# --------------------
def post_analysis_from_coeffs(coeff_dir="model_coefficients", outdir="post_analysis"):
    Path(outdir).mkdir(parents=True, exist_ok=True)
    all_coefs = load_all_coefficients(coeff_dir)
    norm = normalize_coefficients(all_coefs)

    mat_h2m = build_coef_matrix(norm, direction="HumanToMouse")
    mat_m2h = build_coef_matrix(norm, direction="MouseToHuman")

    # GSEA
    gsea_summary = {}
    setups = list(norm.keys())
    for setup in tqdm(setups, desc="GSEA across setups"):
        gsea_summary[setup] = {}
        for d in norm[setup]:
            df = norm[setup][d][["coefficient", "coef_norm"]].copy()
            df.index.name = "gene"
            out_here = os.path.join(outdir, f"gsea_{setup}_{d}")
            prerank_results = run_prerank_from_coef_df(df, outdir=out_here)
            for gs, prer in prerank_results.items():
                try:
                    res_df = prer.res2d.reset_index().rename(
                        columns={
                            "Term": "Term",
                            "NES": "NES",
                            "FDR q-val": "FDR",
                            "Lead_genes": "Genes",
                        }
                    )
                except Exception:
                    res_df = prer.res2d

            gsea_summary[setup][d] = res_df

    return {
        "normalized_coeffs": norm,
        "matrices": {"HumanToMouse": mat_h2m, "MouseToHuman": mat_m2h},
        "gsea_summary": gsea_summary,
        "common_genes": common_genes,
        "outdir": outdir,
    }


# --------------------
# 8. Example run snippet
# --------------------
results = post_analysis_from_coeffs(
    coeff_dir="model_coefficients",  # folder with your CSVs
    outdir="gsea",
)
print("Analysis complete. Outputs saved in:", results["outdir"])

  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = gp.prerank(rnk=str(rnk_path), gene_sets=gs, processes=4,
  prer = g

Analysis complete. Outputs saved in: gsea


In [7]:
# save results
import pickle

with open(os.path.join(results["outdir"], "post_analysis_results.pkl"), "wb") as f:
    pickle.dump(results, f)