# Pathway Enrichment Analysis

This notebook demonstrates pathway enrichment analysis using features and signatures from the RAG platform.

In [None]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from notebook_utils import get_api_client, demo_mode_banner, display_error

# Connect to API (demo fallback if unavailable)
api_url = os.environ.get("API_URL") or os.environ.get("AMPRENTA_API_URL")
client, demo_mode = get_api_client(api_url=api_url)

if demo_mode or client is None:
    demo_mode_banner()
    print("API unavailable â€” demo mode enabled.")
else:
    print(f"Connected to {getattr(client, 'api_url', api_url)}")

## Load Features and Signatures

In [None]:
# Load a dataset + signatures (demo fallback if API unavailable)
try:
    if demo_mode or client is None:
        datasets = []
        signatures = []
    else:
        datasets = client.datasets.list(limit=10)
        signatures = client.signatures.list(limit=5)

    if datasets:
        dataset = datasets[0]
        print(f"Dataset: {getattr(dataset, 'name', '(unnamed)')}")
        feature_ids = getattr(dataset, "feature_ids", None) or {}
        try:
            n_features = sum(len(v or []) for v in feature_ids.values())
        except Exception:
            n_features = 0
        print(f"Features (total across types): {n_features}")

    print(f"Found {len(signatures)} signatures")
except Exception as e:
    demo_mode = True
    demo_mode_banner()
    datasets = []
    signatures = []
    display_error("Failed to load datasets/signatures from API", details=repr(e))

## Map Features to KEGG Pathways

In [None]:
# Map features to pathways + compute enrichment (best-effort)
import numpy as np

# Pick an input feature set from a real signature if possible
feature_names = []
feature_types = set()

if not (demo_mode or client is None) and signatures:
    try:
        sig = client.signatures.get(signatures[0].id)
        for c in getattr(sig, "components", []) or []:
            if getattr(c, "feature_name", None):
                feature_names.append(c.feature_name)
            ft = str(getattr(c, "feature_type", ""))
            if ft:
                feature_types.add(ft)
    except Exception as e:
        display_error("Failed to load signature components", details=repr(e))

# Fallback feature list for demo mode
if not feature_names:
    feature_names = ["BRCA1", "TP53", "EGFR", "MYC", "PIK3CA"]

# Map platform feature types to pathway mapper inputs where possible
# (KEGG mapper supports gene/protein/metabolite)
input_feature_types = {t for t in feature_types if t in {"gene", "protein", "metabolite"}}
if not input_feature_types:
    input_feature_types = {"gene"}

input_features = set(feature_names)
print(f"Using {len(input_features)} input features; types={sorted(input_feature_types)}")

try:
    from amprenta_rag.analysis.pathway.enrichment import perform_pathway_enrichment

    results = perform_pathway_enrichment(
        input_features=input_features,
        input_feature_types=set(input_feature_types),
        pathway_sources=["KEGG"],
        p_value_threshold=0.2,
    )

    enrichment_rows = []
    for r in results:
        enrichment_rows.append(
            {
                "pathway": getattr(r.pathway, "id", None),
                "name": getattr(r.pathway, "name", None),
                "input_features": getattr(r, "input_features", None),
                "pathway_size": getattr(r, "pathway_size", None),
                "adjusted_p_value": getattr(r, "adjusted_p_value", None),
                "-log10(adj_p)": (
                    -np.log10(getattr(r, "adjusted_p_value", 1.0))
                    if getattr(r, "adjusted_p_value", None)
                    else None
                ),
                "enrichment_ratio": getattr(r, "enrichment_ratio", None),
            }
        )

    df_enrichment = pd.DataFrame(enrichment_rows)
    if df_enrichment.empty:
        print("No enriched pathways found (or mapping returned none).")
    else:
        df_enrichment = df_enrichment.sort_values("adjusted_p_value")
        display(df_enrichment.head(15))

except Exception as e:
    df_enrichment = pd.DataFrame()
    display_error("Pathway enrichment failed", details=repr(e))

## Calculate Enrichment

In [None]:
# Enrichment results (already computed above into df_enrichment)
if df_enrichment is None or getattr(df_enrichment, "empty", True):
    print("No enrichment results to display.")
else:
    cols = [c for c in ["pathway", "name", "input_features", "pathway_size", "adjusted_p_value", "-log10(adj_p)", "enrichment_ratio"] if c in df_enrichment.columns]
    display(df_enrichment[cols].head(20))

## Visualize Enriched Pathways

In [None]:
# Bar chart of enriched pathways
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_enrichment.head(10),
    x="-log10(p)",
    y="pathway",
    palette="viridis"
)
plt.xlabel("-log10(p-value)")
plt.ylabel("Pathway")
plt.title("Top Enriched Pathways")
plt.tight_layout()
plt.show()