# Pathway Enrichment Analysis

This notebook demonstrates pathway enrichment analysis using features and signatures from the RAG platform.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from amprenta_rag.client import RAGClient
import os

# Connect to API
api_url = os.environ.get('API_URL', 'http://host.docker.internal:8000')
client = RAGClient(api_url=api_url)
print(f'Connected to {api_url}')

## Load Features and Signatures

In [None]:
# Load features from a dataset
datasets = client.datasets.list(limit=10)
if datasets:
    dataset = datasets[0]
    print(f"Dataset: {dataset.name}")
    print(f"Features: {len(dataset.feature_ids) if hasattr(dataset, 'feature_ids') else 0}")
    
# Load signatures
signatures = client.signatures.list(limit=5)
print(f"Found {len(signatures)} signatures")

## Map Features to KEGG Pathways

In [None]:
# Example: Mock pathway mapping
# In production, this would call amprenta_rag.analysis.pathway_enrichment
feature_names = ["BRCA1", "TP53", "EGFR", "MYC", "PIK3CA"]

# Mock pathway mapping
pathway_mapping = {
    "BRCA1": ["hsa03440"],  # Homologous recombination
    "TP53": ["hsa04115"],  # p53 signaling pathway
    "EGFR": ["hsa04012"],  # ErbB signaling pathway
    "MYC": ["hsa04068"],  # FoxO signaling pathway
    "PIK3CA": ["hsa04151"],  # PI3K-Akt signaling pathway
}

print("Feature to pathway mapping:")
for feature, pathways in pathway_mapping.items():
    print(f"  {feature}: {pathways}")

## Calculate Enrichment

In [None]:
from scipy.stats import fisher_exact
import numpy as np

# Example enrichment calculation (Fisher's exact test)
# In practice, you'd compare your feature set against a background
pathway_counts = {}
for pathways in pathway_mapping.values():
    for pathway in pathways:
        pathway_counts[pathway] = pathway_counts.get(pathway, 0) + 1

# Mock enrichment scores (p-values from Fisher's exact test)
enrichment_results = []
for pathway, count in pathway_counts.items():
    # Mock p-value calculation
    p_value = np.random.uniform(0.001, 0.1)  # Replace with real calculation
    enrichment_results.append({
        "pathway": pathway,
        "count": count,
        "p_value": p_value,
        "-log10(p)": -np.log10(p_value)
    })

df_enrichment = pd.DataFrame(enrichment_results)
df_enrichment = df_enrichment.sort_values("p_value")
print(df_enrichment)

## Visualize Enriched Pathways

In [None]:
# Bar chart of enriched pathways
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_enrichment.head(10),
    x="-log10(p)",
    y="pathway",
    palette="viridis"
)
plt.xlabel("-log10(p-value)")
plt.ylabel("Pathway")
plt.title("Top Enriched Pathways")
plt.tight_layout()
plt.show()