# Comprehensive Enrichment Analysis

This notebook performs enrichment analysis using multiple methods:
- **GO Enrichment**: Gene Ontology enrichment using GOATOOLS
- **GO Slim Enrichment**: Gene Ontology Slim enrichment using GOATOOLS
- **FYPO Enrichment**: Fission Yeast Phenotype Ontology enrichment using GOATOOLS
- **STRING Enrichment**: Protein-protein interaction network enrichment using STRING API

## Input Requirements
Your input TSV file should contain at least these columns:
- `Systematic ID`: Gene systematic identifier
- `Name`: Gene name  
- `FYPOviability`: Viability information from FYPO
- `DeletionLibrary_essentiality`: Essentiality from deletion library
- `Cluster`: Cluster assignment for genes

## Output
- Enrichment tables and interactive Altair plots for each method
- Excel file with high-coverage results
- Interactive visualizations for data exploration


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import altair as alt
import requests
import io
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# GOATOOLS imports for GO and FYPO enrichment
from goatools.obo_parser import GODag
from goatools.anno.gaf_reader import GafReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.go_enrichment import GOEnrichmentStudy
from goatools.mapslim import mapslim

# Set up Altair for better visualization
alt.data_transformers.enable('json')
alt.renderers.enable('default')

print("Libraries imported successfully!")


Libraries imported successfully!


## Configuration and File Paths

Set up the paths to your data files and reference databases. Modify these paths according to your setup.


In [25]:
# Configuration - Modify these paths according to your setup
CONFIG = {
    # Input data file (TSV format)
    # 'input_file': '../../results/HD_DIT_HAP/20_gene_level_clustering/manual_cluster_labels_based_on_kmeans36.tsv',  # Update this path
    'input_file': '/data/c/yangyusheng_optimized/DIT_HAP_pipeline/tmp/curve_fitting_test/gene_level_clustering/gene_level_clustering_12clusters.tsv',  # Update this path
    
    # Reference database paths
    'go_obo_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/go-basic.obo',
    'go_gaf_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/go_style_gaf.tsv', 
    'go_slim_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/goslim_pombe.obo',
    'fypo_obo_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/fypo-simple.obo',
    'bp_slim_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/bp_goslim_pombe_ids_and_names.tsv',
    'mf_slim_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/mf_goslim_pombe_ids_and_names.tsv',
    'cc_slim_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/cc_goslim_pombe_ids_and_names.tsv',
    # 'fypo_gaf_file': '../../resources/pombase_data/2025-06-01/ontologies_and_associations/pombase.gaf',
    
    # Output paths
    # 'output_dir': '../../results/HD_DIT_HAP/20_gene_level_clustering/',
    # 'excel_output': '../../results/HD_DIT_HAP/20_gene_level_clustering/comprehensive_enrichment_results.xlsx',

    'output_dir': '/data/c/yangyusheng_optimized/DIT_HAP_pipeline/tmp/curve_fitting_test/gene_level_clustering/',
    'excel_output': '/data/c/yangyusheng_optimized/DIT_HAP_pipeline/tmp/curve_fitting_test/gene_level_clustering/comprehensive_enrichment_results.xlsx',
    
    # Analysis parameters
    'fdr_threshold': 0.05,
    'coverage_threshold': 0.5,
    'exclude_clusters': ['Miscellaneous']  # Clusters to exclude from analysis
}

print("Configuration loaded successfully!")
print(f"Input file: {CONFIG['input_file']}")
print(f"Output directory: {CONFIG['output_dir']}")
print(f"FDR threshold: {CONFIG['fdr_threshold']}")
print(f"Coverage threshold: {CONFIG['coverage_threshold']}")


Configuration loaded successfully!
Input file: /data/c/yangyusheng_optimized/DIT_HAP_pipeline/tmp/curve_fitting_test/gene_level_clustering/gene_level_clustering_12clusters.tsv
Output directory: /data/c/yangyusheng_optimized/DIT_HAP_pipeline/tmp/curve_fitting_test/gene_level_clustering/
FDR threshold: 0.05
Coverage threshold: 0.5


## Helper Functions

These functions handle the core enrichment analysis logic for different methods.


In [6]:
def load_GO_data(obo_file, gaf_file):
    """
    Load GO ontology and gene associations.
    
    Parameters:
    -----------
    obo_file : str or Path
        Path to the OBO file containing ontology definitions
    gaf_file : str or Path  
        Path to the GAF file containing gene associations
        
    Returns:
    --------
    tuple : (GODag, dict)
        GO ontology DAG and namespace-to-associations dictionary
    """
    try:
        # Load GO ontology
        print(f"Loading ontology from: {obo_file}")
        godag = GODag(str(obo_file))
        
        # Load gene associations
        print(f"Loading gene associations from: {gaf_file}")
        gaf_reader = GafReader(str(gaf_file))
        
        # Group associations by namespace
        ns2assoc = {}
        for ns in ['BP', 'MF', 'CC']:  # Biological Process, Molecular Function, Cellular Component
            ns2assoc[ns] = gaf_reader.get_id2gos_nss(namespace=ns, godag=godag)
            
        print(f"Loaded {len(godag)} GO terms")
        print(f"Namespaces: {list(ns2assoc.keys())}")
        
        return godag, ns2assoc
        
    except Exception as e:
        print(f"Error loading GO data: {e}")
        return None, None

def GOEA(query_genes, bg_genes, godag, ns2assoc, propagate_counts=False, **kwargs):
    """
    Perform GO Enrichment Analysis.
    
    Parameters:
    -----------
    query_genes : list
        List of query gene IDs
    bg_genes : list
        List of background gene IDs
    godag : GODag
        GO ontology DAG
    ns2assoc : dict
        Namespace-to-associations dictionary
        
    Returns:
    --------
    list : Enrichment results
    """
    try:
        # Initialize enrichment study
        goeaobj = GOEnrichmentStudyNS(
            bg_genes,
            ns2assoc,
            godag,
            propagate_counts=propagate_counts,
            alpha=0.05,
            methods=['fdr_bh']
        )
        
        # Run enrichment analysis
        goea_results = goeaobj.run_study(query_genes)
        
        return goea_results
        
    except Exception as e:
        print(f"Error in GOEA: {e}")
        return []

def format_GOEA_results(goea_results, cluster_name=""):
    """
    Format GOEA results into a pandas DataFrame.
    
    Parameters:
    -----------
    goea_results : list
        Results from GOEA function
    cluster_name : str
        Name of the cluster being analyzed
        
    Returns:
    --------
    pd.DataFrame : Formatted results
    """
    if not goea_results:
        return pd.DataFrame()
        
    try:
        results_data = []
        for result in goea_results:
            results_data.append({
                'GO': result.GO,
                'NS': result.NS,
                'name': result.name,
                'enrichment': result.enrichment,
                'p_uncorrected': result.p_uncorrected,
                'p_fdr_bh': result.p_fdr_bh,
                'study_count': result.study_count,
                'pop_count': result.pop_count,
                'study_n': result.study_n,
                'pop_n': result.pop_n,
                'ratio_in_study': result.ratio_in_study,
                'ratio_in_pop': result.ratio_in_pop,
                'study_items': ','.join(result.study_items) if result.study_items else '',
                'pop_items': ','.join(result.pop_items) if result.pop_items else '',
                'cluster': cluster_name
            })
            
        df = pd.DataFrame(results_data)
        return df
        
    except Exception as e:
        print(f"Error formatting GOEA results: {e}")
        return pd.DataFrame()

print("GO analysis functions loaded successfully!")


GO analysis functions loaded successfully!


In [4]:
def parse_string_enrichment(query_genes, bg_genes, max_retries=3, retry_delay=5):
    """
    Perform STRING enrichment analysis using the STRING API.
    
    Parameters:
    -----------
    query_genes : list
        List of query gene IDs
    bg_genes : list
        List of background gene IDs
    max_retries : int
        Maximum number of retry attempts
    retry_delay : int
        Delay between retries in seconds
        
    Returns:
    --------
    pd.DataFrame : STRING enrichment results
    """
    import time
    
    try:
        # Get STRING API version and URL
        output_format = "tsv"
        string_api_url = "https://string-db.org/api"
        
        # Step 1: Get STRING IDs for background genes
        method = "get_string_ids"
        species = "4896"  # S. pombe species ID
        
        params = {
            "identifiers": "\r".join(bg_genes),
            "species": species,
            "limit": 1,
            "echo_query": 1,
            "caller_identity": "enrichment_analysis"
        }
        
        print("Getting STRING IDs for background genes...")
        request_url = "/".join([string_api_url, output_format, method])
        
        for attempt in range(max_retries):
            try:
                response = requests.post(request_url, data=params)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    print(f"Attempt {attempt + 1} failed, retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    raise e
        
        # Parse STRING IDs
        bg_string_data = pd.read_csv(io.StringIO(response.text), sep="\t")
        bg_string_ids = bg_string_data['stringId'].tolist()
        
        print(f"Found {len(bg_string_ids)} STRING IDs for background")
        
        # Step 2: Perform enrichment analysis
        method = "enrichment"
        params = {
            "identifiers": "\r".join(query_genes),
            "species": species,
            "background_string_identifiers": "%0d".join(bg_string_ids),
            "caller_identity": "enrichment_analysis"
        }
        
        print("Performing STRING enrichment analysis...")
        request_url = "/".join([string_api_url, output_format, method])
        
        for attempt in range(max_retries):
            try:
                response = requests.post(request_url, data=params)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    print(f"Attempt {attempt + 1} failed, retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    raise e
        
        # Parse results
        enrichment_df = pd.read_csv(io.StringIO(response.text), sep="\t")
        
        print(f"STRING enrichment completed: {len(enrichment_df)} results")
        return enrichment_df
        
    except Exception as e:
        print(f"Error in STRING enrichment: {e}")
        return pd.DataFrame()

def create_enrichment_plot(data, title, x_col='p_fdr_bh', y_col='name', 
                          color_col='p_fdr_bh', size_col='study_count', column = None):
    """
    Create an interactive Altair plot for enrichment results.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Enrichment results data
    title : str
        Plot title
    x_col : str
        Column for x-axis (typically p-value)
    y_col : str
        Column for y-axis (typically term names)
    color_col : str
        Column for color encoding
    size_col : str
        Column for size encoding
        
    Returns:
    --------
    alt.Chart : Altair chart object
    """
    if data.empty:
        print(f"No data available for {title}")
        return alt.Chart().mark_text(text="No significant results")
    
    # Prepare data
    plot_data = data.copy()
    plot_data['-log10(FDR)'] = -np.log10(plot_data[x_col] + 1e-300)  # Add small value to avoid log(0)
    
    # Create scatter plot
    scatter = alt.Chart(plot_data).mark_circle().encode(
        x=alt.X(f'{x_col}:N', title=f'{x_col}', axis=alt.Axis(grid=True)),
        y=alt.Y(f'{y_col}:N', sort=alt.EncodingSortField(field=x_col, order='ascending'), 
                title='Enriched Terms', axis=alt.Axis(grid=True)),
        color=alt.Color(f'{color_col}:N', title=f'{color_col}', axis=alt.Axis(grid=True)),
        size=alt.Size(f'{size_col}:Q', title=f'{size_col}', axis=alt.Axis(grid=True)),
        tooltip=[y_col, x_col, size_col, color_col],
        column=column
    )
    
    return scatter

print("STRING and plotting functions loaded successfully!")


STRING and plotting functions loaded successfully!


## Data Loading and Preprocessing

Load the input data and prepare it for enrichment analysis.


In [9]:
# Load input data
print("Loading input data...")
try:
    # Load the input TSV file
    input_data = pd.read_csv(CONFIG['input_file'], sep='\t')
    print(f"Loaded {len(input_data)} genes from {CONFIG['input_file']}")
    
    # Display basic info about the dataset
    print(f"\nDataset shape: {input_data.shape}")
    print(f"Columns: {list(input_data.columns)}")
    
    # Check required columns
    required_columns = ['Systematic ID', 'Name', 'FYPOviability', 'DeletionLibrary_essentiality']
    cluster_columns = [col for col in input_data.columns if col.startswith('Clustering_')]
    print(f"Cluster columns: {cluster_columns}")
    missing_columns = [col for col in required_columns if col not in input_data.columns]
    
    if missing_columns:
        print(f"Warning: Missing required columns: {missing_columns}")
    else:
        print("All required columns present!")
    
    # Display first few rows
    print(f"\nFirst 5 rows:")
    display(input_data.head())
    
except Exception as e:
    print(f"Error loading input data: {e}")
    print("Please check the file path and format in CONFIG['input_file']")
    input_data = None


Loading input data...
Loaded 4519 genes from /data/c/yangyusheng_optimized/DIT_HAP_pipeline/tmp/curve_fitting_test/gene_level_clustering/gene_level_clustering_12clusters.tsv

Dataset shape: (4519, 29)
Columns: ['Systematic ID', 'Name', 'FYPOviability', 'DeletionLibrary_essentiality', 'time_points', 'Status', 'A', 'um', 'lam', 'R2', 'RMSE', 'normalized_RMSE', 't0', 't1', 't2', 't3', 't4', 't0_fitted', 't1_fitted', 't2_fitted', 't3_fitted', 't4_fitted', 't0_residual', 't1_residual', 't2_residual', 't3_residual', 't4_residual', 'Clustering_using_A_um_lam', 'Clustering_using_um_lam']
Cluster columns: ['Clustering_using_A_um_lam', 'Clustering_using_um_lam']
All required columns present!

First 5 rows:


Unnamed: 0,Systematic ID,Name,FYPOviability,DeletionLibrary_essentiality,time_points,Status,A,um,lam,R2,...,t2_fitted,t3_fitted,t4_fitted,t0_residual,t1_residual,t2_residual,t3_residual,t4_residual,Clustering_using_A_um_lam,Clustering_using_um_lam
0,SPAC1002.02,pom34,viable,V,"[0.0, 2.352, 5.588, 9.104, 12.48]",Success,0.084,-0.005,0.002,-0.46,...,0.0,0.0,0.0,-0.006,0.029,-0.148,0.0,-0.265,0,8
1,SPAC1002.03c,gls2,viable,V,"[0.0, 2.352, 5.588, 9.104, 12.48]",Success,0.254,0.025,-0.0,0.253,...,0.138,0.2,0.231,-0.017,0.104,-0.164,0.187,-0.09,0,8
2,SPAC1002.04c,taf11,inviable,E,"[0.0, 2.352, 5.588, 9.104, 12.48]",Success,8.688,0.73,2.648,0.997,...,2.166,4.663,6.515,-0.06,0.192,-0.192,0.144,-0.053,6,3
3,SPAC1002.05c,jmj2,viable,V,"[0.0, 2.352, 5.588, 9.104, 12.48]",Success,0.259,-0.017,0.058,-0.747,...,0.0,0.0,0.0,-0.018,0.047,-0.121,-0.147,-0.123,0,8
4,SPAC1002.06c,bqt2,viable,V,"[0.0, 2.352, 5.588, 9.104, 12.48]",Success,-0.133,-0.022,-0.0,0.207,...,-0.108,-0.128,-0.132,0.009,-0.073,-0.038,-0.02,0.079,0,8


In [10]:
cluster_genes = {}

# Prepare data for enrichment analysis
if input_data is not None:
    # Get background genes (all genes in the dataset)
    bg_genes = input_data['Systematic ID'].dropna().unique().tolist()
    print(f"Background genes: {len(bg_genes)}")
    
    # Get clusters for analysis (excluding specified clusters)
    for cluster_column in cluster_columns:
        available_clusters = input_data[cluster_column].dropna().unique().tolist()
        analysis_clusters = [c for c in available_clusters if c not in CONFIG['exclude_clusters']]
        cluster_genes[cluster_column] = {}
    
        for cluster in analysis_clusters:
            cluster_data = input_data[input_data[cluster_column] == cluster]
            genes = cluster_data['Systematic ID'].dropna().unique().tolist()
            cluster_genes[cluster_column][cluster] = genes
            print(f"Cluster {cluster_column} {cluster}: {len(genes)} genes")
        
    print(f"\nReady for enrichment analysis with {len(analysis_clusters)} clusters")
else:
    print("Cannot proceed without input data. Please fix the data loading issue first.")


Background genes: 4519
Cluster Clustering_using_A_um_lam 0: 1683 genes
Cluster Clustering_using_A_um_lam 6: 348 genes
Cluster Clustering_using_A_um_lam 9: 260 genes
Cluster Clustering_using_A_um_lam 1: 265 genes
Cluster Clustering_using_A_um_lam 10: 468 genes
Cluster Clustering_using_A_um_lam 4: 331 genes
Cluster Clustering_using_A_um_lam 3: 296 genes
Cluster Clustering_using_A_um_lam 2: 345 genes
Cluster Clustering_using_A_um_lam 5: 124 genes
Cluster Clustering_using_A_um_lam 7: 221 genes
Cluster Clustering_using_A_um_lam 8: 90 genes
Cluster Clustering_using_A_um_lam 11: 88 genes
Cluster Clustering_using_um_lam 8: 1617 genes
Cluster Clustering_using_um_lam 3: 311 genes
Cluster Clustering_using_um_lam 2: 227 genes
Cluster Clustering_using_um_lam 7: 155 genes
Cluster Clustering_using_um_lam 11: 245 genes
Cluster Clustering_using_um_lam 1: 452 genes
Cluster Clustering_using_um_lam 0: 336 genes
Cluster Clustering_using_um_lam 6: 249 genes
Cluster Clustering_using_um_lam 4: 302 genes
Clust

## GO Enrichment Analysis

Perform Gene Ontology enrichment analysis using GOATOOLS.


In [11]:
# Load GO data and perform enrichment analysis
if input_data is not None and cluster_genes:
    print("=" * 50)
    print("GO ENRICHMENT ANALYSIS")
    print("=" * 50)
    
    # Load GO ontology and associations
    go_dag, go_ns2assoc = load_GO_data(CONFIG['go_obo_file'], CONFIG['go_gaf_file'])
    
    if go_dag is not None and go_ns2assoc is not None:
        # Perform GO enrichment for each cluster
        go_results = {}
        go_enrichment_summary = pd.DataFrame()
        for cluster_column, genes in cluster_genes.items():
            go_results[cluster_column] = {}
            for cluster, query_genes in genes.items():
                print(f"\nAnalyzing cluster: {cluster_column} - {cluster} ({len(query_genes)} genes)")
            
                # Perform GOEA
                goea_results = GOEA(query_genes, bg_genes, go_dag, go_ns2assoc)
                
                # Format results
                formatted_results = format_GOEA_results(goea_results, cluster)
                
                if not formatted_results.empty:
                    # Filter significant results
                    significant = formatted_results[formatted_results['p_fdr_bh'] < CONFIG['fdr_threshold']]
                    go_results[cluster_column][cluster] = significant
                    print(f"  Found {len(significant)} significant GO terms")
                else:
                    print(f"  No significant results for cluster {cluster}")
            
            # Combine all results
            if go_results:
                go_enrichment_df = pd.concat(go_results[cluster_column].values(), ignore_index=True)
                print(f"\nTotal GO enrichment results: {len(go_enrichment_df)}")
                
                # Display summary
                print(f"Results by namespace:")
                for ns in go_enrichment_df['NS'].unique():
                    count = len(go_enrichment_df[go_enrichment_df['NS'] == ns])
                    print(f"  {ns}: {count} terms")
            else:
                go_enrichment_df = pd.DataFrame()
                print("No significant GO enrichment results found.")
            go_enrichment_df["Method"] = cluster_column
            go_enrichment_summary = pd.concat([go_enrichment_summary, go_enrichment_df], ignore_index=True)
    else:
        go_enrichment_summary = pd.DataFrame()
        print("Could not load GO data. Skipping GO enrichment analysis.")
else:
    go_enrichment_summary = pd.DataFrame()
    print("Skipping GO enrichment analysis - no valid input data or clusters.")


GO ENRICHMENT ANALYSIS
Loading ontology from: ../../resources/pombase_data/2025-06-01/ontologies_and_associations/go-basic.obo
../../resources/pombase_data/2025-06-01/ontologies_and_associations/go-basic.obo: fmt(1.2) rel(2025-05-31) 43,448 Terms
Loading gene associations from: ../../resources/pombase_data/2025-06-01/ontologies_and_associations/go_style_gaf.tsv
HMS:0:00:00.964291  49,924 annotations READ: ../../resources/pombase_data/2025-06-01/ontologies_and_associations/go_style_gaf.tsv 
Loaded 43448 GO terms
Namespaces: ['BP', 'MF', 'CC']

Analyzing cluster: Clustering_using_A_um_lam - 0 (1683 genes)

Load BP Ontology Enrichment Analysis ...
 98%  4,409 of  4,519 population items found in association

Load CC Ontology Enrichment Analysis ...
 98%  4,409 of  4,519 population items found in association

Load MF Ontology Enrichment Analysis ...
 98%  4,409 of  4,519 population items found in association

Runing BP Ontology Analysis: current study set of 1683 IDs.
 96%  1,615 of  1,683 

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ff4cbd1a0c0>>
Traceback (most recent call last):
  File "/data/a/yangyusheng/miniforge3/envs/bioinformatics/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


100%    265 of    265 study items found in association
100%    265 of    265 study items found in population(4519)
Calculating 4,524 uncorrected p-values using fisher_scipy_stats
   4,524 terms are associated with  4,409 of  4,519 population items
     883 terms are associated with    265 of    265 study items
  METHOD fdr_bh:
      25 GO terms found significant (< 0.05=alpha) ( 24 enriched +   1 purified): statsmodels fdr_bh
     133 study items associated with significant GO IDs (enriched)
       2 study items associated with significant GO IDs (purified)

Runing CC Ontology Analysis: current study set of 265 IDs.
100%    265 of    265 study items found in association
100%    265 of    265 study items found in population(4519)
Calculating 4,524 uncorrected p-values using fisher_scipy_stats
   4,524 terms are associated with  4,409 of  4,519 population items
     883 terms are associated with    265 of    265 study items
  METHOD fdr_bh:
      25 GO terms found significant (< 0.05=alp

In [17]:
go_enrichment_summary["coverage_frac"] = go_enrichment_summary["study_count"]/go_enrichment_summary["pop_count"]
go_enrichment_summary.to_csv(Path(CONFIG["output_dir"])/"go_enrichment_summary_for_manual_cluster_labels.tsv", sep="\t", index=False)

In [15]:
# Create GO enrichment visualization
if not go_enrichment_summary.empty:
    print("Creating GO enrichment visualization...")
    
    # Create plot for each namespace
    go_plots = []
    for ns in go_enrichment_summary['NS'].unique():
        ns_data = go_enrichment_summary[go_enrichment_summary['NS'] == ns].copy()
        
        # Sort by p-value and take top 20 results
        ns_data = ns_data.sort_values('p_fdr_bh').head(20)
        
        if not ns_data.empty:
            plot = create_enrichment_plot(
                ns_data, 
                title=f"GO {ns} Enrichment",
                x_col='cluster',
                y_col='name',
                color_col='p_fdr_bh',
                size_col='study_count',
                column="Method"
            )
            go_plots.append(plot)
    
    # Display plots
    if go_plots:
        print("GO Enrichment Results:")
        for i, plot in enumerate(go_plots):
            ns = go_enrichment_df['NS'].unique()[i]
            print(f"\n{ns} Namespace:")
            display(plot)
    
    # Display top results table
    print("\nTop 10 GO Enrichment Results:")
    top_go = go_enrichment_df.sort_values('p_fdr_bh').head(10)[
        ['cluster', 'NS', 'name', 'p_fdr_bh', 'study_count', 'pop_count']
    ]
    display(top_go)
    
else:
    print("No GO enrichment results to visualize.")


Creating GO enrichment visualization...
GO Enrichment Results:

CC Namespace:


SchemaValidationError: Multiple errors were found.

Error 1: `Color` has no parameter named 'axis'

    Existing parameter names are:
    shorthand      bin         legend   timeUnit   
    aggregate      condition   scale    title      
    bandPosition   field       sort     type       

    See the help for `Color` to read the full description of these parameters

Error 2: `Size` has no parameter named 'axis'

    Existing parameter names are:
    shorthand      bin         legend   timeUnit   
    aggregate      condition   scale    title      
    bandPosition   field       sort     type       

    See the help for `Size` to read the full description of these parameters

alt.Chart(...)


BP Namespace:


SchemaValidationError: Multiple errors were found.

Error 1: `Color` has no parameter named 'axis'

    Existing parameter names are:
    shorthand      bin         legend   timeUnit   
    aggregate      condition   scale    title      
    bandPosition   field       sort     type       

    See the help for `Color` to read the full description of these parameters

Error 2: `Size` has no parameter named 'axis'

    Existing parameter names are:
    shorthand      bin         legend   timeUnit   
    aggregate      condition   scale    title      
    bandPosition   field       sort     type       

    See the help for `Size` to read the full description of these parameters

alt.Chart(...)


MF Namespace:


SchemaValidationError: Multiple errors were found.

Error 1: `Color` has no parameter named 'axis'

    Existing parameter names are:
    shorthand      bin         legend   timeUnit   
    aggregate      condition   scale    title      
    bandPosition   field       sort     type       

    See the help for `Color` to read the full description of these parameters

Error 2: `Size` has no parameter named 'axis'

    Existing parameter names are:
    shorthand      bin         legend   timeUnit   
    aggregate      condition   scale    title      
    bandPosition   field       sort     type       

    See the help for `Size` to read the full description of these parameters

alt.Chart(...)


Top 10 GO Enrichment Results:


Unnamed: 0,cluster,NS,name,p_fdr_bh,study_count,pop_count
90,3,BP,mitochondrial translation,9.106041e-36,53,91
108,3,BP,mitochondrial translation,9.106041e-36,53,91
126,3,BP,mitochondrial translation,9.106041e-36,53,91
96,3,CC,mitochondrial large ribosomal subunit,4.912771e-16,24,38
132,3,CC,mitochondrial large ribosomal subunit,4.912771e-16,24,38
114,3,CC,mitochondrial large ribosomal subunit,4.912771e-16,24,38
414,4,BP,cytoplasmic translation,6.517849e-13,35,105
306,4,BP,cytoplasmic translation,6.517849e-13,35,105
360,4,BP,cytoplasmic translation,6.517849e-13,35,105
123,3,MF,structural constituent of ribosome,7.770187e-13,41,144


## GO Slim Enrichment Analysis

Perform Gene Ontology Slim enrichment analysis for a more focused view of biological processes.


In [None]:
# Load GO data and perform enrichment analysis
if input_data is not None and cluster_genes:
    print("=" * 50)
    print("GO ENRICHMENT ANALYSIS")
    print("=" * 50)
    
    # Load GO ontology and associations
    go_dag, go_ns2assoc = load_GO_data(CONFIG['go_obo_file'], CONFIG['go_gaf_file'])
    
    if go_dag is not None and go_ns2assoc is not None:
        # Perform GO enrichment for each cluster
        go_results_for_slim = {}
        go_enrichment_summary_for_slim = pd.DataFrame()
        for cluster_column, genes in cluster_genes.items():
            go_results_for_slim[cluster_column] = {}
            for cluster, query_genes in genes.items():
                print(f"\nAnalyzing cluster: {cluster_column} - {cluster} ({len(query_genes)} genes)")
            
                # Perform GOEA
                goea_results = GOEA(query_genes, bg_genes, go_dag, go_ns2assoc, propagate_counts=True)
                
                # Format results
                formatted_results = format_GOEA_results(goea_results, cluster)
                
                if not formatted_results.empty:
                    # Filter significant results
                    significant = formatted_results[formatted_results['p_fdr_bh'] < CONFIG['fdr_threshold']]
                    go_results_for_slim[cluster_column][cluster] = significant
                    print(f"  Found {len(significant)} significant GO terms")
                else:
                    print(f"  No significant results for cluster {cluster}")
            
            # Combine all results
            if go_results_for_slim:
                go_enrichment_df_for_slim = pd.concat(go_results_for_slim[cluster_column].values(), ignore_index=True)
                print(f"\nTotal GO enrichment results: {len(go_enrichment_df_for_slim)}")
                
                # Display summary
                print(f"Results by namespace:")
                for ns in go_enrichment_df_for_slim['NS'].unique():
                    count = len(go_enrichment_df_for_slim[go_enrichment_df_for_slim['NS'] == ns])
                    print(f"  {ns}: {count} terms")
            else:
                go_enrichment_df_for_slim = pd.DataFrame()
                print("No significant GO enrichment results found.")
            go_enrichment_df_for_slim["Method"] = cluster_column
            go_enrichment_summary_for_slim = pd.concat([go_enrichment_summary_for_slim, go_enrichment_df_for_slim], ignore_index=True)
    else:
        go_enrichment_summary_for_slim = pd.DataFrame()
        print("Could not load GO data. Skipping GO enrichment analysis.")
else:
    go_enrichment_summary_for_slim = pd.DataFrame()
    print("Skipping GO enrichment analysis - no valid input data or clusters.")


bp_slims = pd.read_csv(CONFIG['bp_slim_file'], sep="\t", header=None, names=["GO", "Name"])
mf_slims = pd.read_csv(CONFIG['mf_slim_file'], sep="\t", header=None, names=["GO", "Name"])
cc_slims = pd.read_csv(CONFIG['cc_slim_file'], sep="\t", header=None, names=["GO", "Name"])
go_slim_ids = list(set(bp_slims["GO"].tolist()) | set(mf_slims["GO"].tolist()) | set(cc_slims["GO"].tolist()))
go_slim_enrichment_df = go_enrichment_df_for_slim.query("GO in @go_slim_ids").copy()
go_slim_enrichment_df.to_csv(Path(CONFIG['output_dir']) / "go_slim_enrichment_results.tsv", sep="\t", index=False)

GO ENRICHMENT ANALYSIS
Loading ontology from: ../../resources/pombase_data/2025-06-01/ontologies_and_associations/go-basic.obo
../../resources/pombase_data/2025-06-01/ontologies_and_associations/go-basic.obo: fmt(1.2) rel(2025-05-31) 43,448 Terms
Loading gene associations from: ../../resources/pombase_data/2025-06-01/ontologies_and_associations/go_style_gaf.tsv
HMS:0:00:00.984079  49,924 annotations READ: ../../resources/pombase_data/2025-06-01/ontologies_and_associations/go_style_gaf.tsv 
Loaded 43448 GO terms
Namespaces: ['BP', 'MF', 'CC']

Analyzing cluster: Clustering_using_A_um_lam - 0 (1683 genes)

Load BP Ontology Enrichment Analysis ...
Propagating term counts up: is_a
 98%  4,409 of  4,519 population items found in association

Load CC Ontology Enrichment Analysis ...
Propagating term counts up: is_a
 98%  4,409 of  4,519 population items found in association

Load MF Ontology Enrichment Analysis ...
Propagating term counts up: is_a
 98%  4,409 of  4,519 population items found

In [40]:
# Create GO Slim enrichment visualization
if not go_slim_enrichment_df.empty:
    print("Creating GO Slim enrichment visualization...")
    
    # Create plot for each namespace
    go_slim_plots = []
    for ns in go_slim_enrichment_df['NS'].unique():
        ns_data = go_slim_enrichment_df[go_slim_enrichment_df['NS'] == ns].copy()
        
        # Sort by p-value and take top 20 results
        ns_data = ns_data.sort_values('p_fdr_bh').head(20)
        
        if not ns_data.empty:
            plot = create_enrichment_plot(
                ns_data, 
                title=f"GO Slim {ns} Enrichment",
                x_col='cluster',
                y_col='name',
                color_col='p_fdr_bh',
                size_col='study_count'
            )
            go_slim_plots.append(plot)
    
    # Display plots
    if go_slim_plots:
        print("GO Slim Enrichment Results:")
        for i, plot in enumerate(go_slim_plots):
            ns = go_slim_enrichment_df['NS'].unique()[i]
            print(f"\n{ns} Namespace:")
            display(plot)
    
    # Display top results table
    print("\nTop 10 GO Slim Enrichment Results:")
    top_go_slim = go_slim_enrichment_df.sort_values('p_fdr_bh').head(10)[
        ['cluster', 'NS', 'name', 'p_fdr_bh', 'study_count', 'pop_count']
    ]
    display(top_go_slim)
    
else:
    print("No GO Slim enrichment results to visualize.")


Creating GO Slim enrichment visualization...


TypeError: argument of type 'NoneType' is not iterable

## FYPO Enrichment Analysis

Perform Fission Yeast Phenotype Ontology enrichment analysis using GOATOOLS.


In [None]:
# FYPO enrichment analysis
if input_data is not None and cluster_genes:
    print("=" * 50)
    print("FYPO ENRICHMENT ANALYSIS")
    print("=" * 50)
    
    # Load FYPO ontology and associations
    fypo_dag, fypo_ns2assoc = load_GO_data(CONFIG['fypo_obo_file'], CONFIG['fypo_gaf_file'])
    
    if fypo_dag is not None and fypo_ns2assoc is not None:
        # Perform FYPO enrichment for each cluster
        fypo_results = {}
        
        for cluster, query_genes in cluster_genes.items():
            print(f"\nAnalyzing cluster: {cluster} ({len(query_genes)} genes)")
            
            # Perform FYPO enrichment (using the same GOEA function as it works with any OBO ontology)
            fypo_enrichment_results = GOEA(query_genes, bg_genes, fypo_dag, fypo_ns2assoc)
            
            # Format results
            formatted_results = format_GOEA_results(fypo_enrichment_results, cluster)
            
            if not formatted_results.empty:
                # Filter significant results
                significant = formatted_results[formatted_results['p_fdr_bh'] < CONFIG['fdr_threshold']]
                fypo_results[cluster] = significant
                print(f"  Found {len(significant)} significant FYPO terms")
            else:
                print(f"  No significant results for cluster {cluster}")
        
        # Combine all results
        if fypo_results:
            fypo_enrichment_df = pd.concat(fypo_results.values(), ignore_index=True)
            print(f"\nTotal FYPO enrichment results: {len(fypo_enrichment_df)}")
            
            # Display summary
            print(f"Results by namespace:")
            for ns in fypo_enrichment_df['NS'].unique():
                count = len(fypo_enrichment_df[fypo_enrichment_df['NS'] == ns])
                print(f"  {ns}: {count} terms")
        else:
            fypo_enrichment_df = pd.DataFrame()
            print("No significant FYPO enrichment results found.")
    else:
        fypo_enrichment_df = pd.DataFrame()
        print("Could not load FYPO data. Skipping FYPO enrichment analysis.")
else:
    fypo_enrichment_df = pd.DataFrame()
    print("Skipping FYPO enrichment analysis - no valid input data or clusters.")


In [None]:
# Create FYPO enrichment visualization
if not fypo_enrichment_df.empty:
    print("Creating FYPO enrichment visualization...")
    
    # FYPO usually has a single namespace, but let's handle multiple just in case
    fypo_plots = []
    for ns in fypo_enrichment_df['NS'].unique():
        ns_data = fypo_enrichment_df[fypo_enrichment_df['NS'] == ns].copy()
        
        # Sort by p-value and take top 20 results
        ns_data = ns_data.sort_values('p_fdr_bh').head(20)
        
        if not ns_data.empty:
            plot = create_enrichment_plot(
                ns_data, 
                title=f"FYPO {ns} Enrichment",
                x_col='p_fdr_bh',
                y_col='name',
                color_col='cluster',
                size_col='study_count'
            )
            fypo_plots.append(plot)
    
    # Display plots
    if fypo_plots:
        print("FYPO Enrichment Results:")
        for i, plot in enumerate(fypo_plots):
            ns = fypo_enrichment_df['NS'].unique()[i]
            print(f"\n{ns} Namespace:")
            display(plot)
    
    # Display top results table
    print("\nTop 10 FYPO Enrichment Results:")
    top_fypo = fypo_enrichment_df.sort_values('p_fdr_bh').head(10)[
        ['cluster', 'NS', 'name', 'p_fdr_bh', 'study_count', 'pop_count']
    ]
    display(top_fypo)
    
else:
    print("No FYPO enrichment results to visualize.")


## STRING Enrichment Analysis

Perform protein-protein interaction network enrichment analysis using the STRING database API.


In [None]:
# STRING enrichment analysis
if input_data is not None and cluster_genes:
    print("=" * 50)
    print("STRING ENRICHMENT ANALYSIS")
    print("=" * 50)
    
    # Perform STRING enrichment for each cluster
    string_results = {}
    
    for cluster, query_genes in cluster_genes.items():
        print(f"\nAnalyzing cluster: {cluster} ({len(query_genes)} genes)")
        
        # Perform STRING enrichment
        string_enrichment_result = parse_string_enrichment(query_genes, bg_genes)
        
        if not string_enrichment_result.empty:
            # Add cluster information
            string_enrichment_result['cluster'] = cluster
            
            # Filter significant results (using FDR column from STRING)
            if 'fdr' in string_enrichment_result.columns:
                significant = string_enrichment_result[string_enrichment_result['fdr'] < CONFIG['fdr_threshold']]
            else:
                # Fallback to using a different p-value column if available
                p_value_cols = [col for col in string_enrichment_result.columns if 'p' in col.lower()]
                if p_value_cols:
                    significant = string_enrichment_result[string_enrichment_result[p_value_cols[0]] < CONFIG['fdr_threshold']]
                else:
                    significant = string_enrichment_result
            
            string_results[cluster] = significant
            print(f"  Found {len(significant)} significant STRING enrichments")
        else:
            print(f"  No STRING enrichment results for cluster {cluster}")
    
    # Combine all results
    if string_results:
        string_enrichment_df = pd.concat(string_results.values(), ignore_index=True)
        print(f"\nTotal STRING enrichment results: {len(string_enrichment_df)}")
        
        # Display summary by category
        if 'category' in string_enrichment_df.columns:
            print(f"Results by category:")
            for cat in string_enrichment_df['category'].unique():
                count = len(string_enrichment_df[string_enrichment_df['category'] == cat])
                print(f"  {cat}: {count} terms")
    else:
        string_enrichment_df = pd.DataFrame()
        print("No significant STRING enrichment results found.")
else:
    string_enrichment_df = pd.DataFrame()
    print("Skipping STRING enrichment analysis - no valid input data or clusters.")


In [None]:
# Create STRING enrichment visualization
if not string_enrichment_df.empty:
    print("Creating STRING enrichment visualization...")
    
    # Determine the appropriate columns for visualization
    p_val_col = 'fdr' if 'fdr' in string_enrichment_df.columns else 'p_value'
    term_col = 'term' if 'term' in string_enrichment_df.columns else 'description'
    
    # Create plots by category if available
    if 'category' in string_enrichment_df.columns:
        string_plots = []
        for cat in string_enrichment_df['category'].unique():
            cat_data = string_enrichment_df[string_enrichment_df['category'] == cat].copy()
            
            # Sort by p-value and take top 20 results
            cat_data = cat_data.sort_values(p_val_col).head(20)
            
            if not cat_data.empty:
                # Create plot with appropriate column names
                plot = create_enrichment_plot(
                    cat_data, 
                    title=f"STRING {cat} Enrichment",
                    x_col=p_val_col,
                    y_col=term_col,
                    color_col='cluster',
                    size_col='number_of_genes' if 'number_of_genes' in cat_data.columns else 'inputGenes'
                )
                string_plots.append(plot)
        
        # Display plots
        if string_plots:
            print("STRING Enrichment Results:")
            for i, plot in enumerate(string_plots):
                cat = string_enrichment_df['category'].unique()[i]
                print(f"\n{cat} Category:")
                display(plot)
    else:
        # Single plot if no categories
        plot_data = string_enrichment_df.sort_values(p_val_col).head(20)
        plot = create_enrichment_plot(
            plot_data, 
            title="STRING Enrichment",
            x_col=p_val_col,
            y_col=term_col,
            color_col='cluster',
            size_col='number_of_genes' if 'number_of_genes' in plot_data.columns else 'inputGenes'
        )
        print("STRING Enrichment Results:")
        display(plot)
    
    # Display top results table
    print("\nTop 10 STRING Enrichment Results:")
    display_cols = ['cluster', 'category', term_col, p_val_col]
    if 'number_of_genes' in string_enrichment_df.columns:
        display_cols.append('number_of_genes')
    elif 'inputGenes' in string_enrichment_df.columns:
        display_cols.append('inputGenes')
    
    # Only include columns that exist
    display_cols = [col for col in display_cols if col in string_enrichment_df.columns]
    
    top_string = string_enrichment_df.sort_values(p_val_col).head(10)[display_cols]
    display(top_string)
    
else:
    print("No STRING enrichment results to visualize.")


## Results Summary and Export

Combine and summarize all enrichment results, then export to Excel for further analysis.


In [None]:
# Calculate coverage fractions and create high-coverage results
results_summary = {}

# Process GO results
if not go_enrichment_df.empty:
    go_enrichment_df['coverage_frac'] = go_enrichment_df['study_count'] / go_enrichment_df['pop_count']
    high_coverage_go = go_enrichment_df[go_enrichment_df['coverage_frac'] > CONFIG['coverage_threshold']].copy()
    results_summary['GO'] = {
        'total_results': len(go_enrichment_df),
        'high_coverage_results': len(high_coverage_go),
        'namespaces': go_enrichment_df['NS'].unique().tolist()
    }
    print(f"GO Enrichment: {len(go_enrichment_df)} total, {len(high_coverage_go)} high-coverage")
else:
    high_coverage_go = pd.DataFrame()
    results_summary['GO'] = {'total_results': 0, 'high_coverage_results': 0, 'namespaces': []}

# Process GO Slim results
if not go_slim_enrichment_df.empty:
    go_slim_enrichment_df['coverage_frac'] = go_slim_enrichment_df['study_count'] / go_slim_enrichment_df['pop_count']
    high_coverage_go_slim = go_slim_enrichment_df[go_slim_enrichment_df['coverage_frac'] > CONFIG['coverage_threshold']].copy()
    results_summary['GO_Slim'] = {
        'total_results': len(go_slim_enrichment_df),
        'high_coverage_results': len(high_coverage_go_slim),
        'namespaces': go_slim_enrichment_df['NS'].unique().tolist()
    }
    print(f"GO Slim Enrichment: {len(go_slim_enrichment_df)} total, {len(high_coverage_go_slim)} high-coverage")
else:
    high_coverage_go_slim = pd.DataFrame()
    results_summary['GO_Slim'] = {'total_results': 0, 'high_coverage_results': 0, 'namespaces': []}

# Process FYPO results
if not fypo_enrichment_df.empty:
    fypo_enrichment_df['coverage_frac'] = fypo_enrichment_df['study_count'] / fypo_enrichment_df['pop_count']
    high_coverage_fypo = fypo_enrichment_df[fypo_enrichment_df['coverage_frac'] > CONFIG['coverage_threshold']].copy()
    results_summary['FYPO'] = {
        'total_results': len(fypo_enrichment_df),
        'high_coverage_results': len(high_coverage_fypo),
        'namespaces': fypo_enrichment_df['NS'].unique().tolist()
    }
    print(f"FYPO Enrichment: {len(fypo_enrichment_df)} total, {len(high_coverage_fypo)} high-coverage")
else:
    high_coverage_fypo = pd.DataFrame()
    results_summary['FYPO'] = {'total_results': 0, 'high_coverage_results': 0, 'namespaces': []}

# Process STRING results
if not string_enrichment_df.empty:
    # Calculate coverage fraction for STRING (if we have the necessary columns)
    if 'number_of_genes' in string_enrichment_df.columns and 'number_of_genes_in_background' in string_enrichment_df.columns:
        string_enrichment_df['coverage_frac'] = string_enrichment_df['number_of_genes'] / string_enrichment_df['number_of_genes_in_background']
    else:
        string_enrichment_df['coverage_frac'] = 1.0  # Default if we can't calculate
    
    high_coverage_string = string_enrichment_df[string_enrichment_df['coverage_frac'] > CONFIG['coverage_threshold']].copy()
    
    categories = string_enrichment_df['category'].unique().tolist() if 'category' in string_enrichment_df.columns else []
    results_summary['STRING'] = {
        'total_results': len(string_enrichment_df),
        'high_coverage_results': len(high_coverage_string),
        'categories': categories
    }
    print(f"STRING Enrichment: {len(string_enrichment_df)} total, {len(high_coverage_string)} high-coverage")
else:
    high_coverage_string = pd.DataFrame()
    results_summary['STRING'] = {'total_results': 0, 'high_coverage_results': 0, 'categories': []}

# Display overall summary
print("\n" + "=" * 50)
print("ENRICHMENT ANALYSIS SUMMARY")
print("=" * 50)
for method, stats in results_summary.items():
    print(f"{method}:")
    print(f"  Total results: {stats['total_results']}")
    print(f"  High-coverage results: {stats['high_coverage_results']}")
    if 'namespaces' in stats:
        print(f"  Namespaces: {', '.join(stats['namespaces'])}")
    if 'categories' in stats:
        print(f"  Categories: {', '.join(stats['categories'][:5])}" + ("..." if len(stats['categories']) > 5 else ""))
    print()


In [None]:
# Export results to Excel
print("Exporting results to Excel...")

try:
    # Create output directory if it doesn't exist
    output_path = Path(CONFIG['excel_output'])
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Export to Excel with multiple sheets
    with pd.ExcelWriter(CONFIG['excel_output'], engine='openpyxl') as writer:
        # Export all results
        if not go_enrichment_df.empty:
            go_enrichment_df.to_excel(writer, sheet_name='GO_All', index=False)
        if not high_coverage_go.empty:
            high_coverage_go.to_excel(writer, sheet_name='GO_HighCoverage', index=False)
            
        if not go_slim_enrichment_df.empty:
            go_slim_enrichment_df.to_excel(writer, sheet_name='GOSlim_All', index=False)
        if not high_coverage_go_slim.empty:
            high_coverage_go_slim.to_excel(writer, sheet_name='GOSlim_HighCoverage', index=False)
            
        if not fypo_enrichment_df.empty:
            fypo_enrichment_df.to_excel(writer, sheet_name='FYPO_All', index=False)
        if not high_coverage_fypo.empty:
            high_coverage_fypo.to_excel(writer, sheet_name='FYPO_HighCoverage', index=False)
            
        if not string_enrichment_df.empty:
            string_enrichment_df.to_excel(writer, sheet_name='STRING_All', index=False)
        if not high_coverage_string.empty:
            high_coverage_string.to_excel(writer, sheet_name='STRING_HighCoverage', index=False)
        
        # Create summary sheet
        summary_df = pd.DataFrame.from_dict(results_summary, orient='index')
        summary_df.to_excel(writer, sheet_name='Summary', index=True)
    
    print(f"Results exported successfully to: {CONFIG['excel_output']}")
    
    # Also save individual CSV files for easy access
    csv_output_dir = Path(CONFIG['output_dir']) / 'enrichment_csv'
    csv_output_dir.mkdir(parents=True, exist_ok=True)
    
    for name, df in [
        ('GO_enrichment', go_enrichment_df),
        ('GO_slim_enrichment', go_slim_enrichment_df), 
        ('FYPO_enrichment', fypo_enrichment_df),
        ('STRING_enrichment', string_enrichment_df)
    ]:
        if not df.empty:
            csv_path = csv_output_dir / f'{name}.csv'
            df.to_csv(csv_path, index=False)
            print(f"Saved {name} to: {csv_path}")
    
    print(f"CSV files saved to: {csv_output_dir}")
    
except Exception as e:
    print(f"Error exporting results: {e}")
    print("Results are still available in the notebook variables for manual export.")


## Usage Instructions

### How to Use This Notebook

1. **Configure Paths**: Update the `CONFIG` dictionary with your specific file paths
2. **Run All Cells**: Execute all cells in order to perform the complete analysis
3. **Modify Parameters**: Adjust FDR thresholds and coverage thresholds as needed
4. **Add/Remove Clusters**: Modify the `exclude_clusters` list to control which clusters are analyzed

### Customization Options

- **FDR Threshold**: Change `CONFIG['fdr_threshold']` to be more or less stringent
- **Coverage Threshold**: Adjust `CONFIG['coverage_threshold']` for high-coverage filtering
- **Visualization**: Modify the `create_enrichment_plot` function for different plot styles
- **Export Format**: Add additional export formats in the results export section

### Troubleshooting

- **File Not Found**: Check that all paths in CONFIG point to existing files
- **No Results**: Try relaxing the FDR threshold or checking gene ID format compatibility
- **STRING API Issues**: The STRING API may be temporarily unavailable; results will be skipped automatically
- **Memory Issues**: For large datasets, consider processing clusters individually

### Output Files

- **Excel File**: Contains all results in separate sheets for easy browsing
- **CSV Files**: Individual CSV files for each enrichment method
- **Interactive Plots**: Displayed in the notebook for exploration


In [None]:
go_term_features = pd.read_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/results/HD_DIT_HAP/21_similarity_analysis_for_terms/go_curve_features_df.csv")
A_features = go_term_features.query("feature_name == 'A'").rename(columns={"features": "A"})
um_features = go_term_features.query("feature_name == 'um'").rename(columns={"features": "um"})
lam_features = go_term_features.query("feature_name == 'lam'").rename(columns={"features": "lam"})

shared_columns = ["GO_term", "GO_name", "GO_namespace", "GO_definition", "n_genes", "n_covered_genes", "coverage"]
go_term_features = pd.merge(A_features[shared_columns+["A"]], um_features[shared_columns+["um"]], on=shared_columns, how="left")
go_term_features = pd.merge(go_term_features, lam_features[shared_columns+["lam"]], on=shared_columns, how="left")

go_term_features.to_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/results/HD_DIT_HAP/21_similarity_analysis_for_terms/go_curve_features_df_with_features.csv", index=False)

In [None]:
go_enrichment_summary = go_enrichment_summary.merge(go_term_features, left_on="GO", right_on="GO_term", how="left")

NameError: name 'go_term_features' is not defined

In [None]:
plots = []

for ns in go_enrichment_summary['NS'].unique():
    plot = alt.Chart(go_enrichment_summary.query(f"NS == '{ns}' and cluster != 1 and enrichment=='e'")).mark_circle().encode(
        x=alt.X("cluster", title="Cluster"),
        y=alt.Y("name", title="GO Term", sort=alt.SortField(field="cluster", order="descending")).axis(grid=True, labelLimit=1000),
        color=alt.Color("p_fdr_bh", title="p-value", scale=alt.Scale(scheme="reds")),
        size=alt.Size("coverage_frac", title="Coverage Fraction"),
        column=alt.Column("NS", title="Namespace")
    )

    plots.append(plot)

# alt.hconcat(plots).resolve_scale(color='shared', size='shared')

In [None]:
go_enrichment_summary["coverage_frac"] = go_enrichment_summary["study_count"] / go_enrichment_summary["pop_count"]

In [None]:
method_orders = [
    "cluster_hierarchical_agg_curve_params",
    "cluster_kmeans_curve_params",
    "cluster_hierarchical_agg_um_lam_only",
    "cluster_kmeans_um_lam_only",
    "cluster_hierarchical_agg_fitted_plus_params",
    "cluster_kmeans_fitted_plus_params",
    "cluster_hierarchical_agg_raw_plus_params",
    "cluster_kmeans_raw_plus_params",
    "cluster_hierarchical_agg_fitted_fitness",
    "cluster_kmeans_fitted_fitness",
    "cluster_hierarchical_agg_raw_fitness",
    "cluster_kmeans_raw_fitness"
]

In [None]:
concated_GOEA_results_plot = []
for NS, NS_df in go_enrichment_summary.query("enrichment == 'e'").groupby("NS"):
    chart = alt.Chart(NS_df).mark_circle().encode(
        x=alt.X("cluster:N", axis=alt.Axis(grid=True)),
        y=alt.Y(
                    "name:N",
                    axis=alt.Axis(grid=True, labelLimit=500, title=""),
                    sort=alt.EncodingSortField(
                        field="cluster", order="ascending"),
                ),
        color=alt.Color("p_fdr_bh:Q").scale(
                    scheme="yelloworangered", reverse=True),
        tooltip=NS_df.columns.tolist(),
        size="coverage_frac",
        column=alt.Column("Method", sort=method_orders))
    concated_GOEA_results_plot.append(chart)
alt.vconcat(*concated_GOEA_results_plot).save("../../results/HD_DIT_HAP/20_gene_level_clustering/GOEA_results_plot.html")