# Set-up

In [21]:
import os
import sys
import yaml
import logging
import mudata
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Change path to wherever you have repo locally
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')

from src.evaluation import (
    compute_categorical_association,
    compute_geneset_enrichment,
    compute_trait_enrichment,
    compute_perturbation_association,
    compute_explained_variance_ratio,
    compute_motif_enrichment
)
from src.evaluation.enrichment_trait import process_enrichment_data

In [2]:
# I/O paths
path_config = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_30/evaluation_pipeline.yml"
config = yaml.safe_load(open(path_config))

## I/O

In [13]:
io_config = config['io']
io_config

{'path_mdata': '/cellar/users/aklie/opt/gene_program_evaluation/examples/inference/iPSC_EC/cNMF/cNMF_30_0.2_gene_names.h5mu',
 'path_out': '/cellar/users/aklie/opt/gene_program_evaluation/app/examples/evaluation/iPSC_EC/cNMF_30',
 'data_key': 'rna',
 'prog_key': 'cNMF'}

In [14]:
# Load mdata
path_mdata = io_config['path_mdata']
mdata = mudata.read(path_mdata)
mdata

  utils.warn_names_duplicates("var")


In [18]:
prog_key = "cNMF"
data_key = "rna"

# Gene set enrichment testing

In [24]:
from src.evaluation.enrichment_geneset import get_geneset, get_program_gene_loadings, perform_prerank, perform_fisher_enrich
import gseapy as gp

In [54]:
# Run GSEAS
def perform_prerank(
    loadings, 
    geneset, 
    n_jobs=1, 
    low_cutoff=-np.inf,
    hi_cutoff=np.inf,
    **kwargs
):
    """Run GSEA prerank on each gene program in the loadings matrix.
    
    Parameters
    ----------
    loadings : pd.DataFrame
        DataFrame of gene loadings for each gene program.
    geneset : str
        Name of the gene set to run GSEA on.
    n_jobs : int
        Number of parallel jobs to run.
    min_value : float
        Loadings must be strictly greater this value to be included in the analysis.
    max_value : float
        Loadings must be strictly less than this value to be included in the analysis.
    """

    # Run GSEA prerank for each column of loadings (each cell program)
    pre_res = pd.DataFrame()
    for i in tqdm(loadings.columns, desc='Running GSEA', unit='programs'):

        # If low_cutoff or hi_cutoff is not -np.inf or np.inf, filter loadings
        if low_cutoff != -np.inf or hi_cutoff != np.inf:
            temp_loadings = loadings[i][(loadings[i] > low_cutoff) & (loadings[i] < hi_cutoff)]
        else:
            temp_loadings = loadings[i]
            
        # Run GSEA prerank
        temp_res = gp.prerank(rnk=temp_loadings, gene_sets=geneset, threads=n_jobs, **kwargs).res2d

        # Post-process results
        temp_res['Gene %'] = temp_res['Gene %'].apply(lambda x: float(x[:-1]))
        temp_res['tag_before'] = temp_res['Tag %'].apply(lambda x: int(x.split('/')[0]))
        temp_res['tag_after'] = temp_res['Tag %'].apply(lambda x: int(x.split('/')[1]))
        temp_res.drop(columns=['Tag %'], inplace=True)
        if 'Name' in temp_res.columns and temp_res['Name'][0] == "prerank":
            temp_res['Name'] = i
        temp_res.rename(columns={'Name': 'program_name'}, inplace=True)
        temp_res = temp_res.sort_values(['program_name', 'FDR q-val'])
        pre_res = pd.concat([pre_res, temp_res], ignore_index=True)
    
    return pre_res

In [55]:
gene_set_enrichment_config = config['gene_set_enrichment']
gene_set_enrichment_config

{'prog_nam': None,
 'organism': 'human',
 'libraries': ['Reactome_2022', 'GO_Biological_Process_2023'],
 'method': 'fisher',
 'database': 'enrichr',
 'loading_rank_thresh': 300,
 'n_jobs': -1,
 'inplace': False,
 'user_geneset': None,
 'max_size': 500,
 'min_size': 5}

In [56]:
reactome = get_geneset(
    organism="human",
    library="Reactome_2022",
    database="enrichr"
)

INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.Reactome_2022.gmt, use local file
INFO:root:0002 gene_sets have been filtered out when max_size=2000 and min_size=0
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.Reactome_2022.gmt, use local file
INFO:root:0002 gene_sets have been filtered out when max_size=2000 and min_size=0


In [19]:
loadings = get_program_gene_loadings(
    mdata, 
    prog_key=prog_key, 
    prog_nam=gene_set_enrichment_config['prog_nam'],
    data_key=data_key, 
    organism=gene_set_enrichment_config['organism'],
)

In [49]:
min_value = 0.0
max_value = np.inf
for i in tqdm(loadings.columns, desc='Running GSEA', unit='programs'):
    temp_loadings = loadings[i][(loadings[i] > min_value) & (loadings[i] < max_value)]


Running GSEA:   0%|          | 0/30 [00:00<?, ?programs/s]

In [60]:
pre_res = perform_prerank(
    loadings=loadings, 
    geneset=reactome,
    n_jobs=gene_set_enrichment_config['n_jobs'],
    low_cutoff=0
)

Running GSEA:   0%|          | 0/30 [00:00<?, ?programs/s]

(3406,)
(3298,)


The order of those genes will be arbitrary, which may produce unexpected results.


(3663,)
(2555,)
(3047,)
(3562,)
(3017,)
(3503,)
(2737,)
(2470,)
(2788,)
(3158,)
(3162,)
(2616,)


The order of those genes will be arbitrary, which may produce unexpected results.


(3078,)
(2376,)
(3085,)


The order of those genes will be arbitrary, which may produce unexpected results.


(2790,)
(3125,)
(3286,)
(3047,)
(3295,)
(3034,)
(3110,)
(3036,)
(2910,)
(2754,)
(2767,)
(3316,)
(2787,)


Exception ignored in atexit callback: <bound method InteractiveShell.atexit_operations of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x155551779e10>>
Traceback (most recent call last):
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3917, in atexit_operations
    self._atexit_once()
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3900, in _atexit_once
    self.history_manager.end_session()
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/IPython/core/history.py", line 583, in end_session
    self.writeout_cache()
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-pac