# Set-up

In [31]:
import os
import sys
import yaml
import logging
import mudata
import pandas as pd

# Change path to wherever you have repo locally
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')

from src.evaluation import (
    compute_categorical_association,
    compute_geneset_enrichment,
    compute_trait_enrichment,
    compute_perturbation_association,
    compute_explained_variance_ratio,
    compute_motif_enrichment
)
from src.evaluation.enrichment_trait import process_enrichment_data

In [32]:
# I/O paths
path_config = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_30/evaluation_pipeline.yml"
config = yaml.safe_load(open(path_config))

## I/O

In [33]:
io_config = config['io']
io_config

{'path_mdata': '/cellar/users/aklie/opt/gene_program_evaluation/examples/inference/iPSC_EC/cNMF/cNMF_30_0.2_gene_names.h5mu',
 'path_out': '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_30',
 'data_key': 'rna',
 'prog_key': 'cNMF'}

In [34]:
# Load mdata
path_mdata = io_config['path_mdata']
mdata = mudata.read(path_mdata)
mdata

  utils.warn_names_duplicates("var")


In [35]:
# Make path_out directory
path_out = io_config['path_out']
if not os.path.exists(path_out):
    os.makedirs(path_out)

In [36]:
# Update cNMF key to cNMF_30 and save
old_prog_key = io_config['prog_key']
prog_key = os.path.basename(path_out)
mdata.mod[prog_key] = mdata.mod.pop(old_prog_key)
mdata.update()
mdata.write(os.path.join(path_out, 'eval.h5mu'))



In [37]:
# Get data key
data_key= io_config['data_key']
data_key

'rna'

In [38]:
# Set up logging to print to console and also to file in path_out (evaluation_pipeline.log) with overwrite
log_path = os.path.join(path_out, 'evaluation_pipeline.log')
if os.path.exists(log_path):
    os.remove(log_path)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[
    logging.FileHandler(log_path, mode='w'),
    logging.StreamHandler()
])

## Run categorical association

In [39]:
# Run categorical association and save results
categorical_assocation_config = config['categorical_association']
results_df, posthoc_df = compute_categorical_association(
    mdata, 
    prog_key=prog_key,
    **categorical_assocation_config,
)
results_df.to_csv(os.path.join(path_out, 'categorical_association_results.txt'), sep='\t', index=False) 
posthoc_df.to_csv(os.path.join(path_out, 'categorical_association_posthoc.txt'), sep='\t', index=False)

INFO:root:Performing tests at single-cell level. Significance will likely be inflated


Testing sample association:   0%|          | 0/30 [00:00<?, ?programs/s]

Identifying differential sample:   0%|          | 0/30 [00:00<?, ?programs/s]

## Run perturbation association

In [40]:
# Run perturbation association
perturbation_assocation_config = config['perturbation_association']
if perturbation_assocation_config["groupby_key"] is not None:
    logging.info("groupby_key provided, running perturbation association on each group")
    groupby_key = perturbation_assocation_config.pop("groupby_key")
    data_key = io_config["data_key"]
    perturbation_assocation_df = pd.DataFrame()
    for group in mdata[data_key].obs[groupby_key].unique():
        mdata_ = mdata[mdata[data_key].obs[groupby_key] == group]
        test_stats_df = compute_perturbation_association(
            mdata_, 
            prog_key=prog_key,
            **perturbation_assocation_config,
        )
        test_stats_df.to_csv(os.path.join(path_out, f'perturbation_association_results_{group}.txt'), sep='\t', index=False)
        perturbation_assocation_df = pd.concat([perturbation_assocation_df, test_stats_df])
        perturbation_assocation_df["group"] = group
    
    # Save combined results
    perturbation_assocation_df.to_csv(os.path.join(path_out, 'perturbation_association_results.txt'), sep='\t', index=False)
else:
    logging.info("No groupby_key provided, running perturbation association on full dataset")
    perturbation_assocation_config.pop("groupby_key")
    perturbation_assocation_df = compute_perturbation_association(
        mdata, 
        prog_key=prog_key,
        **perturbation_assocation_config,
    )
    perturbation_assocation_df.to_csv(os.path.join(path_out, 'perturbation_association_results.txt'), sep='\t', index=False)

INFO:root:No groupby_key provided, running perturbation association on full dataset


  0%|          | 0/299 [00:00<?, ?targets/s]

## Run gene set enrichment analysis

In [41]:
# Gene-set enrichment
gene_set_enrichment_config = config['gene_set_enrichment']
libraries = gene_set_enrichment_config.pop('libraries')
gene_set_enrichment_df = pd.DataFrame()
data_key = io_config['data_key']
for library in libraries:
    logging.info(f'Running gene-set enrichment analysis for {library}')
    pre_res = compute_geneset_enrichment(
        mdata, 
        prog_key=prog_key,
        data_key=data_key,
        library=library,
        **gene_set_enrichment_config,
    )

    if gene_set_enrichment_config["method"] == "fisher":
        pre_res = pre_res.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "effect_size", "Genes": "genes"})
    elif gene_set_enrichment_config["method"] == "gsea":
        pre_res = pre_res.rename(columns={"Term": "term", "NOM p-val": "pval", "FDR q-val": "adj_pval", "NES": "effect_size", "Lead_genes": "genes"})
    
    # Save results
    pre_res['library'] = library
    pre_res.to_csv(os.path.join(path_out, f'geneset_enrichment_{library}.txt'), sep='\t', index=False)
    gene_set_enrichment_df = pd.concat([gene_set_enrichment_df, pre_res])

# Save combined results
gene_set_enrichment_df.to_csv(os.path.join(path_out, 'geneset_enrichment.txt'), sep='\t', index=False)

INFO:root:Running gene-set enrichment analysis for Reactome_2022
INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.Reactome_2022.gmt, use local file
INFO:root:0031 gene_sets have been filtered out when max_size=500 and min_size=5


Running Fisher enrichment:   0%|          | 0/30 [00:00<?, ?programs/s]

INFO:root:Running gene-set enrichment analysis for GO_Biological_Process_2023
INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.GO_Biological_Process_2023.gmt, use local file
INFO:root:0014 gene_sets have been filtered out when max_size=500 and min_size=5


Running Fisher enrichment:   0%|          | 0/30 [00:00<?, ?programs/s]

In [42]:
os.path.join(path_out, 'geneset_enrichment.txt')

'/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_30/geneset_enrichment.txt'

## Run trait enrichment analysis

In [43]:
# Run trait enrichment, process and save results
trait_enrichment_config = config['trait_enrichment']
pre_res_trait = compute_trait_enrichment(
    mdata, 
    prog_key=prog_key,
    data_key=data_key,
    gwas_data=trait_enrichment_config['gwas_data'],
    prog_nam=trait_enrichment_config['prog_nam'],
    library=trait_enrichment_config['library'],
    n_jobs=trait_enrichment_config['n_jobs'],
    inplace=trait_enrichment_config['inplace'],
    key_column=trait_enrichment_config['key_column'],
    gene_column=trait_enrichment_config['gene_column'],
    method=trait_enrichment_config['method'],
    loading_rank_thresh=trait_enrichment_config['loading_rank_thresh'],
)
if trait_enrichment_config["method"] == "fisher":
    pre_res_trait = pre_res_trait.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "effect_size", "Genes": "genes"})
elif trati_enrichment_config["method"] == "gsea":
    pre_res_trait = pre_res_trait.rename(columns={"Term": "term", "NOM p-val": "pval", "FDR q-val": "adj_pval", "NES": "effect_size", "Lead_genes": "genes"})
pre_res_trait.to_csv(os.path.join(path_out, 'trait_enrichment.txt'), sep='\t', index=False)
data = process_enrichment_data(
    enrich_res=pre_res_trait,
    metadata=trait_enrichment_config['metadata'],
    pval_col=trait_enrichment_config["pval_col"],
    enrich_geneset_id_col=trait_enrichment_config["enrich_geneset_id_col"],
    metadata_geneset_id_col=trait_enrichment_config["metadata_geneset_id_col"],
    color_category_col=trait_enrichment_config["color_category_col"],
    program_name_col=trait_enrichment_config["program_name_col"],
    annotation_cols=trait_enrichment_config["annotation_cols"],
)
data.to_csv(os.path.join(path_out, "trait_enrichment_processed.txt"), sep='\t', index=False)

Running Fisher enrichment:   0%|          | 0/30 [00:00<?, ?programs/s]

## Run motif enrichment analysis

In [44]:
# Run motif enrichment and save results
motif_enrichment_config = config['motif_enrichment']
motif_match_df, motif_count_df, motif_enrichment_df = compute_motif_enrichment(
    mdata, 
    prog_key=prog_key,
    data_key=data_key,
    **motif_enrichment_config,
)
motif_match_df.to_csv(os.path.join(path_out, 'motif_enrichment_matches.txt'), sep='\t', index=False)
motif_count_df.to_csv(os.path.join(path_out, 'motif_enrichment_counts.txt'), sep='\t', index=False)
motif_enrichment_df.to_csv(os.path.join(path_out, 'motif_enrichment.txt'), sep='\t', index=False)

  utils.warn_names_duplicates("var")


Matching motifs to sequences:   0%|          | 0/8 [00:00<?, ?motifs/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Motif scanning:   0%|          | 0/3 [00:00<?, ?genes/s]

Computing motif enrichment:   0%|          | 0/5 [00:00<?, ?motifs/s]

## Run explained variance

In [45]:
# Run explained variance
explained_variance_config = config['explained_variance']
explained_variance_ratio = compute_explained_variance_ratio(
    mdata, 
    prog_key=prog_key,
    data_key=data_key,
    **explained_variance_config,
)
explained_variance_ratio.to_csv(os.path.join(path_out, 'explained_variance_ratio.txt'), sep='\t')

  utils.warn_names_duplicates("var")


Computing explained variance:   0%|          | 0/30 [00:00<?, ?programs/s]

## Software versions

In [46]:
import joblib
import numpy as np
import pandas as pd
import scipy
import sklearn
import scikit_posthocs as posthocs
import gseapy
import pymemesuite

versions = {
    "evaluation_pipeline_versions": {
        'gene_program_evaluation': '0.0.1',
        'mudata': mudata.__version__,
        'joblib': joblib.__version__,
        'scipy': scipy.__version__,
        'numpy': np.__version__,
        'pandas': pd.__version__,
        'scikit-learn': sklearn.__version__,
        'scikit-posthocs': posthocs.__version__,
        'gseapy': gseapy.__version__,
        'pymemesuite': pymemesuite.__version__,
    }
}

with open(os.path.join(path_out, 'software_versions.yml'), 'w') as f:
    yaml.dump(versions, f)

## Run post-flight checks

In [47]:
# Flag any missing files
expected_files = [
    "trait_enrichment_processed.txt",
    "software_versions.yml",
    "trait_enrichment.txt",
    "perturbation_association_results.txt",
    "motif_enrichment_matches.txt",
    "motif_enrichment_counts.txt",
    "motif_enrichment.txt",
    "geneset_enrichment.txt",
    "explained_variance_ratio.txt",
    "categorical_association_results.txt",
    "categorical_association_posthoc.txt"
]
for file in expected_files:
    if not os.path.exists(os.path.join(path_out, file)):
        logging.error(f"Missing file: {file}")

Exception ignored in atexit callback: <bound method InteractiveShell.atexit_operations of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x155551779e10>>
Traceback (most recent call last):
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3917, in atexit_operations
    self._atexit_once()
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3900, in _atexit_once
    self.history_manager.end_session()
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/IPython/core/history.py", line 583, in end_session
    self.writeout_cache()
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-packages/decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle/lib/python3.10/site-pac

# DONE!

---