# Set-up

In [1]:
import os
import sys
import yaml
import logging
import mudata
import pandas as pd

# Change path to wherever you have repo locally
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')

from src.evaluation import (
    compute_categorical_association,
    compute_geneset_enrichment,
    compute_trait_enrichment,
    compute_perturbation_association,
    compute_explained_variance_ratio,
    compute_motif_enrichment
)
from src.evaluation.enrichment_trait import process_enrichment_data

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
# I/O paths
path_config = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/evaluation_pipeline.yml"
config = yaml.safe_load(open(path_config))

## I/O

In [3]:
io_config = config['io']
io_config

{'path_mdata': '/cellar/users/aklie/opt/gene_program_evaluation/examples/inference/iPSC_EC/cNMF/cNMF_30_0.2_gene_names.h5mu',
 'path_out': '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30',
 'data_key': 'rna',
 'prog_key': 'cNMF'}

In [4]:
# Load mdata
path_mdata = io_config['path_mdata']
mdata = mudata.read(path_mdata)
mdata

  utils.warn_names_duplicates("var")


In [5]:
# Make path_out directory
path_out = io_config['path_out']
if not os.path.exists(path_out):
    os.makedirs(path_out)

In [6]:
# Update cNMF key to cNMF_30 and save
old_prog_key = io_config['prog_key']
prog_key = os.path.basename(path_out)
mdata.mod[prog_key] = mdata.mod.pop(old_prog_key)
mdata.update()
mdata.write(os.path.join(path_out, f'{prog_key}.h5mu'))





In [7]:
# Get data key
data_key= io_config['data_key']
data_key

'rna'

## Run categorical association

In [8]:
# Run categorical association and save results
categorical_assocation_config = config['categorical_association']
categorical_keys = categorical_assocation_config['categorical_keys']
for key in categorical_keys:
    results_df, posthoc_df = compute_categorical_association(
        mdata, 
        prog_key=prog_key,
        categorical_key=key,
        **categorical_assocation_config,
    )
    results_df.to_csv(os.path.join(path_out, f'{prog_key}_{key}_association_results.txt'), sep='\t', index=False) 
    posthoc_df.to_csv(os.path.join(path_out, f'{prog_key}_{key}_association_posthoc.txt'), sep='\t', index=False)

INFO:root:Performing tests at single-cell level. Significance will likely be inflated


Testing sample association:   0%|          | 0/30 [00:00<?, ?programs/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[k] = df_sub[k].cat.remove_unused_categories()
INFO:root:Running jamboree specific version of posthoc with pearsonr, this is not yet integrated into the main pipeline


Identifying differential sample:   0%|          | 0/30 [00:00<?, ?programs/s]

## Run perturbation association

In [9]:
perturbation_assocation_config = config['perturbation_association']
perturbation_assocation_config

{'groupby_key': 'sample',
 'collapse_targets': True,
 'pseudobulk': False,
 'reference_targets': ['non-targeting'],
 'n_jobs': -1,
 'inplace': False}

In [10]:
# Run perturbation association
target_type = "gene" if perturbation_assocation_config["collapse_targets"] else "guide"
if perturbation_assocation_config["groupby_key"] is not None:
    logging.info("groupby_key provided, running perturbation association on each group")
    groupby_key = perturbation_assocation_config.pop("groupby_key")
    data_key = io_config["data_key"]
    perturbation_assocation_df = pd.DataFrame()
    for group in mdata[data_key].obs[groupby_key].unique():
        mdata_ = mdata[mdata[data_key].obs[groupby_key] == group]
        test_stats_df = compute_perturbation_association(
            mdata_, 
            prog_key=prog_key,
            data_key=data_key,
            **perturbation_assocation_config,
        )
        test_stats_df.to_csv(os.path.join(path_out, f'{prog_key}_{target_type}_{groupby_key}_{group}_perturbation_association.txt'), sep='\t', index=False)
else:
    logging.info("No groupby_key provided, running perturbation association on full dataset")
    perturbation_assocation_config.pop("groupby_key")
    perturbation_assocation_df = compute_perturbation_association(
        mdata, 
        prog_key=prog_key,
        data_key=data_key,
        **perturbation_assocation_config,
    )
    perturbation_assocation_df.to_csv(os.path.join(path_out, f'{prog_key}_{target_type}_perturbation_association.txt'), sep='\t', index=False)

INFO:root:groupby_key provided, running perturbation association on each group
  utils.warn_names_duplicates("var")


Testing perturbation association:   0%|          | 0/298 [00:00<?, ?targets/s]

  utils.warn_names_duplicates("var")


Testing perturbation association:   0%|          | 0/298 [00:00<?, ?targets/s]

  utils.warn_names_duplicates("var")


Testing perturbation association:   0%|          | 0/298 [00:00<?, ?targets/s]

  utils.warn_names_duplicates("var")


Testing perturbation association:   0%|          | 0/298 [00:00<?, ?targets/s]

## Run gene set enrichment analysis

In [11]:
# Gene-set enrichment
gene_set_enrichment_config = config['gene_set_enrichment']
libraries = gene_set_enrichment_config.pop('libraries')
gene_set_enrichment_df = pd.DataFrame()
data_key = io_config['data_key']
for library in libraries:
    logging.info(f'Running gene-set enrichment analysis for {library}')
    pre_res = compute_geneset_enrichment(
        mdata, 
        prog_key=prog_key,
        data_key=data_key,
        library=library,
        **gene_set_enrichment_config,
    )

    if gene_set_enrichment_config["method"] == "fisher":
        pre_res = pre_res.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "enrichment", "Genes": "genes"})
    elif gene_set_enrichment_config["method"] == "gsea":
        pre_res = pre_res.rename(columns={"Term": "term", "NOM p-val": "pval", "FDR q-val": "adj_pval", "NES": "enrichment", "Lead_genes": "genes"})
    
    # Save results
    pre_res.to_csv(os.path.join(path_out, f'{prog_key}_{library}_{gene_set_enrichment_config["method"]}_geneset_enrichment.txt'), sep='\t', index=False)

INFO:root:Running gene-set enrichment analysis for Reactome_2022
INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.Reactome_2022.gmt, use local file
INFO:root:0031 gene_sets have been filtered out when max_size=500 and min_size=5


Running Fisher enrichment:   0%|          | 0/30 [00:00<?, ?programs/s]

INFO:root:Running gene-set enrichment analysis for GO_Biological_Process_2023
INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.GO_Biological_Process_2023.gmt, use local file
INFO:root:0014 gene_sets have been filtered out when max_size=500 and min_size=5


Running Fisher enrichment:   0%|          | 0/30 [00:00<?, ?programs/s]

## Run trait enrichment analysis

In [12]:
# Run trait enrichment, process and save results
trait_enrichment_config = config['trait_enrichment']
pre_res_trait = compute_trait_enrichment(
    mdata, 
    prog_key=prog_key,
    data_key=data_key,
    gwas_data=trait_enrichment_config['gwas_data'],
    prog_nam=trait_enrichment_config['prog_nam'],
    library=trait_enrichment_config['library'],
    n_jobs=trait_enrichment_config['n_jobs'],
    inplace=trait_enrichment_config['inplace'],
    key_column=trait_enrichment_config['key_column'],
    gene_column=trait_enrichment_config['gene_column'],
    method=trait_enrichment_config['method'],
    loading_rank_thresh=trait_enrichment_config['loading_rank_thresh'],
)
if trait_enrichment_config["method"] == "fisher":
    pre_res_trait = pre_res_trait.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "enrichment", "Genes": "genes"})
elif trait_enrichment_config["method"] == "gsea":
    pre_res_trait = pre_res_trait.rename(columns={"Term": "term", "NOM p-val": "pval", "FDR q-val": "adj_pval", "NES": "enrichment", "Lead_genes": "genes"})
data = process_enrichment_data(
    enrich_res=pre_res_trait,
    metadata=trait_enrichment_config['metadata'],
    pval_col=trait_enrichment_config["pval_col"],
    enrich_geneset_id_col=trait_enrichment_config["enrich_geneset_id_col"],
    metadata_geneset_id_col=trait_enrichment_config["metadata_geneset_id_col"],
    color_category_col=trait_enrichment_config["color_category_col"],
    program_name_col=trait_enrichment_config["program_name_col"],
    annotation_cols=trait_enrichment_config["annotation_cols"],
)
data.to_csv(os.path.join(path_out, f"{prog_key}_{trait_enrichment_config['library']}_{trait_enrichment_config['method']}_trait_enrichment.txt"), sep='\t', index=False)

Running Fisher enrichment:   0%|          | 0/30 [00:00<?, ?programs/s]

## Run motif enrichment analysis

In [13]:
motif_enrichment_config = config['motif_enrichment']
motif_enrichment_config

{'motif_file': '/cellar/users/aklie/opt/gene_program_evaluation/src/tests/test_data/motifs.meme',
 'seq_file': '/cellar/users/aklie/data/ref/genomes/hg38/hg38.fa',
 'loci_files': ['/cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_D0_2024_09_07.txt',
  '/cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_sample_D1_2024_09_07.txt',
  '/cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_sample_D2_2024_09_07.txt',
  '/cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_sample_D3_2024_09_07.txt'],
 'names': ['D0', 'sample_D1', 'sample_D2', 'sample_D3'],
 'output_loc': None,
 'window': 1000,
 'threshold': 0.001,
 'eps': 0.001,
 'reverse_complement': True,
 'sig': 0.05,
 'num_genes': None,
 'correlation': 'pearsonr',
 'n_jobs': -1,
 'inplace': False}

In [14]:
# Run motif enrichment and save results
loci_files = motif_enrichment_config['loci_files']
names = motif_enrichment_config['names']
for loci_file, name in zip(loci_files, names):
    logging.info(f'Running motif enrichment analysis for {loci_file}')
    motif_match_df, motif_count_df, motif_enrichment_df = compute_motif_enrichment(
        mdata, 
        prog_key=prog_key,
        data_key=data_key,
        loci_file=loci_file,
        **motif_enrichment_config,
    )
    motif_match_df.to_csv(os.path.join(path_out, f'{prog_key}_enhancer_test_{motif_enrichment_config["correlation"]}_sample_{name}_motif_match.txt'), sep='\t', index=False)
    motif_count_df.to_csv(os.path.join(path_out, f'{prog_key}_enhancer_test_{motif_enrichment_config["correlation"]}_sample_{name}_motif_count.txt'), sep='\t', index=False)
    motif_enrichment_df.to_csv(os.path.join(path_out, f'{prog_key}_enhancer_test_{motif_enrichment_config["correlation"]}_sample_{name}_motif_enrichment.txt'), sep='\t', index=False)

INFO:root:Running motif enrichment analysis for /cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_D0_2024_09_07.txt


  utils.warn_names_duplicates("var")


Number of matching genes: 4420
Number of loci: 15692
There are 54366 significant motif matches.


Computing motif enrichment:   0%|          | 0/8 [00:00<?, ?motifs/s]

INFO:root:Running motif enrichment analysis for /cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_sample_D1_2024_09_07.txt
  utils.warn_names_duplicates("var")


Number of matching genes: 4420
Number of loci: 15997
There are 49357 significant motif matches.


Computing motif enrichment:   0%|          | 0/8 [00:00<?, ?motifs/s]

INFO:root:Running motif enrichment analysis for /cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_sample_D2_2024_09_07.txt
  utils.warn_names_duplicates("var")


Number of matching genes: 4420
Number of loci: 16164
There are 53283 significant motif matches.


Computing motif enrichment:   0%|          | 0/8 [00:00<?, ?motifs/s]

INFO:root:Running motif enrichment analysis for /cellar/users/aklie/opt/gene_program_evaluation/examples/datasets/iPSC_EC/EnhancerPredictions_sample_D3_2024_09_07.txt
  utils.warn_names_duplicates("var")


Number of matching genes: 4420
Number of loci: 16263
There are 50764 significant motif matches.


Computing motif enrichment:   0%|          | 0/8 [00:00<?, ?motifs/s]

## Run explained variance

In [15]:
# Run explained variance
explained_variance_config = config['explained_variance']
explained_variance_ratio = compute_explained_variance_ratio(
    mdata, 
    prog_key=prog_key,
    data_key=data_key,
    **explained_variance_config,
)
explained_variance_ratio.index = mdata.mod[prog_key].var.index
explained_variance_ratio.index.name = 'program_name'
explained_variance_ratio.columns = ["variance_explained_ratio"]
explained_variance_ratio.to_csv(os.path.join(path_out, f'{prog_key}_variance_explained_ratio.txt'), sep='\t', index=True)

  utils.warn_names_duplicates("var")


Computing explained variance:   0%|          | 0/30 [00:00<?, ?programs/s]

## Software versions

In [17]:
# Save software versions
import joblib
import numpy as np
import scipy
import sklearn
import statsmodels
import scikit_posthocs as posthocs
import gseapy
import tangermeme

versions = {
    "evaluation_pipeline_versions": {
        'gene_program_evaluation': '0.0.1',
        'numpy': np.__version__,
        'pandas': pd.__version__,
        'mudata': mudata.__version__,
        'scipy': scipy.__version__,
        'scikit-learn': sklearn.__version__,
        'scikit-posthocs': posthocs.__version__,
        'statsmodels': statsmodels.__version__,
        'gseapy': gseapy.__version__,  # gene set enrichment analysis
        'tangermeme': tangermeme.__version__,  # motif enrichment analysis
    }
}

with open(os.path.join(path_out, 'software_versions.yml'), 'w') as f:
    yaml.dump(versions, f)

# DONE!

---