In [89]:
import os
import glob
import sys
import numpy as np
import pandas as pd
import collections
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation/app/')
import mudata
import scanpy as sc

from utils import count, count_unique
from utils import load_config

from parse import (
    parse_loadings,
    parse_obs_memberships,
    parse_software_versions
)

In [90]:
# Pipeline outputs
path_config = "/cellar/users/aklie/opt/gene_program_evaluation/examples/report/Endothelial/cNMF/report.yaml"

In [91]:
config = load_config(path_config)

In [92]:
# Parse config for paths
path_evaluation_outs = config["path_evaluation_outs"]
path_mdata = config["path_mdata"]
path_evaluation_config = config["path_evaluation_config"]
path_report_out = config["path_report_out"]

In [93]:
# Parse config for other parameters
data_key = config["data_key"]
prog_keys = config["prog_keys"]
categorical_keys = config["categorical_keys"] if config["categorical_keys"] else []
continuous_keys = config["continuous_keys"] if config["continuous_keys"] else []
annotations_loc = config["annotations_loc"]

In [94]:
# Load evaluation config
evaluation_config = load_config(path_evaluation_config)
evaluation_config

{'io': {'path_mdata': '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF.h5mu',
  'path_out': '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF',
  'data_key': 'rna',
  'prog_key': 'cNMF'},
 'categorical_association': {'categorical_keys': ['batch'],
  'pseudobulk_key': None,
  'test': 'pearsonr',
  'mode': 'one_vs_all',
  'n_jobs': -1,
  'inplace': False},
 'perturbation_association': {'groupby_key': None,
  'collapse_targets': True,
  'pseudobulk': False,
  'reference_targets': ['non-targeting'],
  'n_jobs': -1,
  'inplace': False},
 'gene_set_enrichment': {'prog_nam': None,
  'organism': 'human',
  'libraries': ['Reactome_2022', 'GO_Biological_Process_2023'],
  'method': 'fisher',
  'database': 'enrichr',
  'n_top': 500,
  'low_cutoff': 0.0,
  'n_jobs': -1,
  'inplace': False,
  'user_geneset': None,
  'max_size': 500,
  'min_size': 5},
 'trait_enrichment': {'gwas_data': '/cellar/users/aklie/opt/gene_program_

In [95]:
# Log configuration
for key, value in config.items():
    print((f"{key}: {value}"))

path_evaluation_outs: ['/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF']
path_evaluation_config: /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/evaluation_pipeline.yml
path_mdata: /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF.h5mu
path_report_out: /cellar/users/aklie/opt/gene_program_evaluation/examples/report/Endothelial/cNMF
prog_keys: ['cNMF']
data_key: rna
categorical_keys: ['batch', 'sample']
continuous_keys: ['n_counts']
annotations_loc: annotations.csv


## Load mudata

In [96]:
mdata = mudata.read_h5mu(path_mdata)
mdata.mod = collections.OrderedDict(sorted(mdata.mod.items()))
mdata



## `method` and `n_components`

In [97]:
def parse_methods(mdata, data_key="rna"):
    methods = {}
    n_components = {}
    for key in mdata.mod.keys():
        method_split = key.split("_")
        if len(method_split) > 1:
            method = "_".join(method_split[:-1])
        else:
            method = method_split[0]
        if method != data_key:
            methods[key] = method
            n_components[key] = mdata.mod[key].X.shape[1]
    return methods, n_components

In [98]:
res_methods, res_ks = parse_methods(mdata, data_key=data_key)

In [99]:
res_methods[prog_keys[0]]

'cNMF'

In [100]:
res_ks[prog_keys[0]]

60

## `loadings`

In [103]:
res_loadings = parse_loadings(mdata, data_key=data_key)

In [104]:
res_loadings[prog_keys[0]]

gene_name,FAM87B:ENSG00000177757,FAM41C:ENSG00000230368,SAMD11:ENSG00000187634,NOC2L:ENSG00000188976,KLHL17:ENSG00000187961,PLEKHN1:ENSG00000187583,PERM1:ENSG00000187642,HES4:ENSG00000188290,ISG15:ENSG00000187608,AGRN:ENSG00000188157,...,MT-ATP8:ENSG00000228253,MT-ATP6:ENSG00000198899,MT-CO3:ENSG00000198938,MT-ND3:ENSG00000198840,MT-ND4L:ENSG00000212907,MT-ND4:ENSG00000198886,MT-ND5:ENSG00000198786,MT-ND6:ENSG00000198695,MT-CYB:ENSG00000198727,MAFIP:ENSG00000274847
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,5.860408e-07,0.0001419481,5.4e-05,4.9e-05,0.0,5.3e-05,9.3e-05,0.0,...,0.0,0.0,0.0,0.0,5.2e-05,0.0,0.0,0.0,0.0,4.115031e-06
2,0.0,6.6e-05,0.0,2.356824e-05,2.2e-05,0.0,0.0,0.0,0.000118,0.0,...,0.000151,0.000232,0.0,0.0,0.000259,0.000157,0.000102,0.0,8.4e-05,3.391089e-05
3,0.0,1.5e-05,0.0,4.494347e-05,6.3e-05,0.0,0.0,0.000291,5.7e-05,7.4e-05,...,7.6e-05,0.000199,0.000203,0.000136,0.000185,0.000203,0.000198,0.000291,0.000165,0.0
4,0.0,0.0,5.437769e-05,0.0001329747,1.6e-05,0.00017,0.0,9e-05,0.00015,0.0,...,0.0,0.0,0.0,0.0,0.000169,2.9e-05,0.000139,0.000193,0.0,0.0
5,0.0,0.0,0.0,9.711936e-05,0.000325,7.4e-05,0.0,0.000399,0.008099,0.000147,...,4.2e-05,0.000125,0.00012,0.000137,0.000125,0.000169,0.000105,3e-06,0.000147,0.0
6,0.0,0.0,0.0,0.0001611588,0.0,4e-06,2.142176e-05,0.0,6.7e-05,0.0,...,0.000225,0.000114,0.000101,0.000103,0.000187,0.000133,0.000106,3.3e-05,9.1e-05,0.0
7,0.0,1.8e-05,0.0,0.000158323,0.0,3.1e-05,0.0,3.2e-05,0.0,4.2e-05,...,0.0,0.000728,0.000932,0.000963,0.0,0.001051,0.0,0.0,0.001114,0.0
8,0.0,0.0,0.0,2.932366e-05,0.0,0.0,0.0,7.4e-05,4.9e-05,3.2e-05,...,0.0,7.9e-05,0.000318,0.000304,0.000256,0.000202,0.000245,0.0,0.00036,3.530803e-05
9,0.0,0.0,0.0,0.0001292109,3e-05,5.1e-05,0.0,0.0,4.4e-05,6.8e-05,...,0.0,0.00021,0.000344,0.0,0.0,0.000297,0.0,0.0,9.9e-05,3.722996e-05
10,0.0,1e-06,5.961796e-05,0.0,0.0,1e-05,0.0,0.003982,0.000262,0.000337,...,6.6e-05,8.5e-05,8.2e-05,7.9e-05,0.000118,9.8e-05,0.000113,7.5e-05,6.4e-05,0.0


## `obs_membership`

In [105]:
res_obs_membership = parse_obs_memberships(mdata, data_key=data_key)

In [106]:
res_obs_membership[prog_keys[0]]

program_name,1,2,3,4,5,6,7,8,9,10,...,51,52,53,54,55,56,57,58,59,60
obs_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"('ANKEF1:GAAGGGACATCATTCACGCCT:AAACCCAAGAAGTCAT-scRNAseq_2kG_11AMDox_1',)",121.749060,64.133135,8.107442,301.635912,11.174598,0.000000,0.000000,97.718911,336.144644,1.291447,...,233.636156,447.831548,139.534562,0.000000,0.000000,20.765844,143.199367,290.182025,34.604045,628.739760
"('MTRR:GTGGTCCTGGGTACCGAGCAT:AAACCCAAGAGGACTC-scRNAseq_2kG_11AMDox_1',)",116.857174,58.951640,0.000000,24.340045,0.291422,0.000000,0.000000,0.000000,0.000000,3.313101,...,67.100028,415.237080,106.003152,0.000000,0.000000,124.525017,175.423327,289.857663,156.831588,221.619807
"('JAG1:GATGCGCCCTGCCCGGCGTGC:AAACCCACAATCGCAT-scRNAseq_2kG_11AMDox_1',)",0.000000,0.000000,9.599494,0.000000,0.000000,0.000000,223.030173,0.000000,136.979013,0.000000,...,532.648754,204.271224,256.087966,0.000000,0.000000,225.720921,315.902555,221.630012,191.158832,1508.063771
"('GOLPH3L:GGAAGTTTGTGCTCTCTGCG:AAACCCACACCAGCGT-scRNAseq_2kG_11AMDox_1',)",0.000000,122.133827,10.861754,105.185642,0.000000,0.000000,0.000000,94.525288,9.906529,0.000000,...,199.618592,348.723283,118.126613,75.682063,23.327307,46.885769,254.726283,231.582575,376.122800,932.057594
"('ARHGEF15-TSS2:GACCTACTGCAGAGTTAGGG:AAACCCACATTAAGCC-scRNAseq_2kG_11AMDox_1',)",541.275596,37.484053,0.000000,754.398716,0.000000,1.170416,0.000000,95.477637,264.667966,16.275740,...,391.251173,344.427816,0.000000,346.156259,145.217304,195.982947,79.628793,265.616580,204.519899,901.975519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"('ARHGAP26:GCTCGGAAGTGCGGACTATCG:TTTGTTGCAGTTGCGC-scRNAseq_2kG_2PMDox2_7',)",18.207947,77.236697,37.510989,179.563481,0.000000,0.000000,264.594245,55.222566,58.927143,3.485854,...,196.284818,383.096991,97.975362,47.863879,141.223926,78.169581,48.317410,65.962411,54.581000,0.000000
"('Enhancer-BP-etc-chr7-150704624-150705124:GCAAGTGGGCGCACAAAGGGA:TTTGTTGGTATCACGT-scRNAseq_2kG_2PMDox2_7',)",176.165401,0.000000,3.781613,88.219071,0.000000,0.000000,522.186737,337.551912,0.000000,5.781014,...,0.000000,210.987869,27.231758,15.451982,220.460153,28.465382,374.087279,277.836118,217.102964,166.132388
"('ERGIC3:GCATGGGGACCGGCCGGAAAG:TTTGTTGGTTCAGCGC-scRNAseq_2kG_2PMDox2_7',)",128.595762,0.000000,33.366939,299.690733,22.669654,0.000000,904.535637,92.267307,0.000000,0.000000,...,73.585602,551.063049,0.000000,79.799175,364.806463,42.103565,327.637684,343.052591,409.746455,260.209294
"('MMRN2:GTCTTCAAGTTAAATCTCAGG:TTTGTTGTCTATCACT-scRNAseq_2kG_2PMDox2_7',)",47.865646,0.000000,12.960127,29.554090,16.559151,1.112656,45.628822,17.997079,39.754875,5.333006,...,190.243596,70.161919,19.528940,0.000000,115.283159,277.144962,166.382661,75.062468,217.111685,31.075039


## `categorical_associations`

In [107]:
def parse_categorical_associations(dirs, prog_keys):
    categorical_associations_results = {}
    categorical_associations_posthoc = {}
    for dir, prog_key in zip(dirs, prog_keys):
        
        # Initialize dictionaries 
        categorical_associations_results[prog_key] = {}
        categorical_associations_posthoc[prog_key] = {}

        # Load association results
        categorical_association_files = glob.glob(os.path.join(dir, f"{prog_key}_*_association_results.txt"))
        for categorical_association_file in categorical_association_files:
            categorical_association_df = pd.read_csv(categorical_association_file, sep="\t")
            categorical_key = categorical_association_file.split(f"{prog_key}_")[1].split("_association_results.txt")[0]
            categorical_associations_results[prog_key][categorical_key] = categorical_association_df
        
        # Load association posthoc results
        categorical_association_posthoc_files = glob.glob(os.path.join(dir, f"{prog_key}_*_association_posthoc.txt"))
        for categorical_association_posthoc_file in categorical_association_posthoc_files:
            categorical_association_posthoc_df = pd.read_csv(categorical_association_posthoc_file, sep="\t")
            categorical_key = categorical_association_posthoc_file.split(f"{prog_key}_")[1].split("_association_posthoc.txt")[0]
            categorical_associations_posthoc[prog_key][categorical_key] = categorical_association_posthoc_df
            
    return categorical_associations_results, categorical_associations_posthoc

In [108]:
res_categorical_associations_results, res_categorical_associations_posthoc = parse_categorical_associations(path_evaluation_outs, prog_keys)

In [109]:
res_categorical_associations_results[prog_keys[0]]["batch"]

Unnamed: 0,program_name,batch_max_pearsonr_stat,batch_max_pearsonr_pval
0,1,0.1706,0.0
1,2,0.055254,1.277865e-144
2,3,0.033283,1.256675e-53
3,4,0.23908,0.0
4,5,0.023984,1.1446270000000002e-28
5,6,0.014689,1.029003e-11
6,7,0.268933,0.0
7,8,0.090685,0.0
8,9,0.649342,0.0
9,10,0.011966,3.002763e-08


In [22]:
res_categorical_associations_posthoc[prog_keys[0]]["batch"]

KeyError: 'batch'

## `perturbation_association_results`

In [60]:
def parse_perturbation_associations(
    dirs, 
    prog_keys,
    stratification_key=None
):
    perturbation_associations = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in perturbation_associations:
            perturbation_associations[prog_key] = {}
        perturbation_associations[prog_key]["results"] = {}
        perturbation_associations[prog_key]["gene_guides"] = []
        perturbation_associations[prog_key]["stratification_keys"] = []
        perturbation_associations[prog_key]["level_keys"] = []
        perturbation_association_files = glob.glob(os.path.join(dir, f"{prog_key}_*_perturbation_association.txt"))
        for perturbation_association_file in perturbation_association_files:
            gene_guide = perturbation_association_file.split(f"{prog_key}_")[1].split("_")[0]
            if not stratification_key:
                curr_stratification_key = "global"
                curr_level_key = "global"
            else:
                curr_stratification_key = stratification_key
                curr_level_key = perturbation_association_file.split(f"{prog_key}_{gene_guide}_{curr_stratification_key}_")[1].split("_perturbation_association.txt")[0]
            print(f"Gene/guide: {gene_guide}, Stratification key: {curr_stratification_key}, Level key: {curr_level_key}")
            df = pd.read_csv(perturbation_association_file, sep="\t")
            perturbation_associations[prog_key]["results"][f"{gene_guide}_{curr_stratification_key}_{curr_level_key}"] = df
            perturbation_associations[prog_key]["gene_guides"].append(gene_guide)
            perturbation_associations[prog_key]["stratification_keys"].append(curr_stratification_key)
            perturbation_associations[prog_key]["level_keys"].append(curr_level_key)
    return perturbation_associations

In [61]:
pertubation_associations = parse_perturbation_associations(
    path_evaluation_outs, 
    prog_keys, 
    stratification_key=evaluation_config["perturbation_association"]["groupby_key"]
)

Gene/guide: gene, Stratification key: global, Level key: global


In [62]:
first_gene_guide = pertubation_associations[prog_keys[0]]["gene_guide"][0]
first_stratification_key = pertubation_associations[prog_keys[0]]["stratification_key"][0]
first_level_key = pertubation_associations[prog_keys[0]]["level_key"][0]
res_pertubation_association = pertubation_associations[prog_keys[0]]["results"][f"{first_gene_guide}_{first_stratification_key}_{first_level_key}"]
res_pertubation_association.head()

Unnamed: 0,target_name,program_name,stat,pval
0,ABCC10,5,0.57549,0.00015102
1,ABCG4,56,0.23005,5.3652e-05
2,ABHD8,22,0.45855,2.9076e-07
3,ACAD11,15,0.64903,6.9167e-08
4,ACAD11,30,0.59919,4.8005e-12


In [63]:
pertubation_associations[prog_keys[0]]["results"].keys()

dict_keys(['gene_global_global'])

In [113]:
pertubation_associations[prog_keys[0]].keys()

dict_keys(['results', 'gene_guide', 'stratification_key', 'level_key'])

## `explained_variance`

In [64]:
def parse_explained_variance(dirs, prog_keys):
    explained_variance_ratios = {}
    for dir, prog_key in zip(dirs, prog_keys):
        try:
            explained_variance_ratio_file = os.path.join(dir, f"{prog_key}_variance_explained_ratio.txt")
            df = pd.read_csv(explained_variance_ratio_file, sep="\t")
            explained_variance_ratios[prog_key] = df
        except FileNotFoundError:
            print(f"File not found: {explained_variance_ratio_file}")
    return explained_variance_ratios

In [65]:
res_explained_variance = parse_explained_variance(path_evaluation_outs, prog_keys)

In [66]:
res_explained_variance[prog_keys[0]]

Unnamed: 0,program_name,variance_explained_ratio
0,1,0.018046
1,2,0.026766
2,3,0.013374
3,4,0.029387
4,5,0.002182
5,6,0.001648
6,7,0.023513
7,8,0.014861
8,9,0.0297
9,10,0.001846


## `geneset_enrichments`

In [67]:
def parse_geneset_enrichments(dirs, prog_keys):
    geneset_enrichments = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in geneset_enrichments:
            geneset_enrichments[prog_key] = {}
        geneset_enrichments[prog_key]["results"] = {}
        geneset_enrichments[prog_key]["libraries"] = []
        geneset_enrichments[prog_key]["methods"] = []
        geneset_enrichment_files = glob.glob(os.path.join(dir, f"{prog_key}_*_geneset_enrichment.txt"))
        for gene_set_enrichment_file in geneset_enrichment_files:
            method = gene_set_enrichment_file.split("_geneset_enrichment.txt")[0].split("_")[-1]
            library = gene_set_enrichment_file.split(f"{prog_key}_")[1].split(f"_{method}_geneset_enrichment.txt")[0]
            print(f"Library: {library}, Method: {method}")
            df = pd.read_csv(gene_set_enrichment_file, sep="\t")
            geneset_enrichments[prog_key]["results"][f"{library}_{method}"] = df
            geneset_enrichments[prog_key]["libraries"].append(library)
            geneset_enrichments[prog_key]["methods"].append(method)
    return geneset_enrichments

In [68]:
geneset_enrichments = parse_geneset_enrichments(path_evaluation_outs, prog_keys)

Library: Reactome_2022, Method: fisher
Library: GO_Biological_Process_2023, Method: fisher


In [69]:
geneset_enrichments[prog_keys[0]]["results"].keys()

dict_keys(['Reactome_2022_fisher', 'GO_Biological_Process_2023_fisher'])

In [70]:
# Find all unique libraries in list in one line
list(set(geneset_enrichments[prog_keys[0]]["libraries"]))

['GO_Biological_Process_2023', 'Reactome_2022']

In [71]:
first_method = geneset_enrichments[prog_keys[0]]["methods"][0]
first_library = geneset_enrichments[prog_keys[0]]["libraries"][0]
res_geneset_enrichments = geneset_enrichments[prog_keys[0]]["results"][f"{first_library}_{first_method}"]
res_geneset_enrichments.head()

Unnamed: 0,Gene_set,Term,P-value,Adjusted P-value,Odds Ratio,Combined Score,Genes,program_name,overlap_numerator,overlap_denominator
0,gs_ind_0,2-LTR Circle Formation R-HSA-164843,0.136777,0.441518,9.708661,19.314409,HMGA1,1,1,8
1,gs_ind_0,ABC Transporter Disorders R-HSA-5619084,0.165154,0.467384,2.494413,4.492128,UBC;PSMA7;PSMD8,1,3,77
2,gs_ind_0,ABC-family Proteins Mediated Transport R-HSA-3...,0.038397,0.344295,2.988335,9.741281,PSMD8;PSMA7;EIF2S3;UBC;EIF2S2,1,5,102
3,gs_ind_0,ADORA2B Mediated Anti-Inflammatory Cytokine Pr...,0.911332,0.934492,0.620191,0.057583,ADM2,1,1,131
4,gs_ind_0,ALK Mutants Bind TKIs R-HSA-9700645,0.198016,0.497816,6.59937,10.687067,NPM1,1,1,12


## `motif_enrichments`

In [72]:
def parse_motif_enrichments(
    dirs, 
    prog_keys,
    stratification_key=None
):
    motif_enrichments = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in motif_enrichments:
            motif_enrichments[prog_key] = {}
        motif_enrichments[prog_key]["results"] = {}
        motif_enrichments[prog_key]["E_P_types"] = []
        motif_enrichments[prog_key]["databases"] = []
        motif_enrichments[prog_key]["test_types"] = []
        motif_enrichments[prog_key]["stratification_keys"] = []
        motif_enrichments[prog_key]["level_keys"] = []
        motif_enrichment_files = glob.glob(os.path.join(dir, f"{prog_key}_*_motif_enrichment.txt"))
        for motif_enrichment_file in motif_enrichment_files:
            E_P_type = motif_enrichment_file.split(f"{prog_key}_")[1].split("_")[0]
            database = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_")[1].split("_")[0]
            test_type = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_{database}_")[1].split("_")[0]
            if not stratification_key:
                curr_stratification_key = "global"
                curr_level_key = "global"
            else:
                curr_level_key = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_{database}_{test_type}_{curr_stratification_key}_")[1].split("_motif_enrichment.txt")[0]
                curr_stratification_key = stratification_key
            print(f"E_P_type: {E_P_type}, Database: {database}, Test type: {test_type}, Stratification key: {curr_stratification_key}, Level key: {curr_level_key}")
            df = pd.read_csv(motif_enrichment_file, sep="\t")
            motif_enrichments[prog_key]["results"][f"{E_P_type}_{database}_{test_type}_{curr_stratification_key}_{curr_level_key}"] = df
            motif_enrichments[prog_key]["E_P_types"].append(E_P_type)
            motif_enrichments[prog_key]["databases"].append(database)
            motif_enrichments[prog_key]["test_types"].append(test_type)
            motif_enrichments[prog_key]["stratification_keys"].append(curr_stratification_key)
            motif_enrichments[prog_key]["level_keys"].append(curr_level_key)
    
    return motif_enrichments

In [73]:
res_motif_enrichment = parse_motif_enrichments(
    path_evaluation_outs, 
    prog_keys,
    stratification_key=evaluation_config["motif_enrichment"]["groupby_key"]
)

E_P_type: promoter, Database: hocomoco, Test type: ttest, Stratification key: global, Level key: global
E_P_type: enhancer, Database: hocomoco, Test type: ttest, Stratification key: global, Level key: global


In [75]:
first_E_P_type = res_motif_enrichment[prog_keys[0]]["E_P_types"][0]
first_database = res_motif_enrichment[prog_keys[0]]["databases"][0]
first_test_type = res_motif_enrichment[prog_keys[0]]["test_types"][0]
first_stratification_key = res_motif_enrichment[prog_keys[0]]["stratification_keys"][0]
first_level_key = res_motif_enrichment[prog_keys[0]]["level_keys"][0]
res_motif_enrichment[prog_keys[0]]["results"][f"{first_E_P_type}_{first_database}_{first_test_type}_{first_stratification_key}_{first_level_key}"]

Unnamed: 0,program_name,motif,stat,pval
0,1,AHR,1.594955,0.044631
1,10,AHR,1.242518,0.351685
2,11,AHR,0.901666,0.681555
3,12,AHR,1.204339,0.446282
4,13,AHR,0.949163,0.829277
...,...,...,...,...
40615,6,ZSCA4,0.742726,0.234976
40616,60,ZSCA4,1.113110,0.668214
40617,7,ZSCA4,0.842898,0.488940
40618,8,ZSCA4,0.982363,0.936049


In [76]:
res_motif_enrichment[prog_keys[0]]["results"].keys()

dict_keys(['promoter_hocomoco_ttest_global_global', 'enhancer_hocomoco_ttest_global_global'])

## `trait_enrichments`

In [110]:
# Same as geneset enrichmetns
def parse_trait_enrichments(dirs, prog_keys):
    trait_enrichments = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in trait_enrichments:
            trait_enrichments[prog_key] = {}
        trait_enrichments[prog_key]["results"] = {}
        trait_enrichments[prog_key]["databases"] = []
        trait_enrichments[prog_key]["methods"] = []
        trait_enrichment_files = glob.glob(os.path.join(dir, f"{prog_key}_*_trait_enrichment.txt"))
        for trait_enrichment_file in trait_enrichment_files:
            method = trait_enrichment_file.split("_trait_enrichment.txt")[0].split("_")[-1]
            database = trait_enrichment_file.split(f"{prog_key}_")[1].split(f"_{method}_trait_enrichment.txt")[0]
            print(f"Database: {database}, Method: {method}")
            df = pd.read_csv(trait_enrichment_file, sep="\t")
            trait_enrichments[prog_key]["results"][f"{database}_{method}"] = df
            trait_enrichments[prog_key]["databases"].append(database)
            trait_enrichments[prog_key]["methods"].append(method)
    return trait_enrichments

In [111]:
trait_enrichments = parse_trait_enrichments(path_evaluation_outs, prog_keys)

Database: OT_GWAS, Method: fisher


In [112]:
first_method = trait_enrichments[prog_keys[0]]["methods"][0]
first_database= trait_enrichments[prog_keys[0]]["databases"][0]
res_trait_enrichments = trait_enrichments[prog_keys[0]]["results"][f"{first_database}_{first_method}"]
res_trait_enrichments.head()

Unnamed: 0,term,adj_pval,trait_efos,trait_category,program_name,enrichment,trait_reported,genes,study_id,pmid,-log10(adj_pval)
0,GO_0036273,0.020498,GO_0036273,biological process,19,8.017121,Statin medication,IRF2BP2;PNPLA3;LIPG;HMGCR;LDLR;PIR;VEGFA,FINNGEN_R6_RX_STATIN,,1.688285
1,GO_0036273,0.057469,GO_0036273,biological process,18,4.473323,Statin medication,MAFB;CDKN2B;TRIB1;HHEX;VEGFA,FINNGEN_R6_RX_STATIN,,1.24057
2,EFO_0004875,0.086676,EFO_0004875,biological process,36,1.894045,Highest math class taken (MTAG) [MTAG],TET2;NR2F2;CTTNBP2NL;PHIP;RYBP;ZBTB38;PURA;ZSW...,GCST006568,PMID:30038396,1.062102
3,GO_0006306,0.115787,GO_0006306,biological process,47,2.561764,DNA methylation variation (age effect),FRY;DACH1;NFIA;TNR;ZNF467;PLCB1;EEA1;SVIL;ATP9A,GCST006660,PMID:30348214,0.936342
4,EFO_0004337,0.116656,EFO_0004337,biological process,36,3.542221,General cognitive ability,TET2;RBMS1;ATF7IP;FOXO3;NR1D2;JMJD1C,GCST006269,PMID:29844566,0.933092


In [358]:
res_trait_enrichments.query(f"program_name == '0'")

Unnamed: 0,term,adj_pval,trait_efos,trait_category,program_name,trait_reported,genes,study_id,pmid,-log10(adj_pval)


## `software_versions`

In [80]:
res_software_versions = parse_software_versions(path_evaluation_outs)

In [81]:
res_software_versions[prog_keys[0]]

{'evaluation_pipeline_versions': {'gene_program_evaluation': '0.0.1',
  'gseapy': '1.1.3',
  'mudata': '0.2.3',
  'numpy': '1.26.2',
  'pandas': '1.5.3',
  'scikit-learn': '1.3.2',
  'scikit-posthocs': '0.9.0',
  'scipy': '1.11.4',
  'statsmodels': '0.14.0',
  'tangermeme': '0.3.0'}}

# `parse`

In [82]:
def parse(
    mdata,
    dirs,
    data_key="rna",
    perturbation_association_stratification_key=None,
    motif_enrichment_stratification_key=None
):
    methods, n_components = parse_methods(mdata, data_key)
    loadings = parse_loadings(mdata, data_key)
    obs_memberships = parse_obs_memberships(mdata, data_key)
    categorical_associations_results, categorical_associations_posthoc = parse_categorical_associations(dirs, methods.keys())
    perturbation_associations = parse_perturbation_associations(dirs, methods.keys(), perturbation_association_stratification_key)
    geneset_enrichments = parse_geneset_enrichments(dirs, methods.keys())
    trait_enrichments = parse_trait_enrichments(dirs, methods.keys())
    motif_enrichments = parse_motif_enrichments(dirs, methods.keys(), motif_enrichment_stratification_key)
    explained_variance_ratios = parse_explained_variance(dirs, methods.keys())
    software_versions = parse_software_versions(dirs)
    return {
        "methods": methods,
        "n_components": n_components,
        "loadings": loadings,
        "obs_memberships": obs_memberships,
        "categorical_associations_results": categorical_associations_results,
        "categorical_associations_posthoc": categorical_associations_posthoc,
        "perturbation_associations": perturbation_associations,
        "geneset_enrichments": geneset_enrichments,
        "trait_enrichments": trait_enrichments,
        "motif_enrichments": motif_enrichments,
        "explained_variance_ratios": explained_variance_ratios,
        "software_versions": software_versions,
    }

In [83]:
results = parse(
    mdata,
    path_evaluation_outs,
    data_key=data_key,
    perturbation_association_stratification_key=evaluation_config["perturbation_association"]["groupby_key"],
    motif_enrichment_stratification_key=evaluation_config["motif_enrichment"]["groupby_key"]
)

Gene/guide: gene, Stratification key: global, Level key: global
Library: Reactome_2022, Method: fisher
Library: GO_Biological_Process_2023, Method: fisher
Database: OT_GWAS, Method: fisher
E_P_type: promoter, Database: hocomoco, Test type: ttest, Stratification key: global, Level key: global
E_P_type: enhancer, Database: hocomoco, Test type: ttest, Stratification key: global, Level key: global


In [85]:
default_run = "cNMF"

In [86]:
results.keys()

dict_keys(['methods', 'n_components', 'loadings', 'obs_memberships', 'categorical_associations_results', 'categorical_associations_posthoc', 'perturbation_associations', 'geneset_enrichments', 'trait_enrichments', 'motif_enrichments', 'explained_variance_ratios', 'software_versions'])

In [87]:
results["explained_variance_ratios"]

{'cNMF':     program_name  variance_explained_ratio
 0              1                  0.018046
 1              2                  0.026766
 2              3                  0.013374
 3              4                  0.029387
 4              5                  0.002182
 5              6                  0.001648
 6              7                  0.023513
 7              8                  0.014861
 8              9                  0.029700
 9             10                  0.001846
 10            11                  0.003365
 11            12                  0.012964
 12            13                  0.008912
 13            14                  0.000281
 14            15                  0.015150
 15            16                  0.003249
 16            17                  0.003378
 17            18                  0.004095
 18            19                  0.007991
 19            20                  0.005638
 20            21                  0.003457
 21            22       

In [88]:
results["motif_enrichments"][prog_key]["results"].keys()

NameError: name 'prog_key' is not defined

# DONE!

---