In [233]:
import os
import glob
import sys
import numpy as np
import pandas as pd
import collections
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation/app/')
import mudata
import scanpy as sc

from utils import count, count_unique
from utils import load_config

from parse import (
    parse_methods,
    parse_loadings,
    parse_obs,
    parse_obs_memberships,
    parse_software_versions
)

In [186]:
# Pipeline outputs
path_config = "/cellar/users/aklie/opt/gene_program_evaluation/examples/report/iPSC_EC/cNMF/cNMF_30/report.yaml"

In [187]:
config = load_config(path_config)

In [188]:
# Parse config for paths
path_evaluation_outs = config["path_evaluation_outs"]
path_mdata = config["path_mdata"]
path_evaluation_config = config["path_evaluation_config"]
path_report_out = config["path_report_out"]

In [189]:
# Parse config for other parameters
data_key = config["data_key"]
prog_keys = config["prog_keys"]
categorical_keys = config["categorical_keys"] if config["categorical_keys"] else []
continuous_keys = config["continuous_keys"] if config["continuous_keys"] else []
annotations_loc = config["annotations_loc"]

In [190]:
# Load evaluation config
evaluation_config = load_config(path_evaluation_config)
evaluation_config

{'io': {'path_mdata': '/cellar/users/aklie/opt/gene_program_evaluation/examples/inference/iPSC_EC/cNMF/cNMF_30_0.2_gene_names.h5mu',
  'path_out': '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30',
  'data_key': 'rna',
  'prog_key': 'cNMF'},
 'categorical_association': {'categorical_keys': ['sample'],
  'pseudobulk_key': None,
  'test': 'pearsonr',
  'mode': 'one_vs_all',
  'n_jobs': -1,
  'inplace': False},
 'perturbation_association': {'groupby_key': 'sample',
  'collapse_targets': True,
  'pseudobulk': False,
  'reference_targets': ['non-targeting'],
  'n_jobs': -1,
  'inplace': False},
 'gene_set_enrichment': {'prog_nam': None,
  'organism': 'human',
  'libraries': ['Reactome_2022', 'GO_Biological_Process_2023'],
  'method': 'fisher',
  'database': 'enrichr',
  'n_top': 500,
  'low_cutoff': 0.0,
  'n_jobs': -1,
  'inplace': False,
  'user_geneset': None,
  'max_size': 500,
  'min_size': 5},
 'trait_enrichment': {'gwas_data': '/cellar/users/a

In [191]:
# Log configuration
for key, value in config.items():
    print((f"{key}: {value}"))

path_evaluation_outs: ['/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30']
path_evaluation_config: /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30/evaluation_pipeline.yml
path_mdata: /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30/cNMF_30.h5mu
path_report_out: /cellar/users/aklie/opt/gene_program_evaluation/examples/report/iPSC_EC/cNMF/cNMF_30
prog_keys: ['cNMF_30']
data_key: rna
categorical_keys: ['sample', 'leiden']
continuous_keys: ['n_counts']
annotations_loc: annotations.csv


## Load mudata

In [156]:
mdata = mudata.read_h5mu(path_mdata)
mdata.mod = collections.OrderedDict(sorted(mdata.mod.items()))
mdata

  utils.warn_names_duplicates("var")


## `method` and `n_components`

In [201]:
def parse_methods(mdata, data_key="rna"):
    methods = {}
    n_components = {}
    for key in mdata.mod.keys():
        method_split = key.split("_")
        if len(method_split) > 1:
            method = "_".join(method_split[:-1])
        else:
            method = method_split[0]
        if method != data_key:
            methods[key] = method
            n_components[key] = mdata.mod[key].X.shape[1]
    return methods, n_components

In [202]:
res_methods, res_ks = parse_methods(mdata, data_key=data_key)

In [203]:
res_methods[prog_keys[0]]

'cNMF'

In [204]:
res_ks[prog_keys[0]]

30

## `loadings`

In [205]:
res_loadings = parse_loadings(mdata, data_key=data_key)

In [206]:
res_loadings[prog_keys[0]]

gene_name,SEMA3F,CFTR,CYP51A1,HECW1,KLHL13,CASP10,CFLAR,TFPI,MTMR7,SLC7A2,...,AC026316.5,AL591485.1,AL162417.1,AL390957.1,LINC02478,AL033530.1,AL162718.1,AC111006.1,AL136419.1,AC007846.2
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000267,0.0001066837,0.0,0.000881,0.000513,8.4e-05,0.0,0.0,0.000722,0.000409,...,0.0,0.0,6.1e-05,0.0,0.0,0.0,0.0,6.4e-05,1.845254e-05,3.9e-05
1,0.0,0.0,0.000607,0.0,0.0,9.4e-05,0.001517148,0.001669,0.0,2.3e-05,...,0.0,0.0,0.000223,0.000127,0.000181,0.0,0.000173,2.5e-05,3.621885e-05,0.0
2,0.000199,0.0,0.0,0.000215,0.000155,0.0,1.138748e-05,0.0,0.000174,0.000238,...,4.5e-05,0.000142,6e-05,6.8e-05,9.7e-05,0.000254,2e-06,0.0002,7.164031e-05,8.4e-05
3,0.0,0.0,0.001982,0.0,0.000225,6.1e-05,4.756013e-05,0.0,0.000258,0.000399,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9e-05,2.844309e-05,0.0
4,0.00113,0.0,0.0,9.9e-05,0.000232,7.7e-05,0.0,0.0,0.00017,0.000401,...,0.0,0.0,7.2e-05,0.0,5e-06,0.0,0.0,0.0,3.507478e-05,0.0
5,0.0,0.0,0.001291,0.0,0.000113,8.4e-05,8.444412e-07,0.0,0.00016,0.0,...,0.000238,0.000141,0.0,0.00018,3e-05,0.000161,0.0,0.000321,6.921149e-06,0.0
6,0.0,4.375732e-05,0.000295,1.9e-05,0.000436,0.0,0.0,0.0,0.00015,0.000641,...,0.0,9e-06,0.0,0.00037,0.0,0.000571,0.0,4.1e-05,7.933604e-05,0.0
7,0.000161,0.0002328258,0.0,0.000813,0.000122,0.000161,0.0,0.0,0.000581,0.000402,...,0.00029,3.4e-05,0.0,0.0,0.0,0.000137,0.0,4.7e-05,6.838258e-06,0.000145
8,0.0,0.0,0.001347,0.0,0.000524,8.7e-05,0.0001198437,0.0,0.0,3.4e-05,...,0.0,0.0,0.000195,0.0,0.0,0.0,0.0,9.8e-05,0.0001055024,0.0
9,0.000115,0.0,0.001269,0.0,0.0,0.00025,0.00153344,0.002689,0.0,0.0,...,0.0,3.9e-05,0.0,0.000147,6.5e-05,6e-06,0.000333,0.0,2.424239e-05,0.0


## `obs_membership`

In [None]:
res_obs_membership = parse_obs_memberships(mdata, data_key=data_key)

In [None]:
res_obs_membership[prog_keys[0]]

program_name,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
obs_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01_01_06__s1,0.307338,0.000000,0.121418,0.135263,0.140723,0.000000,0.013140,0.030572,0.021220,0.008757,...,0.022937,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
01_01_17__s1,0.231799,0.014621,0.051004,0.140972,0.154207,0.002151,0.018437,0.066379,0.000000,0.012373,...,0.000000,0.000000,0.000000,0.000000,0.016740,0.028391,0.000000,0.014055,0.017937,0.013680
01_01_29__s1,0.294956,0.000000,0.000000,0.154108,0.169964,0.000000,0.152439,0.063943,0.066985,0.000000,...,0.000000,0.006544,0.000000,0.011999,0.008542,0.014532,0.033448,0.000000,0.000000,0.005936
01_01_34__s1,0.119799,0.000000,0.090509,0.072802,0.067988,0.000000,0.027995,0.232284,0.014503,0.004474,...,0.000000,0.000000,0.000000,0.160598,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
01_01_55__s1,0.079256,0.000000,0.132262,0.155498,0.123116,0.000000,0.103695,0.160777,0.000000,0.009275,...,0.000000,0.009231,0.002798,0.000000,0.059831,0.000000,0.005614,0.000000,0.011201,0.007107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48_95_94__s3,0.000000,0.327238,0.024796,0.000000,0.000000,0.000000,0.000000,0.072411,0.000000,0.302006,...,0.000000,0.000000,0.000000,0.000000,0.012733,0.000000,0.000000,0.000000,0.000000,0.000000
48_96_21__s3,0.001956,0.326830,0.005394,0.049566,0.000000,0.000000,0.000000,0.000000,0.016738,0.333939,...,0.020051,0.007999,0.000000,0.000000,0.007264,0.000000,0.000000,0.024587,0.008488,0.000000
48_96_35__s3,0.000000,0.427634,0.044526,0.048376,0.008330,0.000000,0.000000,0.000000,0.004547,0.224355,...,0.000000,0.000000,0.000000,0.016253,0.000000,0.000000,0.000000,0.006801,0.001932,0.000000
48_96_42__s3,0.000000,0.412141,0.000000,0.000000,0.000000,0.000000,0.000000,0.033835,0.000000,0.157821,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.014767,0.000000,0.051600,0.000000,0.006799


## `categorical_associations`

In [None]:
def parse_categorical_associations(dirs, prog_keys):
    categorical_associations_results = {}
    categorical_associations_posthoc = {}
    for dir, prog_key in zip(dirs, prog_keys):
        
        # Initialize dictionaries 
        categorical_associations_results[prog_key] = {}
        categorical_associations_posthoc[prog_key] = {}

        # Load association results
        categorical_association_files = glob.glob(os.path.join(dir, f"{prog_key}_*_association_results.txt"))
        for categorical_association_file in categorical_association_files:
            categorical_association_df = pd.read_csv(categorical_association_file, sep="\t")
            categorical_key = categorical_association_file.split(f"{prog_key}_")[1].split("_association_results.txt")[0]
            categorical_associations_results[prog_key][categorical_key] = categorical_association_df
        
        # Load association posthoc results
        categorical_association_posthoc_files = glob.glob(os.path.join(dir, f"{prog_key}_*_association_posthoc.txt"))
        for categorical_association_posthoc_file in categorical_association_posthoc_files:
            categorical_association_posthoc_df = pd.read_csv(categorical_association_posthoc_file, sep="\t")
            categorical_key = categorical_association_posthoc_file.split(f"{prog_key}_")[1].split("_association_posthoc.txt")[0]
            categorical_associations_posthoc[prog_key][categorical_key] = categorical_association_posthoc_df
            
    return categorical_associations_results, categorical_associations_posthoc

In [None]:
res_categorical_associations_results, res_categorical_associations_posthoc = parse_categorical_associations(path_evaluation_outs, prog_keys)

In [None]:
res_categorical_associations_results[prog_keys[0]]["sample"]

Unnamed: 0,sample_kruskall_wallis_stat,sample_kruskall_wallis_pval
0,68816.470381,0.0
1,34189.544565,0.0
2,17886.375794,0.0
3,17172.102945,0.0
4,28717.366511,0.0
5,41017.283842,0.0
6,24156.117876,0.0
7,51317.719854,0.0
8,16029.489627,0.0
9,28237.512002,0.0


In [None]:
res_categorical_associations_posthoc[prog_keys[0]]["sample"]

Unnamed: 0,sample_D0_pearsonr_stat,sample_D0_pearsonr_pval,sample_D0_pearsonr_adj_pval,sample_D0_pearsonr_log2FC,sample_sample_D1_pearsonr_stat,sample_sample_D1_pearsonr_pval,sample_sample_D1_pearsonr_adj_pval,sample_sample_D1_pearsonr_log2FC,sample_sample_D2_pearsonr_stat,sample_sample_D2_pearsonr_pval,sample_sample_D2_pearsonr_adj_pval,sample_sample_D2_pearsonr_log2FC,sample_sample_D3_pearsonr_stat,sample_sample_D3_pearsonr_pval,sample_sample_D3_pearsonr_adj_pval,sample_sample_D3_pearsonr_log2FC
0,0.870169,0.0,0.0,3.6224,-0.412455,0.0,0.0,-2.795765,-0.3458,0.0,0.0,-3.154612,-0.512979,0.0,0.0,-3.659611
1,-0.458007,0.0,0.0,-3.753409,-0.270776,0.0,0.0,-3.648265,-0.255153,0.0,0.0,-2.632564,0.732122,0.0,0.0,3.985067
2,0.32526,0.0,0.0,0.782145,0.072408,9.370198e-82,9.370198e-82,0.225597,-0.073127,2.36316e-83,2.36316e-83,-0.264392,-0.354213,0.0,0.0,-1.170326
3,0.340553,0.0,0.0,0.776468,0.125554,2.408202e-239,2.408202e-239,0.350606,-0.130546,9.460306e-259,9.460306e-259,-0.471049,-0.372004,0.0,0.0,-1.17617
4,-0.388457,0.0,0.0,-3.024323,0.840549,0.0,0.0,3.648192,-0.237996,0.0,0.0,-1.889081,-0.343107,0.0,0.0,-2.945606
5,0.451908,0.0,0.0,1.22783,0.031683,4.366861e-15,4.366861e-15,0.109374,-0.229251,0.0,0.0,-1.209603,-0.365807,0.0,0.0,-1.545153
6,0.261422,0.0,0.0,0.680059,0.186126,0.0,0.0,0.5456,-0.151744,0.0,0.0,-0.659965,-0.352161,0.0,0.0,-1.46961
7,0.622893,0.0,0.0,2.22457,-0.252181,0.0,0.0,-1.347491,-0.245848,0.0,0.0,-1.812631,-0.377141,0.0,0.0,-2.276364
8,0.34213,0.0,0.0,0.769623,0.1021,1.263029e-156,1.263029e-156,0.285176,-0.149152,0.0,0.0,-0.548311,-0.34005,0.0,0.0,-1.026575
9,-0.409822,0.0,0.0,-3.045149,-0.252858,0.0,0.0,-2.70432,-0.224408,0.0,0.0,-1.666923,0.666162,0.0,0.0,3.058629


## `perturbation_association_results`

In [340]:
def parse_perturbation_associations(
    dirs, 
    prog_keys,
    stratification_key=None
):
    perturbation_associations = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in perturbation_associations:
            perturbation_associations[prog_key] = {}
        perturbation_associations[prog_key]["results"] = {}
        perturbation_associations[prog_key]["gene_guide"] = []
        perturbation_associations[prog_key]["stratification_key"] = []
        perturbation_associations[prog_key]["level_key"] = []
        perturbation_association_files = glob.glob(os.path.join(dir, f"{prog_key}_*_perturbation_association.txt"))
        for perturbation_association_file in perturbation_association_files:
            gene_guide = perturbation_association_file.split(f"{prog_key}_")[1].split("_")[0]
            if not stratification_key:
                stratification_key = perturbation_association_file.split(f"{prog_key}_{gene_guide}_")[1].split("_")[0]
            level_key = perturbation_association_file.split(f"{prog_key}_{gene_guide}_{stratification_key}_")[1].split("_perturbation_association.txt")[0]
            print(f"Gene/guide: {gene_guide}, Stratification key: {stratification_key}, Level key: {level_key}")
            df = pd.read_csv(perturbation_association_file, sep="\t")
            perturbation_associations[prog_key]["results"][f"{gene_guide}_{stratification_key}_{level_key}"] = df
            perturbation_associations[prog_key]["gene_guide"].append(gene_guide)
            perturbation_associations[prog_key]["stratification_key"].append(stratification_key)
            perturbation_associations[prog_key]["level_key"].append(level_key)
    return perturbation_associations

In [341]:
pertubation_associations = parse_perturbation_associations(
    path_evaluation_outs, 
    prog_keys, 
    stratification_key=evaluation_config["perturbation_association"]["groupby_key"]
)

Gene/guide: gene, Stratification key: sample, Level key: sample_D1
Gene/guide: gene, Stratification key: sample, Level key: sample_D2
Gene/guide: gene, Stratification key: sample, Level key: D0
Gene/guide: gene, Stratification key: sample, Level key: sample_D3


In [342]:
first_gene_guide = pertubation_associations[prog_keys[0]]["gene_guide"][0]
first_stratification_key = pertubation_associations[prog_keys[0]]["stratification_key"][0]
first_level_key = pertubation_associations[prog_keys[0]]["level_key"][0]
res_pertubation_association = pertubation_associations[prog_keys[0]]["results"][f"{first_gene_guide}_{first_stratification_key}_{first_level_key}"]
res_pertubation_association.head()

Unnamed: 0,target_name,program_name,ref_mean,test_mean,log2FC,stat,pval,adj_pval
0,ACAA1,0,0.011169,0.011796,0.078747,327883.0,0.332131,0.815073
1,ACAA1,1,0.004502,0.00435,-0.049623,327927.5,0.301497,0.797924
2,ACAA1,3,0.082368,0.078486,-0.069645,306678.5,0.336355,0.815883
3,ACAA1,5,0.261522,0.255209,-0.035254,308393.5,0.416436,0.857467
4,ACAA1,7,0.017068,0.015546,-0.134803,311624.5,0.573162,0.915457


## `explained_variance`

In [211]:
def parse_explained_variance(dirs, prog_keys):
    explained_variance_ratios = {}
    for dir, prog_key in zip(dirs, prog_keys):
        try:
            explained_variance_ratio_file = os.path.join(dir, f"{prog_key}_variance_explained_ratio.txt")
            df = pd.read_csv(explained_variance_ratio_file, sep="\t")
            explained_variance_ratios[prog_key] = df
        except FileNotFoundError:
            print(f"File not found: {explained_variance_ratio_file}")
    return explained_variance_ratios

In [212]:
res_explained_variance = parse_explained_variance(path_evaluation_outs, prog_keys)

In [213]:
res_explained_variance[prog_keys[0]]

Unnamed: 0,program_name,variance_explained_ratio
0,0,-0.061864
1,1,-0.061864
2,2,-0.061864
3,3,-0.061864
4,4,-0.061864
5,5,-0.061864
6,6,-0.061864
7,7,-0.061864
8,8,-0.061864
9,9,-0.061864


## `geneset_enrichments`

In [290]:
def parse_geneset_enrichments(dirs, prog_keys):
    geneset_enrichments = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in geneset_enrichments:
            geneset_enrichments[prog_key] = {}
        geneset_enrichments[prog_key]["results"] = {}
        geneset_enrichments[prog_key]["libraries"] = []
        geneset_enrichments[prog_key]["methods"] = []
        geneset_enrichment_files = glob.glob(os.path.join(dir, f"{prog_key}_*_geneset_enrichment.txt"))
        for gene_set_enrichment_file in geneset_enrichment_files:
            method = gene_set_enrichment_file.split("_geneset_enrichment.txt")[0].split("_")[-1]
            library = gene_set_enrichment_file.split(f"{prog_key}_")[1].split(f"_{method}_geneset_enrichment.txt")[0]
            print(f"Library: {library}, Method: {method}")
            df = pd.read_csv(gene_set_enrichment_file, sep="\t")
            geneset_enrichments[prog_key]["results"][f"{library}_{method}"] = df
            geneset_enrichments[prog_key]["libraries"].append(library)
            geneset_enrichments[prog_key]["methods"].append(method)
    return geneset_enrichments

In [291]:
geneset_enrichments = parse_geneset_enrichments(path_evaluation_outs, prog_keys)

Library: Reactome_2022, Method: fisher
Library: GO_Biological_Process_2023, Method: fisher


In [298]:
geneset_enrichments[prog_keys[0]]["results"].keys()

dict_keys(['Reactome_2022_fisher', 'GO_Biological_Process_2023_fisher'])

In [297]:
# Find all unique libraries in list in one line
list(set(geneset_enrichments[prog_keys[0]]["libraries"]))

['GO_Biological_Process_2023', 'Reactome_2022']

In [292]:
first_method = geneset_enrichments[prog_keys[0]]["methods"][0]
first_library = geneset_enrichments[prog_keys[0]]["libraries"][0]
res_geneset_enrichments = geneset_enrichments[prog_keys[0]]["results"][f"{first_library}_{first_method}"]
res_geneset_enrichments.head()

Unnamed: 0,program_name,term,pval,adj_pval,enrichment,Combined Score,genes,overlap_numerator,overlap_denominator
0,0,Neuronal System R-HSA-112316,1.60057e-12,1.310867e-09,4.558044,123.799495,GABRA3;CACNA2D3;PRKCA;CACNA1A;SLC1A3;NRXN1;ADC...,38,386
1,0,Transmission Across Chemical Synapses R-HSA-11...,3.196046e-08,1.308781e-05,4.34058,74.913054,GABRA3;CACNA2D3;PRKCA;CACNA1A;SLC1A3;ADCY5;PRK...,24,246
2,0,Chondroitin Sulfate/Dermatan Sulfate Metabolis...,5.738173e-07,0.0001280062,9.850353,141.558982,CHSY1;CHST9;CHST11;VCAN;GPC4;DSE;UST;XYLT1;GPC...,10,50
3,0,Integration Of Energy Metabolism R-HSA-163685,6.251831e-07,0.0001280062,6.083571,86.905161,TKT;ITPR2;PRKCA;PRKAR1B;ITPR3;GNG4;CACNA1A;CAC...,14,105
4,0,Neurexins And Neuroligins R-HSA-6794361,2.048844e-06,0.0003356007,8.392759,109.930332,DLGAP1;NLGN1;DLGAP2;NLGN4Y;NRXN1;NRXN3;NLGN4X;...,10,57


## `motif_enrichments`

In [310]:
def parse_motif_enrichments(
    dirs, 
    prog_keys,
    stratification_key=None
):
    motif_enrichments = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in motif_enrichments:
            motif_enrichments[prog_key] = {}
        motif_enrichments[prog_key]["results"] = {}
        motif_enrichments[prog_key]["E_P_types"] = []
        motif_enrichments[prog_key]["databases"] = []
        motif_enrichments[prog_key]["test_types"] = []
        motif_enrichments[prog_key]["stratification_keys"] = []
        motif_enrichments[prog_key]["level_keys"] = []
        motif_enrichment_files = glob.glob(os.path.join(dir, f"{prog_key}_*_motif_enrichment.txt"))
        for motif_enrichment_file in motif_enrichment_files:
            E_P_type = motif_enrichment_file.split(f"{prog_key}_")[1].split("_")[0]
            database = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_")[1].split("_")[0]
            test_type = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_{database}_")[1].split("_")[0]
            if not stratification_key:
                stratification_key = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_{database}_{test_type}_")[1].split("_")[0]
            level_key = motif_enrichment_file.split(f"{prog_key}_{E_P_type}_{database}_{test_type}_{stratification_key}_")[1].split("_motif_enrichment.txt")[0]
            print(f"E_P_type: {E_P_type}, Database: {database}, Test type: {test_type}, Stratification key: {stratification_key}, Level key: {level_key}")
            df = pd.read_csv(motif_enrichment_file, sep="\t")
            motif_enrichments[prog_key]["results"][f"{E_P_type}_{database}_{test_type}_{stratification_key}_{level_key}"] = df
            motif_enrichments[prog_key]["E_P_types"].append(E_P_type)
            motif_enrichments[prog_key]["databases"].append(database)
            motif_enrichments[prog_key]["test_types"].append(test_type)
            motif_enrichments[prog_key]["stratification_keys"].append(stratification_key)
            motif_enrichments[prog_key]["level_keys"].append(level_key)
    
    return motif_enrichments

In [311]:
res_motif_enrichment = parse_motif_enrichments(
    path_evaluation_outs, 
    prog_keys,
    stratification_key=evaluation_config["motif_enrichment"]["groupby_key"]
)

E_P_type: enhancer, Database: test, Test type: pearsonr, Stratification key: sample, Level key: sample_D2
E_P_type: enhancer, Database: test, Test type: pearsonr, Stratification key: sample, Level key: sample_D3
E_P_type: enhancer, Database: test, Test type: pearsonr, Stratification key: sample, Level key: D0
E_P_type: enhancer, Database: test, Test type: pearsonr, Stratification key: sample, Level key: sample_D1


In [312]:
first_E_P_type = res_motif_enrichment[prog_keys[0]]["E_P_types"][0]
first_database = res_motif_enrichment[prog_keys[0]]["databases"][0]
first_test_type = res_motif_enrichment[prog_keys[0]]["test_types"][0]
first_stratification_key = res_motif_enrichment[prog_keys[0]]["stratification_keys"][0]
first_level_key = res_motif_enrichment[prog_keys[0]]["level_keys"][0]
res_motif_enrichment[prog_keys[0]]["results"][f"{first_E_P_type}_{first_database}_{first_test_type}_{first_stratification_key}_{first_level_key}"]

Unnamed: 0,motif,stat,pval,program_name,adj_pval
0,AHR.H12CORE.0.P.B,0.009386,0.488420,0,0.545213
1,PAX3.H12CORE.0.PS.A,0.007182,0.596011,0,0.650194
2,ALX1.H12CORE.0.SM.B,-0.005934,0.661362,0,0.711780
3,ALX3.H12CORE.0.SM.B,-0.004940,0.715365,0,0.763056
4,FOXF1.H12CORE.0.P.C,-0.003446,0.799215,0,0.837605
...,...,...,...,...,...
235,ALX1.H12CORE.0.SM.B,0.061342,0.000006,9,0.000056
236,PAX3.H12CORE.0.PS.A,0.057666,0.000020,9,0.000151
237,ALX3.H12CORE.0.SM.B,0.054948,0.000049,9,0.000282
238,AHRR.H12CORE.0.P.C,0.051545,0.000141,9,0.000703


## `trait_enrichments`

In [336]:
# Same as geneset enrichmetns
def parse_trait_enrichments(dirs, prog_keys):
    trait_enrichments = {}
    for dir, prog_key in zip(dirs, prog_keys):
        if prog_key not in trait_enrichments:
            trait_enrichments[prog_key] = {}
        trait_enrichments[prog_key]["results"] = {}
        trait_enrichments[prog_key]["databases"] = []
        trait_enrichments[prog_key]["methods"] = []
        trait_enrichment_files = glob.glob(os.path.join(dir, f"{prog_key}_*_trait_enrichment.txt"))
        for trait_enrichment_file in trait_enrichment_files:
            method = trait_enrichment_file.split("_trait_enrichment.txt")[0].split("_")[-1]
            database = trait_enrichment_file.split(f"{prog_key}_")[1].split(f"_{method}_trait_enrichment.txt")[0]
            print(f"Database: {database}, Method: {method}")
            df = pd.read_csv(trait_enrichment_file, sep="\t")
            trait_enrichments[prog_key]["results"][f"{database}_{method}"] = df
            trait_enrichments[prog_key]["databases"].append(database)
            trait_enrichments[prog_key]["methods"].append(method)
    return trait_enrichments

In [337]:
trait_enrichments = parse_trait_enrichments(path_evaluation_outs, prog_keys)

Database: OT_GWAS, Method: fisher


In [339]:
first_method = trait_enrichments[prog_keys[0]]["methods"][0]
first_database= trait_enrichments[prog_keys[0]]["databases"][0]
res_trait_enrichments = trait_enrichments[prog_keys[0]]["results"][f"{first_database}_{first_method}"]
res_trait_enrichments.head()

Unnamed: 0,term,adj_pval,trait_efos,trait_category,program_name,trait_reported,genes,study_id,pmid,-log10(adj_pval)
0,EFO_0004875,2.8248450000000005e-17,EFO_0004875,biological process,0,Highest math class taken (MTAG) [MTAG],CACNA2D3;OSBPL3;PRKG2;DLG2;PRKCA;DMC1;CD47;LUZ...,GCST006568,PMID:30038396,16.549005
1,EFO_0004875,2.8248450000000005e-17,EFO_0004875,biological process,1,Highest math class taken (MTAG) [MTAG],CACNA2D3;OSBPL3;PRKG2;DLG2;PRKCA;DMC1;CD47;LUZ...,GCST006568,PMID:30038396,16.549005
2,EFO_0004875,2.8248450000000005e-17,EFO_0004875,biological process,10,Highest math class taken (MTAG) [MTAG],CACNA2D3;OSBPL3;PRKG2;DLG2;PRKCA;DMC1;CD47;LUZ...,GCST006568,PMID:30038396,16.549005
3,EFO_0004875,2.8248450000000005e-17,EFO_0004875,biological process,11,Highest math class taken (MTAG) [MTAG],CACNA2D3;OSBPL3;PRKG2;DLG2;PRKCA;DMC1;CD47;LUZ...,GCST006568,PMID:30038396,16.549005
4,EFO_0004875,2.8248450000000005e-17,EFO_0004875,biological process,12,Highest math class taken (MTAG) [MTAG],CACNA2D3;OSBPL3;PRKG2;DLG2;PRKCA;DMC1;CD47;LUZ...,GCST006568,PMID:30038396,16.549005


## `software_versions`

In [235]:
res_software_versions = parse_software_versions(path_evaluation_outs)

In [236]:
res_software_versions[prog_keys[0]]

{'evaluation_pipeline_versions': {'gene_program_evaluation': '0.0.1',
  'gseapy': '1.1.3',
  'joblib': '1.3.2',
  'mudata': '0.2.3',
  'numpy': '1.26.2',
  'pandas': '1.5.3',
  'pymemesuite': '0.1.0-a2',
  'scikit-learn': '1.3.2',
  'scikit-posthocs': '0.9.0',
  'scipy': '1.11.4'}}

# `parse`

In [313]:
def parse(
    mdata,
    dirs,
    data_key="rna",
    perturbation_association_stratification_key=None,
    motif_enrichment_stratification_key=None
):
    methods, n_components = parse_methods(mdata, data_key)
    loadings = parse_loadings(mdata, data_key)
    obs_memberships = parse_obs_memberships(mdata, data_key)
    categorical_associations_results, categorical_associations_posthoc = parse_categorical_associations(dirs, methods.keys())
    perturbation_associations = parse_perturbation_associations(dirs, methods.keys(), perturbation_association_stratification_key)
    geneset_enrichments = parse_geneset_enrichments(dirs, methods.keys())
    trait_enrichments = parse_trait_enrichments(dirs, methods.keys())
    motif_enrichments = parse_motif_enrichments(dirs, methods.keys(), motif_enrichment_stratification_key)
    explained_variance_ratios = parse_explained_variance(dirs, methods.keys())
    software_versions = parse_software_versions(dirs)
    return {
        "methods": methods,
        "n_components": n_components,
        "loadings": loadings,
        "obs_memberships": obs_memberships,
        "categorical_associations_results": categorical_associations_results,
        "categorical_associations_posthoc": categorical_associations_posthoc,
        "perturbation_associations": perturbation_associations,
        "geneset_enrichments": geneset_enrichments,
        "trait_enrichments": trait_enrichments,
        "motif_enrichments": motif_enrichments,
        "explained_variance_ratios": explained_variance_ratios,
        "software_versions": software_versions,
    }

In [314]:
results = parse(
    mdata,
    path_evaluation_outs,
    data_key=data_key,
    perturbation_association_stratification_key=evaluation_config["perturbation_association"]["groupby_key"],
    motif_enrichment_stratification_key=evaluation_config["motif_enrichment"]["groupby_key"]
)

/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30/cNMF_30_gene_sample_sample_D1_perturbation_association.txt
Gene/guide: gene, Stratification key: sample, Level key: sample_D1
/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30/cNMF_30_gene_sample_sample_D2_perturbation_association.txt
Gene/guide: gene, Stratification key: sample, Level key: sample_D2
/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30/cNMF_30_gene_sample_D0_perturbation_association.txt
Gene/guide: gene, Stratification key: sample, Level key: D0
/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF/cNMF_30/cNMF_30_gene_sample_sample_D3_perturbation_association.txt
Gene/guide: gene, Stratification key: sample, Level key: sample_D3
Library: Reactome_2022, Method: fisher
Library: GO_Biological_Process_2023, Method: fisher
Database: OT_GWAS, Method: fisher
E_P_type: enhancer, Database:

In [315]:
default_run = "cNMF_30"

In [316]:
results.keys()

dict_keys(['methods', 'n_components', 'loadings', 'obs_memberships', 'categorical_associations_results', 'categorical_associations_posthoc', 'perturbation_associations', 'geneset_enrichments', 'trait_enrichments', 'motif_enrichments', 'explained_variance_ratios', 'software_versions'])

In [317]:
results["explained_variance_ratios"]

{'cNMF_30':     program_name  variance_explained_ratio
 0              0                 -0.061864
 1              1                 -0.061864
 2              2                 -0.061864
 3              3                 -0.061864
 4              4                 -0.061864
 5              5                 -0.061864
 6              6                 -0.061864
 7              7                 -0.061864
 8              8                 -0.061864
 9              9                 -0.061864
 10            10                 -0.061864
 11            11                 -0.061864
 12            12                 -0.061864
 13            13                 -0.061864
 14            14                 -0.061864
 15            15                 -0.061864
 16            16                 -0.061864
 17            17                 -0.061864
 18            18                 -0.061864
 19            19                 -0.061864
 20            20                 -0.061864
 21            21    

In [321]:
results["motif_enrichments"][prog_key]["results"].keys()

dict_keys(['enhancer_test_pearsonr_sample_sample_D2', 'enhancer_test_pearsonr_sample_sample_D3', 'enhancer_test_pearsonr_sample_D0', 'enhancer_test_pearsonr_sample_sample_D1'])

# DONE!

---