In [18]:
import os
import sys
# Change path to wherever you have repo locally
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')

from src.evaluation import *
from src.evaluation.enrichment_trait import process_enrichment_data

import mudata
import numpy as np
import pandas as pd

from scipy import stats
from statsmodels.stats.multitest import fdrcorrection

In [2]:
# Read mudata with cNMF and RNA
mdata = mudata.read('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF.h5mu')
mdata



In [15]:
# Redo correlation tests
correls_df = []
for topic in mdata['cNMF'].var_names:
    for batch_ in mdata['cNMF'].obs.batch.unique():
        bool_ = mdata['cNMF'].obs.batch==batch_

        r, pval = stats.pearsonr(mdata['cNMF'][:, topic].X.toarray().flatten(),
                                 bool_)
    
        _, adj_pval = fdrcorrection([pval])
        log2FC = np.log2(mdata['cNMF'][:, topic].X.toarray().flatten()[bool_].mean() /
                         mdata['cNMF'][:, topic].X.toarray().flatten()[~bool_].mean())

        correls_df.append([topic, batch_, r, pval, adj_pval[0], log2FC])
correls_df = pd.DataFrame(correls_df, columns=['topic', 'batch', 'pearsonr', 'pval', 'adj_pval', 'log2FC'])
correls_df

Unnamed: 0,topic,batch,pearsonr,pval,adj_pval,log2FC
0,1,11AMDox,0.170600,0.000000e+00,0.000000e+00,0.680129
1,1,2PMDox1,-0.083032,0.000000e+00,0.000000e+00,-0.352810
2,1,2PMDox2,-0.070304,4.532597e-233,4.532597e-233,-0.278402
3,2,11AMDox,0.055254,1.277865e-144,1.277865e-144,0.363984
4,2,2PMDox1,0.016959,4.024909e-15,4.024909e-15,0.107524
...,...,...,...,...,...,...
175,59,2PMDox1,-0.268220,0.000000e+00,0.000000e+00,-0.658530
176,59,2PMDox2,0.023672,5.708263e-28,5.708263e-28,0.050693
177,60,11AMDox,0.581869,0.000000e+00,0.000000e+00,2.558836
178,60,2PMDox1,-0.237496,0.000000e+00,0.000000e+00,-1.502954


In [16]:
# Create eval pipeline output for correlations
results_df = pd.DataFrame(index=mdata['cNMF'].var_names, 
                          columns=['batch_max_pearsonr_stat', 
                                   'batch_max_pearsonr_pval'])

for topic_ in results_df.index.values:
    maxidx = correls_df.loc[correls_df.topic==topic_,'pearsonr'].idxmax()
    results_df.loc[topic_] = correls_df.loc[maxidx, ['pearsonr', 'pval']].values
results_df['batch_max_pearsonr_pval'] = results_df['batch_max_pearsonr_pval'].astype(float)
results_df.index.name='program_name'

# File name format scheme {prog_key}_{categorical_key}_association_results.txt
# Column format scheme {categorical_key}_{test}_* 
results_df.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_batch_association_results.txt', sep='\t', index=True)

In [17]:
posthoc_df = correls_df.pivot(index='topic', columns='batch', values=['pearsonr', 'pval', 'adj_pval', 'log2FC'])
posthoc_df.columns = [f"batch_{batch}_pearsonr_stat" if metric == 'pearsonr' 
                   else f"batch_{batch}_pearsonr_{metric}" for metric, batch in posthoc_df.columns]
posthoc_df = posthoc_df.reset_index().rename(columns={'topic': 'program_name'})

# File name format scheme {prog_key}_{categorical_key}_association_posthoc.txt
# Column format scheme {categorical_key}_{test}_*
posthoc_df.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_batch_association_posthoc.txt', sep='\t', index=False)

In [20]:
# Redo enrichments

# Gene-set enrichment
pre_res = compute_geneset_enrichment(mdata, prog_key='cNMF', data_key='rna', prog_name=None,
                                     organism='human', library='Reactome_2022', method="fisher",
                                     database='enrichr', n_top=300, n_jobs=-1, 
                                     inplace=False, user_geneset=None)
pre_res = pre_res.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "enrichment", "Genes": "genes"})
pre_res.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_Reactome_2022_fisher_geneset_enrichment.txt', sep='\t', index=False)

# GO Term enrichment
pre_res = compute_geneset_enrichment(mdata, prog_key='cNMF', data_key='rna', prog_name=None,
                                     organism='human', library='GO_Biological_Process_2023', method="fisher",
                                     database='enrichr', n_top=300, n_jobs=-1, 
                                     inplace=False, user_geneset=None)
pre_res = pre_res.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "enrichment", "Genes": "genes"})
pre_res.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_GO_Biological_Process_2023_fisher_geneset_enrichment.txt', sep='\t', index=False)

# Run trait enrichment
pre_res_trait = compute_trait_enrichment(mdata, gwas_data='/cellar/users/aklie/opt/gene_program_evaluation/smk/resources/OpenTargets_L2G_Filtered.csv.gz', 
                                        prog_key='cNMF', prog_name=None, data_key='rna', 
                                        library='OT_GWAS', n_jobs=-1, inplace=False, 
                                        key_column='trait_efos', gene_column='gene_name', 
                                        method='fisher', n_top=300)
pre_res_trait = pre_res_trait.rename(columns={"Term": "term", "P-value": "pval", "Adjusted P-value": "adj_pval", "Odds Ratio": "enrichment", "Genes": "genes"})
res = process_enrichment_data(
    enrich_res=pre_res_trait,
    metadata="/cellar/users/aklie/opt/gene_program_evaluation/smk/resources/OpenTargets_L2G_Filtered.csv.gz",
    pval_col="adj_pval",
    enrich_geneset_id_col="term",
    metadata_geneset_id_col="trait_efos",
    color_category_col="trait_category",
    program_name_col="program_name",
    annotation_cols=["enrichment", "trait_reported", "genes", "study_id", "pmid"]
)
res.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_OT_GWAS_fisher_trait_enrichment.txt', sep='\t', index=False)

# File format scheme {prog_key}_{library}_{test}_enrichment.txt


INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.Reactome_2022.gmt, use local file
INFO:root:0002 gene_sets have been filtered out when max_size=2000 and min_size=0


Running Fisher enrichment:   0%|          | 0/60 [00:00<?, ?programs/s]

INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:Library is already downloaded in: /cellar/users/aklie/.cache/gseapy/Enrichr.GO_Biological_Process_2023.gmt, use local file
INFO:root:0001 gene_sets have been filtered out when max_size=2000 and min_size=0


Running Fisher enrichment:   0%|          | 0/60 [00:00<?, ?programs/s]

Running Fisher enrichment:   0%|          | 0/60 [00:00<?, ?programs/s]

NameError: name 'res_trait' is not defined

In [27]:
# Reformat perturbation outputs
perturbation_results = pd.read_excel('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/41586_2024_7022_MOESM3_ESM.xlsx', sheet_name='Suppl.Table.12')
perturbation_results = perturbation_results.loc[:,['Perturbation', 'ProgramsRegulated', 'log2FC', 'p.value', 'AcrossProgramsFDR']]

test_stats_df = []
for idx in perturbation_results.index.values:
    target_name = perturbation_results.loc[idx, 'Perturbation']
    progs = perturbation_results.loc[idx, 'ProgramsRegulated'].split('|')
    log2fcs = perturbation_results.loc[idx, 'log2FC'].split('|')
    pvals = perturbation_results.loc[idx, 'p.value'].split('|')
    adj_pvals = perturbation_results.loc[idx, 'AcrossProgramsFDR'].split('|')
    for i in range(len(progs)):
        test_stats_df.append([target_name, progs[i], log2fcs[i], log2fcs[i], pvals[i], adj_pvals[i]])

test_stats_df = pd.DataFrame(test_stats_df, 
                             columns=['target_name', 'program_name', 'stat', 'log2FC', 'pval', 'adj_pval'])
test_stats_df['program_name'] = test_stats_df['program_name'].apply(lambda x: x.replace('K60_', ''))

# File format scheme {prog_key}_{level_key}_perturbation_association.txt
test_stats_df.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_gene_perturbation_association.txt', sep='\t', index=False)

In [26]:
# Reformat motif enrichment outputs
motif_enrichment = pd.read_excel('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/41586_2024_7022_MOESM4_ESM.xlsx', sheet_name='Suppl.Table.24')

for typ in ['Promoter', 'Enhancer']:
    motif_enrichment_df = motif_enrichment.loc[motif_enrichment.EPType==typ, 
                                                    ['ProgramID', 'TFMotif', 'Enrichment', 'PValue', 'FDR']]
    motif_enrichment_df.columns = ['program_name', 'motif', 'stat', 'pval', 'adj_pval']
    motif_enrichment_df['program_name'] = motif_enrichment_df['program_name'].apply(lambda x: x.replace('K60_', ''))

    # File format scheme {prog_key}_{eptype}_{database}_{test}_enrichment.txt
    motif_enrichment_df.to_csv('/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/Endothelial/cNMF/cNMF_{}_hocomoco_ttest_motif_enrichment.txt'.format(typ.lower()), sep='\t', index=False)


In [24]:
# Reformat explained variance
variance_explained = pd.read_csv('metrics.varianceExplained.df.txt', sep='\t')
variance_explained = variance_explained.loc[:,['ProgramID', 'VarianceExplained']]
variance_explained.columns = ['program_name', 'variance_explained_ratio']
variance_explained['program_name'] = variance_explained['program_name'].apply(lambda x: x.replace('K60_', ''))

# File format key {prog_key}_variance_explained_ratio.txt
variance_explained.to_csv('cNMF_variance_explained_ratio.txt', sep='\t', index=False)