In [1]:
import os
import sys
# Change path to wherever you have repo locally
sys.path.append('../../gene_network_evaluation/')

from src.evaluation import *

import mudata
import pandas as pd

from scipy import stats

In [2]:
# Read mudata with cNMF and RNA
mdata = mudata.read('../../../../data/TeloHAEC_Perturb-seq_2kG/2kG.library_K60_kangh.h5mu')
mdata



In [5]:
# Redo correlation tests
correls_df = []
for topic in mdata['cNMF'].var_names:
    for batch_ in mdata['cNMF'].obs.batch.unique():
        bool_ = mdata['cNMF'].obs.batch==batch_

        r, pval = stats.pearsonr(mdata['cNMF'][:, topic].X.toarray().flatten(),
                                 bool_)
        correls_df.append([topic, batch_, r, pval])
correls_df = pd.DataFrame(correls_df, columns=['topic', 'batch', 'r', 'pval'])
correls_df

Unnamed: 0,topic,batch,r,pval
0,1,11AMDox,0.170600,0.000000e+00
1,1,2PMDox1,-0.083032,0.000000e+00
2,1,2PMDox2,-0.070304,4.532597e-233
3,2,11AMDox,0.055254,1.277865e-144
4,2,2PMDox1,0.016959,4.024909e-15
...,...,...,...,...
175,59,2PMDox1,-0.268220,0.000000e+00
176,59,2PMDox2,0.023672,5.708263e-28
177,60,11AMDox,0.581869,0.000000e+00
178,60,2PMDox1,-0.237496,0.000000e+00


In [6]:
# Create eval pipeline output for correlations
results_df = pd.DataFrame(index=mdata['cNMF'].var_names, 
                          columns=['batch_max_pearsonr_stat', 
                                   'batch_max_pearsonr_pval'])

for topic_ in results_df.index.values:
    maxidx = correls_df.loc[correls_df.topic==topic_,'r'].idxmax()
    results_df.loc[topic_] = correls_df.loc[maxidx, ['r', 'pval']].values
results_df['batch_max_pearsonr_pval'] = results_df['batch_max_pearsonr_pval'].astype(float)
results_df.index.name='program_name'

# File name format scheme {prog_key}_{categorical_key}_association_results.txt
# Column format scheme {categorical_key}_{test}_* 
results_df.to_csv('cNMF_batch_association_results.txt', sep='\t', index=True)


In [8]:
# Redo enrichments

# Gene-set enrichment
pre_res = compute_geneset_enrichment(mdata, prog_key='cNMF', data_key='rna', prog_nam=None,
                                     organism='human', library='Reactome_2022', method="fisher",
                                     database='enrichr', loading_rank_thresh=300, n_jobs=-1, 
                                     inplace=False, user_geneset=None)
pre_res.to_csv('cNMF_Reactome_2022_fisher_enrichment.txt', sep='\t', index=False)

# GO Term enrichment
pre_res = compute_geneset_enrichment(mdata, prog_key='cNMF', data_key='rna', prog_nam=None,
                                     organism='human', library='GO_Biological_Process_2023', method="fisher",
                                     database='enrichr', loading_rank_thresh=300, n_jobs=-1, 
                                     inplace=False, user_geneset=None)
pre_res.to_csv('cNMF_GO_Biological_Process_2023_fisher_enrichment.txt', sep='\t', index=False)

# Run trait enrichment
pre_res_trait = compute_trait_enrichment(mdata, gwas_data='../../../gene_program_evaluation/gene_network_evaluation/smk/resources/OpenTargets_L2G_Filtered.csv.gz', 
                                        prog_key='cNMF', prog_nam=None, data_key='rna', 
                                        library='OT_GWAS', n_jobs=-1, inplace=False, 
                                        key_column='trait_efos', gene_column='gene_name', 
                                        method='fisher', loading_rank_thresh=300)
pre_res_trait.to_csv('cNMF_OT_GWAS_fisher_enrichment.txt', sep='\t', index=False)

# File format scheme {prog_key}_{library}_{test}_enrichment.txt


INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:0002 gene_sets have been filtered out when max_size=2000 and min_size=0


Running Fisher enrichment:   0%|          | 0/60 [00:00<?, ?programs/s]

INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:0001 gene_sets have been filtered out when max_size=2000 and min_size=0


Running Fisher enrichment:   0%|          | 0/60 [00:00<?, ?programs/s]

Running Fisher enrichment:   0%|          | 0/60 [00:00<?, ?programs/s]

In [15]:
# Reformat perturbation outputs
perturbation_results = pd.read_excel('41586_2024_7022_MOESM3_ESM.xlsx', sheet_name='Suppl.Table.12')
perturbation_results = perturbation_results.loc[:,['Perturbation', 'ProgramsRegulated', 'log2FC', 'p.value']]

test_stats_df = []
for idx in perturbation_results.index.values:
    target_name = perturbation_results.loc[idx, 'Perturbation']
    progs = perturbation_results.loc[idx, 'ProgramsRegulated'].split('|')
    log2fcs = perturbation_results.loc[idx, 'log2FC'].split('|')
    pvals = perturbation_results.loc[idx, 'p.value'].split('|')
    for i in range(len(progs)):
        test_stats_df.append([target_name, progs[i], log2fcs[i], pvals[i]])

test_stats_df = pd.DataFrame(test_stats_df, 
                             columns=['target_name', 'program_name', 'stat', 'pval'])
test_stats_df['program_name'] = test_stats_df['program_name'].apply(lambda x: x.replace('K60_', ''))

# File format scheme {prog_key}_{level_key}_perturbation_association.txt
test_stats_df.to_csv('cNMF_gene_perturbation_association.txt', sep='\t', index=False)

In [17]:
# Reformat motif enrichment outputs
motif_enrichment = pd.read_excel('41586_2024_7022_MOESM4_ESM.xlsx', sheet_name='Suppl.Table.24')

for typ in ['Promoter', 'Enhancer']:
    motif_enrichment_df = motif_enrichment.loc[motif_enrichment.EPType==typ, 
                                                    ['ProgramID', 'TFMotif', 'Enrichment', 'PValue']]
    motif_enrichment_df.columns = ['program_name', 'motif', 'stat', 'pval']
    motif_enrichment_df['program_name'] = motif_enrichment_df['program_name'].apply(lambda x: x.replace('K60_', ''))

    # File format scheme {prog_key}_{eptype}_{database}_{test}_enrichment.txt
    motif_enrichment_df.to_csv('cNMF_{}_hocomoco_ttest_enrichment.txt'.format(typ.lower()), sep='\t', index=False)


In [24]:
# Reformat explained variance
variance_explained = pd.read_csv('metrics.varianceExplained.df.txt', sep='\t')
variance_explained = variance_explained.loc[:,['ProgramID', 'VarianceExplained']]
variance_explained.columns = ['program_name', 'variance_explained_ratio']
variance_explained['program_name'] = variance_explained['program_name'].apply(lambda x: x.replace('K60_', ''))

# File format key {prog_key}_variance_explained_ratio.txt
variance_explained.to_csv('cNMF_variance_explained_ratio.txt', sep='\t', index=False)