# Set-up

In [1]:
import os
import sys
import yaml
import logging
import mudata
import pandas as pd

# Change path to wherever you have repo locally
sys.path.append('/oak/stanford/groups/engreitz/Users/ymo/Tools/cNMF_benchmarking/cNMF_benchmarking_pipeline')

from Evaluation.src import (
    compute_categorical_association,
    compute_geneset_enrichment,
    compute_trait_enrichment,
    compute_perturbation_association,
    #compute_explained_variance_ratio,
    compute_motif_enrichment
)
from Evaluation.src.enrichment_trait import process_enrichment_data

  from .autonotebook import tqdm as notebook_tqdm


# categorical association, geneset enrichment, GO term enrichment, trait enrichment

In [5]:

folder = "/oak/stanford/groups/engreitz/Users/ymo/NMF_re-inplementing/Results/torch-cNMF_evaluation/100k_cells_10iter_torch_mu_batch"
mdata_guide = "/oak/stanford/groups/engreitz/Users/ymo/Tools/cNMF_benchmarking/cNMF_benchmarking_pipeline/Evaluation/Resources/mdata_guide.h5mu"

def assign_guide(mdatam,file):

    # read mdata with guide
    mdata_guide = mudata.read(file)

    mdata['cNMF'].uns["guide_names"] = mdata_guide["cNMF_100"].uns["guide_names"]
    mdata['cNMF'].uns["guide_targets"] = mdata_guide["cNMF_100"].uns["guide_targets"]
    mdata['cNMF'].obsm["guide_assignment"] = mdata_guide["cNMF_100"].obsm["guide_assignment"]


for k in [30, 60, 80, 100, 200, 250, 300]:  

    os.makedirs(f"{folder}/Eval/{k}", exist_ok=True)

    output_folder = f"{folder}/Eval/{k}"

    # Load mdata
    mdata = mudata.read('{}/adata/cNMF_{}_2_0.h5mu'.format(folder,k))
    assign_guide(mdata, mdata_guide) 


    # Run categorical assocation
    results_df, posthoc_df = compute_categorical_association(mdata, prog_key='cNMF', categorical_key='sample', 
                                                            pseudobulk_key=None, test='dunn', n_jobs=-1, inplace=False)

    results_df.to_csv('{}/{}_categorical_association_results.txt'.format(output_folder,k), sep='\t', index=False) # This was made wide form to insert into .var of the program anndata.
    posthoc_df.to_csv('{}/{}_categorical_association_posthoc.txt'.format(output_folder,k), sep='\t', index=False)

 
    # Run perturbation assocation
    for samp in mdata['rna'].obs['sample'].unique():
        mdata_ = mdata[mdata['rna'].obs['sample']==samp]
        test_stats_df = compute_perturbation_association(mdata_, prog_key='cNMF', 
                                                        collapse_targets=True,
                                                        pseudobulk=False,
                                                        reference_targets=('non-targeting'),
                                                        n_jobs=-1, inplace=False)

        test_stats_df.to_csv('{}/{}_perturbation_association_results_{}.txt'.format(folder,k,samp), sep='\t', index=False)
  

    # Gene-set enrichment
    pre_res = compute_geneset_enrichment(mdata, prog_key='cNMF', data_key='rna', prog_nam=None,
                                        organism='human', library='Reactome_2022', method="fisher",
                                        database='enrichr', loading_rank_thresh=300, n_jobs=-1, 
                                        inplace=False, user_geneset=None)
    pre_res.to_csv('{}/{}_geneset_enrichment.txt'.format(output_folder,k), sep='\t', index=False)

    # GO Term enrichment
    pre_res = compute_geneset_enrichment(mdata, prog_key='cNMF', data_key='rna', prog_nam=None,
                                        organism='human', library='GO_Biological_Process_2023', method="fisher",
                                        database='enrichr', loading_rank_thresh=300, n_jobs=-1, 
                                        inplace=False, user_geneset=None)
    pre_res.to_csv('{}/{}_GO_term_enrichment.txt'.format(output_folder,k), sep='\t', index=False)

    # Run trait enrichment
    pre_res_trait = compute_trait_enrichment(mdata, gwas_data='/oak/stanford/groups/engreitz/Users/ymo/Tools/cNMF_benchmarking/cNMF_benchmarking_pipeline/Evaluation/Resources/OpenTargets_L2G_Filtered.csv.gz', 
                                            prog_key='cNMF', prog_nam=None, data_key='rna', 
                                            library='OT_GWAS', n_jobs=-1, inplace=False, 
                                            key_column='trait_efos', gene_column='gene_name', 
                                            method='fisher', loading_rank_thresh=300)
    pre_res_trait.to_csv('{}/{}_trait_enrichment.txt'.format(output_folder,k), sep='\t', index=False)

    

INFO:root:Performing tests at single-cell level. Significance will likely be inflated
Testing sample association: 100%|██████████| 30/30 [00:03<00:00,  9.59programs/s]
Identifying differential sample: 100%|██████████| 30/30 [00:03<00:00,  7.85programs/s]
INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:0002 gene_sets have been filtered out when max_size=2000 and min_size=0
Running Fisher enrichment: 100%|██████████| 30/30 [00:09<00:00,  3.14programs/s]
INFO:root:Downloading and generating Enrichr library gene sets...
INFO:root:0001 gene_sets have been filtered out when max_size=2000 and min_size=0
Running Fisher enrichment: 100%|██████████| 30/30 [00:25<00:00,  1.17programs/s]
Running Fisher enrichment: 100%|██████████| 30/30 [00:02<00:00, 13.33programs/s]
INFO:root:Performing tests at single-cell level. Significance will likely be inflated
Testing sample association: 100%|██████████| 60/60 [00:06<00:00,  9.43programs/s]
Identifying differential sample: 100%|

# Motif Enrichment (compliation working on progress)

In [None]:
# # Format files

# Thresholds
# score_thresh_abc_e2g_enhancer = 0.015
# score_thresh_abc_e2g_promoter = 0.8

# for i in range(4):
#     e2g = pd.read_csv('scE2G_links/EnhancerPredictionsAllPutative.ForVariantOverlap.shrunk150bp_D{}.tsv'.format(i), sep='\t')

#     e2g_enhancers = e2g.loc[(e2g['class']!='promoter') &\
#                             (e2g['ABC.Score']>score_thresh_abc_e2g_enhancer)]
#     e2g_enhancers = e2g_enhancers.loc[:,['chr', 'start', 'end', 'name', 'class', 'ABC.Score', 'TargetGene']]
#     e2g_enhancers.columns = [        'chromosome', 'start', 'end', 'seq_name', 'seq_class', 'seq_score', 'gene_name']

#     e2g_enhancers.to_csv('scE2G_links/EnhancerPredictionsAllPutative.ForVariantOverlap.shrunk150bp_D{}_enhancer.tsv'.format(i), 
#                           sep='\t', index=False)

#     e2g_promoters = e2g.loc[(e2g['class']=='promoter') &\
#                             (e2g['ABC.Score']>score_thresh_abc_e2g_promoter)]
#     e2g_promoters = e2g_promoters.loc[:,['chr', 'start', 'end', 'name', 'class', 'ABC.Score', 'TargetGene']]
#     e2g_promoters.columns = ['chromosome', 'start', 'end', 'seq_name', 'seq_class', 'seq_score', 'gene_name']

#     e2g_promoters.to_csv('scE2G_links/EnhancerPredictionsAllPutative.ForVariantOverlap.shrunk150bp_D{}_promoter.tsv'.format(i), 
#                         sep='\t', index=False)

In [None]:
# # Run in script


fimo_thresh_enhancer = 1e-6
fimo_thresh_promoter = 1e-4

# Load program data
mdata = mudata.read('/oak/stanford/groups/engreitz/Users/ymo/NMF_re-inplementing/Results/torch-cNMF_evaluation/090425_100k_cells_10iter_torch_mu_online/adata/cNMF_30_2.0.h5mu')
output_path = '/oak/stanford/groups/engreitz/Users/ymo/NMF_re-inplementing/Results/torch-cNMF_evaluation/090425_100k_cells_10iter_torch_mu_online/Eval/30'


In [4]:
mdata['rna']

AnnData object with n_obs × n_vars = 92284 × 5451
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'leiden'
    obsm: 'X_pca', 'X_umap'
    layers: 'norm10k'

In [25]:
import mygene

def rename_gene(mdata):

    mg = mygene.MyGeneInfo()
    gene_list = mdata['cNMF'].uns['var_names'].tolist()
    annotations = mg.querymany(gene_list, scopes='ensembl.gene', fields='symbol', species='human')

    # Process the results to create mapping
    gene_dict = {}
    
    for item in annotations:
        if 'symbol' in item:
            gene_dict[item['query']] = item['symbol']

    mdata['cNMF'].uns['var_names'] = [gene_dict.get(x, x) for x in mdata['cNMF'].uns['var_names']]

INFO:biothings.client:querying 1-1000 ...
INFO:httpx:HTTP Request: POST https://mygene.info/v3/query/ "HTTP/1.1 200 OK"
INFO:biothings.client:querying 1001-2000 ...
INFO:httpx:HTTP Request: POST https://mygene.info/v3/query/ "HTTP/1.1 200 OK"
INFO:biothings.client:querying 2001-3000 ...
INFO:httpx:HTTP Request: POST https://mygene.info/v3/query/ "HTTP/1.1 200 OK"
INFO:biothings.client:querying 3001-4000 ...
INFO:httpx:HTTP Request: POST https://mygene.info/v3/query/ "HTTP/1.1 200 OK"
INFO:biothings.client:querying 4001-5000 ...
INFO:httpx:HTTP Request: POST https://mygene.info/v3/query/ "HTTP/1.1 200 OK"
INFO:biothings.client:querying 5001-5451 ...
INFO:httpx:HTTP Request: POST https://mygene.info/v3/query/ "HTTP/1.1 200 OK"
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [17]:
mdata.var_names_make_unique()  # if you need unique names first
mdata.var.index = mdata.var['gene_name']

In [26]:
mdata['cNMF'].uns['var_names']

['SEMA3F',
 'CFTR',
 'CYP51A1',
 'HECW1',
 'KLHL13',
 'CASP10',
 'CFLAR',
 'TFPI',
 'MTMR7',
 'SLC7A2',
 'ARF5',
 'PLXND1',
 'CAMKK1',
 'ST7',
 'CALCR',
 'SLC25A5',
 'THSD7A',
 'ACSM3',
 'SPPL2B',
 'TSPOAP1',
 'CROT',
 'ITGA3',
 'CRLF1',
 'OSBPL7',
 'TAC1',
 'CACNA1G',
 'TNFRSF12A',
 'DLX6',
 'MAP3K9',
 'RALA',
 'ETV1',
 'USH1C',
 'GTF2IRD1',
 'ARHGAP44',
 'UPP2',
 'PROM1',
 'CEACAM21',
 'NOS2',
 'GAS7',
 'CACNA2D2',
 'E2F2',
 'CDKL5',
 'NADK',
 'ADAM22',
 'CYB561',
 'AASS',
 'CRY1',
 'ST3GAL1',
 'REV3L',
 'TENM1',
 'PAX7',
 'ETV7',
 'CD9',
 'GIPR',
 'STAB1',
 'SLC6A13',
 'IDS',
 'ZNF200',
 'LRRC23',
 'BTK',
 'SCMH1',
 'FYN',
 'HIVEP2',
 'LYPLA2',
 'SLC6A7',
 'TSPAN9',
 'ABHD5',
 'ANOS1',
 'PLAUR',
 'ANLN',
 'MAP4K3',
 'GABRA3',
 'PRICKLE3',
 'CLDN11',
 'GPRC5A',
 'MAMLD1',
 'ACP3',
 'MDH1',
 'CCDC88C',
 'ISL1',
 'SLC38A5',
 'RUFY3',
 'CNTN1',
 'WWTR1',
 'ATP1A2',
 'ZNF582',
 'SNAI2',
 'HGF',
 'ADGRA2',
 'SAMD4A',
 'PLEKHB1',
 'NRXN3',
 'CPS1',
 'FHL1',
 'NLRP2',
 'SLC45A4',
 'GRAMD1B'

In [27]:


# Run motif enrichment and save results
os.makedirs(output_path, exist_ok=True)
for i in range(4):
    for class_, thresh in [('enhancer', fimo_thresh_enhancer), 
                           ('promoter', fimo_thresh_promoter)]:

        loci_file = '/oak/stanford/groups/engreitz/Users/ymo/Tools/cNMF_benchmarking/cNMF_benchmarking_pipeline/Evaluation/Resources/scE2G_links/EnhancerPredictionsAllPutative.ForVariantOverlap.shrunk150bp_D{}_{}.tsv'.format(i, class_)
        motif_match_df, motif_count_df, motif_enrichment_df = compute_motif_enrichment(
            mdata, 
            prog_key='cNMF',
            data_key='rna',
            motif_file='/oak/stanford/groups/engreitz/Users/ymo/Tools/cNMF_benchmarking/cNMF_benchmarking_pipeline/Evaluation/Resources/hocomoco_meme.meme',
            seq_file='/oak/stanford/groups/engreitz/Users/ymo/Tools/cNMF_benchmarking/cNMF_benchmarking_pipeline/Evaluation/Resources/hg38.fa',
            loci_file=loci_file,
            window=1000,
            sig=thresh,
            eps=1e-4,
            n_top=2000,
            n_jobs=-1,
            inplace=False
        )

        motif_match_df.to_csv(os.path.join(output_path, f'cNMF_{class_}_pearson_topn2000_sample_D{i}_motif_match.txt'), sep='\t', index=False)
        motif_count_df.to_csv(os.path.join(output_path, f'cNMF_{class_}_pearson_topn2000_sample_D{i}_motif_count.txt'), sep='\t', index=False)
        motif_enrichment_df.to_csv(os.path.join(output_path, f'cNMF_{class_}_pearson_topn2000_sample_D{i}_motif_enrichment.txt'), sep='\t', index=False)

Number of matching genes: 3844
['FAM41C' 'LINC01128' 'LINC00115' ... 'SPRY3' 'VAMP7' 'IL9R']
Number of loci: 19853
