In [1]:
!pip install anndata regdiffusion numpy==1.23.5
# I need anndata to load the h5ad file. Installing within the Docker...

Collecting anndata
  Downloading anndata-0.11.4-py3-none-any.whl (144 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regdiffusion
  Downloading regdiffusion-0.1.0-py2.py3-none-any.whl (29 kB)
Collecting natsort
  Downloading natsort-8.4.0-py3-none-any.whl (38 kB)
Collecting packaging>=24.2
  Downloading packaging-25.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting array-api-compat!=1.5,>1.4
  Downloading array_api_compat-1.11.2-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting scanpy
  Downloading scanpy-1.11.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting 

In [1]:
# Fix OPENBLAS Warnings
import os
param_n_workers = 32 # We have 112 CPUs/cores, each process will automatically be associated to a different CPU by the OS scheduler
param_threads_per_worker=2 # We have 2 threads per CPU on SVEN (hyper-threading). See lscpu command. Note: Here they are not used apparently. Setting to 1 or 2 gives similar c. time
os.environ['OPENBLAS_NUM_THREADS'] = f"{param_n_workers * param_threads_per_worker}"
os.environ['MKL_NUM_THREADS'] = f"{param_n_workers * param_threads_per_worker}"
os.environ['OMP_NUM_THREADS'] = f"{param_n_workers * param_threads_per_worker}"
os.environ['NUMEXPR_MAX_THREADS'] = f"{param_n_workers * param_threads_per_worker}"

# import dependencies
import pandas as pd
import numpy as np
#import anndata as ad
#import regdiffusion as rd # For replacing grnboost2 which is slow as hell and bugs (stalls) when there are more than ~1.5B elements in the matrix
import ast # For reading frozenset as strings
import pickle
import pytz

from datetime import datetime
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from distributed import Client, LocalCluster

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.binarization import binarize
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

from scipy.sparse import csr_matrix
from scipy.io import mmwrite, mmread

In [2]:
# [Input] H5ad file to use
#EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed/PFC427_raw_data.h5ad' # From Synapse
EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/all_brain_regions_filt_preprocessed_scanpy_fullmatrix.h5ad' # From Synapse
METADATA_H5AD_FNAME = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/all_brain_regions_filt_preprocessed_scanpy_norm.final_noMB.cell_labels.tsv"
# [Output]
param_mtx_file = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/all_brain_regions_filt_preprocessed_scanpy_fullmatrix_excitatory.neurons.only.mtx"
param_features_file = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/features.tsv"
param_barcodes_file = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/barcodes.tsv"
param_matrix_pickle = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/ex_matrix.pkl"

# [Input] Transcription factors list (SCENIC step 1: GRNBoost2)
f_tfs = "/data/gardeux/Neuro_Droso_ND75KD/data/allTFs_hg38.txt" # From https://resources.aertslab.org/cistarget/tf_lists/
# Derive list of Transcription Factors(TF)
tf_names = load_tf_names(f_tfs)

# [Output] Adjacency matrix (SCENIC step 1: GRNBoost2)
adj_matrix = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_adj.csv"

# [Input] Ranking databases (SCENIC step 2-3: cisTarget)
f_db_names = ["/data/gardeux/Neuro_Droso_ND75KD/data/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather"] # From pySCENIC db: https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/
# Alternatively: f_db_names = ["/data/gardeux/Neuro_Droso_ND75KD/data/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather"]
dbs = [RankingDatabase(fname=f_name, name=os.path.basename(f_name)) for f_name in f_db_names]

# [Input] Motif databases (SCENIC step 2-3: cisTarget)
f_motif_path = "/data/gardeux/Neuro_Droso_ND75KD/data/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl" # From pySCENIC db: https://resources.aertslab.org/cistarget/motif2tf/

# [Output] Regulons (SCENIC step 2-3: cisTarget)
f_motifs_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_motifs.tsv"
f_modules_pickle = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_modules.pkl"
f_modules_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_modules.tsv"
f_regulons_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_regulons.tsv"
f_regulons_pickle = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_regulons.pkl"
f_regulons_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_regulons_aucell.tsv"
f_regulons_binarized_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_regulons_aucell_binarized.tsv"
f_regulons_binarization_thresholds_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Kellis_region_regulons_aucell_binarization_thresholds.tsv"

Load ex_matrix

In [3]:
# [Input] Load expression matrix from H5ad file
f_h5ad = ad.read_h5ad(EXPRESSION_H5AD_FNAME)
f_gene_names = f_h5ad.var_names.tolist()  # Gene names
f_cell_names = f_h5ad.obs_names.tolist()   # Cell names
ex_matrix = pd.DataFrame.sparse.from_spmatrix(f_h5ad.X.T, index=f_gene_names, columns=f_cell_names)

# Restrict matrix to feather genes
ranking_feather = pd.read_feather(f_db_names[0])
overlap_values = ex_matrix.index[pd.Series(ex_matrix.index).isin(ranking_feather.columns)].unique()
ex_matrix = ex_matrix.loc[overlap_values, :] # This step takes forever

ex_matrix # 18587 (out of XXX) genes x 1612073 cells for region data  # 20653 (out of 33538) genes x 2663736 cells for cell_type data

Unnamed: 0,AG_AAACCCACAGATAAAC-1,AG_AAACGAAAGGCCACCT-1,AG_AAACGAACACAAATAG-1,AG_AAACGAATCCACAGGC-1,AG_AAACGCTCAAACACGG-1,AG_AAACGCTCAGAATCGG-1,AG_AAACGCTTCTGTTCAT-1,AG_AAAGGGCAGCTAATGA-1,AG_AAAGGGCTCGCTTGAA-1,AG_AAAGGTACAGACCCGT-1,...,TH_TTTGACTGTGCCTAAT-47,TH_TTTGATCAGCAAATGT-47,TH_TTTGGAGAGCTAGATA-47,TH_TTTGGAGGTCTCCCTA-47,TH_TTTGGAGTCATTTCGT-47,TH_TTTGGTTAGTACAGCG-47,TH_TTTGGTTGTTACAGCT-47,TH_TTTGTTGCACCTCTGT-47,TH_TTTGTTGGTATGCTAC-47,TH_TTTGTTGGTCGGATTT-47
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SAMD11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOC2L,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C21orf58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PCNT,2.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DIP2A,0.0,1.0,1.0,11.0,0.0,1.0,1.0,1.0,5.0,0.0,...,2.0,0.0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0
S100B,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [4]:
# Safety check
(f_h5ad.obs.index == ex_matrix.columns).all()

True

In [5]:
# Import metadata from file
metadata = pd.read_csv(METADATA_H5AD_FNAME, index_col=0, sep = "\t", low_memory=False)  # Assuming first column is the cell ID
metadata = metadata.set_index("barcode")
metadata

Unnamed: 0_level_0,U1,U2,rind,region,projid,is.doublet,col,tspcol,hcluster,hcelltype,hsubclass,major.celltype,minor.celltype,neuronal.layer,inh.subtype,neuronal.exttype,full.exttype,cell_type_high_resolution
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AG_AACCATGTCATTGCGA-1,-2.428250,8.324310,AG.1,AG,50410319,False,#E31A1C,#E31A1C80,,Ast,,Ast,Ast,,,,Ast,Ast DPP10
AG_AACGGGATCGAGATGG-1,-5.414470,12.086900,AG.1,AG,50410319,False,#E31A1C,#E31A1C80,,Ast,,Ast,Ast,,,,Ast,Ast GRM3
AG_AACGTCAAGCGTGAAC-1,-2.404900,10.881300,AG.1,AG,50410319,False,#E31A1C,#E31A1C80,,Ast,,Ast,Ast,,,,Ast,Ast GRM3
AG_AAGCGAGAGGACAAGA-1,-5.492280,13.338700,AG.1,AG,50410319,False,#E31A1C,#E31A1C80,,Ast,,Ast,Ast,,,,Ast,Ast GRM3
AG_AAGGTAACATGGGATG-1,-3.846350,12.626400,AG.1,AG,50410319,False,#E31A1C,#E31A1C80,,Ast,,Ast,Ast,,,,Ast,Ast GRM3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TH_TTGCCTGGTACATACC-47,0.669809,5.040410,TH.47,TH,94430339,False,#FED9A6,#FED9A680,choroid_plexus_epithelial_cells,Vasc/Epithelia,,Vasc/Epithelia,CPEC,,,,CPEC,CPEC
TH_TTGGGATTCAACACGT-47,5.291230,0.209286,TH.47,TH,94430339,False,#FED9A6,#1F78B480,endothelial_cells,Vasc/Epithelia,,Vasc/Epithelia,End,,,,End,End
TH_TTGGGTAAGAGAGAAC-47,3.646170,0.957303,TH.47,TH,94430339,False,#FED9A6,#FED9A680,fibroblasts,Vasc/Epithelia,,Vasc/Epithelia,Fib,,,,Fib,Fib
TH_TTGTGTTGTCACGACC-47,0.820165,4.782530,TH.47,TH,94430339,False,#FED9A6,#FED9A680,choroid_plexus_epithelial_cells,Vasc/Epithelia,,Vasc/Epithelia,CPEC,,,,CPEC,CPEC


In [6]:
# Filter ex_matrix object
ex_matrix = ex_matrix.loc[:, ex_matrix.columns.isin(metadata.index)]
ex_matrix

Unnamed: 0,AG_AAACCCACAGATAAAC-1,AG_AAACGAAAGGCCACCT-1,AG_AAACGAACACAAATAG-1,AG_AAACGAATCCACAGGC-1,AG_AAACGCTCAAACACGG-1,AG_AAACGCTCAGAATCGG-1,AG_AAACGCTTCTGTTCAT-1,AG_AAAGGGCAGCTAATGA-1,AG_AAAGGGCTCGCTTGAA-1,AG_AAAGGTACAGACCCGT-1,...,TH_TTTGACTGTGCCTAAT-47,TH_TTTGATCAGCAAATGT-47,TH_TTTGGAGAGCTAGATA-47,TH_TTTGGAGGTCTCCCTA-47,TH_TTTGGAGTCATTTCGT-47,TH_TTTGGTTAGTACAGCG-47,TH_TTTGGTTGTTACAGCT-47,TH_TTTGTTGCACCTCTGT-47,TH_TTTGTTGGTATGCTAC-47,TH_TTTGTTGGTCGGATTT-47
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SAMD11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOC2L,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C21orf58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PCNT,2.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DIP2A,0.0,1.0,1.0,11.0,0.0,1.0,1.0,1.0,5.0,0.0,...,2.0,0.0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0
S100B,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [7]:
# Reorder
metadata = metadata.sort_index()
ex_matrix = ex_matrix.reindex(sorted(ex_matrix.columns), axis=1)

In [8]:
# Safety check
(metadata.index == ex_matrix.columns).all()

True

In [9]:
# Subset to only Excitatory neurons
ex_matrix = ex_matrix.loc[:, metadata["major.celltype"] == "Exc"]
ex_matrix # 18587 rows (genes) × 436014 columns (cells)

Unnamed: 0,AG_AAACCCAAGAAATTCG-40,AG_AAACCCAAGACAACTA-20,AG_AAACCCAAGACCACGA-45,AG_AAACCCAAGACCTCCG-24,AG_AAACCCAAGAGAGGTA-18,AG_AAACCCAAGAGGTCAC-45,AG_AAACCCAAGAGTGACC-41,AG_AAACCCAAGATGAACT-38,AG_AAACCCAAGATTAGCA-14,AG_AAACCCAAGATTAGCA-23,...,TH_TTTGTTGTCAACCCGG-41,TH_TTTGTTGTCACCCTCA-12,TH_TTTGTTGTCACTTGTT-17,TH_TTTGTTGTCCAATCTT-29,TH_TTTGTTGTCCAATCTT-32,TH_TTTGTTGTCGAACCAT-29,TH_TTTGTTGTCGAACCAT-32,TH_TTTGTTGTCGGAGTAG-32,TH_TTTGTTGTCTCGTTTA-9,TH_TTTGTTGTCTTGAACG-22
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SAMD11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOC2L,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C21orf58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PCNT,6.0,2.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,6.0,0.0,5.0,1.0,2.0,0.0
DIP2A,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,4.0,1.0,0.0,9.0,0.0,6.0,4.0,0.0,0.0
S100B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
ex_matrix.to_pickle(param_matrix_pickle) # ~24Gb RAM
#ex_matrix = pd.read_pickle(param_matrix_pickle)

# SCENIC steps

## STEP 1: Gene regulatory network inference, and generation of co-expression modules

### 1.a. GRN inference using the GRNBoost2 algorithm

In the initial phase of the pySCENIC pipeline the single cell expression profiles are used to infer co-expression modules from.

Run GRNboost from arboreto to infer co-expression modules

The arboreto package is used for this phase of the pipeline.

*Output:* List of adjacencies between a TF and its targets.

Run RegDiffusionTrainer instead of the (slow) GRNBoost2 algorithm. See https://tuftsbcb.github.io/RegDiffusion/downstream_with_pyscenic.html

In [None]:
# Prepare dataset for RegDiffusionTrainer (needs to be logged, cells as rows)
ex_matrix_log = np.log(ex_matrix.transpose() + 1.0) # Transpose and log
ex_matrix_log = ex_matrix_log.loc[:, ~(ex_matrix_log == 0).all()] # Drop columns where all values are 0
ex_matrix_log

In [None]:
rd_trainer = rd.RegDiffusionTrainer(ex_matrix_log.to_numpy(), device="cpu")
rd_trainer.train()

Extract edges from GRN

In [None]:
# Now we focus on edges with weight > 50 percentile. 
grn = rd_trainer.get_grn(ex_matrix_log.columns, top_gene_percentile = 50) # gene_names to recover non-expressed genes

# Here for each gene, we are going to extract all edges
adjacencies = grn.extract_edgelist(k = -1, workers = param_n_workers)
adjacencies.columns = ['TF', 'target', 'importance']

# check edgelist.  
adjacencies

Read in the adjacencies matrix

In [None]:
# Restrict to tf names
adjacencies = adjacencies[adjacencies['TF'].isin(tf_names)]
# Sort by importance
adjacencies = adjacencies.sort_values(by='importance', ascending=False)
adjacencies

In [None]:
print(adjacencies.TF.nunique(), "unique TF-modules were found ( out of",len(tf_names),").")

In [None]:
print(adjacencies.target.nunique(), "unique targets were found ( out of",len(ex_matrix.index),").")

In [None]:
adjacencies.TF.isin(tf_names).all()

In [11]:
adjacencies.to_csv(adj_matrix, index=False, sep=',')
#adjacencies = pd.read_csv(adj_matrix, sep=',', na_filter=False) # If na_filter=True, the nan gene is detected as NaN
adjacencies

Unnamed: 0,TF,target,importance
0,KDM5D,USP9Y,1.472000e+01
1,RORB,POU6F2,1.433600e+01
2,KDM5D,NLGN4Y,1.426000e+01
3,KDM5D,UTY,1.394000e+01
4,POU6F2,RORB,1.379000e+01
...,...,...,...
14023224,POU2F3,ZNF19,1.900000e-06
14023225,BARX1,TAAR8,1.850000e-06
14023226,CENPB,FAM57A,1.100000e-06
14023227,EBF3,MMP12,6.600000e-07


## STEP 2-3: Regulon prediction aka cisTarget

*Output:* List of adjacencies between a TF and its targets.

### 2.a. Running regulon prediction using cisTarget

Here, we use the --mask_dropouts option, which affects how the correlation between TF and target genes is calculated during module creation. It is important to note that prior to pySCENIC v0.9.18, the default behavior was to mask dropouts, while in v0.9.18 and later, the correlation is performed using the entire set of cells (including those with zero expression). When using the modules_from_adjacencies function directly in python instead of via the command line, the rho_mask_dropouts option can be used to control this.

**Note:** I kept same parameters than when I ran pySCENIC on our own dataset. It produces 1618 regulons when using TF list from Aerts.

In [12]:
modules = list(modules_from_adjacencies(adjacencies, ex_matrix.transpose(), rho_mask_dropouts=True, keep_only_activating=True)) # rho_mask_dropouts=True

  ex_mtx = ex_mtx.T[~ex_mtx.columns.duplicated(keep="first")].T.astype(float)

2025-05-15 09:04:59,849 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [True].

2025-05-15 09:48:52,690 - pyscenic.utils - INFO - Creating modules.


In [97]:
pd.to_pickle(modules, f_modules_pickle)
#modules = pd.read_pickle(f_modules_pickle)

In [13]:
modules_df = pd.DataFrame(index = range(0, len(modules)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Context", "NES", "Markers"))
for j in range(0, len(modules)):
    # Setting values
    context = list(modules[j].context)
    modules_df["Regulon"].iloc[j] = modules[j].name
    modules_df["TF"].iloc[j] = modules[j].transcription_factor
    modules_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    modules_df["NbMarkers"].iloc[j] = len(set(modules[j].gene2weight))
    modules_df["Context"].iloc[j] = context[1]
    modules_df["NES"].iloc[j] = modules[j].score
    modules_df["Markers"].iloc[j] = ','.join(list(modules[j].gene2weight))

modules_df = modules_df.sort_values(by='NbMarkers', ascending=False)
modules_df.to_csv(f_modules_path, index=False, sep = "\t")
modules_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Context,NES,Markers
486,Regulon for PHTF1,PHTF1,activating,7383,weight>75.0%,0.0,"STK39,UNC80,DYNC1I1,SPOCK1,ASAP1,BICD1,SUPT3H,..."
406,Regulon for NAP1L1,NAP1L1,activating,7283,weight>75.0%,0.0,"RTN4,PREPL,CD47,TSC22D1,FBXL17,NPTN,IDS,MARCH6..."
756,Regulon for YWHAZ,YWHAZ,activating,7277,weight>75.0%,0.0,"YWHAG,CALM1,IDS,SNAP25,UCHL1,SERINC1,ATP1B1,CA..."
823,Regulon for ZKSCAN1,ZKSCAN1,activating,7191,weight>75.0%,0.0,"PAFAH1B1,WSB1,NDFIP1,HGSNAT,SPTAN1,GIPC1,PCMTD..."
550,Regulon for RBFOX2,RBFOX2,activating,7187,weight>75.0%,0.0,"RIMS1,TRIM9,NBEA,GPHN,RASAL2,DOCK3,HERC1,NLGN1..."
...,...,...,...,...,...,...,...
4620,ZNF624,ZNF624,activating,20,top5perTarget,0.0,"ALX4,ESPL1,HEMK1,HIST1H4I,LGALS2,NOTCH4,NUPR1,..."
4913,GLI3,GLI3,activating,20,top10perTarget,0.0,"C17orf64,DSG4,ESPNL,FBXL7,FLT1,GJA1,HEY2,HHEX,..."
3871,BRCA1,BRCA1,activating,20,top5perTarget,0.0,"CALCR,CD300LF,CT62,CYP2A7,DEFB116,FAM111B,GJD4..."
4487,ZNF169,ZNF169,activating,20,top5perTarget,0.0,"C17orf78,CCKAR,KCNG4,KLK1,KRT222,LILRB5,LRRC45..."


In [14]:
print(modules_df.TF.nunique(), "unique TF-modules were found ( out of",len(tf_names),"). Modules with less than 20 markers were filtered out.")

1676 unique TF-modules were found ( out of 1892 ). Modules with less than 20 markers were filtered out.


In [38]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

df = prune2df(dbs, modules, f_motif_path, num_workers=param_n_workers, weighted_recovery=False, rank_threshold = 1500, nes_threshold=3, motif_similarity_fdr=0.001, auc_threshold=0.05, filter_for_annotation=False)
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))
# Note: 30mn30 with num_workers=64

df.to_csv(f_motifs_path, sep = "\t")
df

Start time: 13:29:51
End time: 13:57:40
Running time: 0:27:48.897491


Unnamed: 0_level_0,Unnamed: 1_level_0,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment
Unnamed: 0_level_1,Unnamed: 1_level_1,AUC,NES,MotifSimilarityQvalue,OrthologousIdentity,Annotation,Context,TargetGenes,RankAtMax
TF,MotifID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
ABL1,cisbp__M01745,0.052753,3.045009,,,,"(activating, weight>75.0%, hg38_500bp_up_100bp...","[(CACNA1D, 3.967), (PTPN21, 1.539), (FOXK1, 2....",1207
ABL1,hdpi__ZNF207,0.053048,3.091019,,,,"(activating, weight>75.0%, hg38_500bp_up_100bp...","[(PTPN21, 1.539), (CACNA1D, 3.967), (UBE2E3, 2...",1428
ABL1,kznf__ZNF100_Imbeault2017_RP_RCADE,0.054408,3.302745,,,,"(activating, weight>75.0%, hg38_500bp_up_100bp...","[(ZFHX3, 1.278), (MED13L, 2.855), (TTBK1, 1.94...",1468
ABL1,kznf__ZNF571_Imbeault2017_OM_RCADE,0.056786,3.673179,,,,"(activating, weight>75.0%, hg38_500bp_up_100bp...","[(ZFHX3, 1.278), (GAS6, 1.701), (DIS3L, 1.497)...",1492
ABL1,metacluster_151.1,0.053208,3.115963,,,,"(activating, weight>75.0%, hg38_500bp_up_100bp...","[(ZFHX3, 1.278), (TMOD1, 1.501), (AUTS2, 3.791...",1461
...,...,...,...,...,...,...,...,...,...
ZXDC,taipale_tf_pairs__TEAD4_ELK1_RMATWCCGGAWRN_CAP_repr,0.044465,3.284867,,,,"(top50perTarget, activating, hg38_500bp_up_100...","[(ARMCX5, 1.529), (FAF2, 2.516), (TFG, 2.662),...",1426
ZXDC,taipale_tf_pairs__TEAD4_ERG_RSCGGAAATRCC_CAP,0.046339,3.625273,,,,"(top50perTarget, activating, hg38_500bp_up_100...","[(TFG, 2.662), (FAM91A1, 2.004), (CCDC73, 1.27...",1338
ZXDC,taipale_tf_pairs__TEAD4_ERG_RSCGGAAATRCC_CAP_repr,0.043549,3.118488,,,,"(top50perTarget, activating, hg38_500bp_up_100...","[(TFG, 2.662), (CCDC73, 1.271), (AP1M1, 2.805)...",1229
ZXDC,taipale_tf_pairs__TEAD4_ETV1_RSCGGAAATRCM_CAP,0.045596,3.490335,,,,"(top50perTarget, activating, hg38_500bp_up_100...","[(JPH3, 2.732), (AP1M1, 2.805), (TFG, 2.662), ...",1111


In [None]:
# # Checkpoint
# # Reading back the data from tsv
# ## 1. Read with multi-index headers
# df = pd.read_csv(f_motifs_path, sep="\t", header=[0, 1], index_col=[0, 1])
# ## 2. Transform the "Context" frozensets (string) into actual frozensets
# def parse_frozenset_string(s):
#     if isinstance(s, str) and s.startswith('frozenset'):
#         return frozenset(ast.literal_eval(s[len('frozenset('):-1]))
#     return s
# df.loc[:, ('Enrichment', 'Context')] = df.loc[:, ('Enrichment', 'Context')].apply(parse_frozenset_string)
# ## 3. Transform "TargetGenes" from string to list of tuples
# def parse_list_of_tuples(s):
#     if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
#         return ast.literal_eval(s)
#     return s  # if it's already a list or something else
# df.loc[:, ('Enrichment', 'TargetGenes')] = df.loc[:, ('Enrichment', 'TargetGenes')].apply(parse_list_of_tuples)

In [16]:
print(len(set(df.index.get_level_values('TF').values)), "regulons were kept, after pruning")

1676 regulons were kept, after pruning


In [17]:
# Look for main regulons
print("ATF4", "ATF4" in df.index.get_level_values('TF').values, sep="\t")

ATF4	True


In [92]:
print("Size of Dataframe:", len(df))
# Check which rows have empty lists in 'TargetGenes'
mask = df[('Enrichment', 'TargetGenes')].apply(lambda x: len(x) == 0)
# Drop those rows
df_filtered = df.loc[~mask]
print("Size of Dataframe:", len(df_filtered))

Size of Dataframe: 242008
Size of Dataframe: 242008


These "modules" are then combined into regulons, by taking the top NES for each TF (for main Motif, and final score of regulon). All genes are bundled together.

In [94]:
# This dataframe can then be converted to regulons.
regulons = df2regulons(df_filtered)

Create regulons from a dataframe of enriched features.
Additional columns saved: []


In [5]:
pd.to_pickle(regulons, f_regulons_pickle)
#regulons = pd.read_pickle(f_regulons_pickle)

In [6]:
regulon_df = pd.DataFrame(index = range(0, len(regulons)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Motif", "NES", "Markers"))
for j in range(0, len(regulons)):
    # Fixing order of set
    context = list(regulons[j].context)
    if(context[0].endswith(".png")):
        tmp = context[0]
        context[0] = context[1]
        context[1] = tmp
    # Setting values
    regulon_df["Regulon"].iloc[j] = regulons[j].name
    regulon_df["TF"].iloc[j] = regulons[j].transcription_factor
    regulon_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    regulon_df["NbMarkers"].iloc[j] = len(set(regulons[j].gene2weight))
    regulon_df["Motif"].iloc[j] = "https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/logos/" + context[1]
    regulon_df["NES"].iloc[j] = regulons[j].score
    regulon_df["Markers"].iloc[j] = ','.join(list(regulons[j].gene2weight))

regulon_df = regulon_df.sort_values(by='NbMarkers', ascending=False)
regulon_df.to_csv(f_regulons_path, index=False, sep = "\t")
regulon_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Motif,NES,Markers
1421,ZNF483(+),ZNF483,activating,4473,https://resources.aertslab.org/cistarget/motif...,5.801521,"ZNF384,TPRKB,NR1H2,TMEM167A,CD2BP2,ING4,MRPS22..."
404,HLTF(+),HLTF,activating,4467,https://resources.aertslab.org/cistarget/motif...,5.700365,"ZNF329,LYPLAL1,SMARCAD1,ATXN7L3B,XRCC5,SLC39A1..."
763,PKM(+),PKM,activating,4401,https://resources.aertslab.org/cistarget/motif...,5.866966,"FANCC,ING4,XRCC5,CD2BP2,RMDN1,RPS16,ITGB1,TAF1..."
340,GOT1(+),GOT1,activating,4248,https://resources.aertslab.org/cistarget/motif...,7.365905,"ING4,XRCC5,CD2BP2,RMDN1,RPS16,ATG16L1,ITGB1,TA..."
1337,ZNF302(+),ZNF302,activating,4232,https://resources.aertslab.org/cistarget/motif...,7.988816,"ZNF384,ING4,XRCC5,MRPS22,RMDN1,ICA1L,RPS16,ATG..."
...,...,...,...,...,...,...,...
1203,ZFP42(+),ZFP42,activating,21,https://resources.aertslab.org/cistarget/motif...,5.496516,"HNMT,CD40,PCYT1B,METTL7A,OTUD6B,DDX3Y,CFLAR,TB..."
803,PRDM13(+),PRDM13,activating,20,https://resources.aertslab.org/cistarget/motif...,4.56084,"LMNB1,UTP18,CD63,GRID2,SWI5,KCNQ5,SCX,ALDH2,PI..."
273,FOXA1(+),FOXA1,activating,19,https://resources.aertslab.org/cistarget/motif...,5.370463,"TMED3,CBLN4,ZIC4,OTUD1,PRPF39,HMGB2,OTX2,PPP2R..."
417,HMX2(+),HMX2,activating,19,https://resources.aertslab.org/cistarget/motif...,5.623844,"LAP3,GLIPR1L1,BCL6,ARMCX6,HMX2,PTCHD4,KDELC2,D..."


In [7]:
# Look for main regulons
print("ATF4", "ATF4" in regulon_df.TF.values, sep="\t")

ATF4	True


## Phase III: Cellular regulon enrichment matrix (aka AUCell)

Characterize the different cells in a single-cell transcriptomics experiment by the enrichment of the regulons. Enrichment of a regulon is measures as AUC of the recovery curve of the genes that define this regulon.

In [8]:
auc_mtx = aucell(ex_matrix.transpose(), regulons, num_workers=param_n_workers)
auc_mtx.to_csv(f_regulons_aucell_path, sep = "\t")
auc_mtx

Regulon,A1CF(+),ABCF2(+),ABL1(+),ACAA1(+),ACO1(+),ADARB1(+),ADNP(+),ADNP2(+),AEBP2(+),AFF4(+),...,ZSCAN31(+),ZSCAN32(+),ZSCAN4(+),ZSCAN5A(+),ZSCAN5B(+),ZSCAN9(+),ZSWIM1(+),ZXDA(+),ZXDB(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AG_AAACCCAAGAAATTCG-40,0.000000,0.000000,0.111381,0.046440,0.057841,0.106420,0.092009,0.057382,0.136745,0.100157,...,0.209130,0.004279,0.000000,0.108665,0.029518,0.016580,0.000000,0.004353,0.009289,0.086191
AG_AAACCCAAGACAACTA-20,0.006954,0.000000,0.107523,0.046954,0.061225,0.105570,0.085729,0.055504,0.129114,0.096673,...,0.194486,0.009664,0.000000,0.113975,0.029550,0.029860,0.001542,0.000000,0.011777,0.084910
AG_AAACCCAAGACCACGA-45,0.000000,0.000000,0.099369,0.045400,0.054026,0.095106,0.091095,0.068265,0.123867,0.087801,...,0.190174,0.006838,0.003007,0.092645,0.029550,0.039422,0.023296,0.000000,0.035742,0.078027
AG_AAACCCAAGACCTCCG-24,0.012406,0.000000,0.100276,0.048365,0.065367,0.089820,0.091513,0.062329,0.120494,0.094450,...,0.204802,0.003040,0.000000,0.099655,0.029486,0.011591,0.036723,0.000000,0.050545,0.083830
AG_AAACCCAAGAGAGGTA-18,0.000000,0.021556,0.046888,0.046760,0.032266,0.057617,0.071412,0.069911,0.046740,0.058532,...,0.108122,0.014803,0.000000,0.034022,0.000000,0.047291,0.111867,0.011336,0.206744,0.051408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TH_TTTGTTGTCGAACCAT-29,0.014178,0.016261,0.066749,0.049662,0.059866,0.070655,0.066665,0.060826,0.085823,0.066215,...,0.077486,0.017941,0.000000,0.067812,0.028853,0.016630,0.018839,0.014913,0.047919,0.057455
TH_TTTGTTGTCGAACCAT-32,0.000000,0.015809,0.101726,0.058764,0.088517,0.108142,0.097378,0.087396,0.125052,0.092149,...,0.130148,0.013061,0.000000,0.102266,0.029136,0.065132,0.036737,0.009365,0.047544,0.085926
TH_TTTGTTGTCGGAGTAG-32,0.009451,0.008017,0.105020,0.057755,0.095472,0.105466,0.097088,0.080059,0.124002,0.091010,...,0.135483,0.012452,0.000000,0.103822,0.029658,0.044185,0.037709,0.004874,0.035561,0.087462
TH_TTTGTTGTCTCGTTTA-9,0.000000,0.011756,0.105672,0.063946,0.095210,0.106240,0.091940,0.077622,0.123149,0.084674,...,0.115650,0.021266,0.000000,0.108039,0.029550,0.083885,0.017304,0.004103,0.009722,0.082298


In [3]:
# Checkpoint to regenerate the object from the file
auc_mtx = pd.read_csv(f_regulons_aucell_path, sep = "\t", index_col = "Cell")
auc_mtx.columns.name = "Regulon"
auc_mtx

Regulon,A1CF(+),ABCF2(+),ABL1(+),ACAA1(+),ACO1(+),ADARB1(+),ADNP(+),ADNP2(+),AEBP2(+),AFF4(+),...,ZSCAN31(+),ZSCAN32(+),ZSCAN4(+),ZSCAN5A(+),ZSCAN5B(+),ZSCAN9(+),ZSWIM1(+),ZXDA(+),ZXDB(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AG_AAACCCAAGAAATTCG-40,0.000000,0.000000,0.111381,0.046440,0.057841,0.106420,0.092009,0.057382,0.136745,0.100157,...,0.209130,0.004279,0.000000,0.108665,0.029518,0.016580,0.000000,0.004353,0.009289,0.086191
AG_AAACCCAAGACAACTA-20,0.006954,0.000000,0.107523,0.046954,0.061225,0.105570,0.085729,0.055504,0.129114,0.096673,...,0.194486,0.009664,0.000000,0.113975,0.029550,0.029860,0.001542,0.000000,0.011777,0.084910
AG_AAACCCAAGACCACGA-45,0.000000,0.000000,0.099369,0.045400,0.054026,0.095106,0.091095,0.068265,0.123867,0.087801,...,0.190174,0.006838,0.003007,0.092645,0.029550,0.039422,0.023296,0.000000,0.035742,0.078027
AG_AAACCCAAGACCTCCG-24,0.012406,0.000000,0.100276,0.048365,0.065367,0.089820,0.091513,0.062329,0.120494,0.094450,...,0.204802,0.003040,0.000000,0.099655,0.029486,0.011591,0.036723,0.000000,0.050545,0.083830
AG_AAACCCAAGAGAGGTA-18,0.000000,0.021556,0.046888,0.046760,0.032266,0.057617,0.071412,0.069911,0.046740,0.058532,...,0.108122,0.014803,0.000000,0.034022,0.000000,0.047291,0.111867,0.011336,0.206744,0.051408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TH_TTTGTTGTCGAACCAT-29,0.014178,0.016261,0.066749,0.049662,0.059866,0.070655,0.066665,0.060826,0.085823,0.066215,...,0.077486,0.017941,0.000000,0.067812,0.028853,0.016630,0.018839,0.014913,0.047919,0.057455
TH_TTTGTTGTCGAACCAT-32,0.000000,0.015809,0.101726,0.058764,0.088517,0.108142,0.097378,0.087396,0.125052,0.092149,...,0.130148,0.013061,0.000000,0.102266,0.029136,0.065132,0.036737,0.009365,0.047544,0.085926
TH_TTTGTTGTCGGAGTAG-32,0.009451,0.008017,0.105020,0.057755,0.095472,0.105466,0.097088,0.080059,0.124002,0.091010,...,0.135483,0.012452,0.000000,0.103822,0.029658,0.044185,0.037709,0.004874,0.035561,0.087462
TH_TTTGTTGTCTCGTTTA-9,0.000000,0.011756,0.105672,0.063946,0.095210,0.106240,0.091940,0.077622,0.123149,0.084674,...,0.115650,0.021266,0.000000,0.108039,0.029550,0.083885,0.017304,0.004103,0.009722,0.082298


In [15]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

auc_mtx_bin = binarize(auc_mtx, seed = 42, num_workers=param_n_workers)

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: 08h40mn46 with num_workers=32

Start time: 22:04:01
End time: 06:44:48
Running time: 8:40:46.868378


In [16]:
binarization_thresholds = auc_mtx_bin[1]
binarization_thresholds.to_csv(f_regulons_binarization_thresholds_aucell_path, sep = "\t")
binarization_thresholds

Regulon
A1CF(+)      0.002720
ABCF2(+)     0.012674
ABL1(+)      0.060816
ACAA1(+)     0.059583
ACO1(+)      0.094483
               ...   
ZSCAN9(+)    0.041063
ZSWIM1(+)    0.031915
ZXDA(+)      0.002884
ZXDB(+)      0.100295
ZXDC(+)      0.062062
Length: 1676, dtype: float64

In [17]:
auc_mtx_bin = auc_mtx_bin[0]
auc_mtx_bin.to_csv(f_regulons_binarized_aucell_path, sep = "\t")
auc_mtx_bin

Regulon,A1CF(+),ABCF2(+),ABL1(+),ACAA1(+),ACO1(+),ADARB1(+),ADNP(+),ADNP2(+),AEBP2(+),AFF4(+),...,ZSCAN31(+),ZSCAN32(+),ZSCAN4(+),ZSCAN5A(+),ZSCAN5B(+),ZSCAN9(+),ZSWIM1(+),ZXDA(+),ZXDB(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AG_AAACCCAAGAAATTCG-40,0,0,1,0,0,1,0,0,1,1,...,1,0,0,1,0,0,0,1,0,1
AG_AAACCCAAGACAACTA-20,1,0,1,0,0,1,0,0,1,1,...,1,0,0,1,0,0,0,0,0,1
AG_AAACCCAAGACCACGA-45,0,0,1,0,0,1,0,0,1,1,...,1,0,0,1,0,0,0,0,0,1
AG_AAACCCAAGACCTCCG-24,1,0,1,0,0,1,0,0,1,1,...,1,0,0,1,0,0,1,0,0,1
AG_AAACCCAAGAGAGGTA-18,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TH_TTTGTTGTCGAACCAT-29,1,1,1,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
TH_TTTGTTGTCGAACCAT-32,0,1,1,0,0,1,0,1,1,1,...,0,0,0,1,0,1,1,1,0,1
TH_TTTGTTGTCGGAGTAG-32,1,0,1,0,1,1,0,1,1,1,...,1,0,0,1,0,1,1,1,0,1
TH_TTTGTTGTCTCGTTTA-9,0,0,1,1,1,1,0,0,1,1,...,0,1,0,1,0,1,0,1,0,1


In [18]:
sum(auc_mtx_bin["ATF4(+)"])

20628

In [19]:
binarization_thresholds.loc['ATF4(+)']

0.05723547513734781

In [20]:
sum(auc_mtx["ATF4(+)"] > binarization_thresholds.loc['ATF4(+)'])

20628