# GSE243639 - Martirosyan - pySCENIC pipeline (Embedded version)

**Author:** Vincent Gardeux

**Date Created:** 05/03/2024

**Date Modified:** 05/03/2024

# Libraries

In [2]:
!pip install polars
# I need polars to load the csv fast enough. Installing within the Docker...

Collecting polars
  Downloading polars-1.24.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-1.24.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Fix OPENBLAS Warnings
import os
default_n_threads = 12
os.environ['OPENBLAS_NUM_THREADS'] = f"{default_n_threads}"
os.environ['MKL_NUM_THREADS'] = f"{default_n_threads}"
os.environ['OMP_NUM_THREADS'] = f"{default_n_threads}"

# import dependencies
import pandas as pd
import numpy as np
import polars as pl
import pickle
import pytz

from datetime import datetime
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from distributed import Client, LocalCluster

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.binarization import binarize
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

# Parameters

In [7]:
# [Input] Count matrix file to use
EXPRESSION_MTX_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/GSE243639_Filtered_count_table.csv.gz' # From https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE243639

# Open CSV count matrix
f_m = pl.read_csv(EXPRESSION_MTX_FNAME)
f_gene_names = f_m[:, 0].to_list()  # First column as list of strings
f_cell_names = f_m.columns[1:]      # Column names (excluding first column)

# Create expression matrix
ex_matrix = pd.DataFrame(f_m[:, 1:], columns = f_cell_names, index = f_gene_names) # 33537 genes x 83484 cells

# [Input] Transcription factors list (SCENIC step 1: GRNBoost2)
f_tfs = "/data/gardeux/Neuro_Droso_ND75KD/data/allTFs_hg38.txt" # From https://resources.aertslab.org/cistarget/tf_lists/
# Derive list of Transcription Factors(TF)
tf_names = load_tf_names(f_tfs)

# [Output] Adjacency matrix (SCENIC step 1: GRNBoost2)
adj_matrix = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_adj.csv"

# [Input] Ranking databases (SCENIC step 2-3: cisTarget)
f_db_names = ["/data/gardeux/Neuro_Droso_ND75KD/data/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather"] # From pySCENIC db: https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/
# Alternatively: f_db_names = ["/data/gardeux/Neuro_Droso_ND75KD/data/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather"]
dbs = [RankingDatabase(fname=f_name, name=os.path.basename(f_name)) for f_name in f_db_names]

# [Input] Motif databases (SCENIC step 2-3: cisTarget)
f_motif_path = "/data/gardeux/Neuro_Droso_ND75KD/data/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl" # From pySCENIC db: https://resources.aertslab.org/cistarget/motif2tf/

# [Output] Regulons (SCENIC step 2-3: cisTarget)
f_motifs_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_motifs.tsv"
f_modules_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_modules.tsv"
f_regulons_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_regulons.tsv"
f_regulons_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_regulons_aucell.tsv"
f_regulons_binarized_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_regulons_aucell_binarized.tsv"
f_regulons_binarization_thresholds_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/Martirosyan_regulons_aucell_binarization_thresholds.tsv"

# Restrict matrix to feather genes
ranking_feather = pd.read_feather(f_db_names[0])
overlap_values = ex_matrix.index[pd.Series(ex_matrix.index).isin(ranking_feather.columns)].unique()
ex_matrix = ex_matrix.loc[overlap_values, :]

ex_matrix

# SCENIC steps

## STEP 1: Gene regulatory network inference, and generation of co-expression modules

### 1.a. GRN inference using the GRNBoost2 algorithm

In the initial phase of the pySCENIC pipeline the single cell expression profiles are used to infer co-expression modules from.

Run GRNboost from arboreto to infer co-expression modules

The arboreto package is used for this phase of the pipeline.

*Output:* List of adjacencies between a TF and its targets.

Run GRNBoost2 algorithm

In [15]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

# Prepare the multithreading
cluster = LocalCluster(name='grn_call', dashboard_address=":12345", n_workers=default_n_threads, threads_per_worker=8)
client = Client(cluster)

# Here I run the function within the package (no CLI)
adjacencies = grnboost2(expression_data=ex_matrix.transpose(), tf_names=tf_names, seed=42, verbose=True, client_or_address=client)
            
# Shutting down cluster
client.close()
cluster.close()
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~18hrs52mn with n_workers=12, threads_per_worker=8

Start time: 13:20:48
preparing dask client
parsing input
creating dask graph
12 partitions
computing dask graph
not shutting down client, client was created externally
finished
End time: 08:12:56
Running time: 18:52:07.946933


Read in the adjacencies matrix

In [16]:
adjacencies.to_csv(adj_matrix, index=False, sep=',')
#adjacencies = pd.read_csv(adj_matrix, sep=',', na_filter=False) # If na_filter=True, the nan gene is detected as NaN
adjacencies

Unnamed: 0,TF,target,importance
1422,TRIB3,C1orf43,3.427943e+02
1015,NKX6-2,FTH1,3.197987e+02
951,FEZ1,CRYAB,3.188863e+02
951,FEZ1,S100B,2.995253e+02
1179,FOS,HSPB1,2.934377e+02
...,...,...,...
699,HDX,HS3ST2,1.262339e-19
1409,POLI,GRM1,1.075685e-19
1618,ZNF260,GRM1,7.377732e-20
1821,ZNF280B,HS3ST2,4.700777e-20


## STEP 2-3: Regulon prediction aka cisTarget

*Output:* List of adjacencies between a TF and its targets.

### 2.a. Running regulon prediction using cisTarget

Here, we use the --mask_dropouts option, which affects how the correlation between TF and target genes is calculated during module creation. It is important to note that prior to pySCENIC v0.9.18, the default behavior was to mask dropouts, while in v0.9.18 and later, the correlation is performed using the entire set of cells (including those with zero expression). When using the modules_from_adjacencies function directly in python instead of via the command line, the rho_mask_dropouts option can be used to control this.

**Note:** I kept same parameters than when I ran pySCENIC on our own dataset. It produces 1618 regulons when using TF list from Aerts.

In [17]:
modules = list(modules_from_adjacencies(adjacencies, ex_matrix.transpose(), rho_mask_dropouts=True, keep_only_activating=True)) # rho_mask_dropouts=True


2025-03-06 08:45:57,568 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [True].

2025-03-06 09:03:03,014 - pyscenic.utils - INFO - Creating modules.


In [18]:
modules_df = pd.DataFrame(index = range(0, len(modules)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Context", "NES", "Markers"))
for j in range(0, len(modules)):
    # Setting values
    context = list(modules[j].context)
    modules_df["Regulon"].iloc[j] = modules[j].name
    modules_df["TF"].iloc[j] = modules[j].transcription_factor
    modules_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    modules_df["NbMarkers"].iloc[j] = len(set(modules[j].gene2weight))
    modules_df["Context"].iloc[j] = context[1]
    modules_df["NES"].iloc[j] = modules[j].score
    modules_df["Markers"].iloc[j] = ','.join(list(modules[j].gene2weight))

modules_df = modules_df.sort_values(by='NbMarkers', ascending=False)
modules_df.to_csv(f_modules_path, index=False, sep = "\t")
modules_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Context,NES,Markers
1086,Regulon for ZBTB20,ZBTB20,activating,9942,weight>75.0%,0.0,"DDX17,NTM,DST,PCDH9,NCAM2,PPP2R2B,MAGI2,NPAS3,..."
329,Regulon for GTF2I,GTF2I,activating,9234,weight>75.0%,0.0,"HNRNPA2B1,LUC7L3,SRRM2,PLCG2,DDX17,SON,HP1BP3,..."
971,Regulon for TCF4,TCF4,activating,9233,weight>75.0%,0.0,"DDX17,ZBTB20,KMT2C,SPATA6,ZSWIM6,HNRNPA2B1,BAZ..."
967,Regulon for TCF12,TCF12,activating,9037,weight>75.0%,0.0,"PLCG2,SGK1,ELMO1,PTPRJ,CCNH,SIK3,ARAP2,FCHSD2,..."
7371,ZBTB20,ZBTB20,activating,8994,top50perTarget,0.0,"A2M-AS1,A2ML1,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK..."
...,...,...,...,...,...,...,...
4973,MBTPS2,MBTPS2,activating,20,top5perTarget,0.0,"ARHGAP19-SLIT1,ARHGEF16,CHIT1,EFCAB6-AS1,FAM98..."
1872,Regulon for FOXP3,FOXP3,activating,20,weight>90.0%,0.0,"RRS1-AS1,ZNF324B,BHMT,ZC3H18,SIM2,ZNF671,RNASE..."
5889,SALL2,SALL2,activating,20,top10perTarget,0.0,"ADGRL3-AS1,ART4,ERVH48-1,FOXL1,HP,IDO2,KCNAB1-..."
5930,SOX13,SOX13,activating,20,top10perTarget,0.0,"ADAM21,ALOXE3,ARHGAP31-AS1,DCST2,HSPA1L,LAMB3,..."


In [19]:
print(modules_df.TF.nunique(), "unique TF-modules were found ( out of",len(tf_names),"). Modules with less than 20 markers were filtered out.")

1618 unique TF-modules were found ( out of 1892 ). Modules with less than 20 markers were filtered out.


In [20]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

df = prune2df(dbs, modules, f_motif_path, num_workers=default_n_threads, weighted_recovery=False, rank_threshold = 1500, nes_threshold=3, motif_similarity_fdr=0.001, auc_threshold=0.05, filter_for_annotation=False)
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))
# Note: 36mn27 with num_workers=12

df.to_csv(f_motifs_path,sep = "\t")
#df

Start time: 10:50:28
End time: 11:26:01
Running time: 0:35:32.368071


In [21]:
print(len(set(df.index.get_level_values('TF').values)), "regulons were kept, after pruning")

1618 regulons were kept, after pruning


In [23]:
# Look for main regulons
print("ATF4", "ATF4" in df.index.get_level_values('TF').values, sep="\t")

ATF4	True


In [24]:
print("Size of Dataframe:", len(df))
drop_indexes = []
for j in range(0, len(df)):
    # Setting values
    if(len(df["Enrichment"]["TargetGenes"][j]) == 0): drop_indexes.append(df.index[j])
df_filtered = df["Enrichment"].drop(index=drop_indexes)
print("Size of Dataframe:", len(df_filtered))

Size of Dataframe: 285023
Size of Dataframe: 285023


These "modules" are then combined into regulons, by taking the top NES for each TF (for main Motif, and final score of regulon). All genes are bundled together.

In [25]:
# This dataframe can then be converted to regulons.
regulons = df2regulons(df_filtered)

Create regulons from a dataframe of enriched features.
Additional columns saved: []


In [26]:
regulon_df = pd.DataFrame(index = range(0, len(regulons)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Motif", "NES", "Markers"))
for j in range(0, len(regulons)):
    # Fixing order of set
    context = list(regulons[j].context)
    if(context[0].endswith(".png")):
        tmp = context[0]
        context[0] = context[1]
        context[1] = tmp
    # Setting values
    regulon_df["Regulon"].iloc[j] = regulons[j].name
    regulon_df["TF"].iloc[j] = regulons[j].transcription_factor
    regulon_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    regulon_df["NbMarkers"].iloc[j] = len(set(regulons[j].gene2weight))
    regulon_df["Motif"].iloc[j] = "https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/logos/" + context[1]
    regulon_df["NES"].iloc[j] = regulons[j].score
    regulon_df["Markers"].iloc[j] = ','.join(list(regulons[j].gene2weight))

regulon_df = regulon_df.sort_values(by='NbMarkers', ascending=False)
regulon_df.to_csv(f_regulons_path, index=False, sep = "\t")
regulon_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Motif,NES,Markers
333,GTF2I(+),GTF2I,activating,6313,https://resources.aertslab.org/cistarget/motif...,5.911895,"ZNF384,FANCC,ING4,XRCC5,CD2BP2,MRPS22,RMDN1,TP..."
982,TCF12(+),TCF12,activating,5307,https://resources.aertslab.org/cistarget/motif...,4.792579,"ZNF384,FANCC,ING4,XRCC5,CD2BP2,MRPS22,RMDN1,TP..."
523,MBNL2(+),MBNL2,activating,5305,https://resources.aertslab.org/cistarget/motif...,5.155327,"ZNF528,NPLOC4,ANKMY1,ZNF578,KNOP1,COG3,SLC25A4..."
602,NF1(+),NF1,activating,5019,https://resources.aertslab.org/cistarget/motif...,9.724204,"VPS25,ZNF384,NOSIP,TPRKB,NR1H2,TMEM167A,CCAR2,..."
838,RUFY3(+),RUFY3,activating,4999,https://resources.aertslab.org/cistarget/motif...,6.258469,"ADPRHL1,ZNF528,NPLOC4,ANKMY1,COG3,SLC25A46,ENT..."
...,...,...,...,...,...,...,...
445,IRX4(+),IRX4,activating,34,https://resources.aertslab.org/cistarget/motif...,4.299165,"SLC25A33,MNT,NOD1,GEMIN8,TTYH1,RHOBTB3,CRIM1,C..."
748,POU5F1B(+),POU5F1B,activating,34,https://resources.aertslab.org/cistarget/motif...,5.366329,"USP2,RAB23,RFX3,ANKRD9,C19orf12,AP5B1,MAPKAPK5..."
280,FOXN4(+),FOXN4,activating,24,https://resources.aertslab.org/cistarget/motif...,4.978754,"RHPN1,NOL9,LINS1,NUP43,RAB23,CCHCR1,TRMO,ZNF61..."
427,IL21(+),IL21,activating,20,https://resources.aertslab.org/cistarget/motif...,5.355578,"CARHSP1,ZNF621,CTSF,MFSD4A,C7orf26,GPSM1,AMER2..."


In [27]:
# Look for main regulons
print("ATF4", "ATF4" in df.index.get_level_values('TF').values, sep="\t")

ATF4	True


## Phase III: Cellular regulon enrichment matrix (aka AUCell)

Characterize the different cells in a single-cell transcriptomics experiment by the enrichment of the regulons. Enrichment of a regulon is measures as AUC of the recovery curve of the genes that define this regulon.

In [28]:
auc_mtx = aucell(ex_matrix.transpose(), regulons, num_workers=default_n_threads)
auc_mtx.to_csv(f_regulons_aucell_path, sep = "\t")
auc_mtx

Regulon,A1CF(+),ABCF2(+),ABL1(+),ACAA1(+),ACO1(+),ADARB1(+),ADNP(+),ADNP2(+),AEBP2(+),AFF4(+),...,ZSCAN31(+),ZSCAN32(+),ZSCAN4(+),ZSCAN5A(+),ZSCAN5B(+),ZSCAN9(+),ZSWIM1(+),ZXDA(+),ZXDB(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s.0096_AAACCCAAGTACGAGC.1,0.009444,0.000000,0.016985,0.020391,0.014743,0.019363,0.020046,0.026854,0.049865,0.044838,...,0.125639,0.000970,0.003486,0.015148,0.000000,0.007632,0.006456,0.030326,0.001873,0.042951
s.0096_AAACCCACACAGCGCT.1,0.038672,0.015596,0.028162,0.022865,0.015782,0.053353,0.025826,0.023695,0.079487,0.048850,...,0.037522,0.001408,0.016239,0.022776,0.005076,0.005935,0.000000,0.016553,0.003451,0.055055
s.0096_AAACCCACAGATAAAC.1,0.017409,0.004160,0.017414,0.016577,0.016254,0.038260,0.022193,0.021379,0.048196,0.043602,...,0.151181,0.003682,0.018529,0.019038,0.000811,0.004950,0.000000,0.012621,0.000938,0.039366
s.0096_AAACCCAGTCCGGATC.1,0.003260,0.000000,0.023215,0.007211,0.024733,0.026583,0.024318,0.019403,0.059641,0.045727,...,0.121864,0.003158,0.000000,0.023101,0.000000,0.006347,0.000007,0.000000,0.011583,0.035897
s.0096_AAACCCAGTCTCTCAC.1,0.001742,0.000000,0.016950,0.008251,0.013327,0.031233,0.026638,0.020131,0.070626,0.053468,...,0.133321,0.000701,0.004510,0.015616,0.000268,0.001830,0.002371,0.000000,0.002785,0.038052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s.0165_TTTGGTTTCCGCCTAT.1,0.010713,0.012080,0.024873,0.028892,0.035310,0.031963,0.032451,0.028849,0.045464,0.043862,...,0.136846,0.005627,0.000000,0.024579,0.024518,0.014054,0.001580,0.016065,0.010628,0.042145
s.0165_TTTGGTTTCTCGTTTA.1,0.028060,0.005654,0.026379,0.021684,0.028926,0.021650,0.025040,0.019235,0.031623,0.046157,...,0.055129,0.006968,0.019898,0.019863,0.004099,0.010392,0.003524,0.024260,0.009819,0.032126
s.0165_TTTGTTGCAAGTTCGT.1,0.006004,0.009323,0.027725,0.026777,0.017165,0.028448,0.027875,0.031789,0.054856,0.053299,...,0.087144,0.009686,0.033508,0.019337,0.003146,0.010351,0.000052,0.016027,0.005154,0.045834
s.0165_TTTGTTGCATGTTCAG.1,0.002247,0.000000,0.024487,0.034188,0.009337,0.027734,0.029019,0.015694,0.050122,0.047428,...,0.134554,0.012868,0.006077,0.013639,0.074850,0.000802,0.012456,0.003955,0.012222,0.034423


In [1]:
# Checkpoint to regenerate the object from the file
#auc_mtx = pd.read_csv(f_regulons_aucell_path, sep = "\t", index_col = "Cell")
#auc_mtx.columns.name = "Regulon"
#auc_mtx

In [29]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

auc_mtx_bin = binarize(auc_mtx, seed = 42, num_workers=default_n_threads)

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: 04h21mn56 with num_workers=12

Start time: 13:28:23
End time: 17:50:20
Running time: 4:21:56.228980


In [30]:
binarization_thresholds = auc_mtx_bin[1]
binarization_thresholds.to_csv(f_regulons_binarization_thresholds_aucell_path, sep = "\t")
binarization_thresholds

Regulon
A1CF(+)      0.022189
ABCF2(+)     0.013607
ABL1(+)      0.037242
ACAA1(+)     0.038034
ACO1(+)      0.044103
               ...   
ZSCAN9(+)    0.018699
ZSWIM1(+)    0.042951
ZXDA(+)      0.014804
ZXDB(+)      0.014456
ZXDC(+)      0.059073
Length: 1618, dtype: float64

In [31]:
auc_mtx_bin = auc_mtx_bin[0]
auc_mtx_bin.to_csv(f_regulons_binarized_aucell_path, sep = "\t")
auc_mtx_bin

Regulon,A1CF(+),ABCF2(+),ABL1(+),ACAA1(+),ACO1(+),ADARB1(+),ADNP(+),ADNP2(+),AEBP2(+),AFF4(+),...,ZSCAN31(+),ZSCAN32(+),ZSCAN4(+),ZSCAN5A(+),ZSCAN5B(+),ZSCAN9(+),ZSWIM1(+),ZXDA(+),ZXDB(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s.0096_AAACCCAAGTACGAGC.1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
s.0096_AAACCCACACAGCGCT.1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
s.0096_AAACCCACAGATAAAC.1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
s.0096_AAACCCAGTCCGGATC.1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
s.0096_AAACCCAGTCTCTCAC.1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s.0165_TTTGGTTTCCGCCTAT.1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
s.0165_TTTGGTTTCTCGTTTA.1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
s.0165_TTTGTTGCAAGTTCGT.1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
s.0165_TTTGTTGCATGTTCAG.1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [39]:
sum(auc_mtx_bin["ATF4(+)"])

3886

In [40]:
binarization_thresholds.loc['ATF4(+)']

0.06801822607244995

In [41]:
sum(auc_mtx["ATF4(+)"] > binarization_thresholds.loc['ATF4(+)'])

3886