# Emma Pan Neuro (Control + ND75KD) - pySCENIC pipeline (Embedded version)

**Author:** Vincent Gardeux

**Date Created:** 03/06/2024

# Libraries

In [7]:
# Fix OPENBLAS Warnings
import os
default_n_threads = 12
os.environ['OPENBLAS_NUM_THREADS'] = f"{default_n_threads}"
os.environ['MKL_NUM_THREADS'] = f"{default_n_threads}"
os.environ['OMP_NUM_THREADS'] = f"{default_n_threads}"

# import dependencies
import pandas as pd
import numpy as np
import h5py
import pickle
import pytz

from datetime import datetime
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from distributed import Client, LocalCluster

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.binarization import binarize
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

# Parameters

In [8]:
# [Input] Fixed gene annotation (for SCENIC feather file compatibility)
genome_data = pd.read_csv("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/fixed_gene_annotation.tsv", sep = "\t", na_filter=False)

# [Input] Loom file to use
f_loom_path_scenic = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated.loom"
# Open Loom file in reading mode
f = h5py.File(f_loom_path_scenic, 'r')
f_m = f["/matrix"][:,:]
f_gene_names = f["/row_attrs/Gene"].asstr()[:]
f_ensembl_names = f["/row_attrs/Accession"].asstr()[:]
f_cell_names = f["/col_attrs/CellID"].asstr()[:]
f.close()

# Update gene names using fixing table
ensembl_to_gene_dict = genome_data.set_index('Ensembl')['Name'].to_dict()
f_gene_names_updated = np.array([
    ensembl_to_gene_dict[ensembl] if ensembl in ensembl_to_gene_dict else gene
    for gene, ensembl in zip(f_gene_names, f_ensembl_names)
])

# Create expression matrix
ex_matrix = pd.DataFrame(f_m, columns = f_cell_names, index = f_gene_names_updated)
ex_matrix = ex_matrix.loc[f_gene_names_updated, :]

# [Input] Transcription factors list (SCENIC step 1: GRNBoost2)
#f_tfs = "/data/genome/drosophila_melanogaster/cistopic_flybase_r6.02/allTFs_dmel.txt" # From pySCENIC github
f_tfs = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/FBgg0000745_TF_Flybase.filtered.txt" # From Flybase
# Derive list of Transcription Factors(TF)
tf_names = load_tf_names(f_tfs)

# [Output] Adjacency matrix (SCENIC step 1: GRNBoost2)
adj_matrix = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_adj.csv"

# [Input] Ranking databases (SCENIC step 2-3: cisTarget)
f_db_names = ["/data/genome/drosophila_melanogaster/cistopic_flybase_r6.02/mc_v10_clust/dm6_v10_clust.genes_vs_motifs.rankings.feather"]
dbs = [RankingDatabase(fname=fname, name=os.path.basename(fname)) for fname in f_db_names]

# [Input] Motif databases (SCENIC step 2-3: cisTarget)
f_motif_path = "/data/genome/drosophila_melanogaster/cistopic_flybase_r6.02/mc_v10_clust/motifs-v10-nr.flybase-m0.00001-o0.0.tbl"

# [Output] Regulons (SCENIC step 2-3: cisTarget)
f_motifs_path = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_motifs.tsv"
f_modules_path = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_modules.tsv"
f_regulons_path = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_regulons.tsv"
f_regulons_aucell_path = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_regulons_aucell.tsv"
f_regulons_binarized_aucell_path = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_regulons_aucell_binarized.tsv"
f_regulons_binarization_thresholds_aucell_path = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_regulons_aucell_binarization_thresholds.tsv"

# Restrict matrix to feather genes
ranking_feather = pd.read_feather(f_db_names[0])
overlap_values = ex_matrix.index[pd.Series(ex_matrix.index).isin(ranking_feather.columns)].unique()
ex_matrix = ex_matrix.loc[overlap_values, :]

ex_matrix

Unnamed: 0,AAACCCAAGGTGATAT-1_ctrl,AAACCCACAAATAGCA-1_ctrl,AAACCCACAACAAAGT-1_ctrl,AAACCCACACTCATAG-1_ctrl,AAACCCACAGAGAGGG-1_ctrl,AAACCCACAGCCTATA-1_ctrl,AAACCCAGTACCTTCC-1_ctrl,AAACCCAGTACTGCCG-1_ctrl,AAACCCAGTGTTCGTA-1_ctrl,AAACCCAGTTCAGGTT-1_ctrl,...,TTTGTTGCAAGCACCC-1_ndkd,TTTGTTGCAGTTACCA-1_ndkd,TTTGTTGGTGAGATAT-1_ndkd,TTTGTTGGTGTCACAT-1_ndkd,TTTGTTGGTTAAGAAC-1_ndkd,TTTGTTGGTTTGGAGG-1_ndkd,TTTGTTGTCCGTTTCG-1_ndkd,TTTGTTGTCCTTCAGC-1_ndkd,TTTGTTGTCGAACGCC-1_ndkd,TTTGTTGTCGTGTTCC-1_ndkd
gfzf,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Osi24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
msps,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
CG6013,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
CR44091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mitf,0.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,...,5.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
Pur-alpha,3.0,6.0,3.0,1.0,0.0,15.0,11.0,3.0,0.0,0.0,...,2.0,3.0,5.0,1.0,1.0,2.0,1.0,6.0,1.0,0.0
gw,2.0,1.0,6.0,0.0,0.0,6.0,2.0,3.0,1.0,1.0,...,3.0,4.0,6.0,0.0,4.0,2.0,2.0,6.0,8.0,0.0
CR44027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# SCENIC steps

## STEP 1: Gene regulatory network inference, and generation of co-expression modules

### 1.a. GRN inference using the GRNBoost2 algorithm

In the initial phase of the pySCENIC pipeline the single cell expression profiles are used to infer co-expression modules from.

Run GRNboost from arboreto to infer co-expression modules

The arboreto package is used for this phase of the pipeline.

*Output:* List of adjacencies between a TF and its targets.

Run GRNBoost2 algorithm

In [9]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

# Prepare the multithreading
cluster = LocalCluster(name='grn_call', dashboard_address=":12345", n_workers=default_n_threads, threads_per_worker=8)
client = Client(cluster)

# Here I run the function within the package (no CLI)
adjacencies = grnboost2(expression_data=ex_matrix.transpose(), tf_names=tf_names, seed=42, verbose=True, client_or_address=client)
            
# Shutting down cluster
client.close()
cluster.close()
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))
# Note: ~1h40 with n_workers=12, threads_per_worker=[not set] => default to 16 threads
# Note: ~2h10 with n_workers=12, threads_per_worker=8
# Note: 53mn with n_workers=12, threads_per_worker=8 => With reduced matrix

Start time: 14:29:31
preparing dask client
parsing input
creating dask graph
12 partitions
computing dask graph
not shutting down client, client was created externally
finished
End time: 15:15:18
Running time: 0:45:46.592788


Read in the adjacencies matrix

In [10]:
adjacencies.to_csv(adj_matrix, index=False, sep=',')
#adjacencies = pd.read_csv(adj_matrix, sep=',', na_filter=False) # If na_filter=True, the nan gene is detected as NaN
adjacencies

Unnamed: 0,TF,target,importance
395,bi,CR32773,8.560182e+02
478,CG9650,CR44357,4.424130e+02
573,salm,salr,4.024105e+02
491,br,Mur2B,3.896239e+02
474,Vsx2,Vsx1,3.817732e+02
...,...,...,...
300,rgr,Sox21a,2.229785e-17
125,REPTOR,CG9416,1.459898e-17
239,kni,CG30271,1.115953e-17
498,mamo,CCAP,5.907672e-20


## STEP 2-3: Regulon prediction aka cisTarget

*Output:* List of adjacencies between a TF and its targets.

### 2.a. Running regulon prediction using cisTarget

Here, we use the --mask_dropouts option, which affects how the correlation between TF and target genes is calculated during module creation. It is important to note that prior to pySCENIC v0.9.18, the default behavior was to mask dropouts, while in v0.9.18 and later, the correlation is performed using the entire set of cells (including those with zero expression). When using the modules_from_adjacencies function directly in python instead of via the command line, the rho_mask_dropouts option can be used to control this.

**Note:** Here I tried with and without the `rho_mask_dropouts=True` option. Without it, it generates 46 regulons. With it, 81 regulons. So I keep it.

**Update:** Now 99 regulons with fixed gene names, and `rho_mask_dropouts=True` option

**Update 2:** Now 108 regulons with fixed gene names, restricted to feather genes, and `rho_mask_dropouts=True` option. It contains dati and crc, but not repo anymore :(

**Update 3:** Now 764 regulons with filter_for_annotation=False ( no filtering on pruning )

**Update 4:** Now 579 regulons when using TF list from Flybase

In [11]:
modules = list(modules_from_adjacencies(adjacencies, ex_matrix.transpose(), rho_mask_dropouts=True, keep_only_activating=True)) # rho_mask_dropouts=True


2024-05-15 13:15:29,670 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [True].

2024-05-15 13:15:52,271 - pyscenic.utils - INFO - Creating modules.


In [12]:
modules_df = pd.DataFrame(index = range(0, len(modules)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Context", "NES", "Markers"))
for j in range(0, len(modules)):
    # Setting values
    context = list(modules[j].context)
    modules_df["Regulon"].iloc[j] = modules[j].name
    modules_df["TF"].iloc[j] = modules[j].transcription_factor
    modules_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    modules_df["NbMarkers"].iloc[j] = len(set(modules[j].gene2weight))
    modules_df["Context"].iloc[j] = context[1]
    modules_df["NES"].iloc[j] = modules[j].score
    modules_df["Markers"].iloc[j] = ','.join(list(modules[j].gene2weight))

modules_df = modules_df.sort_values(by='NbMarkers', ascending=False)
modules_df.to_csv(f_modules_path, index=False, sep = "\t")
modules_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Context,NES,Markers
3104,lola,lola,activating,7496,top50perTarget,0.0,"128up,14-3-3epsilon,14-3-3zeta,140up,18w,26-29..."
3003,bun,bun,activating,7319,top50perTarget,0.0,"128up,14-3-3epsilon,14-3-3zeta,18w,26-29-p,2mi..."
463,Regulon for lola,lola,activating,7142,weight>75.0%,0.0,"roX1,Cam,ps,14-3-3zeta,Atpalpha,sbb,Vha16-1,Rp..."
362,Regulon for bun,bun,activating,7041,weight>75.0%,0.0,"CG42613,zfh2,Rbfox1,ct,lola,RapGAP1,sbb,NFAT,S..."
2939,Pdp1,Pdp1,activating,6842,top50perTarget,0.0,"14-3-3epsilon,14-3-3zeta,140up,18w,26-29-p,2mi..."
...,...,...,...,...,...,...,...
1773,CG2889,CG2889,top5perTarget,20,activating,0.0,"CG10026,CG13164,CG14069,CG14327,CG17217,CG3106..."
1761,CG17612,CG17612,top5perTarget,20,activating,0.0,"CG15725,CG16716,CG16979,CG17068,CG3740,CG4159,..."
1730,CG10366,CG10366,top5perTarget,20,activating,0.0,"BCAS2,CG10347,CG11975,CG13151,CG14882,CG17118,..."
1789,CG4328,CG4328,top5perTarget,20,activating,0.0,"CG11997,CG13807,CG32150,CG33253,CG42564,CG4415..."


In [13]:
print(modules_df.TF.nunique(), "unique TF-modules were found ( out of",len(tf_names),"). Modules with less than 20 markers were filtered out.")

579 unique TF-modules were found ( out of 631 ). Modules with less than 20 markers were filtered out.


In [14]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

df = prune2df(dbs, modules, f_motif_path, num_workers=default_n_threads, weighted_recovery=False, rank_threshold = 1500, nes_threshold=3, motif_similarity_fdr=0.001, auc_threshold=0.05, filter_for_annotation=False)
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))
# Note: 5mn37 with num_workers=48
# Note: 5mn29 with num_workers=12
# Note: 5mn33 with num_workers=10
# Note: 5mn37 with num_workers=8 ** optimal? 12mn with filter_for_annotation=False)
# Note: 8mn40 with num_workers=6
# Note: 9mn53 with num_workers=4
# Note: 30mn24 with num_workers=1

df.to_csv(f_motifs_path,sep = "\t")
#df

Start time: 15:17:15
End time: 15:28:45
Running time: 0:11:30.094269


In [15]:
print(len(set(df.index.get_level_values('TF').values)), "regulons were kept, after pruning")

579 regulons were kept, after pruning


In [16]:
# Look for main regulons
print("pros", "pros" in df.index.get_level_values('TF').values, sep="\t")
print("dati", "dati" in df.index.get_level_values('TF').values, sep="\t")
print("scro", "scro" in df.index.get_level_values('TF').values, sep="\t")
print("crc", "crc" in df.index.get_level_values('TF').values, sep="\t")
print("repo", "repo" in df.index.get_level_values('TF').values, sep="\t")
print("sima", "sima" in df.index.get_level_values('TF').values, sep="\t")

pros	True
dati	True
scro	True
crc	True
repo	True
sima	True


In [17]:
print("Size of Dataframe:", len(df))
drop_indexes = []
for j in range(0, len(df)):
    # Setting values
    if(len(df["Enrichment"]["TargetGenes"][j]) == 0): drop_indexes.append(df.index[j])
df_filtered = df["Enrichment"].drop(index=drop_indexes)
print("Size of Dataframe:", len(df_filtered))

Size of Dataframe: 26009
Size of Dataframe: 26009


These "modules" are then combined into regulons, by taking the top NES for each TF (for main Motif, and final score of regulon). All genes are bundled together.

In [18]:
# This dataframe can then be converted to regulons.
regulons = df2regulons(df_filtered)

Create regulons from a dataframe of enriched features.
Additional columns saved: []


In [19]:
regulon_df = pd.DataFrame(index = range(0, len(regulons)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Motif", "NES", "Markers"))
for j in range(0, len(regulons)):
    # Fixing order of set
    context = list(regulons[j].context)
    if(context[0].endswith(".png")):
        tmp = context[0]
        context[0] = context[1]
        context[1] = tmp
    # Setting values
    regulon_df["Regulon"].iloc[j] = regulons[j].name
    regulon_df["TF"].iloc[j] = regulons[j].transcription_factor
    regulon_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    regulon_df["NbMarkers"].iloc[j] = len(set(regulons[j].gene2weight))
    regulon_df["Motif"].iloc[j] = "https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/logos/" + context[1]
    regulon_df["NES"].iloc[j] = regulons[j].score
    regulon_df["Markers"].iloc[j] = ','.join(list(regulons[j].gene2weight))

regulon_df = regulon_df.sort_values(by='NbMarkers', ascending=False)
regulon_df.to_csv(f_regulons_path, index=False, sep = "\t")
regulon_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Motif,NES,Markers
471,lola(+),lola,activating,4254,https://resources.aertslab.org/cistarget/motif...,5.945286,"CG17698,CR45736,p120ctn,jim,AGO3,Myo81F,CG4578..."
301,Pdp1(+),Pdp1,activating,3819,https://resources.aertslab.org/cistarget/motif...,5.439625,"CG17698,CR45736,p120ctn,jim,AGO3,Myo81F,CG1320..."
430,fs(1)h(+),fs(1)h,activating,3814,https://resources.aertslab.org/cistarget/motif...,6.649265,"CG17698,p120ctn,jim,Myo81F,CG45781,kl-3,CG1320..."
382,crol(+),crol,activating,3688,https://resources.aertslab.org/cistarget/motif...,9.530519,"CG17698,CR45736,p120ctn,jim,CG13204,CG41378,kl..."
368,bun(+),bun,activating,3633,https://resources.aertslab.org/cistarget/motif...,4.974753,"CG17698,CR45736,p120ctn,jim,AGO3,Myo81F,CG4578..."
...,...,...,...,...,...,...,...
348,amos(+),amos,activating,20,https://resources.aertslab.org/cistarget/motif...,4.08343,"Nlg3,Pdp1,CG30463,CadN,LPCAT,Gyc88E,shakB,Frq1..."
76,CG15696(+),CG15696,activating,17,https://resources.aertslab.org/cistarget/motif...,3.91921,"Syt12,CG1275,lt,flw,alpha-Man-Ia,CG42747,Mid1,..."
353,ase(+),ase,activating,14,https://resources.aertslab.org/cistarget/motif...,3.430874,"jim,zld,CG34354,CG13293,CG30116,CG34357,CG1576..."
203,Doc3(+),Doc3,activating,13,https://resources.aertslab.org/cistarget/motif...,5.210102,"Syx7,Axud1,CG44247,CG14015,esn,HmgD,Rpn13,CG29..."


In [20]:
# Look for main regulons
print("pros", "pros" in regulon_df["TF"].values, sep="\t")
print("dati", "dati" in regulon_df["TF"].values, sep="\t")
print("scro", "scro" in regulon_df["TF"].values, sep="\t")
print("crc", "crc" in regulon_df["TF"].values, sep="\t")
print("repo", "repo" in regulon_df["TF"].values, sep="\t")
print("sima", "sima" in regulon_df["TF"].values, sep="\t")

pros	True
dati	True
scro	True
crc	True
repo	True
sima	True


## Phase III: Cellular regulon enrichment matrix (aka AUCell)

Characterize the different cells in a single-cell transcriptomics experiment by the enrichment of the regulons. Enrichment of a regulon is measures as AUC of the recovery curve of the genes that define this regulon.

In [21]:
auc_mtx = aucell(ex_matrix.transpose(), regulons, num_workers=default_n_threads)
auc_mtx.to_csv(f_regulons_aucell_path, sep = "\t")
auc_mtx

Regulon,ATbp(+),Abd-B(+),Adf1(+),Aef1(+),Antp(+),Asciz(+),Atf-2(+),Atf3(+),Atf6(+),Awh(+),...,vri(+),vvl(+),wdn(+),wek(+),woc(+),z(+),zf30C(+),zfh1(+),zfh2(+),zld(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGGTGATAT-1_ctrl,0.025016,0.051157,0.009165,0.217876,0.005565,0.043619,0.004420,0.058608,0.065610,0.060508,...,0.112104,0.048170,0.045674,0.008630,0.035798,0.020901,0.040082,0.051026,0.073097,0.054074
AAACCCACAAATAGCA-1_ctrl,0.048197,0.072628,0.004738,0.093205,0.024699,0.043178,0.050244,0.027497,0.183466,0.106904,...,0.081442,0.063286,0.016767,0.001272,0.007462,0.057980,0.024195,0.050381,0.187626,0.134823
AAACCCACAACAAAGT-1_ctrl,0.048374,0.055401,0.010618,0.095788,0.007051,0.140935,0.012873,0.032550,0.233749,0.132549,...,0.077538,0.050187,0.012419,0.014976,0.016292,0.060347,0.019741,0.055351,0.156163,0.068243
AAACCCACACTCATAG-1_ctrl,0.056667,0.080954,0.005031,0.128824,0.007040,0.029652,0.015074,0.039802,0.155645,0.171546,...,0.091319,0.063301,0.019340,0.006113,0.026780,0.052161,0.024468,0.041457,0.196490,0.076487
AAACCCACAGAGAGGG-1_ctrl,0.013838,0.008583,0.014718,0.283570,0.012601,0.032273,0.034665,0.067232,0.045371,0.070239,...,0.164186,0.060007,0.047588,0.019446,0.062699,0.024874,0.031490,0.047543,0.079467,0.016080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTTTGGAGG-1_ndkd,0.015443,0.029266,0.004485,0.228821,0.018392,0.022867,0.077920,0.163073,0.080727,0.160639,...,0.194089,0.102957,0.042985,0.067163,0.096124,0.055005,0.036921,0.118856,0.202013,0.022660
TTTGTTGTCCGTTTCG-1_ndkd,0.044308,0.037830,0.006690,0.127690,0.016011,0.031436,0.044347,0.040877,0.179631,0.126737,...,0.097696,0.066936,0.019406,0.011160,0.027647,0.043161,0.025456,0.066068,0.169896,0.093657
TTTGTTGTCCTTCAGC-1_ndkd,0.043543,0.066490,0.014648,0.079371,0.010937,0.016735,0.007212,0.043082,0.173221,0.687419,...,0.079111,0.055287,0.017202,0.002248,0.021235,0.074826,0.017163,0.057514,0.150466,0.080344
TTTGTTGTCGAACGCC-1_ndkd,0.045354,0.045479,0.011098,0.149059,0.011069,0.018916,0.018311,0.060819,0.169757,0.185152,...,0.103534,0.069396,0.015553,0.010884,0.051322,0.053105,0.017973,0.093731,0.251925,0.080254


In [86]:
# Checkpoint to regenerate the object from the file
#auc_mtx = pd.read_csv("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/old/Pan_neuro_both_reannotated_GFP_curated_reintegrated_regulons_aucell.tsv", sep = "\t", index_col = "Cell")
auc_mtx = pd.read_csv(f_regulons_aucell_path, sep = "\t", index_col = "Cell")
auc_mtx.columns.name = "Regulon"
auc_mtx

Regulon,ATbp(+),Abd-B(+),Adf1(+),Aef1(+),Antp(+),Asciz(+),Atf-2(+),Atf3(+),Atf6(+),Awh(+),...,vri(+),vvl(+),wdn(+),wek(+),woc(+),z(+),zf30C(+),zfh1(+),zfh2(+),zld(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGGTGATAT-1_ctrl,0.025016,0.051157,0.009165,0.217876,0.005565,0.043619,0.004420,0.058608,0.065610,0.060508,...,0.112104,0.048170,0.045674,0.008630,0.035798,0.020901,0.040082,0.051026,0.073097,0.054074
AAACCCACAAATAGCA-1_ctrl,0.048197,0.072628,0.004738,0.093205,0.024699,0.043178,0.050244,0.027497,0.183466,0.106904,...,0.081442,0.063286,0.016767,0.001272,0.007462,0.057980,0.024195,0.050381,0.187626,0.134823
AAACCCACAACAAAGT-1_ctrl,0.048374,0.055401,0.010618,0.095788,0.007051,0.140935,0.012873,0.032550,0.233749,0.132549,...,0.077538,0.050187,0.012419,0.014976,0.016292,0.060347,0.019741,0.055351,0.156163,0.068243
AAACCCACACTCATAG-1_ctrl,0.056667,0.080954,0.005031,0.128824,0.007040,0.029652,0.015074,0.039802,0.155645,0.171546,...,0.091319,0.063301,0.019340,0.006113,0.026780,0.052161,0.024468,0.041457,0.196490,0.076487
AAACCCACAGAGAGGG-1_ctrl,0.013838,0.008583,0.014718,0.283570,0.012601,0.032273,0.034665,0.067232,0.045371,0.070239,...,0.164186,0.060007,0.047588,0.019446,0.062699,0.024874,0.031490,0.047543,0.079467,0.016080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTTTGGAGG-1_ndkd,0.015443,0.029266,0.004485,0.228821,0.018392,0.022867,0.077920,0.163073,0.080727,0.160639,...,0.194089,0.102957,0.042985,0.067163,0.096124,0.055005,0.036921,0.118856,0.202013,0.022660
TTTGTTGTCCGTTTCG-1_ndkd,0.044308,0.037830,0.006690,0.127690,0.016011,0.031436,0.044347,0.040877,0.179631,0.126737,...,0.097696,0.066936,0.019406,0.011160,0.027647,0.043161,0.025456,0.066068,0.169896,0.093657
TTTGTTGTCCTTCAGC-1_ndkd,0.043543,0.066490,0.014648,0.079371,0.010937,0.016735,0.007212,0.043082,0.173221,0.687419,...,0.079111,0.055287,0.017202,0.002248,0.021235,0.074826,0.017163,0.057514,0.150466,0.080344
TTTGTTGTCGAACGCC-1_ndkd,0.045354,0.045479,0.011098,0.149059,0.011069,0.018916,0.018311,0.060819,0.169757,0.185152,...,0.103534,0.069396,0.015553,0.010884,0.051322,0.053105,0.017973,0.093731,0.251925,0.080254


In [87]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

auc_mtx_bin = binarize(auc_mtx, seed = 42, num_workers=default_n_threads)

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: 23mn12 with num_workers=12

Start time: 11:01:44
End time: 11:19:28
Running time: 0:17:43.246747


In [88]:
binarization_thresholds = auc_mtx_bin[1]
binarization_thresholds.to_csv(f_regulons_binarization_thresholds_aucell_path, sep = "\t")
binarization_thresholds

Regulon
ATbp(+)     0.070662
Abd-B(+)    0.137902
Adf1(+)     0.024422
Aef1(+)     0.256438
Antp(+)     0.046179
              ...   
z(+)        0.083907
zf30C(+)    0.041259
zfh1(+)     0.097270
zfh2(+)     0.242251
zld(+)      0.134415
Length: 579, dtype: float64

In [89]:
auc_mtx_bin = auc_mtx_bin[0]
auc_mtx_bin.to_csv(f_regulons_binarized_aucell_path, sep = "\t")
auc_mtx_bin

Regulon,ATbp(+),Abd-B(+),Adf1(+),Aef1(+),Antp(+),Asciz(+),Atf-2(+),Atf3(+),Atf6(+),Awh(+),...,vri(+),vvl(+),wdn(+),wek(+),woc(+),z(+),zf30C(+),zfh1(+),zfh2(+),zld(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGGTGATAT-1_ctrl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCACAAATAGCA-1_ctrl,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
AAACCCACAACAAAGT-1_ctrl,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCACACTCATAG-1_ctrl,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCACAGAGAGGG-1_ctrl,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTTTGGAGG-1_ndkd,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,1,0,0,1,0,0
TTTGTTGTCCGTTTCG-1_ndkd,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
TTTGTTGTCCTTCAGC-1_ndkd,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
TTTGTTGTCGAACGCC-1_ndkd,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [90]:
auc_mtx_bin.sum(numeric_only=True, axis=0).loc['scro(+)'] # 733 original (764 regulons, Aerts). 716 ok

716

In [91]:
binarization_thresholds.loc['scro(+)'] # 0.3359530388214861 original (764 regulons, Aerts). 0.2968568380139414

0.2968568380139414

In [92]:
auc_mtx_bin.sum(numeric_only=True, axis=0).loc['sima(+)'] # 1067 original (764 regulons, Aerts). 1212 ok

1212

In [93]:
binarization_thresholds.loc['sima(+)'] # 0.21179422980092633 original (764 regulons, Aerts). 0.18676990882955136

0.18676990882955136

In [94]:
auc_mtx_bin.sum(numeric_only=True, axis=0).loc['crc(+)'] # 1136 original (764 regulons, Aerts). 1036 a bit too many?

1036

In [95]:
binarization_thresholds.loc['crc(+)'] # 0.21042246303356155 original (764 regulons, Aerts). 0.23696722508584317

0.23696722508584317

In [96]:
auc_mtx_bin.sum(numeric_only=True, axis=0).loc['dati(+)'] # 11733 original (764 regulons, Aerts). 11631 ok

11631

In [97]:
binarization_thresholds.loc['dati(+)'] # 0.25299783006969817 original (764 regulons, Aerts). 0.2806370838941694

0.2806370838941694

In [98]:
auc_mtx_bin.sum(numeric_only=True, axis=0).loc['pros(+)'] # 18482 original (764 regulons, Aerts). 18548 ok

18548

In [99]:
binarization_thresholds.loc['pros(+)'] # 0.11976533052607628 original (764 regulons, Aerts). 0.13361937192963938

0.13361937192963938

In [100]:
auc_mtx_bin.sum(numeric_only=True, axis=0).loc['repo(+)'] # 906 original (764 regulons, Aerts). 920 ok

920

In [101]:
binarization_thresholds.loc['repo(+)'] # 0.10907608108082668 original (764 regulons, Aerts). 0.11348899982841607

0.11348899982841607

In [113]:
sum(auc_mtx["crc(+)"] > 0.238)

975

In [None]:
sum(auc_mtx["crc(+)"] > binarization_thresholds.loc['crc(+)'])

In [103]:
sum(auc_mtx_bin["crc(+)"])

1036