# Jeffries Human Brain Aging - pySCENIC pipeline (Embedded version)

**Author:** Vincent Gardeux

**Date Created:** 2025-11-17

**Date Modified:** 2025-11-21

In [1]:
# Fix OPENBLAS Warnings
import os
param_n_workers = 24 # We have 112 CPUs/cores, each process will automatically be associated to a different CPU by the OS scheduler
param_threads_per_worker=1 # We have 2 threads per CPU on SVEN (hyper-threading). See lscpu command. Note: Here they are not used apparently. Setting to 1 or 2 gives similar c. time
os.environ['OPENBLAS_NUM_THREADS'] = f"{param_n_workers * param_threads_per_worker}"
os.environ['MKL_NUM_THREADS'] = f"{param_n_workers * param_threads_per_worker}"
os.environ['OMP_NUM_THREADS'] = f"{param_n_workers * param_threads_per_worker}"
os.environ['NUMEXPR_MAX_THREADS'] = f"{param_n_workers * param_threads_per_worker}"

# import dependencies
import pandas as pd
import numpy as np
import anndata as ad
import regdiffusion as rd # For replacing grnboost2 which is slow as hell and bugs (stalls) when there are more than ~1.5B elements in the matrix
import ast # For reading frozenset as strings
import pickle
import pytz

from datetime import datetime
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from distributed import Client, LocalCluster

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.binarization import binarize
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

In [2]:
# [Input] H5ad file to use
EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pfc.clean.h5ad' # Built from pfc.clean.rds. Downloaded from https://publications.wenglab.org/SomaMut/Jeffries_Yu_BrainAging_2025/

# [Input] Transcription factors list (SCENIC step 1: GRNBoost2)
f_tfs = "/data/gardeux/Neuro_Droso_ND75KD/data/allTFs_hg38.txt" # From https://resources.aertslab.org/cistarget/tf_lists/
# Derive list of Transcription Factors(TF)
tf_names = load_tf_names(f_tfs)

# [Output] Adjacency matrix (SCENIC step 1: GRNBoost2)
adj_matrix = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_adj.csv"

# [Input] Ranking databases (SCENIC step 2-3: cisTarget)
f_db_names = ["/data/gardeux/Neuro_Droso_ND75KD/data/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather", "/data/gardeux/Neuro_Droso_ND75KD/data/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather"] # From pySCENIC db: https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/
dbs = [RankingDatabase(fname=f_name, name=os.path.basename(f_name)) for f_name in f_db_names]

# [Input] Motif databases (SCENIC step 2-3: cisTarget)
f_motif_path = "/data/gardeux/Neuro_Droso_ND75KD/data/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl" # From pySCENIC db: https://resources.aertslab.org/cistarget/motif2tf/

# [Output] Regulons (SCENIC step 2-3: cisTarget)
f_motifs_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_motifs.tsv"
f_modules_pickle = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_modules.pkl"
f_modules_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_modules.tsv"
f_regulons_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_regulons.tsv"
f_regulons_pickle = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_regulons.pkl"
f_regulons_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_regulons_aucell.tsv"
f_regulons_binarized_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_regulons_aucell_binarized.tsv"
f_regulons_binarization_thresholds_aucell_path = "/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pySCENIC/Jeffries_HBAging_regulons_aucell_binarization_thresholds.tsv"

Load ex_matrix

In [3]:
# [Input] Load expression matrix from H5ad file
f_h5ad = ad.read_h5ad(EXPRESSION_H5AD_FNAME)
f_gene_names = f_h5ad.var_names.tolist()  # Gene names
f_cell_names = f_h5ad.obs_names.tolist()   # Cell names
ex_matrix = pd.DataFrame.sparse.from_spmatrix(f_h5ad.X.T, index=f_gene_names, columns=f_cell_names)

# Restrict matrix to feather genes
ranking_feather = pd.read_feather(f_db_names[0])
overlap_values = ex_matrix.index[pd.Series(ex_matrix.index).isin(ranking_feather.columns)].unique()
ex_matrix = ex_matrix.loc[overlap_values, :] # This step takes forever

ex_matrix # 18099 genes x 367317 cells

Unnamed: 0,0950_240109_AAACCCAAGACATCCT,0950_240109_AAACCCACACCGTACG,0950_240109_AAACCCAGTAATCAGA,0950_240109_AAACCCATCACTCTTA,0950_240109_AAACGAAAGGTTACCT,0950_240109_AAACGAAAGTATGACA,0950_240109_AAACGAACAGCGATTT,0950_240109_AAACGAATCCATCGTC,0950_240109_AAACGAATCTCATGCC,0950_240109_AAACGCTCAATTGCGT,...,6052_200709_TTTGGTTCATGTACGT,6052_200709_TTTGGTTCATGTCGTA,6052_200709_TTTGGTTGTACCGTCG,6052_200709_TTTGGTTGTATTGACC,6052_200709_TTTGGTTGTCAGTCCG,6052_200709_TTTGGTTTCGACACTA,6052_200709_TTTGGTTTCGCAGTTA,6052_200709_TTTGTTGCAAGCGATG,6052_200709_TTTGTTGCAAGGTCAG,6052_200709_TTTGTTGCAGCCGTTG
FAM87B,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.00000
LINC01128,0.0,0.392156,0.000000,0.0,0.0,0.743607,0.000000,0.201210,0.0,0.248353,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.26238,0.439308,0.00000
LINC00115,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.248919,0.0,0.00000,0.000000,0.00000
FAM41C,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.00000
SAMD11,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ARSA,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.386462,0.0,0.0,0.248919,0.0,0.00000,0.000000,2.52105
SHANK3,0.0,0.392156,0.000000,0.0,0.0,0.000000,0.000000,0.201210,0.0,0.248353,...,1.005767,0.000000,0.535117,0.0,0.0,0.000000,0.0,0.00000,0.439308,0.00000
ACR,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.00000
RABL2B,0.0,0.000000,0.912676,0.0,0.0,0.743607,0.711225,0.512011,0.0,0.248353,...,0.000000,1.122504,0.386462,0.0,0.0,0.448062,0.0,0.00000,0.439308,0.00000


In [4]:
# Safety check
(f_h5ad.obs.index == ex_matrix.columns).all()

True

# SCENIC steps

## STEP 1: Gene regulatory network inference, and generation of co-expression modules

### 1.a. GRN inference using the GRNBoost2 algorithm

In the initial phase of the pySCENIC pipeline the single cell expression profiles are used to infer co-expression modules from.

Run GRNboost from arboreto to infer co-expression modules

The arboreto package is used for this phase of the pipeline.

*Output:* List of adjacencies between a TF and its targets.



**Option 1:** Run GRNBoost2 algorithm

```
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

# Prepare the multithreading
cluster = LocalCluster(name='grn_call', dashboard_address=":12345", n_workers=param_n_workers, threads_per_worker=param_threads_per_worker)
client = Client(cluster)

# Here I run the function within the package (no CLI)
adjacencies = grnboost2(expression_data=ex_matrix.transpose(), tf_names=tf_names, seed=42, verbose=True, client_or_address=client)
            
# Shutting down cluster
client.close()
cluster.close()
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~50mn with n_workers=12, threads_per_worker=2 => With reduced matrix to overlapping genes between .feather file and our matrix
```

Here it stalls for several days at step "creating dask graph"...

I think there is a bug in grnboost2 which makes it incompatible with big matrices i.e. impossible to deal with >~1.5B elements in the matrix. Here we have 6.6B...

So I'll use regdiffusion instead.

**Option 2:** Run regdiffusion specific code
### * * REGDIFFUSION SPECIFIC CODE * * ###

Run RegDiffusionTrainer instead of the (slow) GRNBoost2 algorithm. See https://tuftsbcb.github.io/RegDiffusion/downstream_with_pyscenic.html

In [5]:
# Prepare dataset for RegDiffusionTrainer (needs to be logged, cells as rows)
ex_matrix_log = np.log(ex_matrix.transpose() + 1.0) # Transpose and log
ex_matrix_log = ex_matrix_log.loc[:, ~(ex_matrix_log == 0).all()] # Drop columns where all values are 0
ex_matrix_log

Unnamed: 0,FAM87B,LINC01128,LINC00115,FAM41C,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,...,ODF3B,SYCE3,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3,ACR,RABL2B,MAFIP
0950_240109_AAACCCAAGACATCCT,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
0950_240109_AAACCCACACCGTACG,0.0,0.330854,0.000000,0.0,0.0,0.000000,0.330854,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.330854,0.000000,0.330854,0.0,0.000000,0.0
0950_240109_AAACCCAGTAATCAGA,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.648503,0.0
0950_240109_AAACCCATCACTCTTA,0.0,0.000000,0.000000,0.0,0.0,0.427147,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.263534,0.000000,0.000000,0.0,0.000000,0.0
0950_240109_AAACGAAAGGTTACCT,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6052_200709_TTTGGTTTCGACACTA,0.0,0.000000,0.222278,0.0,0.0,0.222278,0.000000,0.0,0.0,0.563255,...,0.0,0.0,0.000000,0.222278,0.222278,0.222278,0.000000,0.0,0.370226,0.0
6052_200709_TTTGGTTTCGCAGTTA,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0
6052_200709_TTTGTTGCAAGCGATG,0.0,0.232999,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.650413,...,0.0,0.0,0.232999,0.385283,0.581375,0.000000,0.000000,0.0,0.000000,0.0
6052_200709_TTTGTTGCAAGGTCAG,0.0,0.364162,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.809910,...,0.0,0.0,0.000000,0.217999,0.471828,0.000000,0.364162,0.0,0.364162,0.0


In [6]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

rd_trainer = rd.RegDiffusionTrainer(ex_matrix_log.to_numpy(), device="cpu")
rd_trainer.train()

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~2h03 with n_workers=24, threads_per_worker=1

Start time: 15:24:35


Training loss: 0.229, Change on Adj: -0.000: 100%|██████████| 1000/1000 [1:53:25<00:00,  6.81s/it] 

End time: 17:28:26
Running time: 2:03:50.485112





Extract edges from GRN

In [7]:
# Now we focus on edges with weight > 50 percentile. 
grn = rd_trainer.get_grn(ex_matrix_log.columns, top_gene_percentile = 50) # gene_names to recover non-expressed genes

# Here for each gene, we are going to extract all edges
adjacencies = grn.extract_edgelist(k = -1, workers = param_n_workers)
adjacencies.columns = ['TF', 'target', 'importance']

# check edgelist.  
adjacencies

Unnamed: 0,TF,target,importance
0,LINC01128,RECK,0.698242
1,HES4,RECK,1.079102
2,ISG15,RECK,0.502441
3,AGRN,RECK,1.448242
4,C1orf159,RECK,0.873535
...,...,...,...
161806126,LMF2,ACR,0.782715
161806127,SYCE3,ACR,0.647461
161806128,CPT1B,ACR,0.758789
161806129,CHKB,ACR,0.399170


Read in the adjacencies matrix

In [8]:
# Restrict to tf names
adjacencies = adjacencies[adjacencies['TF'].isin(tf_names)]
# Sort by importance
adjacencies = adjacencies.sort_values(by='importance', ascending=False)
adjacencies

Unnamed: 0,TF,target,importance
28163320,ZBTB20,QKI,14.484375
116562555,CREB5,ST18,14.343750
116563923,ZNF536,ST18,14.117188
59568148,ZNF536,SPOCK3,14.046875
56032499,CREB5,KCNH8,14.031250
...,...,...,...
6190741,TP53,LINC01619,0.375000
76112989,ZNF274,KCNJ8,0.375000
68833582,ZNF697,C5orf58,0.375000
3824026,EIF5A2,SHISA5,0.375000


### * * / REGDIFFUSION SPECIFIC CODE * * ###

Here is the end of the regdiffusion specific code. If grnboost2 worked, then you can resume here.

In [9]:
print(adjacencies.TF.nunique(), "unique TF-modules were found ( out of",len(tf_names),").")

1657 unique TF-modules were found ( out of 1892 ).


In [10]:
print(adjacencies.target.nunique(), "unique targets were found ( out of",len(ex_matrix.index),").")

18099 unique targets were found ( out of 18099 ).


In [11]:
adjacencies.TF.isin(tf_names).all()

True

In [12]:
adjacencies.to_csv(adj_matrix, index=False, sep=',')

In [13]:
# Checkpoint to recover from file (eventually)
#adjacencies = pd.read_csv(adj_matrix, sep=',', na_filter=False) # If na_filter=True, the nan gene is detected as NaN
adjacencies

Unnamed: 0,TF,target,importance
28163320,ZBTB20,QKI,14.484375
116562555,CREB5,ST18,14.343750
116563923,ZNF536,ST18,14.117188
59568148,ZNF536,SPOCK3,14.046875
56032499,CREB5,KCNH8,14.031250
...,...,...,...
6190741,TP53,LINC01619,0.375000
76112989,ZNF274,KCNJ8,0.375000
68833582,ZNF697,C5orf58,0.375000
3824026,EIF5A2,SHISA5,0.375000


## STEP 2-3: Regulon prediction aka cisTarget

*Output:* List of adjacencies between a TF and its targets.

### 2.a. Running regulon prediction using cisTarget

Here, we use the --mask_dropouts option, which affects how the correlation between TF and target genes is calculated during module creation. It is important to note that prior to pySCENIC v0.9.18, the default behavior was to mask dropouts, while in v0.9.18 and later, the correlation is performed using the entire set of cells (including those with zero expression). When using the modules_from_adjacencies function directly in python instead of via the command line, the rho_mask_dropouts option can be used to control this.

**Note:** In the following code, I use `rho_mask_dropouts=True` for keeping the old R behaviour. It also produces slightly more regulons.

<span color="red">**Issue:** This function `modules_from_adjacencies` is inherently bugged. As it's supposed to run on a single thread but bypass all my specifications to run on all available cores. I don't know how to fix this behaviour and for the function to limit the number of cores</span>

In [14]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

modules = list(modules_from_adjacencies(adjacencies, ex_matrix.transpose(), rho_mask_dropouts=True, keep_only_activating=True))

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~37mn with n_workers=24, threads_per_worker=1. BUT USES ALL 112 cores!!

Start time: 17:34:13


  ex_mtx = ex_mtx.T[~ex_mtx.columns.duplicated(keep="first")].T.astype(float)

2025-11-25 16:46:02,168 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [True].

2025-11-25 17:08:47,587 - pyscenic.utils - INFO - Creating modules.


End time: 18:11:48
Running time: 0:37:35.136483


In [15]:
modules_df = pd.DataFrame(index = range(0, len(modules)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Context", "NES", "Markers"))
for j in range(0, len(modules)):
    # Setting values
    context = list(modules[j].context)
    modules_df["Regulon"].iloc[j] = modules[j].name
    modules_df["TF"].iloc[j] = modules[j].transcription_factor
    modules_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    modules_df["NbMarkers"].iloc[j] = len(set(modules[j].gene2weight))
    modules_df["Context"].iloc[j] = context[1]
    modules_df["NES"].iloc[j] = modules[j].score
    modules_df["Markers"].iloc[j] = ','.join(list(modules[j].gene2weight))

modules_df = modules_df.sort_values(by='NbMarkers', ascending=False)
modules_df.to_csv(f_modules_path, index=False, sep = "\t")
modules_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Context,NES,Markers
805,Regulon for RBFOX2,RBFOX2,activating,7458,weight>75.0%,0.0,"NRG3,PLCB1,CACNA1C,FAM155A,DLGAP1,FGF14,PTPRG,..."
506,Regulon for LRRFIP1,LRRFIP1,activating,7360,weight>75.0%,0.0,"DLGAP1,CACNA1A,MYT1L,FGF12,PTPRN2,GRIK2,SNTG1,..."
950,Regulon for SRRM3,SRRM3,activating,7352,weight>75.0%,0.0,"DLGAP2,MEG3,KSR2,MYT1L,CACNA1B,MTUS2,GRIN2B,SR..."
650,Regulon for NPAS2,NPAS2,activating,7345,weight>75.0%,0.0,"NEBL,FAT3,CACNB2,PTPRG,FLRT2,FMN2,ABLIM1,SORBS..."
1110,Regulon for ZBTB20,ZBTB20,activating,7323,weight>75.0%,0.0,"QKI,NCKAP5,FBXL7,NEAT1,PHLPP1,PTGDS,DOCK1,CDH2..."
...,...,...,...,...,...,...,...
4988,TBPL2,TBPL2,top5perTarget,20,activating,0.0,"ABCA13,ARGFX,ASCL2,C1orf116,CCR7,CEP55,CPLX4,C..."
5118,ZNF540,ZNF540,top5perTarget,20,activating,0.0,"CNTN4-AS1,CYYR1-AS1,DLGAP1-AS1,DNAJC9,ECT2L,KC..."
5120,ZNF556,ZNF556,top5perTarget,20,activating,0.0,"C1orf229,C9orf135,CHST5,CRX,F2RL3,FAM27E3,FCRL..."
4657,HOXB3,HOXB3,top5perTarget,20,activating,0.0,"ACTG2,BCL2A1,C10orf113,CLCF1,CLECL1,FOXN3-AS1,..."


In [16]:
print(modules_df.TF.nunique(), "unique TF-modules were found ( out of",len(tf_names),"). Modules with less than 20 markers were filtered out.")

1657 unique TF-modules were found ( out of 1892 ). Modules with less than 20 markers were filtered out.


**Note:** In the following code, I use `filter_for_annotation=False` for deactivating the pruning/filtering and producing all possible regulons. This may not be the more conservative behaviour, so feel free to deactivate it with `filter_for_annotation=True` and then play with the filtering parameters: 
- `weighted_recovery=False`
- `rank_threshold=1500`
- `nes_threshold=3`
- `motif_similarity_fdr=0.001`
- `auc_threshold=0.05`

Of course, even if these parameters are set, they will have no impact on the result if `filter_for_annotation=False`

In [17]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

df = prune2df(dbs, modules, f_motif_path, num_workers=param_n_workers, weighted_recovery=False, rank_threshold = 1500, nes_threshold=3, motif_similarity_fdr=0.001, auc_threshold=0.05, filter_for_annotation=False)
    
end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~30mn with n_workers=24, threads_per_worker=1.

Start time: 18:11:56
End time: 18:42:13
Running time: 0:30:17.043665


In [18]:
df.to_csv(f_motifs_path, sep = "\t")
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment
Unnamed: 0_level_1,Unnamed: 1_level_1,AUC,NES,MotifSimilarityQvalue,OrthologousIdentity,Annotation,Context,TargetGenes,RankAtMax
TF,MotifID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A1CF,hocomoco__SMAD3_HUMAN.H11MO.0.B,0.044219,3.185378,,,,(hg38_500bp_up_100bp_down_full_tx_v10_clust.ge...,"[(PCSK6-AS1, 0.98388671875), (KNCN, 0.92626953...",1342
A1CF,hocomoco__TBX3_HUMAN.H11MO.0.C,0.054417,4.654768,,,,(hg38_500bp_up_100bp_down_full_tx_v10_clust.ge...,"[(LST1, 0.93896484375), (MYL1, 0.9873046875), ...",1417
A1CF,swissregulon__mm__Snai2,0.045009,3.299153,,,,(hg38_500bp_up_100bp_down_full_tx_v10_clust.ge...,"[(RUFY4, 1.0263671875), (KRT3, 0.9072265625), ...",1352
A1CF,tfdimers__MD00166,0.045747,3.405623,,,,(hg38_500bp_up_100bp_down_full_tx_v10_clust.ge...,"[(MSGN1, 0.90283203125), (FAM41C, 0.9287109375...",567
A1CF,transfac_pro__M01721,0.044050,3.161089,,,,(hg38_500bp_up_100bp_down_full_tx_v10_clust.ge...,"[(LST1, 0.93896484375), (FOXE3, 1.001953125), ...",1430
...,...,...,...,...,...,...,...,...,...
ZXDC,taipale_tf_pairs__ETV5_FOXI1_TGTTGNCGGAWRN_CAP,0.049217,3.126896,,,,"(top50perTarget, hg38_10kbp_up_10kbp_down_full...","[(TRA2A, 2.474609375), (ELAC2, 1.2568359375), ...",298
ZXDC,taipale_tf_pairs__GCM1_ELK1_RTGCGGGCGGAAGTN_CAP_1,0.051025,3.382381,,,,"(top50perTarget, hg38_10kbp_up_10kbp_down_full...","[(UFL1, 1.7509765625), (CCDC18-AS1, 2.65820312...",1471
ZXDC,transfac_pro__M05505,0.050867,3.360039,,,,"(top50perTarget, hg38_10kbp_up_10kbp_down_full...","[(OTUD6B, 1.279296875), (LINC00426, 0.96240234...",931
ZXDC,transfac_pro__M05696,0.048908,3.083264,,,,"(top50perTarget, hg38_10kbp_up_10kbp_down_full...","[(CARS, 1.642578125), (ZNF83, 1.732421875), (T...",1365


In [19]:
# # Checkpoint
# # Reading back the data from tsv
# ## 1. Read with multi-index headers
# df = pd.read_csv(f_motifs_path, sep="\t", header=[0, 1], index_col=[0, 1])
# ## 2. Transform the "Context" frozensets (string) into actual frozensets
# def parse_frozenset_string(s):
#     if isinstance(s, str) and s.startswith('frozenset'):
#         return frozenset(ast.literal_eval(s[len('frozenset('):-1]))
#     return s
# df.loc[:, ('Enrichment', 'Context')] = df.loc[:, ('Enrichment', 'Context')].apply(parse_frozenset_string)
# ## 3. Transform "TargetGenes" from string to list of tuples
# def parse_list_of_tuples(s):
#     if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
#         return ast.literal_eval(s)
#     return s  # if it's already a list or something else
# df.loc[:, ('Enrichment', 'TargetGenes')] = df.loc[:, ('Enrichment', 'TargetGenes')].apply(parse_list_of_tuples)

In [20]:
print(len(set(df.index.get_level_values('TF').values)), "regulons were kept, after pruning")

1657 regulons were kept, after pruning


In [21]:
# Look for main regulons I'm looking for
print("ATF4", "ATF4" in df.index.get_level_values('TF').values, sep="\t")

ATF4	True


In [22]:
print("Size of Dataframe:", len(df))
# Check which rows have empty lists in 'TargetGenes'
mask = df[('Enrichment', 'TargetGenes')].apply(lambda x: len(x) == 0)
# Drop those rows
df_filtered = df.loc[~mask]
print("Size of Dataframe:", len(df_filtered))

Size of Dataframe: 612300
Size of Dataframe: 612300


These "modules" are then combined into regulons, by taking the top NES for each TF (for main Motif, and final score of regulon). All genes are bundled together.

In [23]:
# This dataframe can then be converted to regulons.
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

regulons = df2regulons(df_filtered)

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~11mn with n_workers=24, threads_per_worker=1.

Start time: 18:43:08
Create regulons from a dataframe of enriched features.
Additional columns saved: []
End time: 18:54:13
Running time: 0:11:04.124414


In [24]:
regulon_df = pd.DataFrame(index = range(0, len(regulons)), columns = ("Regulon", "TF", "TFTargetGenesCorrelation", "NbMarkers", "Motif", "NES", "Markers"))
for j in range(0, len(regulons)):
    # Fixing order of set
    context = list(regulons[j].context)
    if(context[0].endswith(".png")):
        tmp = context[0]
        context[0] = context[1]
        context[1] = tmp
    # Setting values
    regulon_df["Regulon"].iloc[j] = regulons[j].name
    regulon_df["TF"].iloc[j] = regulons[j].transcription_factor
    regulon_df["TFTargetGenesCorrelation"].iloc[j] = context[0]
    regulon_df["NbMarkers"].iloc[j] = len(set(regulons[j].gene2weight))
    regulon_df["Motif"].iloc[j] = "https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/logos/" + context[1]
    regulon_df["NES"].iloc[j] = regulons[j].score
    regulon_df["Markers"].iloc[j] = ','.join(list(regulons[j].gene2weight))

regulon_df = regulon_df.sort_values(by='NbMarkers', ascending=False)
regulon_df.to_csv(f_regulons_path, index=False, sep = "\t")
regulon_df

Unnamed: 0,Regulon,TF,TFTargetGenesCorrelation,NbMarkers,Motif,NES,Markers
726,PKM(+),PKM,activating,5354,https://resources.aertslab.org/cistarget/motif...,8.106719,"VPS25,FKBP1A,NOSIP,TSSC4,NR1H2,TMEM167A,CCAR2,..."
50,ATF6B(+),ATF6B,activating,5352,https://resources.aertslab.org/cistarget/motif...,7.044777,"FANCC,ING4,RMDN1,TPRKB,CHERP,ICA1L,TAF13,DFFA,..."
652,NPDC1(+),NPDC1,activating,5336,https://resources.aertslab.org/cistarget/motif...,6.432085,"FKBP1A,NOSIP,TPRKB,NR1H2,TMEM167A,CCAR2,CHERP,..."
1639,ZSCAN18(+),ZSCAN18,activating,4975,https://resources.aertslab.org/cistarget/motif...,7.8027,"ZNF528,NPLOC4,KNOP1,COG3,TBRG1,MCRS1,ZKSCAN1,K..."
103,CELF5(+),CELF5,activating,4868,https://resources.aertslab.org/cistarget/motif...,5.526488,"MED13L,LYRM1,TTBK1,GAS6,GMEB1,STK40,CUL2,NR3C1..."
...,...,...,...,...,...,...,...
99,CEBPE(+),CEBPE,activating,181,https://resources.aertslab.org/cistarget/motif...,6.39176,"IKZF1,CD33,CSF2RB,PIK3CG,CEACAM21,PAQR5,TAGLN2..."
256,FEZF1(+),FEZF1,activating,145,https://resources.aertslab.org/cistarget/motif...,5.726437,"GPC6-AS1,MAP4K1,QKI,C1QB,PAPPA-AS1,ADAM20,AJUB..."
59,BARHL1(+),BARHL1,activating,102,https://resources.aertslab.org/cistarget/motif...,6.352232,"TMCO4,CDCA7L,ELOVL2-AS1,FAM167A-AS1,AARD,NFIA-..."
452,ISL2(+),ISL2,activating,89,https://resources.aertslab.org/cistarget/motif...,5.140866,"KRTAP5-11,C11orf52,TMEM213,LCN12,IKZF1,PRTN3,Z..."


In [25]:
# Look for main regulons
print("ATF4", "ATF4" in regulon_df.TF.values, sep="\t")

ATF4	True


## Phase III: Cellular regulon enrichment matrix (aka AUCell)

Characterize the different cells in a single-cell transcriptomics experiment by the enrichment of the regulons. Enrichment of a regulon is measures as AUC of the recovery curve of the genes that define this regulon.

In [26]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

auc_mtx = aucell(ex_matrix.transpose(), regulons, num_workers=param_n_workers)

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: Takes ~57mn with n_workers=24, threads_per_worker=1.

Start time: 18:54:15
End time: 19:51:23
Running time: 0:57:08.591517


In [27]:
auc_mtx.to_csv(f_regulons_aucell_path, sep = "\t")
auc_mtx

Regulon,A1CF(+),ABCF2(+),ABL1(+),ACAA1(+),ACO1(+),ADARB1(+),ADNP(+),ADNP2(+),AEBP2(+),AFF4(+),...,ZSCAN29(+),ZSCAN30(+),ZSCAN31(+),ZSCAN32(+),ZSCAN4(+),ZSCAN5A(+),ZSCAN5B(+),ZSCAN9(+),ZSWIM1(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0950_240109_AAACCCAAGACATCCT,0.000000,0.024658,0.091146,0.024932,0.069350,0.052157,0.091689,0.061605,0.050848,0.077550,...,0.014441,0.076373,0.036029,0.003729,0.003734,0.046107,0.008721,0.019971,0.000264,0.078344
0950_240109_AAACCCACACCGTACG,0.000000,0.021395,0.055565,0.023055,0.041589,0.089148,0.065912,0.059249,0.144269,0.078498,...,0.000000,0.075599,0.077817,0.001798,0.000000,0.064721,0.000000,0.001653,0.000000,0.062055
0950_240109_AAACCCAGTAATCAGA,0.000000,0.014502,0.064801,0.021895,0.094315,0.102142,0.086684,0.073949,0.135545,0.076370,...,0.001155,0.074548,0.044383,0.002775,0.000000,0.066575,0.000000,0.002680,0.000000,0.068417
0950_240109_AAACCCATCACTCTTA,0.000000,0.014540,0.059694,0.019924,0.047204,0.095392,0.072920,0.064572,0.152613,0.083096,...,0.000513,0.074198,0.084286,0.002856,0.000000,0.070507,0.000000,0.002861,0.000000,0.063124
0950_240109_AAACGAAAGGTTACCT,0.006155,0.021028,0.047185,0.025566,0.070592,0.075773,0.061729,0.055901,0.108271,0.062131,...,0.004033,0.043920,0.050377,0.005897,0.004318,0.058772,0.001879,0.003744,0.000434,0.049658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6052_200709_TTTGGTTTCGACACTA,0.000000,0.010994,0.067159,0.024528,0.103783,0.089819,0.078756,0.062876,0.125679,0.067219,...,0.004425,0.083836,0.032565,0.002889,0.000000,0.066071,0.000000,0.004196,0.000639,0.078374
6052_200709_TTTGGTTTCGCAGTTA,0.008716,0.027432,0.047810,0.037411,0.042183,0.061342,0.055731,0.048982,0.088286,0.055796,...,0.008468,0.047633,0.045425,0.006315,0.007149,0.055412,0.005595,0.008613,0.000942,0.050324
6052_200709_TTTGTTGCAAGCGATG,0.000000,0.021211,0.051496,0.031873,0.037237,0.093770,0.061524,0.060291,0.152333,0.072751,...,0.000000,0.065064,0.069909,0.000788,0.000000,0.067151,0.000000,0.000644,0.000000,0.056401
6052_200709_TTTGTTGCAAGGTCAG,0.000000,0.018819,0.054186,0.031962,0.062210,0.090237,0.063198,0.059095,0.145092,0.074155,...,0.000000,0.068237,0.063098,0.000899,0.001783,0.077281,0.003254,0.005008,0.000000,0.068445


In [28]:
# Checkpoint to regenerate the object from the file
#auc_mtx = pd.read_csv(f_regulons_aucell_path, sep = "\t", index_col = "Cell")
#auc_mtx.columns.name = "Regulon"
#auc_mtx

In [None]:
start = datetime.now(pytz.timezone('Europe/Paris'))
print("Start time:", start.strftime("%H:%M:%S"))

auc_mtx_bin = binarize(auc_mtx, seed = 42, num_workers=param_n_workers)

end = datetime.now(pytz.timezone('Europe/Paris'))
print("End time:", end.strftime("%H:%M:%S"))
print("Running time:", (end - start))

# Note: 08h40mn46 with num_workers=32 # Note: Takes ~57mn with n_workers=24, threads_per_worker=1.

Start time: 20:02:58


In [None]:
import numpy as np

# columns with all-constant values
constant_cols = [c for c in auc_mtx.columns 
                 if np.allclose(auc_mtx[c].values, auc_mtx[c].values[0])]

print("Number of constant columns:", len(constant_cols))

In [None]:
binarization_thresholds = auc_mtx_bin[1]
binarization_thresholds.to_csv(f_regulons_binarization_thresholds_aucell_path, sep = "\t")
binarization_thresholds

In [None]:
auc_mtx_bin = auc_mtx_bin[0]
auc_mtx_bin.to_csv(f_regulons_binarized_aucell_path, sep = "\t")
auc_mtx_bin

In [None]:
sum(auc_mtx_bin["ATF4(+)"])

In [None]:
binarization_thresholds.loc['ATF4(+)']

In [None]:
sum(auc_mtx["ATF4(+)"] > binarization_thresholds.loc['ATF4(+)'])