# Emma Pan Neuro (Control + ND75KD) - pySCENIC pipeline (CLI version)

**Author:** Vincent Gardeux

**Date Created:** 03/01/2024

# Libraries

In [2]:
# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE

# Parameters

In [28]:
# Parameters for scanpy
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)
sc.settings.njobs = 32 # Set maximum number of jobs for Scanpy (also used in GRNBoost2)

# [Input] Loom file to use
f_loom_path_scenic = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated.loom"

# [Input] Transcription factors list (SCENIC step 1: GRNBoost2)
f_tfs = "/data/genome/drosophila_melanogaster/cistopic_flybase_r6.02/allTFs_dmel.txt" # drosophila

# [Output] Adjacency matrix (SCENIC step 1: GRNBoost2)
adj_matrix = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_adj.csv"

# [Input] Ranking databases (SCENIC step 2-3: cisTarget)
f_db_names = "/data/genome/drosophila_melanogaster/cistopic_flybase_r6.02/mc_v10_clust/dm6_v10_clust.genes_vs_motifs.rankings.feather"

# [Input] Motif databases (SCENIC step 2-3: cisTarget)
f_motif_path = "/data/genome/drosophila_melanogaster/cistopic_flybase_r6.02/mc_v10_clust/motifs-v10nr_clust-nr.flybase-m0.001-o0.0.tbl"

# [Output] Regulons (SCENIC step 2-3: cisTarget)
reg_matrix = "/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis/Pan_neuro_both_reannotated_GFP_curated_reintegrated_reg.csv"


-----
anndata     0.10.3
scanpy      1.9.3
-----
MulticoreTSNE       NA
PIL                 8.2.0
argcomplete         NA
asttokens           NA
astunparse          1.6.3
awkward             2.3.1
awkward_cpp         NA
backcall            0.2.0
cairo               1.21.0
cffi                1.14.5
cloudpickle         2.2.0
colorama            0.4.6
comm                0.1.3
cycler              0.10.0
cython_runtime      NA
cytoolz             0.10.1
dask                2021.06.0
dateutil            2.8.2
debugpy             1.6.7
decorator           4.4.2
dill                0.3.4
entrypoints         0.3
exceptiongroup      1.1.3
executing           1.2.0
fsspec              2023.4.0
future_fstrings     NA
google              NA
h5py                3.7.0
igraph              0.10.4
ipykernel           6.25.0
ipython_genutils    0.2.0
ipywidgets          7.6.3
isal                1.5.2
jedi                0.18.0
joblib              1.2.0
kiwisolver          1.3.1
leidenalg           0.9.

# SCENIC steps

## STEP 1: Gene regulatory network inference, and generation of co-expression modules

### 1.a. GRN inference using the GRNBoost2 algorithm

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system. We use the counts matrix (without log transformation or further processing) from the loom file we wrote earlier. 

*Output:* List of adjacencies between a TF and its targets.

Run GRNBoost2 algorithm

In [26]:
# Here I run the CLI version from the Docker
!docker run -it --rm --name pySCENIC \
    -v /data/genome/drosophila_melanogaster/cistopic_flybase_r6.02:/cistopic_flybase_r6.02 \
    -v /home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis:/analysis \
    -w /analysis aertslab/pyscenic:0.12.1 \
    pyscenic grn \
    {f_loom_path_scenic.replace("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/", "/")} \
    {f_tfs.replace("/data/genome/drosophila_melanogaster/", "/")} \
    -o {adj_matrix.replace("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/", "/")} \
    --num_workers {sc.settings.njobs}

# Note: It takes ~2h to complete


2024-03-06 11:00:06,775 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.

2024-03-06 11:00:23,383 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
32 partitions
computing dask graph
not shutting down client, client was created externally
finished

2024-03-06 12:44:45,916 - pyscenic.cli.pyscenic - INFO - Writing results to file.


Read in the adjacencies matrix

In [48]:
adjacencies = pd.read_csv(adj_matrix, sep=',')
#adjacencies.to_csv(adj_matrix.replace(".csv", ".tsv"), index=False, sep='\t')
adjacencies

Unnamed: 0,TF,target,importance
0,bi,lncRNA:CR32773,7.387206e+02
1,CG9650,lncRNA:CR44357,5.064458e+02
2,trv,CG34354,4.570765e+02
3,CG34354,trv,4.545700e+02
4,br,Mur2B,4.179730e+02
...,...,...,...
4016712,Dgp-1,Hug,7.302217e-20
4016713,rump,CG8401,3.989110e-20
4016714,crol,Capa,3.451499e-20
4016715,pho,Ilp3,1.136389e-20


## STEP 2-3: Regulon prediction aka cisTarget from CLI

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

*Output:* List of adjacencies between a TF and its targets.

### 2.a. Running regulon prediction using cisTarget

Here, we use the --mask_dropouts option, which affects how the correlation between TF and target genes is calculated during module creation. It is important to note that prior to pySCENIC v0.9.18, the default behavior was to mask dropouts, while in v0.9.18 and later, the correlation is performed using the entire set of cells (including those with zero expression). When using the modules_from_adjacencies function directly in python instead of via the command line, the rho_mask_dropouts option can be used to control this.

In [49]:
# Here I run the CLI version from the Docker
!docker run -it --rm --name pySCENIC \
    -v /data/genome/drosophila_melanogaster/cistopic_flybase_r6.02:/cistopic_flybase_r6.02 \
    -v /home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/analysis:/analysis \
    -w /analysis aertslab/pyscenic:0.12.1 \
    pyscenic ctx \
    {adj_matrix.replace("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/", "/")} \
    {f_db_names.replace("/data/genome/drosophila_melanogaster/", "/")} \
    --annotations_fname {f_motif_path.replace("/data/genome/drosophila_melanogaster/", "/")} \
    --expression_mtx_fname {f_loom_path_scenic.replace("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/", "/")} \
    --output {reg_matrix.replace("/home/gardeux/SVRAW1/gardeux/2023-04-19_Emma_snRNAseq/", "/")} \
    #--mask_dropouts \
    --num_workers 12
            
# Note: It takes ~2mn to complete


2024-03-06 19:00:57,846 - pyscenic.cli.pyscenic - INFO - Creating modules.

2024-03-06 19:00:59,393 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.

2024-03-06 19:01:21,750 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-03-06 19:03:25,003 - pyscenic.utils - INFO - Creating modules.

2024-03-06 19:06:17,636 - pyscenic.cli.pyscenic - INFO - Loading databases.

2024-03-06 19:06:17,724 - pyscenic.cli.pyscenic - INFO - Calculating regulons.

2024-03-06 19:06:17,725 - pyscenic.prune - INFO - Using 112 workers.

2024-03-06 19:06:17,725 - pyscenic.prune - INFO - Using 112 workers.

2024-03-06 19:06:18,584 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(1): database loaded in memory.

2024-03-06 19:06:18,584 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(1): database loaded in memory.

2024-03-06 19:06:18,746 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_moti


2024-03-06 19:06:19,914 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(7): motif annotations loaded in memory.

2024-03-06 19:06:19,914 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(7): motif annotations loaded in memory.

2024-03-06 19:06:19,926 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(6): motif annotations loaded in memory.

2024-03-06 19:06:19,926 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(6): motif annotations loaded in memory.
terminate called after throwing an instance of 'std::system_error'
  what():  Resource temporarily unavailable
terminate called after throwing an instance of 'std::system_error'
  what():  Resource temporarily unavailable

2024-03-06 19:06:19,984 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(10): motif annotations loaded in memory.

2024-03-06 19:06:19,984 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rank






















































































































































































































































2024-03-06 19:06:35,423 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(12): All regulons derived.

2024-03-06 19:06:35,423 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(12): All regulons derived.

2024-03-06 19:06:35,424 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(12): Done.

2024-03-06 19:06:35,424 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(12): Done.






2024-03-06 19:06:35,654 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(5): All regulons derived.

2024-03-06 19:06:35,654 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(5): All regulons derived.

2024-03-06 19:06:35,655 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(5): Done.

2024-03-06 19:06:35,655 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(5): Done.



2024-03-06 19:06:35,788 - pyscenic.prune - INFO - Worker 












2024-03-06 19:06:38,045 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(14): All regulons derived.

2024-03-06 19:06:38,045 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(14): All regulons derived.

2024-03-06 19:06:38,047 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(14): Done.

2024-03-06 19:06:38,047 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(14): Done.





2024-03-06 19:06:38,863 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(1): All regulons derived.

2024-03-06 19:06:38,863 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(1): All regulons derived.

2024-03-06 19:06:38,869 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(1): Done.

2024-03-06 19:06:38,869 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(1): Done.


2024-03-06 19:06:39,035 - pyscenic.prune - INFO - Worker dm6





2024-03-06 19:06:41,833 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(15): All regulons derived.

2024-03-06 19:06:41,833 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(15): All regulons derived.

2024-03-06 19:06:41,838 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(15): Done.

2024-03-06 19:06:41,838 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(15): Done.


2024-03-06 19:06:42,378 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(6): All regulons derived.

2024-03-06 19:06:42,378 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(6): All regulons derived.

2024-03-06 19:06:42,380 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(6): Done.

2024-03-06 19:06:42,380 - pyscenic.prune - INFO - Worker dm6_v10_clust.genes_vs_motifs.rankings(6): Done.

2024-03-06 19:06:42,763 - pyscenic.prune - INFO - Worker dm6_v10_clust.

In [45]:
regulons = pd.read_csv(reg_matrix, sep=',')
regulons

EmptyDataError: No columns to parse from file

In [39]:
regulons = pd.read_csv(reg_matrix, sep=',')
regulons

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Enrichment,Enrichment.1,Enrichment.2,Enrichment.3,Enrichment.4,Enrichment.5,Enrichment.6,Enrichment.7
0,,,AUC,NES,MotifSimilarityQvalue,OrthologousIdentity,Annotation,Context,TargetGenes,RankAtMax
1,TF,MotifID,,,,,,,,
2,Atf3,stark__TGANTCA,0.05681504924880987,4.666138672306601,3.99369e-06,0.068314,gene is orthologous to ENSG00000162772 in H. s...,"frozenset({'weight>75.0%', 'dm6_v10_clust.gene...","[('CG42748', 1.12564711607615), ('Gclm', 1.078...",457
3,Atf3,metacluster_148.1,0.049568211457107415,3.173405446015911,0.0,0.307087,motif is annotated for orthologous gene ENSG00...,"frozenset({'weight>75.0%', 'dm6_v10_clust.gene...","[('Ac13E', 0.7477660264325211), ('Dh31-R', 1.1...",4988
4,Atf3,metacluster_157.2,0.05377344487208033,4.039616588484887,0.0,0.384,motif is annotated for orthologous gene ENSG00...,"frozenset({'weight>75.0%', 'dm6_v10_clust.gene...","[('CG42784', 1.511065204057486), ('Fim', 3.418...",652
...,...,...,...,...,...,...,...,...,...,...
279,srp,metacluster_117.4,0.06427245428554983,5.9103326489220915,0.0,1.0,gene is directly annotated,"frozenset({'activating', 'dm6_v10_clust.genes_...","[('Ac13E', 15.015438165840512), ('CG17646', 9....",4765
280,srp,metacluster_117.7,0.05220225646142333,3.774395080426182,0.0,1.0,gene is directly annotated,"frozenset({'activating', 'dm6_v10_clust.genes_...","[('Idh', 9.23347691913659), ('mbc', 6.78963765...",4725
281,srp,metacluster_169.2,0.0685249055010841,6.66284478184673,0.0,1.0,gene is directly annotated,"frozenset({'activating', 'dm6_v10_clust.genes_...","[('dpp', 2.5538793133476654), ('for', 24.07611...",4996
282,tj,tfdimers__MD00092,0.09257796501082441,3.113835207142027,2.71523e-07,0.391026,motif similar to tfdimers__MD00162 ('M01721_fo...,"frozenset({'activating', 'dm6_v10_clust.genes_...","[('msi', 4.820953696086392), ('bru2', 4.166019...",635
