In [None]:
# Import dependencies
import os, glob, re, pickle
from functools import partial
from collections import OrderedDict
import operator as op
from cytoolz import compose

import numpy as np
import pandas as pd
import scanpy as sc
import scipy as sp
import loompy as lp
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import anndata

from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.binarization import binarize
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_binarization, plot_rss

from IPython.display import HTML, display

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks"
os.chdir( wdir )

# folder structures
SCVI_FOLDERNAME = "foetal/results/scVI/"
RESULTS_FOLDERNAME = "foetal/results/Scenic/"
FIGURES_FOLDERNAME = "foetal/results/Scenic/"
AUXILLIARIES_FOLDERNAME = "../files/auxilliaries/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
DATASET_ID = "foetal"

ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.scenic.loom'.format(DATASET_ID))
RES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulon_mat.csv'.format(DATASET_ID))


def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')

# Downloading files

Pick and download from https://resources.aertslab.org/cistarget/:

### v9

In [None]:
!wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.genes_vs_motifs.rankings.feather

In [None]:
!wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.genes_vs_motifs.rankings.feather

In [None]:
!wget https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl

In [None]:
!wget https://resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt

### v10

In [None]:
!wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather

In [None]:
!wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather

In [None]:
!wget https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl

In [None]:
# transcription factors list from https://github.com/aertslab/SCENICprotocol/blob/master/example/
HUMAN_TFS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'allTFs_hg38.txt')
# Ranking databases. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
RANKING_DBS_FNAMES = list(map(lambda fn: os.path.join(AUXILLIARIES_FOLDERNAME, fn),
                        ['hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather',
                        'hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather']))
# Motif annotations. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
MOTIF_ANNOTATIONS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl')

# SCENIC Loom File Prep

In [None]:
adata=sc.read_h5ad(os.path.join(SCVI_FOLDERNAME, 'dev_scANVI.h5ad'))
adata

In [None]:
print(adata.var_names)
print(adata.obs_names)

In [None]:
# check counts:
print(adata.X[1:5,1:5])

In [None]:
print(adata.layers['counts'][1:5,1:5])

In [None]:
adata_age_dict = {}
for pcw in adata.obs.age.unique():
    subset = adata[adata.obs['age']==pcw].copy()
    adata_age_dict[pcw] = subset
    print(f'{subset.n_obs}, {subset.n_vars}')
    #loomdata=sc.AnnData(subset.layers['counts'],obs=subset.obs,var=subset.var)
    #loomdata.write_loom(os.path.join(RESULTS_FOLDERNAME, f'dev_scenic_input_{pcw}.loom'))
    
#del loomdata
del subset

In [None]:
adata_age_dict

In [None]:
INPUT_LOOM_FNAME = os.path.join(SCVI_FOLDERNAME, 'dev_scenicinput.loom')

# SCENIC

### Checking parameters for AUCELL (step 3)

"It is important to check that most cells have a substantial fraction of expressed/detected genes in the calculation of the AUC. The following histogram gives an idea of the distribution and allows selection of an appropriate threshold. In this plot, a few thresholds are highlighted, with the number of genes selected shown in red text and the corresponding percentile in parentheses)." "See the relevant section in the R tutorial (https://scenic.aertslab.org/scenic_paper/tutorials/AUCell.html#build-gene-expression-rankings-for-each-cell) for more information."

The information obtained from this plot can be used to set appropriate thresholds for filtering out low-quality cells or genes from downstream analysis. For example, cells with very low numbers of detected genes may be considered low-quality and removed from the dataset. Conversely, genes that are detected in only a small number of cells may also be filtered out as potential noise.

In [None]:
for adata in adata_age_dict.values():
    # Check thresholds for aucell
    nGenesDetectedPerCell = pd.DataFrame(np.sum(adata.X>0, axis=1))
    nGenesDetectedPerCell = nGenesDetectedPerCell.squeeze()

    percentiles = nGenesDetectedPerCell.quantile([0.01, 0.05, 0.10, 0.50, 1])
    print(percentiles)

    fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150)
    sns.distplot(nGenesDetectedPerCell, norm_hist=False, kde=False, bins='fd')
    for i,x in enumerate(percentiles):
        fig.gca().axvline(x=x, ymin=0,ymax=1, color='red')
        ax.text(x=x, y=ax.get_ylim()[1], s=f'{int(x)} ({percentiles.index.values[i]*100}%)', color='red', rotation=30, size='x-small',rotation_mode='anchor' )
    ax.set_title('# of genes detected per cell')
    ax.set_xlabel('# of genes')
    ax.set_ylabel('# of cells')
    fig.tight_layout()

The `--auc_threshold` value determines the minimum Area Under the Curve (AUC) score that a gene must have in order to be considered significant. By setting the `--auc_threshold` parameter to a specific value, we control the stringency of the gene signature enrichment analysis. Genes with AUC scores below the threshold are considered non-significant and are excluded from downstream analysis. A lower threshold value will result in more genes being considered significant, while a higher threshold value will result in fewer genes being considered significant. The choice of threshold value will depend on the specific research question and the quality of the data.

During the SCENIC workflow, the AUC score is used to assess the enrichment of each gene signature in each cell type or condition. The AUC score reflects the ability of a gene signature to discriminate between the expression profiles of two cell types or conditions. Genes with high AUC scores are considered to be strongly associated with the gene signature and are likely to play an important role in the biological process or pathway represented by the signature.

In general, it is recommended to use a threshold that strikes a balance between sensitivity and specificity in identifying significant genes. Setting a low threshold value will result in more genes being identified as significant, but may also increase the risk of false positives. Setting a high threshold value will reduce the number of false positives, but may also lead to false negatives and miss important genes.

One approach to selecting an appropriate threshold value is to consider the distribution of AUC scores across all genes in the dataset. If the distribution is bimodal, with one peak representing non-significant genes and another peak representing significant genes, the threshold can be set at the valley between the two peaks. However, if the distribution is unimodal or irregular, other methods can be used.

- By using the setting for `--auc_threshold` of 0.05, we see that 507 genes are selected for the rankings based on the plot above.

# ALL JOBS COMPUTATIONALLY HEAVY, RUN ON CCB CLUSTER 
(JUMBO NODE)

#### STEP 1: Network inference based on GRNBoost2:

In [None]:
!pyscenic grn {INPUT_LOOM_FNAME} {HUMAN_TFS_FNAME} \
-o {ADJACENCIES_FNAME} \
--seed 4000 \
--num_workers 40
-q

alternative used on the CCB cluster:

In [None]:
arboreto_with_multiprocessing.py {INPUT_LOOM_FNAME} {HUMAN_TFS_FNAME} \
-o {ADJACENCIES_FNAME} \
--num_workers 8 \
--method grnboost2 \
--seed 4000

#### STEP2: Regulon prediction (cisTarget):

In [None]:
DBS_PARAM = ' '.join(RANKING_DBS_FNAMES)
DBS_PARAM

In [None]:
!pyscenic ctx {ADJACENCIES_FNAME} {DBS_PARAM} \ 
--annotations_fname {MOTIF_ANNOTATIONS_FNAME} \ 
--expression_mtx_fname {INPUT_LOOM_FNAME} \ 
--output {MOTIFS_FNAME} \
--auc_threshold 0.05

#### STEP3: AUCELL: CHECK APPROPRIATE AUC_THRESHOLD TO SET BEFORE RUNNING (see below)

In [None]:
!pyscenic aucell {INPUT_LOOM_FNAME} \
{MOTIF_ANNOTATIONS_FNAME} \ 
--output {LOOM_FNAME} \
--auc_threshold 0.05

# 20w Tendons

# Checking motifs

In [None]:
DATASET_ID = "20w_foetal"

ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_005AUC.scenic.loom'.format(DATASET_ID))
RES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulon_mat.csv'.format(DATASET_ID))

In [None]:
df_motifs = load_motifs(MOTIFS_FNAME)

In [None]:
df_motifs.head()

In [None]:
def derive_regulons(motifs, db_names=('hg38__refseq-r80__10kb_up_and_down_tss.mc9nr', 
                                      'hg38__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr')):
    motifs.columns = motifs.columns.droplevel(0)

    def contains(*elems):
        def f(context):
            return any(elem in context for elem in elems)
        return f

    # For the creation of regulons we only keep the 10-species databases and the activating modules. We also remove the
    # enriched motifs for the modules that were created using the method 'weight>50.0%' (because these modules are not part
    # of the default settings of modules_from_adjacencies anymore.
    motifs = motifs[
        np.fromiter(map(compose(op.not_, contains('weight>50.0%')), motifs.Context), dtype=np.bool) & \
        np.fromiter(map(contains(*db_names), motifs.Context), dtype=np.bool) & \
        np.fromiter(map(contains('activating'), motifs.Context), dtype=np.bool)]

    # We build regulons only using enriched motifs with a NES of 3.0 or higher; we take only directly annotated TFs or TF annotated
    # for an orthologous gene into account; and we only keep regulons with at least 10 genes.
    regulons = list(filter(lambda r: len(r) >= 10, df2regulons(motifs[(motifs['NES'] >= 3.0) 
                                                                      & ((motifs['Annotation'] == 'gene is directly annotated')
                                                                        | (motifs['Annotation'].str.startswith('gene is orthologous to')
                                                                           & motifs['Annotation'].str.endswith('which is directly annotated for motif')))
                                                                     ])))
    
    # Rename regulons, i.e. remove suffix.
    return list(map(lambda r: r.rename(r.transcription_factor), regulons))

In [None]:
#regulons = derive_regulons(df_motifs)
#len(regulons)

In [None]:
regulons = df2regulons(df_motifs)

In [None]:
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

# Visualization of SCENIC's AUC matrix

In [None]:
#lf = lp.connect(os.path.join(RESULTS_FOLDERNAME, '..._005AUC.scenic.loom'))
lf = lp.connect(LOOM_FNAME, mode='r+', validate=False )
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
#exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID)
regulons = lf.ra.Regulons
#res=pd.concat([pd.Series(r.tolist(),index=regulons.dtype.names) for r in regulons],axis=1)
#res.columns=lf.row_attrs["Gene"]
#res.to_csv(RES_FNAME)
lf.close()

In [None]:
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

In [None]:
auc_mtx

In [None]:
sns.clustermap(auc_mtx, figsize=(12,12))

## Regulon Binarization

In [None]:
#%timeit bin_mtx, thresholds = binarize(auc_mtx.iloc[:10, :10]) 
#%timeit bin_mtx, thresholds = binarize(auc_mtx.iloc[:100, :10]) 
#%timeit bin_mtx, thresholds = binarize(auc_mtx.iloc[:1000, :10]) 

#%%time #takes about 40 min
bin_mtx, thresholds = binarize(auc_mtx)
bin_mtx.to_csv(BIN_MTX_FNAME) 
thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)

In [None]:
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold

In [None]:
bin_mtx

In [None]:
thresholds

In [None]:
# Create heatmap with binarized regulon activity.

def palplot(pal, names, colors=None, size=1):
    n = len(pal)
    f, ax = plt.subplots(1, 1, figsize=(n * size, size))
    ax.imshow(np.arange(n).reshape(1, n),
              cmap=mpl.colors.ListedColormap(list(pal)),
              interpolation="nearest", aspect="auto")
    ax.set_xticks(np.arange(n) - .5)
    ax.set_yticks([-.5, .5])
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    colors = n * ['k'] if colors is None else colors
    for idx, (name, color) in enumerate(zip(names, colors)):
        ax.text(0.0+idx, 0.0, name, color=color, horizontalalignment='center', verticalalignment='center')
    return f

In [None]:
adata_age_dict['20w'].obs['C_scANVI'].value_counts()

In [None]:
adata = adata_age_dict['20w'].copy()

In [None]:
adata.uns['cell_type_colors']

In [None]:
import matplotlib as mpl
#import colorcet as cc

cats = sorted(list(set(adata.obs['C_scANVI'])))
colors = sns.color_palette(adata.uns['cell_type_colors'], n_colors=len(cats)) #alt palette = 'bright'
colorsd = dict( zip( cats, colors ))
colormap = [ colorsd[x] for x in adata.obs['C_scANVI'] ]

cell_id2cell_type_lut =adata.obs['C_scANVI'].to_dict()
bw_palette = sns.xkcd_palette(["white", "black"])

In [None]:
sns.set()
sns.set_style("whitegrid")
fig = palplot(bw_palette, ['OFF', 'ON'], ['k', 'w'])
savesvg('20w_legend_on_off.svg', fig)

sns.set()
sns.set(font_scale=1.0)
fig = palplot(colors, cats, size=2.5)
savesvg('20w_legend_celltypes.svg', fig)

#sns.set()
#sns.set(font_scale=1.0)
#fig = palplot(sns.color_palette(COLORS), adata.obs['CellType'].dtype.categories, size=3.0)

In [None]:
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx.T, 
               col_colors=auc_mtx.index.map(cell_id2cell_type_lut).map(colorsd),
               cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Cell Type'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, 'binarizedregulonheatmap_20w.jpeg'), format='jpeg')

In [None]:
## For subsets:

# genes = set(adata.obs.index)
# bin_mtx_sm = bin_mtx[bin_mtx.index.isin(genes)]
# bin_mtx_sm

# auc_mtx_sm = auc_mtx[auc_mtx.index.isin(genes)]
# auc_mtx_sm

In [None]:
## need to half the dataframe!
bin_mtx_clustered = bin_mtx.T.copy()
bin_mtx_clustered.rename(columns=adata.obs['C_scANVI'].to_dict(), inplace=True)
# bin_mtx_clustered.iloc[g.dendrogram_row.reordered_ind, g.dendrogram_col.reordered_ind].to_excel(os.path.join(RESULTS_FOLDERNAME, 'Binarized_regulon_activity_1.xlsx'))

In [None]:
bin_mtx_clustered

In [None]:
bin_mtx_clustered_fibro = bin_mtx_clustered.loc[:,['COL3A1hi LUMhi Fibroblasts', 'THBS4hi Fibroblasts', 'FMODhi KERAhi Fibroblasts']]
bin_mtx_clustered_fibro.to_excel(os.path.join(RESULTS_FOLDERNAME, 'Binarized_regulon_activity_fibroblasts2.xlsx'))
bin_mtx_clustered_fibro

In [None]:
# figure out how to easily look up highly expr. regulons for each cell type...
# bin_mtx_clustered.loc[:, 'COL3A1hi LUMhi Fibroblasts']

### Clustermap with changing cell colors for grouping similar cells together (optional)

In [None]:
# altered_col = sns.color_palette(
#     ['#0072b2',
#  '#018700',
#  '#ffa52f',
#  '#97ff00', # fibro A
#  '#97ff00',
#  '#97ff00',
#  '#97ff00',
#  '#000000', #grem1 '#b500ff'
#  '#FFFFFF', #hsphi
#  '#afa5ff',
#  '#000000',
#  '#00fdcf', #'#9a6900'
#  '#d60000',
#  '#d3008c',
#  '#fdf490',
#  '#b500ff'])# '#0072b2'
# altered_col

# cats = sorted(list(set(adata.obs['CellType'])))
# colors = altered_col
# colorsd = dict( zip( cats, colors ))
# colormap = [ colorsd[x] for x in adata.obs['CellType'] ]

In [None]:
# sns.set()
# sns.set_style("whitegrid")
# fig = palplot(bw_palette, ['OFF', 'ON'], ['k', 'w'])
# savesvg('legend_on_off.svg', fig)

# sns.set()
# sns.set(font_scale=1.0)
# fig = palplot(colors, cats, size=2.5)
# savesvg('legend_celltypes_noUnassigned_fibrogrouped.svg', fig)

In [None]:
# sns.set()
# sns.set(font_scale=1.0)
# sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
# g = sns.clustermap(bin_mtx_sm.T, 
#                col_colors=auc_mtx_sm.index.map(cell_id2cell_type_lut).map(colorsd),
#                cmap=bw_palette, figsize=(20,20))
# g.ax_heatmap.set_xticklabels([])
# g.ax_heatmap.set_xticks([])
# g.ax_heatmap.set_xlabel('Cells')
# g.ax_heatmap.set_ylabel('Regulons')
# g.ax_col_colors.set_yticks([0.5])
# g.ax_col_colors.set_yticklabels(['Cell Type'])
# g.cax.set_visible(False)
# g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, 'binarizedregulonheatmap_fibrogrouped.png'), format='png')

## Plotting binarization of individual regulons:

In [None]:
# fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 4), dpi=100)

# plot_binarization(auc_mtx, 'NR2F2(+)', thresholds['NR2F2(+)'], ax=ax1)
# plot_binarization(auc_mtx, 'SPI1(+)', thresholds['SPI1(+)'], ax=ax2)
# plot_binarization(auc_mtx, 'HOXD8(+)', thresholds['HOXD8(+)'], ax=ax3)
# plot_binarization(auc_mtx, 'ATF3(+)', thresholds['ATF3(+)'], ax=ax4)
# plot_binarization(auc_mtx, 'E2F8(+)', thresholds['E2F8(+)'], ax=ax5)
# plot_binarization(auc_mtx, 'TLL1(+)', thresholds['TLL1(+)'], ax=ax6)
# plot_binarization(auc_mtx, 'PAX3(+)', thresholds['PAX3(+)'], ax=ax7)
# plot_binarization(auc_mtx, 'ZNF713(+)', thresholds['ZNF713(+)'], ax=ax8)

# plt.tight_layout()

## Regulon specificity score (RSS) across predicted cell types

In [None]:
rss = regulon_specificity_scores(auc_mtx, adata.obs['C_scANVI'])
rss

In [None]:
from adjustText import adjust_text

cats = sorted(list(set(adata.obs['C_scANVI'])))

fig = plt.figure(figsize=(22, 8)) #(width, length)
for c,num in zip(cats, range(1,len(cats)+1)):
    x=rss.T[c]
    ax = fig.add_subplot(2,7,num) #(rows, columns)
    plot_rss(rss, c, top_n=5, max_n=None, ax=ax)
    ax.set_ylim( x.min()-(x.max()-x.min())*0.05 , x.max()+(x.max()-x.min())*0.05 )
    for t in ax.texts:
        t.set_fontsize(12)
    ax.set_ylabel('')
    ax.set_xlabel('')
    adjust_text(ax.texts, autoalign='xy', ha='right', va='bottom', 
                arrowprops=dict(arrowstyle='-',color='lightgrey'), precision=0.001 )
 
fig.text(0.5, 0.0, 'Regulon', ha='center', va='center', size='x-large')
fig.text(0.00, 0.5, 'Regulon specificity score (RSS)', ha='center', va='center', rotation='vertical', size='x-large')
plt.tight_layout()
plt.rcParams.update({
    'figure.autolayout': True,
        'figure.titlesize': 'large' ,
        'axes.labelsize': 'medium',
        'axes.titlesize':'large',
        'xtick.labelsize':'medium',
        'ytick.labelsize':'medium'
        })
plt.show()
savesvg('CellTypeRSS_dev20pcw.svg', fig)

In [None]:
topreg = []
for i,c in enumerate(cats):
    topreg.extend(
        list(rss.T[c].sort_values(ascending=False)[:5].index)
    )
topreg = list(set(topreg))

In [None]:
auc_mtx_Z = pd.DataFrame( index=auc_mtx.index )
for col in list(auc_mtx.columns):
    auc_mtx_Z[ col ] = ( auc_mtx[col] - auc_mtx[col].mean()) / auc_mtx[col].std(ddof=0)
#auc_mtx_Z.sort_index(inplace=True)

In [None]:
sns.set(font_scale=1.2)
g = sns.clustermap(auc_mtx_Z[topreg], annot=False,  square=False,  linecolor='gray',
    yticklabels=False, xticklabels=True, vmin=-2, vmax=6, row_colors=colormap,
    cmap="YlGnBu", figsize=(21,16) )
g.cax.set_visible(True)
g.ax_heatmap.set_ylabel('')
g.ax_heatmap.set_xlabel('')
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, 'Z-score_heatmap_dev20pcw_top5.svg'), format='tiff')

### Creating a regulon h5ad object

In [None]:
lf = lp.connect(LOOM_FNAME, mode='r+', validate=False )
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()

In [None]:
adata=sc.read_h5ad(os.path.join(SCVI_FOLDERNAME, 'dev_scANVI.h5ad'))
adata_age_dict = {}
for pcw in adata.obs.age.unique():
    subset = adata[adata.obs['age']==pcw].copy()
    adata_age_dict[pcw] = subset
    print(f'{subset.n_obs}, {subset.n_vars}')
    #loomdata=sc.AnnData(subset.layers['counts'],obs=subset.obs,var=subset.var)
    #loomdata.write_loom(os.path.join(RESULTS_FOLDERNAME, f'dev_scenic_input_{pcw}.loom'))
    
#del loomdata
del subset

In [None]:
adata_genes = adata_age_dict['20w'].copy()
adata_genes.X = adata_genes.layers['log1p_norm'].copy()

In [None]:
adata=sc.AnnData(auc_mtx,
           obs=adata_genes.obs,uns=adata_genes.uns,obsm=adata_genes.obsm)

In [None]:
adata.var_names_make_unique()
adata_genes.var_names_make_unique()

In [None]:
genes=adata_genes.var_names
common=genes[genes.isin(np.array(list(map(lambda x: x.split("(")[0],adata.var_names))))]
adata=adata[:,common+"(+)"]
adata_genes=adata_genes[:,common]
adata_genes.var
adata.layers["auc_init"]=adata.X.copy()
weights=adata_genes.X.copy()
weights[weights>1]=1
adata.X=adata.X*weights.A
adata.layers["weights"]=weights
adata

In [None]:
df_motifs = load_motifs(MOTIFS_FNAME)
regulons = df2regulons(df_motifs)
len(regulons)
#regulons = derive_regulons(df_motifs)

In [None]:
add_scenic_metadata(adata_genes, auc_mtx, regulons)
adata_genes.write_h5ad(ANNDATA_FNAME)

In [None]:
#data.obsm['_scvi_extra_categorical_covs'] = np.array(regdata.obsm['_scvi_extra_categorical_covs'])
#data.obsm['_scvi_extra_continuous_covs'] = np.array(regdata.obsm['_scvi_extra_continuous_covs'])
#data.write_loom("results/scvi_3207/regulons_3207.loom", write_obsm_varm=True)

## AUCell Clustering based on regulon activity

In [None]:
adata_genes = sc.read_h5ad(ANNDATA_FNAME)
adata_genes

In [None]:
#embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['X', 'Y']], index=adata.obs_names)
umap = pd.DataFrame(adata_genes.obsm['X_umap'], columns=[['X', 'Y']], index=adata.obs_names)

In [None]:
umap.head()

In [None]:
pd.DataFrame(adata_genes.obsm['X_aucell'])

In [None]:
sc.pl.umap(adata_genes)

In [None]:
sc.pp.neighbors(adata_genes, use_rep = 'X_aucell', n_neighbors=30, metric='correlation')
sc.tl.umap(adata_genes)
sc.pl.umap(adata_genes)

In [None]:
sc.tl.tsne(adata_genes, n_pcs = 50, use_rep = 'X_aucell')

In [None]:
sc.pl.umap(adata)

In [None]:
sc.tl.tsne(adata, n_pcs = 50)

In [None]:
sc.pl.embedding(adata_genes, basis='X_tsne', color='C_scANVI', frameon=False, legend_fontsize=7)

In [None]:
sc.pl.embedding(adata,color=["C_scANVI"],
                legend_loc='right margin', 
                title="Cell Type - scANVI TSNE", 
                basis='X_tsne', show=False, frameon=False)
sc.pl.embedding(adata_genes,
                color=["C_scANVI"],
                title="Cell Type - SCENIC AUCell TSNE", 
                basis='X_tsne', legend_loc='right margin', 
                show=False, frameon=False)
#savesvg("ScenicAUCELLandNormalUMAPs_celltype.svg", fig)

In [None]:
#embedding_aucell_tsne = pd.DataFrame(adata_genes.obsm['X_tsne'], columns=[['X', 'Y']], index=adata.obs_names)
embedding_aucell_umap = pd.DataFrame(adata_genes.obsm['X_umap'], columns=[['X', 'Y']], index=adata.obs_names)

In [None]:
adata_genes.write_h5ad(ANNDATA_FNAME)

## Z-scoring (Alternative to RSS)
To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).

In [None]:
df_obs = adata_genes.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['C_scANVI']]
df_results = ((df_scores.groupby(by='C_scANVI').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()

In [None]:
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 3.0].sort_values('Z', ascending=False),
                           index='C_scANVI', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched (becuase of small number of cells).
fig, ax1 = plt.subplots(1, 1, figsize=(15, 10))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray', 
            cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('Z-score_regulons_matrixplot_devcombined20pcw.svg', fig)

In [None]:
df_results.sort_values('Z', ascending=False).groupby(by='C_scANVI').head(2)

In [None]:
aucell_adata = sc.AnnData(X=auc_mtx.sort_index())
aucell_adata.obs = df_obs
names = list(map(op.attrgetter('name'), filter(lambda r: r.score > 3.0, regulons)))
sc.pl.stacked_violin(aucell_adata, names, groupby='cell_type',
          #save=' - GSE115978 - regulons.svg'
                    )

# Check motifs

In [None]:
adjacencies = pd.read_csv(ADJACENCIES_FNAME, index_col=False, sep='\t')
adjacencies.head()

In [None]:
#lf = lp.connect(os.path.join(RESULTS_FOLDERNAME, 'devcombined_05AUC.scenic.loom'))
lf = lp.connect(LOOM_FNAME, mode='r+', validate=False ) #this is data with 0.1 AUC threshold
#exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID).T

In [None]:
# create a dictionary of regulons:
regulons = {}
for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():
    regulons[i] =  list(r[r==1].index.values)
lf.close()

In [None]:
regulons['ALX4(+)']

In [None]:
from pyscenic.utils import modules_from_adjacencies
modules = list(modules_from_adjacencies(adjacencies, exprMat))

In [None]:
tf = 'SOX2'
tf_mods = [ x for x in modules if x.transcription_factor==tf ]

for i,mod in enumerate( tf_mods ):
    print( f'{tf} module {str(i)}: {len(mod.genes)} genes' )
print( f'{tf} regulon: {len(regulons[tf+"(+)"])} genes' )

In [None]:
for i,mod in enumerate( tf_mods ):
    with open( os.path.join(RESULTS_FOLDERNAME, tf+'_module_'+str(i)+'.txt'), 'w') as f:
        for item in mod.genes:
            f.write("%s\n" % item)
            
with open( os.path.join(RESULTS_FOLDERNAME, tf+'_regulon.txt'), 'w') as f:
    for item in regulons[tf+"(+)"]:
        f.write("%s\n" % item)

## Exporting to SCope
https://scope.aertslab.org/#/98d21e95-3a9d-48be-a6e7-32f73f00f30f/*/welcome

In [None]:
adata=sc.read_h5ad(os.path.join(SCVI_FOLDERNAME, '{}.h5ad'.format('combined_scVIintegrated_zinb')))
adata

In [None]:
import json
import zlib
import base64
import umap
from MulticoreTSNE import MulticoreTSNE as TSNE

# scenic output
lf = lp.connect(LOOM_FNAME, mode='r+', validate=False )
meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
regulons = lf.ra.Regulons

In [None]:
## If you need to run UMAP or TSNE again:
## UMAP
# runUmap = umap.UMAP(n_neighbors=10, min_dist=0.4, metric='correlation').fit_transform
# dr_umap = runUmap( auc_mtx )
# dr_umap = pd.DataFrame(dr_umap, columns=['X', 'Y'], index=auc_mtx.index) #.to_csv( "results/scvi_3207/scenic_umap.txt", sep='\t')

## tSNE
# tsne = TSNE( n_jobs=20 )
# dr_tsne = tsne.fit_transform( auc_mtx )
# dr_tsne = pd.DataFrame(dr_tsne, columns=['X', 'Y'], index=auc_mtx.index) #.to_csv( "results/scvi_3207/scenic_tsne.txt", sep='\t')

# #dr_umap = pd.read_csv( 'scenic_umap.txt', sep='\t', header=0, index_col=0 )
# #dr_tsne = pd.read_csv( 'scenic_tsne.txt', sep='\t', header=0, index_col=0 )

In [None]:
# Fix regulon objects to display properly
auc_mtx.columns = auc_mtx.columns.str.replace('\(','_(')
regulons.dtype.names = tuple( [ x.replace("(","_(") for x in regulons.dtype.names ] )
# regulon thresholds
rt = meta['regulonThresholds']
for i,x in enumerate(rt):
    tmp = x.get('regulon').replace("(","_(")
    x.update( {'regulon': tmp} )

In [None]:
adata.obsm

In [None]:
# Concatenate embeddings (tSNE, UMAP, etc.)
#tsneDF = pd.DataFrame(adata.obsm['X_tsne'], columns=['_X', '_Y'])

Embeddings_X = pd.DataFrame( index=lf.ca.CellID )
Embeddings_X = pd.concat( [
        #pd.DataFrame(adata.obsm['X_draw_graph_fr'],index=adata.obs.index)[0] , 
        #pd.DataFrame(adata.obsm['X_diffmap'],index=adata.obs.index)[0] , 
        #pd.DataFrame(adata.obsm['X_pca_harmony'],index=adata.obs.index)[0] ,
        pd.DataFrame(adata.obsm['X_scVI'],index=adata.obs.index)[0] ,
        pd.DataFrame(adata.obsm['latent_gene_encoding'],index=adata.obs.index)[0] ,
        pd.DataFrame(adata.obsm['X_umap'],index=adata.obs.index)[0] ,
        pd.DataFrame(adata.obsm['X_pca'],index=adata.obs.index)[0] ,
        #embedding_aucell_tsne['X'] ,
        embedding_aucell_umap['X']
    ], sort=False, axis=1, join='outer' )
Embeddings_X.columns = ['1','2','3','4', '5']

In [None]:
Embeddings_Y = pd.DataFrame( index=lf.ca.CellID )
Embeddings_Y = pd.concat( [
#         pd.DataFrame(adata.obsm['X_draw_graph_fr'],index=adata.obs.index)[1] , 
#         pd.DataFrame(adata.obsm['X_diffmap'],index=adata.obs.index)[1] , 
#         pd.DataFrame(adata.obsm['X_pca_harmony'],index=adata.obs.index)[1] ,
        pd.DataFrame(adata.obsm['X_scVI'],index=adata.obs.index)[1] ,
        pd.DataFrame(adata.obsm['latent_gene_encoding'],index=adata.obs.index)[1] ,
        pd.DataFrame(adata.obsm['X_umap'],index=adata.obs.index)[1] ,
        pd.DataFrame(adata.obsm['X_pca'],index=adata.obs.index)[1] ,
        #embedding_aucell_tsne['Y'] ,
        embedding_aucell_umap['Y']
    ], sort=False, axis=1, join='outer' )
Embeddings_Y.columns = ['1','2','3','4', '5']

In [None]:
# Metadata
### metadata
metaJson = {}

metaJson['embeddings'] = [
#     {
#         "id": -1,
#         "name": f"Scanpy t-SNE (highly variable genes)"
#     },
#     {
#         "id": 1,
#         "name": f"Scanpy Draw Graph (Diffmap base)"
#     },
#     {
#         "id": 2,
#         "name": f"Scanpy Diffusion Map"
#     },
#     {
#         "id": 3,
#         "name": f"Scanpy PCA Harmony"
#     },
    {
        "id": 1,
        "name": f"scVI  (highly variable genes)"
    },
    {
        "id": 2,
        "name": f"scVI Latent Gene Encoding"
    },
    {
        "id": 3,
        "name": f"Scanpy UMAP  (highly variable genes)"
    },
    {
        "id": 4,
        "name": "Scanpy PC1/PC2"
    },
#     {
#         "id": 6,
#         "name": "SCENIC AUC t-SNE"
#     },
    {
        "id": 5,
        "name": "SCENIC AUC UMAP"
    },
]

metaJson["clusterings"] = [{
            "id": 0,
            "group": "Scanpy",
            "name": "Scanpy louvain default resolution",
            "clusters": [],
        }]

metaJson["metrics"] = [
        {
            "name": "nUMI"
        }, {
            "name": "nGene"
        }, {
            "name": "Percent_mito"
        }
]

metaJson["annotations"] = [
#     {
#         "name": "Louvain_0.8",
#         "values": list(set( adata.obs['louvain_0.8'].astype(np.str) ))
#     },
#     {
#         "name": "Leiden_0.6",
#         "values": list(set( adata.obs['leiden_0.6'].astype(np.str) ))
#     },
    {
        "name": "Cell Type",
        "values": list(set( adata.obs['cell_type'].astype(np.str) ))
    },
    {
        "name": "Phase",
        "values": list(set(adata.obs['phase'].values))
    },
    {
        "name": "Sex",
        "values": list(set(adata.obs['sex'].values))
    },
    {
        "name": "Libbatch",
        "values": list(set(adata.obs['libbatch'].values))
    },
    {
        "name": "Samples",
        "values": list(set(adata.obs['ident'].values))
    }
]

# SCENIC regulon thresholds:
metaJson["regulonThresholds"] = rt

for i in range(max(set([int(x) for x in adata.obs['louvain_0.8']])) + 1):
    clustDict = {}
    clustDict['id'] = i
    clustDict['description'] = f'Unannotated Cluster {i + 1}'
    metaJson['clusterings'][0]['clusters'].append(clustDict)
    
clusterings = pd.DataFrame()
clusterings["0"] = adata.obs['louvain_0.8'].values.astype(np.int64)

In [None]:
def dfToNamedMatrix(df):
    arr_ip = [tuple(i) for i in df.values]
    dtyp = np.dtype(list(zip(df.dtypes.index, df.dtypes)))
    arr = np.array(arr_ip, dtype=dtyp)
    return arr

In [None]:
col_attrs = {
    "CellID": np.array(adata.obs.index),
    "nUMI": np.array(adata.obs['n_counts'].values),
    "nGene": np.array(adata.obs['n_genes'].values),
    "Louvain_0.8": np.array( adata.obs['louvain_0.8'].values ),
    "Leiden_0.6": np.array( adata.obs['leiden_0.6'].values ),
    "CellType": np.array(adata.obs['CellType'].values),
    "Sex": np.array(adata.obs['sex'].values),
    "Sample": np.array(adata.obs['sample'].values),
    "Phase": np.array(adata.obs['phase'].values),
    "Libbatch": np.array(adata.obs['libbatch'].values),
    "Percent_mito": np.array(adata.obs['pct_counts_mt'].values),
    "Embedding": dfToNamedMatrix(tsneDF),
    "Embeddings_X": dfToNamedMatrix(Embeddings_X),
    "Embeddings_Y": dfToNamedMatrix(Embeddings_Y),
    "RegulonsAUC": dfToNamedMatrix(auc_mtx),
    "Clusterings": dfToNamedMatrix(clusterings),
    "ClusterID": np.array(adata.obs['CellType'].values)
}

row_attrs = {
    "Gene": lf.ra.Gene,
    "Regulons": regulons,
}

attrs = {
    "title": "sampleTitle",
    "MetaData": json.dumps(metaJson),
    "Genome": 'hg38',
    "SCopeTreeL1": "",
    "SCopeTreeL2": "",
    "SCopeTreeL3": ""
}

# compress the metadata field:
attrs['MetaData'] = base64.b64encode(zlib.compress(json.dumps(metaJson).encode('ascii'))).decode('ascii')

In [None]:
lp.create(
    filename = 'cellranger/results/scope_dev20_cr14812.loom' ,
    layers=lf[:,:],
    row_attrs=row_attrs, 
    col_attrs=col_attrs, 
    file_attrs=attrs
)
lf.close() # close original pyscenic loom file