In [None]:
# Import dependencies
import os, glob, re, pickle
from functools import partial
from collections import OrderedDict
import operator as op
from cytoolz import compose

import numpy as np
import pandas as pd
import scanpy as sc
import scipy as sp
import loompy as lp
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import anndata

from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.binarization import binarize
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_binarization, plot_rss

from IPython.display import HTML, display

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/adult/"
os.chdir( wdir )

# folder structures
INPUT_FOLDERNAME = "annotation/results/"
RESULTS_FOLDERNAME = "scenic/results/"
FIGURES_FOLDERNAME = "scenic/figures/"
AUXILLIARIES_FOLDERNAME = "../../files/auxilliaries/"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)
if not os.path.exists(FIGURES_FOLDERNAME):
    os.makedirs(FIGURES_FOLDERNAME)

# Set folder for saving figures into
sc.settings.figdir = FIGURES_FOLDERNAME

DATASET_ID = "adult_quad"

ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.scenic.loom'.format(DATASET_ID))
RES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulon_mat.csv'.format(DATASET_ID))

def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')

# Set other settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
adata = sc.read_h5ad(os.path.join(INPUT_FOLDERNAME, 'adultdev_combined_scANVI.h5ad'))
adata.var_names_make_unique()
adata

In [None]:
adata = adata[adata.obs['grouptype'] == 'Adult_Quad'].copy()
adata

In [None]:
sc.pp.filter_genes(adata, min_counts=50, inplace=True)

In [None]:
print(adata.X[0:10,0:10])
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=None, inplace=True)
print(adata.X[0:10,0:10])

In [None]:
sc.pp.log1p(adata)
print(adata.X[0:10, 0:10])
adata.layers["log1p_norm"] = adata.X.copy()

In [None]:
data = pd.read_csv(os.path.join(INPUT_FOLDERNAME, "Barcodes_and_celltypes_for_Alina.csv"), index_col='barcodes')
data

In [None]:
matched_data = adata.obs.join(data['cluster_id'], how='inner')
matched_data

In [None]:
adata.obs.index = adata.obs.index.astype(str)
data.index = data.index.astype(str)

missing_barcodes = set(data.index) - set(adata.obs.index)
if len(missing_barcodes) > 0:
    print("Warning: Some barcodes from the CSV are not present in the AnnData object.")


In [None]:
adata.obs['annotations_new'] = matched_data['cluster_id']

In [None]:
sc.pl.umap(adata, color='annotations_new', frameon=False, legend_loc='on data', 
           legend_fontsize=4,
          save=f'{DATASET_ID}_new_annotations.svg'
          )

In [None]:
adata.obs['annotations_orig_full'] = adata.obs['annotations_orig_full'].apply(lambda x: '_'.join(x.split('_')[2:]) if len(x.split('_')) > 2 else '')

In [None]:
sc.pl.umap(adata, color='leiden_fibros', frameon=False, legend_loc='on data', legend_fontsize=6,
          save=f'{DATASET_ID}_leiden_fibros.svg'
          )

In [None]:
sc.pl.umap(adata, color='tendon_status', frameon=False, legend_loc='on data', legend_fontsize=6,
          save=f'{DATASET_ID}_status.svg'
          )

# Downloading files

Pick and download from https://resources.aertslab.org/cistarget/:

In [None]:
!wget https://resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt

### v10

In [None]:
!wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather

In [None]:
!wget https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather

In [None]:
!wget https://resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl

In [None]:
# transcription factors list from https://github.com/aertslab/SCENICprotocol/blob/master/example/
HUMAN_TFS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'allTFs_hg38.txt')
# Ranking databases. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
RANKING_DBS_FNAMES = list(map(lambda fn: os.path.join(AUXILLIARIES_FOLDERNAME, fn),
                        ['v10/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather',
                        'v10/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather']))
# Motif annotations. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
MOTIF_ANNOTATIONS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'v10/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl')

In [None]:
RANKING_DBS_FNAMES

# SCENIC Loom File Prep

In [None]:
print(adultquad.var_names)
print(adultquad.obs_names)

In [None]:
print(adultquad.X[1:5,1:5])
print(adultach.X[1:5,1:5])

In [None]:
adatadict = {'adult_quad': adultquad,
            'adult_ach': adultach}
adatadict

In [None]:
for name, adata in adatadict.items():
    # create basic row and column attributes for the loom file:
    row_attrs = {
        "Gene": np.array(adata.var_names) ,
    }
    col_attrs = {
        "CellID": np.array(adata.obs_names) ,
        "nGene": np.array( np.sum(adata.X.transpose()>0 , axis=0)).flatten() ,
        "nUMI": np.array( np.sum(adata.X.transpose() , axis=0)).flatten() ,
    }
    lp.create(os.path.join(RESULTS_FOLDERNAME, f'{name}.scenic.loom'), adata.X.transpose(), row_attrs, col_attrs)

# SCENIC

### Checking parameters for AUCELL (step 3)

"It is important to check that most cells have a substantial fraction of expressed/detected genes in the calculation of the AUC. The following histogram gives an idea of the distribution and allows selection of an appropriate threshold. In this plot, a few thresholds are highlighted, with the number of genes selected shown in red text and the corresponding percentile in parentheses)." "See the relevant section in the R tutorial (https://scenic.aertslab.org/scenic_paper/tutorials/AUCell.html#build-gene-expression-rankings-for-each-cell) for more information."

The information obtained from this plot can be used to set appropriate thresholds for filtering out low-quality cells or genes from downstream analysis. For example, cells with very low numbers of detected genes may be considered low-quality and removed from the dataset. Conversely, genes that are detected in only a small number of cells may also be filtered out as potential noise.

In [None]:
for name, adata in adatadict.items():
    nGenesDetectedPerCell = pd.DataFrame(np.sum(adata.X>0, axis=1))
    nGenesDetectedPerCell = nGenesDetectedPerCell.squeeze()

    percentiles = nGenesDetectedPerCell.quantile([0.01, 0.05, 0.10, 0.50, 1])
    print(percentiles)

    fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150)
    sns.distplot(nGenesDetectedPerCell, norm_hist=False, kde=False, bins='fd')
    for i,x in enumerate(percentiles):
        fig.gca().axvline(x=x, ymin=0,ymax=1, color='red')
        ax.text(x=x, y=ax.get_ylim()[1], s=f'{int(x)} ({percentiles.index.values[i]*100}%)', color='red', rotation=30, size='x-small',rotation_mode='anchor' )
    ax.set_title('# of genes detected per cell')
    ax.set_xlabel('# of genes')
    ax.set_ylabel('# of cells')
    fig.tight_layout()

The `--auc_threshold` value determines the minimum Area Under the Curve (AUC) score that a gene must have in order to be considered significant. By setting the `--auc_threshold` parameter to a specific value, we control the stringency of the gene signature enrichment analysis. Genes with AUC scores below the threshold are considered non-significant and are excluded from downstream analysis. A lower threshold value will result in more genes being considered significant, while a higher threshold value will result in fewer genes being considered significant. The choice of threshold value will depend on the specific research question and the quality of the data.

During the SCENIC workflow, the AUC score is used to assess the enrichment of each gene signature in each cell type or condition. The AUC score reflects the ability of a gene signature to discriminate between the expression profiles of two cell types or conditions. Genes with high AUC scores are considered to be strongly associated with the gene signature and are likely to play an important role in the biological process or pathway represented by the signature.

In general, it is recommended to use a threshold that strikes a balance between sensitivity and specificity in identifying significant genes. Setting a low threshold value will result in more genes being identified as significant, but may also increase the risk of false positives. Setting a high threshold value will reduce the number of false positives, but may also lead to false negatives and miss important genes.

One approach to selecting an appropriate threshold value is to consider the distribution of AUC scores across all genes in the dataset. If the distribution is bimodal, with one peak representing non-significant genes and another peak representing significant genes, the threshold can be set at the valley between the two peaks. However, if the distribution is unimodal or irregular, other methods can be used.

- By using the setting for `--auc_threshold` of 0.05, we see that 507 genes are selected for the rankings based on the plot above.

### ALL JOBS COMPUTATIONALLY HEAVY, RUN ON CCB CLUSTER 
(JUMBO NODE)

#### STEP 1: Network inference based on GRNBoost2:

In [None]:
!pyscenic grn {LOOM_FNAME} {HUMAN_TFS_FNAME} \
-o {ADJACENCIES_FNAME} \
--seed 4000 \
--num_workers 40

alternative if dask doesn't work:

In [None]:
!arboreto_with_multiprocessing.py {LOOM_FNAME} {HUMAN_TFS_FNAME} \
-o {ADJACENCIES_FNAME} \
--num_workers 8 \
--method grnboost2 \
--seed 4000

#### STEP2: Regulon prediction (cisTarget):

In [None]:
DBS_PARAM = ' '.join(RANKING_DBS_FNAMES)
DBS_PARAM

In [None]:
!pyscenic ctx {ADJACENCIES_FNAME} {DBS_PARAM}\
--annotations_fname {MOTIF_ANNOTATIONS_FNAME}\
--expression_mtx_fname {LOOM_FNAME}\
--output {MOTIFS_FNAME} \
--auc_threshold 0.05

#### STEP3: AUCELL: CHECK APPROPRIATE AUC_THRESHOLD TO SET BEFORE RUNNING (see below)

In [None]:
!pyscenic aucell {LOOM_FNAME} {MOTIFS_FNAME}\
--output {LOOM_FNAME_OUT}\
--auc_threshold 0.05

### Checking motifs

In [None]:
df_motifs = load_motifs(MOTIFS_FNAME)
df_motifs.head()

In [None]:
#regulons = df2regulons(df_motifs)
# Pickle these regulons.
#with open(REGULONS_DAT_FNAME, 'wb') as f:
#    pickle.dump(regulons, f)

# SCENIC ANALYSIS

## Visualization of SCENIC's AUC matrix

In [None]:
LOOM_FNAME_OUT = os.path.join(RESULTS_FOLDERNAME, '{}_01AUC.scenic.loom'.format(DATASET_ID))

In [None]:
lf = lp.connect(LOOM_FNAME_OUT, mode='r+', validate=False )
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
#exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID)
regulons = lf.ra.Regulons
#res=pd.concat([pd.Series(r.tolist(),index=regulons.dtype.names) for r in regulons],axis=1)
#res.columns=lf.row_attrs["Gene"]
#res.to_csv(RES_FNAME)
lf.close()

In [None]:
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

In [None]:
auc_mtx

## Regulon Binarization

In [None]:
%%time 
bin_mtx, thresholds = binarize(auc_mtx)
bin_mtx.to_csv(BIN_MTX_FNAME) 
thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)

In [None]:
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold

In [None]:
bin_mtx

In [None]:
thresholds

In [None]:
# Create heatmap with binarized regulon activity.
def palplot(pal, names, colors=None, size=1):
    n = len(pal)
    f, ax = plt.subplots(1, 1, figsize=(n * size, size))
    ax.imshow(np.arange(n).reshape(1, n),
              cmap=mpl.colors.ListedColormap(list(pal)),
              interpolation="nearest", aspect="auto")
    ax.set_xticks(np.arange(n) - .5)
    ax.set_yticks([-.5, .5])
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    colors = n * ['k'] if colors is None else colors
    for idx, (name, color) in enumerate(zip(names, colors)):
        ax.text(0.0+idx, 0.0, name, color=color, horizontalalignment='center', verticalalignment='center')
    return f

In [None]:
adata = sc.read_h5ad(os.path.join(INPUT_FOLDERNAME, 'adultdev_cellhint.h5ad'))
adata.var_names_make_unique()
adata

In [None]:
adata = adata[adata.obs['grouptype'].isin(['Adult_Quad_Healthy', 'Adult_Quad_Rupture'])]
adata

In [None]:
adata.uns['annotations_upd2_colors']

In [None]:
unscolors = 'annotations_upd2_colors'
groupcolors = 'annotations_upd2'

In [None]:
import matplotlib as mpl
#import colorcet as cc

cats = sorted(list(set(adata.obs[groupcolors])))
colors = sns.color_palette(adata.uns[unscolors], n_colors=len(cats)) #alt palette = 'bright'
colorsd = dict( zip( cats, colors ))
colormap = [ colorsd[x] for x in adata.obs[groupcolors] ]

cell_id2cell_type_lut =adata.obs[groupcolors].to_dict()
bw_palette = sns.xkcd_palette(["white", "black"])

In [None]:
sns.set()
sns.set_style("whitegrid")
fig = palplot(bw_palette, ['OFF', 'ON'], ['k', 'w'])
savesvg(f'{DATASET_ID}_legend_on_off.svg', fig)

sns.set()
sns.set(font_scale=1.0)
fig = palplot(colors, cats, size=2.5)
savesvg(f'{DATASET_ID}_legend_celltypes.svg', fig)

#sns.set()
#sns.set(font_scale=1.0)
#fig = palplot(sns.color_palette(COLORS), adata.obs['CellType'].dtype.categories, size=3.0)

In [None]:
# Convert the index of auc_mtx to string, to ensure all operations are on strings
auc_mtx.index = auc_mtx.index.map(str)

# Map the index to cell types using a Series for better control and avoid implicit MultiIndex conversion
cell_types_series = pd.Series(auc_mtx.index.map(cell_id2cell_type_lut), index=auc_mtx.index)

# Now map the cell types to colors, handling missing keys properly
mapped_colors = cell_types_series.map(lambda x: colorsd.get(x, (1, 1, 1)))  # Using a default color of white for missing keys

# Check for any 'default_color' or placeholder (here using white)
if (1, 1, 1) in mapped_colors.values:
    missing_keys = [index for index, color in mapped_colors.iteritems() if color == (1, 1, 1)]
    print("Missing keys for these index entries:", missing_keys)

In [None]:
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx.T, 
               col_colors=mapped_colors,
               cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Cell Type'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, f'{DATASET_ID}_binarizedregulonheatmap_upd.jpeg'), format='jpeg')

In [None]:
bin_mtx_healthy = bin_mtx.loc[adata.obs[adata.obs['tendon_status'] == 'Healthy'].index]
bin_mtx_ruptured = bin_mtx.loc[adata.obs[adata.obs['tendon_status'] == 'Rupture'].index]
bin_mtx_ruptured

In [None]:
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx_healthy.T, 
               col_colors=mapped_colors,
               cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Cell Type'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, f'{DATASET_ID}_binarizedregulonheatmap_upd_healthyonly.jpeg'), format='jpeg')

In [None]:
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx_ruptured.T, 
               col_colors=mapped_colors,
               cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Cell Type'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, f'{DATASET_ID}_binarizedregulonheatmap_upd_ruptureonly.jpeg'), format='jpeg')

In [None]:
bin_mtx_healthy.to_csv(os.path.join(RESULTS_FOLDERNAME, 'quad_binarised_healthy.csv')) 
bin_mtx_healthy.to_csv(os.path.join(RESULTS_FOLDERNAME, 'quad_binarised_ruptured.csv')) 

In [None]:
cats = sorted(list(set(adata.obs['tendon_status'])))
colors = sns.color_palette(adata.uns['tendon_status_colors'], n_colors=len(cats)) #alt palette = 'bright'
colorsd = dict( zip( cats, colors ))
colormap = [ colorsd[x] for x in adata.obs['tendon_status'] ]

cell_id2cell_type_lut =adata.obs['tendon_status'].to_dict()
bw_palette = sns.xkcd_palette(["white", "black"])

In [None]:
sns.set()
sns.set(font_scale=1.0)
fig = palplot(colors, cats, size=2.5)
savesvg(f'{DATASET_ID}_legend_tendon_status.svg', fig)

In [None]:
# Convert the index of auc_mtx to string, to ensure all operations are on strings
auc_mtx.index = auc_mtx.index.map(str)

# Map the index to cell types using a Series for better control and avoid implicit MultiIndex conversion
cell_types_series = pd.Series(auc_mtx.index.map(cell_id2cell_type_lut), index=auc_mtx.index)

# Now map the cell types to colors, handling missing keys properly
mapped_colors = cell_types_series.map(lambda x: colorsd.get(x, (1, 1, 1)))  # Using a default color of white for missing keys

# Check for any 'default_color' or placeholder (here using white)
if (1, 1, 1) in mapped_colors.values:
    missing_keys = [index for index, color in mapped_colors.iteritems() if color == (1, 1, 1)]
    print("Missing keys for these index entries:", missing_keys)

In [None]:
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx.T, 
               col_colors=mapped_colors,
               cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Ruptured'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, f'{DATASET_ID}_binarizedregulonheatmap_tendon_status.jpeg'), format='jpeg')

In [None]:
bin_mtx_clustered = bin_mtx.T.copy()
bin_mtx_clustered.rename(columns=adata.obs['annotations_upd2'].to_dict(), inplace=True)
regulon_presence_summary = bin_mtx_clustered.groupby(by=bin_mtx_clustered.columns, axis=1).sum()
regulon_presence_summary = regulon_presence_summary.drop(columns=['AGACCATGTGTGGACA_1', 'CAGGTATCATGCCATA_1_1', 'GCCAACGCACTCTCGT_1'])
regulon_presence_summary

In [None]:
bin_mtx_clustered.to_excel(os.path.join(RESULTS_FOLDERNAME, f'{DATASET_ID}_binarized_regulon_activity_newannotations.xlsx'))

In [None]:
regulon_presence_summary.to_csv(os.path.join(RESULTS_FOLDERNAME, f'{DATASET_ID}_binarized_regulon_summaryperct.csv'))

In [None]:
def OP_regulon_clustermap(regulon_presence_summary, condition, normalization_type=None, size=(12, 25)):
    
    # Determine the colormap based on normalization type
    if normalization_type == 'z-score':
        cmap = 'RdBu_r'  # Blue-White-Red colormap for z-score normalization
        center_val=0
    else:
        cmap = 'Oranges'  # Default colormap
        center_val=None
    
    # Create the clustermap
    g = sns.clustermap(regulon_presence_summary, method='average', metric='euclidean',
                       cmap=cmap, center=center_val, linewidths=.5, figsize=size,
                       row_cluster=True, col_cluster=True,
                       cbar_kws={"shrink": .5, "pad": 0.05})

    # Adjust color bar position
    g.cax.set_position([1, .2, .02, .45])  

    # Retrieve the order of the rows from the clustering
    row_order = g.dendrogram_row.reordered_ind
    regulons = regulon_presence_summary.index[row_order]

    # Separate regulons into odd and even for labeling
    odd_regulons = regulons[0::2]  # Odd-indexed regulons after clustering
    even_regulons = regulons[1::2]  # Even-indexed regulons after clustering

    # Set regulon names on the primary y-axis (left)
    g.ax_heatmap.set_yticks([i for i in range(len(regulons)) if i % 2 == 0])
    g.ax_heatmap.set_yticklabels(odd_regulons, rotation=0, fontsize=10)

    g.ax_row_dendrogram.set_visible(False)

    # Adjust tick positions to point to the center of the rows
    centered_ticks = [x + 0.5 for x in range(len(regulons))]

    # Set regulon names on the primary y-axis (left), alternate the labels
    g.ax_heatmap.set_yticks(centered_ticks[::2])
    g.ax_heatmap.set_yticklabels(regulons[::2], rotation=0, fontsize=10)

    # Create a secondary y-axis for the right side
    ax2 = g.ax_heatmap.twinx()

    # Set regulon names on the secondary y-axis (right), alternate the labels
    ax2.set_yticks(centered_ticks[1::2])
    ax2.set_yticklabels(regulons[1::2], rotation=0, fontsize=10)
    ax2.set_ylim(g.ax_heatmap.get_ylim())

    # Ensure labels are visible and adjust their alignment
    ax2.yaxis.set_label_position("right")
    ax2.yaxis.tick_right()

    # Rotate column labels for readability
    plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90, fontsize=10)

    # Adding labels to the axes
    g.ax_heatmap.set_ylabel('Regulons', fontsize=12, labelpad=10)
    g.ax_heatmap.yaxis.set_label_position('left')  # Explicitly position y-axis label on the left

    plt.grid(False)

    g.ax_heatmap.set_xlabel('Cell Types', fontsize=12)

    g.savefig(os.path.join(FIGURES_FOLDERNAME, f'{DATASET_ID}_activated_regulon_abundance_clustermap_{condition}.svg'))

    # Show the plot
    plt.show()

In [None]:
## PROPORTIONAL NORMALISATION:
cell_counts = adata.obs['annotations_upd2'].value_counts().to_dict()
normalized_regulon_presence = regulon_presence_summary.copy()
for column in normalized_regulon_presence.columns:
    normalized_regulon_presence[column] /= cell_counts[column]
normalized_regulon_presence

In [None]:
# Get the list of regulon names
regulons = normalized_regulon_presence.index.tolist()

# Separate odd and even indexed regulons
odd_labels = regulons[0::2]   # Odd-indexed regulons
even_labels = regulons[1::2]  # Even-indexed regulons

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(12, 20))  # Adjust size as needed

# Create a heatmap
sns.heatmap(normalized_regulon_presence, annot=False, 
            cmap='Oranges', linewidths=.5, 
            cbar_kws={"shrink": 0.5, "pad": 0.15}, ax=ax)

# Calculate midpoints for the odd and even indexed rows
odd_ticks = [i + 0.5 for i in range(len(regulons)) if i % 2 == 0]
even_ticks = [i + 0.5 for i in range(len(regulons)) if i % 2 == 1]

# Set regulon names on the primary y-axis (left)
ax.set_yticks(odd_ticks)
ax.set_yticklabels(odd_labels, rotation=0, fontsize=10)

# Create a secondary y-axis for the right side
ax2 = ax.twinx()

# Set regulon names on the secondary y-axis (right)
ax2.set_yticks(even_ticks)
ax2.set_yticklabels(even_labels, rotation=0, fontsize=10)
ax2.set_ylim(ax.get_ylim())  # Ensure the secondary y-axis aligns with the primary y-axis

# Correct alignment of right labels
ax2.yaxis.set_label_position("right")  # Ensure labels appear on the right side
ax2.yaxis.tick_right()

# Add labels and title
ax.set_title('Activated Regulon Abundance Across Cell Types')
ax.set_xlabel('Cell Types')
ax.set_ylabel('Regulons')
#ax2.set_ylabel('Regulons', rotation=270, va="bottom")

savesvg(f'{DATASET_ID}_activated_regulon_abundance_heatmap.svg', fig)

# Show the plot
plt.grid(False)
plt.show()

In [None]:
OP_regulon_clustermap(normalized_regulon_presence, 'All')

In [None]:
# Z-scoring:
normalized_regulon_presence_z = (normalized_regulon_presence - normalized_regulon_presence.mean()) / normalized_regulon_presence.std()
normalized_regulon_presence_z

In [None]:
# Get the list of regulon names
regulons = normalized_regulon_presence_z.index.tolist()

# Separate odd and even indexed regulons
odd_labels = regulons[0::2]   # Odd-indexed regulons
even_labels = regulons[1::2]  # Even-indexed regulons

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(12, 20))  # Adjust size as needed

# Create a heatmap
sns.heatmap(normalized_regulon_presence_z, annot=False, 
            cmap='RdBu_r', center=0, linewidths=.5, 
            cbar_kws={"shrink": 0.5, "pad": 0.15}, ax=ax)

# Calculate midpoints for the odd and even indexed rows
odd_ticks = [i + 0.5 for i in range(len(regulons)) if i % 2 == 0]
even_ticks = [i + 0.5 for i in range(len(regulons)) if i % 2 == 1]

# Set regulon names on the primary y-axis (left)
ax.set_yticks(odd_ticks)
ax.set_yticklabels(odd_labels, rotation=0, fontsize=10)

# Create a secondary y-axis for the right side
ax2 = ax.twinx()

# Set regulon names on the secondary y-axis (right)
ax2.set_yticks(even_ticks)
ax2.set_yticklabels(even_labels, rotation=0, fontsize=10)
ax2.set_ylim(ax.get_ylim())  # Ensure the secondary y-axis aligns with the primary y-axis

# Correct alignment of right labels
ax2.yaxis.set_label_position("right")  # Ensure labels appear on the right side
ax2.yaxis.tick_right()

# Add labels and title
ax.set_title('Activated Regulon Abundance Across Cell Types')
ax.set_xlabel('Cell Types')
ax.set_ylabel('Regulons')
#ax2.set_ylabel('Regulons', rotation=270, va="bottom")

plt.grid(False)
savesvg(f'{DATASET_ID}_activated_regulon_abundance_heatmap_zscore.svg', fig)

# Show the plot
plt.show()

In [None]:
OP_regulon_clustermap(normalized_regulon_presence_z, 'All-Z', 'z-score')

In [None]:
bin_mtx_clustered = bin_mtx_healthy.T.copy()
bin_mtx_clustered.rename(columns=adata.obs['annotations_upd2'].to_dict(), inplace=True)
regulon_presence_summary = bin_mtx_clustered.groupby(by=bin_mtx_clustered.columns, axis=1).sum()
normalized_regulon_presence = regulon_presence_summary.copy()
for column in normalized_regulon_presence.columns:
    normalized_regulon_presence[column] /= cell_counts[column]

In [None]:
OP_regulon_clustermap(normalized_regulon_presence, 'Healthy')

In [None]:
normalized_regulon_presence_z = (normalized_regulon_presence - normalized_regulon_presence.mean()) / normalized_regulon_presence.std()
normalized_regulon_presence_z

In [None]:
OP_regulon_clustermap(normalized_regulon_presence_z, 'Healthy_Z', 'z-score')

In [None]:
bin_mtx_clustered = bin_mtx_ruptured.T.copy()
bin_mtx_clustered.rename(columns=adata.obs['annotations_upd2'].to_dict(), inplace=True)
regulon_presence_summary = bin_mtx_clustered.groupby(by=bin_mtx_clustered.columns, axis=1).sum()
normalized_regulon_presence = regulon_presence_summary.copy()
for column in normalized_regulon_presence.columns:
    normalized_regulon_presence[column] /= cell_counts[column]

In [None]:
normalized_regulon_presence

In [None]:
OP_regulon_clustermap(normalized_regulon_presence, 'Ruptured')

In [None]:
normalized_regulon_presence_z = (normalized_regulon_presence - normalized_regulon_presence.mean()) / normalized_regulon_presence.std()
normalized_regulon_presence_z

In [None]:
OP_regulon_clustermap(normalized_regulon_presence_z, 'Ruptured_Z', 'z-score')

In [None]:
## Plotting binarization of individual regulons:

# fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 4), dpi=100)
# plot_binarization(auc_mtx, 'NR2F2(+)', thresholds['NR2F2(+)'], ax=ax1)
# plot_binarization(auc_mtx, 'SPI1(+)', thresholds['SPI1(+)'], ax=ax2)
# plot_binarization(auc_mtx, 'HOXD8(+)', thresholds['HOXD8(+)'], ax=ax3)
# plot_binarization(auc_mtx, 'ATF3(+)', thresholds['ATF3(+)'], ax=ax4)
# plot_binarization(auc_mtx, 'E2F8(+)', thresholds['E2F8(+)'], ax=ax5)
# plot_binarization(auc_mtx, 'TLL1(+)', thresholds['TLL1(+)'], ax=ax6)
# plot_binarization(auc_mtx, 'PAX3(+)', thresholds['PAX3(+)'], ax=ax7)
# plot_binarization(auc_mtx, 'ZNF713(+)', thresholds['ZNF713(+)'], ax=ax8)
# plt.tight_layout()

# Comparing regulons across conditions

In [None]:
bin_mtx_clustered = bin_mtx_ruptured.T.copy()
bin_mtx_clustered.rename(columns=adata.obs['annotations_upd2'].to_dict(), inplace=True)
regulon_presence_summary = bin_mtx_clustered.groupby(by=bin_mtx_clustered.columns, axis=1).sum()
normalized_regulon_presence = regulon_presence_summary.copy()
for column in normalized_regulon_presence.columns:
    normalized_regulon_presence[column] /= cell_counts[column]
normalized_regulon_presence_z_ruptured = (normalized_regulon_presence - normalized_regulon_presence.mean()) / normalized_regulon_presence.std()
normalized_regulon_presence_z_ruptured

In [None]:
bin_mtx_clustered = bin_mtx_healthy.T.copy()
bin_mtx_clustered.rename(columns=adata.obs['annotations_upd2'].to_dict(), inplace=True)
regulon_presence_summary = bin_mtx_clustered.groupby(by=bin_mtx_clustered.columns, axis=1).sum()
normalized_regulon_presence = regulon_presence_summary.copy()
for column in normalized_regulon_presence.columns:
    normalized_regulon_presence[column] /= cell_counts[column]
normalized_regulon_presence_z_healthy = (normalized_regulon_presence - normalized_regulon_presence.mean()) / normalized_regulon_presence.std()
normalized_regulon_presence_z_healthy

In [None]:
# Get the overlapping columns
columns_healthy = set(normalized_regulon_presence_z_healthy.columns)
columns_ruptured = set(normalized_regulon_presence_z_ruptured.columns)
overlapping_columns = columns_healthy.intersection(columns_ruptured)
len(overlapping_columns)

In [None]:
overlapping_columns

In [None]:
for ct in overlapping_columns:
    try:
        print(f"Processing {ct}...")
        fibroblast_healthy = normalized_regulon_presence_z_healthy[ct].copy()
        fibroblast_ruptured = normalized_regulon_presence_z_ruptured[ct].copy()
        fibroblast_healthy.name = 'Healthy'
        fibroblast_ruptured.name = 'Ruptured'
        combined_data = pd.concat([fibroblast_healthy, fibroblast_ruptured], axis=1)
        OP_regulon_clustermap(combined_data, ct, 'z-score', size=(6,20))
    except Exception as e:
        print(f"Failed to process {ct}: {e}")

In [None]:
normalized_regulon_presence_z_healthy.columns

In [None]:
normalized_regulon_presence_z_healthy.to_csv(os.path.join(RESULTS_FOLDERNAME, f'{DATASET_ID}_zscore_byct_healthy.csv'))
normalized_regulon_presence_z_ruptured.to_csv(os.path.join(RESULTS_FOLDERNAME, f'{DATASET_ID}_zscore_byct_ruptured.csv'))

In [None]:
for ct in ['MERTKhi LYVE1hi macrophages', 'Lymphatic ECs', 'T cells', 'Granulocytes', 'vSMCs', 'FBLNhi fibroblasts']:
    try:
        print(f"Processing {ct}...")
        fibroblast_healthy = normalized_regulon_presence_z_healthy[ct].copy()
        fibroblast_ruptured = normalized_regulon_presence_z_ruptured[ct].copy()
        fibroblast_healthy.name = 'Healthy'
        fibroblast_ruptured.name = 'Ruptured'
        combined_data = pd.concat([fibroblast_healthy, fibroblast_ruptured], axis=1)
        OP_regulon_clustermap(combined_data, ct, 'z-score', size=(6,20))
    except Exception as e:
        print(f"Failed to process {ct}: {e}")

In [None]:
for ct in ['VCANhi DCs/monocytes']:
    try:
        print(f"Processing {ct}...")
        fibroblast_healthy = normalized_regulon_presence_z_healthy[ct].copy()
        fibroblast_ruptured = normalized_regulon_presence_z_ruptured[ct].copy()
        fibroblast_healthy.name = 'Healthy'
        fibroblast_ruptured.name = 'Ruptured'
        combined_data = pd.concat([fibroblast_healthy, fibroblast_ruptured], axis=1)
        OP_regulon_clustermap(combined_data, 'VCANhiDCsmonocytes', 'z-score', size=(6,20))
    except Exception as e:
        print(f"Failed to process {ct}: {e}")

In [None]:
for ct in ['Dividing fibroblasts / mural cells']:
    try:
        print(f"Processing {ct}...")
        fibroblast_healthy = normalized_regulon_presence_z_healthy[ct].copy()
        fibroblast_ruptured = normalized_regulon_presence_z_ruptured[ct].copy()
        fibroblast_healthy.name = 'Healthy'
        fibroblast_ruptured.name = 'Ruptured'
        combined_data = pd.concat([fibroblast_healthy, fibroblast_ruptured], axis=1)
        OP_regulon_clustermap(combined_data, 'Dividingfibroblasts_muralcells', 'z-score', size=(6,20))
    except Exception as e:
        print(f"Failed to process {ct}: {e}")

## Regulon specificity score (RSS) across predicted cell types

In [None]:
filtered_auc_mtx = auc_mtx[auc_mtx.index.isin(adata.obs.index)]
filtered_auc_mtx

In [None]:
rss = regulon_specificity_scores(filtered_auc_mtx, adata.obs['annotations_upd2'])
rss.T

In [None]:
from adjustText import adjust_text

cats = sorted(list(set(adata.obs['annotations_upd2'])))

fig = plt.figure(figsize=(14, 20)) #(width, length)
for c,num in zip(cats, range(1,len(cats)+1)):
    x=rss.T[c]
    ax = fig.add_subplot(6,5,num) #(rows, columns)
    plot_rss(rss, c, top_n=5, max_n=None, ax=ax)
    ax.set_ylim( x.min()-(x.max()-x.min())*0.05 , x.max()+(x.max()-x.min())*0.05 )
    for t in ax.texts:
        t.set_fontsize(12)
    ax.set_ylabel('')
    ax.set_xlabel('')
    adjust_text(ax.texts, autoalign='xy', ha='right', va='bottom', 
                arrowprops=dict(arrowstyle='-',color='lightgrey'), precision=0.001 )
 
fig.text(0.5, 0.0, 'Regulon', ha='center', va='center', size='x-large')
fig.text(0.00, 0.5, 'Regulon specificity score (RSS)', ha='center', va='center', rotation='vertical', size='x-large')
plt.tight_layout()
plt.rcParams.update({
    'figure.autolayout': True,
        'figure.titlesize': 'large' ,
        'axes.labelsize': 'medium',
        'axes.titlesize':'large',
        'xtick.labelsize':'medium',
        'ytick.labelsize':'medium'
        })
plt.show()
savesvg(f'{DATASET_ID}_CellTypeRSS_annotupd.svg', fig)

In [None]:
OP_regulon_clustermap(rss.T, 'RSS_all')

In [None]:
rss.T[c].sort_values(ascending=False)[:5]

In [None]:
topreg = []
for i,c in enumerate(cats):
    topreg.extend(
        list(rss.T[c].sort_values(ascending=False)[:5].index)
    )
topreg = list(set(topreg))

# Z-scoring

In [None]:
auc_mtx_Z = pd.DataFrame( index=filtered_auc_mtx.index )
for col in list(filtered_auc_mtx.columns):
    auc_mtx_Z[ col ] = ( filtered_auc_mtx[col] - filtered_auc_mtx[col].mean()) / filtered_auc_mtx[col].std(ddof=0)
#auc_mtx_Z.sort_index(inplace=True)

In [None]:
auc_mtx_Z

In [None]:
sns.set(font_scale=1.2)
g = sns.clustermap(auc_mtx_Z[topreg], annot=False,  square=False,  linecolor='gray',
    yticklabels=False, xticklabels=True, vmin=-2, vmax=6, row_colors=colormap,
    cmap="YlGnBu", figsize=(25,20) )
g.cax.set_visible(True)
g.ax_heatmap.set_ylabel('')
g.ax_heatmap.set_xlabel('')
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, f'{DATASET_ID}_Z-score_heatmap_annotationsnew.svg'), bbox_inches='tight')

# Creating a regulon h5ad object

In [None]:
adata

In [None]:
adata.X = adata.layers['counts'].copy()
print(adata.X[0:10,0:10])

In [None]:
sc.pp.filter_genes(adata, min_counts=50, inplace=True)

In [None]:
sc.pp.normalize_total(adata, target_sum=None, inplace=True)
sc.pp.log1p(adata)
print(adata.X[0:10, 0:10])
adata.layers["log1p_norm"] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=2500)
sc.pp.scale(adata)
sc.tl.pca(adata)
sc.pl.pca(adata, color='annotations_upd2')

In [None]:
sc.tl.tsne(adata, n_pcs = 30, use_rep = 'X_pca')
sc.pl.embedding(adata, basis='X_tsne', color='annotations_upd2', 
                legend_loc='on data',
                frameon=False, legend_fontsize=5, save=f'{DATASET_ID}_tSNE_original.svg')

In [None]:
sc.pl.embedding(adata, basis='X_umap', color='annotations_upd2', 
                legend_loc='on data',
                frameon=False, legend_fontsize=5, save=f'{DATASET_ID}_UMAP_fulloriginal.svg')
sc.pl.embedding(adata, basis='X_umap', color='annotations_upd2', 
                #legend_loc='on data',
                frameon=False, #legend_fontsize=5, 
                save=f'{DATASET_ID}_UMAP_fulloriginal2.svg'
               )

In [None]:
adata.obsm['X_umap_orig'] = adata.obsm['X_umap'].copy()

In [None]:
import umap 

runUmap = umap.UMAP(n_neighbors=20, min_dist=0.4, metric='correlation').fit_transform
dr_umap = runUmap( adata.X )
dr_umap = pd.DataFrame(dr_umap, columns=['X', 'Y'], index=adata.obs.index) 
dr_umap.to_csv(os.path.join(RESULTS_FOLDERNAME, "quadoriginal_umap.txt"), sep='\t')
adata.obsm['X_umap'] = dr_umap

sc.pl.embedding(adata, basis='X_umap', color='annotations_upd2', 
                legend_loc='on data',
                frameon=False, legend_fontsize=5, save=f'{DATASET_ID}_UMAP_quadoriginal.svg')
sc.pl.embedding(adata, basis='X_umap', color='annotations_upd2', 
                #legend_loc='on data',
                frameon=False, #legend_fontsize=5, 
                save=f'{DATASET_ID}_UMAP_quadoriginal.svg'
               )

In [None]:
adata.write(os.path.join(RESULTS_FOLDERNAME, 'adult_quads_genes.h5ad'))

In [None]:
adata_genes = adata.copy()
adata_genes.X = adata.layers['log1p_norm'].copy()

In [None]:
del adata

In [None]:
# Create REGULON ADATA:
adata = sc.AnnData(filtered_auc_mtx, obs=adata_genes.obs,uns=adata_genes.uns,obsm=adata_genes.obsm)
adata

In [None]:
adata.var

In [None]:
adata.var_names_make_unique()
adata_genes.var_names_make_unique()

In [None]:
# subset to common genes
genes=adata_genes.var_names
common=genes[genes.isin(np.array(list(map(lambda x: x.split("(")[0],adata.var_names))))]
adata=adata[:,common+"(+)"]
adata_genes=adata_genes[:,common]

#adata.layers["auc_init"]=adata.X.copy()
#weights=adata_genes.X.copy()
#weights[weights>1]=1
#adata.X=adata.X*weights.A
#adata.layers["weights"]=weights
#adata

In [None]:
df_motifs = load_motifs(MOTIFS_FNAME)
regulons = df2regulons(df_motifs)
len(regulons)

In [None]:
add_scenic_metadata(adata_genes, filtered_auc_mtx, regulons)
adata_genes

In [None]:
adata_genes.write_h5ad(ANNDATA_FNAME)
print(f'{ANNDATA_FNAME}')

In [None]:
#data.write_loom("results/scvi_3207/regulons_3207.loom", write_obsm_varm=True)

## AUCell Clustering based on regulon activity

In [None]:
adata_genes = sc.read_h5ad(ANNDATA_FNAME)
adata_genes

In [None]:
sc.pl.umap(adata_genes,color=['Regulon(TWIST1(+))', 'Regulon(SOX4(+))'], frameon=False)

In [None]:
original_umap = pd.DataFrame(adata_genes.obsm['X_umap'], columns=[['X', 'Y']], index=adata_genes.obs_names)

In [None]:
aucelltsne = sc.tl.tsne(adata_genes, n_pcs = 30, use_rep = 'X_aucell', copy=True)
adata_genes.obsm['X_tsne_aucell'] = aucelltsne.obsm['X_tsne']
del aucelltsne

In [None]:
sc.pl.embedding(adata_genes, basis='X_tsne_aucell', color='annotations_upd2', 
                legend_loc='on data',
                frameon=False, legend_fontsize=5, save=f'{DATASET_ID}_tSNE_AUCell.svg')
sc.pl.embedding(adata_genes, basis='X_tsne_aucell', color='annotations_upd2', 
                #legend_loc='on data',
                frameon=False, #legend_fontsize=5, 
                save=f'{DATASET_ID}_tSNE_AUCell2.svg'
               )

In [None]:
runUmap = umap.UMAP(n_neighbors=10, min_dist=0.4, metric='correlation').fit_transform
dr_umap = runUmap( filtered_auc_mtx )
dr_umap = pd.DataFrame(dr_umap, columns=['X', 'Y'], index=filtered_auc_mtx.index) 
dr_umap.to_csv(os.path.join(RESULTS_FOLDERNAME, "aucell_umap.txt"), sep='\t')
adata_genes.obsm['X_umap_aucell'] = dr_umap.copy()

In [None]:
sc.pl.embedding(adata_genes, basis='X_umap_aucell', color='annotations_upd2', 
                legend_loc='on data',
                frameon=False, legend_fontsize=5, save=f'{DATASET_ID}_UMAP_AUCell.svg')
sc.pl.embedding(adata_genes, basis='X_umap_aucell', color='annotations_upd2', 
                #legend_loc='on data',
                frameon=False, #legend_fontsize=5, 
                save=f'{DATASET_ID}_UMAP_AUCell2.svg'
               )

In [None]:
del adata_genes.obsm['_scvi_extra_categorical_covs']
del adata_genes.obsm['_scvi_extra_continuous_covs']

In [None]:
adata_genes

In [None]:
adata_genes.write_h5ad(ANNDATA_FNAME)

## Z-scoring (Alternative to RSS)
To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).

In [None]:
df_obs = adata_genes.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['annotations_upd2']]
df_results = ((df_scores.groupby(by='annotations_upd2').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()

In [None]:
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 2.0].sort_values('Z', ascending=False),
                           index='annotations_upd2', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched
fig, ax1 = plt.subplots(1, 1, figsize=(25, 15))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray', 
            cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg(f'{DATASET_ID}_Z-score_regulons_matrixplot.svg', fig)

In [None]:
df_results.sort_values('Z', ascending=False).groupby(by='annotations_upd2').head(2)

In [None]:
aucell_adata = sc.AnnData(X=filtered_auc_mtx.sort_index())
aucell_adata.obs = df_obs
names = list(map(op.attrgetter('name'), filter(lambda r: r.score > 3.0, regulons)))
sc.pl.stacked_violin(aucell_adata, names, groupby='annotations_upd2',
          #save=' - GSE115978 - regulons.svg'
                    )

In [None]:
grouped_data = adata_genes.obs.groupby(['annotations_upd2', 'tendon_status'])
mean_activity = grouped_data['Regulon(TWIST1(+))'].mean()
mean_activity_df = mean_activity.reset_index()

g = sns.FacetGrid(mean_activity_df, col='annotations_upd2', col_wrap=4, height=3)
g.map_dataframe(sns.barplot, x='tendon_status', y='Regulon(TWIST1(+))', dodge=False)
g.set_axis_labels("Tendon Status", "Mean Regulon Activity")
g.set_titles("{col_name}")

# Remove grid lines and adjust legends
g.fig.subplots_adjust(top=0.9)  # Adjust the top space to accommodate title if cut off
g.add_legend(title='Cell Types')
for ax in g.axes.flat:
    ax.grid(False) 

plt.show()

In [None]:
plot_data = adata_genes.obs[['annotations_upd2', 'Regulon(TWIST1(+))']]

# Create a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='annotations_upd2', y='Regulon(TWIST1(+))', data=plot_data)
plt.title('TWIST1 Regulon Activity Across Tendon Status')
plt.ylabel('Regulon Activity')
plt.show()

# Check motifs

In [None]:
adjacencies = pd.read_csv(ADJACENCIES_FNAME, index_col=False, sep='\t')
adjacencies.head()

In [None]:
#lf = lp.connect(os.path.join(RESULTS_FOLDERNAME, 'devcombined_05AUC.scenic.loom'))
lf = lp.connect(LOOM_FNAME_OUT, mode='r+', validate=False ) 
exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID).T
regulons = {}
for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():
    regulons[i] =  list(r[r==1].index.values)
lf.close()

In [None]:
len(regulons)

In [None]:
# write all regulon-associated genes into separate .txt files:
for regulon, genes in regulons.items():
    filename = os.path.join(RESULTS_FOLDERNAME, f"{DATASET_ID}_regulons/{regulon}.txt")
    print(f'Writing for {regulon}')
    with open(filename, 'w') as file:
        for gene in genes:
            file.write(f"{gene}\n")

In [None]:
regulons['TWIST1(+)']

In [None]:
regulons['SOX4(+)']

In [None]:
# group adjacencies into modules where each module represents a collection of target genes 
# that are regulated by a common transcription factor:

from pyscenic.utils import modules_from_adjacencies
modules = list(modules_from_adjacencies(adjacencies, exprMat))

In [None]:
tf = 'TWIST1'
tf_mods = [ x for x in modules if x.transcription_factor==tf ]

for i,mod in enumerate( tf_mods ):
    print( f'{tf} module {str(i)}: {len(mod.genes)} genes' )
print( f'{tf} regulon: {len(regulons[tf+"(+)"])} genes' )

In [None]:
for i,mod in enumerate( tf_mods ):
    with open( os.path.join(RESULTS_FOLDERNAME, tf+'_module_'+str(i)+'.txt'), 'w') as f:
        for item in mod.genes:
            f.write("%s\n" % item)
            
with open( os.path.join(RESULTS_FOLDERNAME, tf+'_regulon.txt'), 'w') as f:
    for item in regulons[tf+"(+)"]:
        f.write("%s\n" % item)

In [None]:
filtered_auc_mtx