# Questions
- which TFs make into a regulon and which do not? 

- TFBS database
- level of expression 
- additional criteria? 

In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP
from statsmodels.stats.multitest import multipletests

from matplotlib.colors import LinearSegmentedColormap

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  

import atac_utils

In [None]:
outdirfig = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures/250521"
!mkdir -p $outdirfig

In [None]:
ddir = '/u/home/f/f7xiesnm/v1_multiome/juyoun/' 
!ls $ddir

In [None]:
f = ddir+'regulons_l23alltime_trimmed_cleaned_bigtable.csv'
# scenic metadata
df_scenic = pd.read_csv(f, index_col=0)
df_scenic = df_scenic[df_scenic['signs'].isin(["+_+", "-_+"])]
df_scenic


In [None]:
df_reg = df_scenic.groupby(['TF', 'signs', 'Consensus_name']).first()[['Gene_signature_name', 'Region_signature_name', 'is_extended']].sort_values('TF')

scenic_tfs = np.sort(df_scenic['TF'].unique())
scenic_genes = np.sort(df_scenic['Gene'].unique()) # .shape
scenic_regions = np.sort(df_scenic['Region'].unique()) # .shape

num_reg    = len(df_reg)
num_tf     = len(scenic_tfs)
num_gene = len(scenic_genes)
num_region = len(scenic_regions)
print(num_reg, num_tf, num_gene, num_region)
df_reg

In [None]:
df_reg.reset_index()['signs'].value_counts()

In [None]:
def sign2word(sign):
    if sign == '+':
        word = 'positive'
    elif sign == '-':
        word = 'negative'
    return word

In [None]:
df_reg_viz = df_reg.copy()
df_reg_viz['num target genes'] = df_reg_viz['Gene_signature_name'].apply(lambda x: x.split('(')[-1][:-2])
df_reg_viz['num target regions'] = df_reg_viz['Region_signature_name'].apply(lambda x: x.split('(')[-1][:-2])
df_reg_viz['TF-gene correlation'] = df_reg_viz.reset_index()['signs'].apply(lambda x: sign2word(x[0])).values
# df_reg_viz['region-gene correlation'] = df_reg_viz.reset_index()['signs'].apply(lambda x: x[2]).values
df_reg_viz = df_reg_viz.reset_index()
df_reg_viz['regulon name'] = df_reg_viz['Consensus_name'].apply(lambda x: x[:-2])
df_reg_viz = df_reg_viz[['regulon name', 'TF', 'num target genes', 'num target regions', 'TF-gene correlation', 'is_extended']].sort_values('regulon name')
df_reg_viz


In [None]:
df_scenic_viz = df_scenic.copy()
df_scenic_viz['Regulon name'] = df_scenic_viz['Consensus_name'].apply(lambda x: x[:-2])
df_scenic_viz = df_scenic_viz[['Regulon name', 'TF', 'Region', 'Gene']].sort_values('Regulon name')
df_scenic_viz

In [None]:
df_reg_viz.to_csv('/u/home/f/f7xiesnm/v1_multiome/table_s5_part1.csv')
df_scenic_viz.to_csv('/u/home/f/f7xiesnm/v1_multiome/table_s5_part2.csv')

# prep RNA data

In [None]:
# RNA data
scores_abc = pd.read_csv("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/scores_l23abc.csv", 
                         index_col=0,
                        )
scores_abc['scores_c-a'] = scores_abc['scores_c'] - scores_abc['scores_a']

adata = anndata.read("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad")
adata.X = adata.raw.X

adata.obs['scores_a'] = scores_abc.loc[adata.obs.index,'scores_a'].copy()
adata.obs['scores_b'] = scores_abc.loc[adata.obs.index,'scores_b'].copy()
adata.obs['scores_c'] = scores_abc.loc[adata.obs.index,'scores_c'].copy()
adata.obs['scores_c-a'] = scores_abc.loc[adata.obs.index,'scores_c-a'].copy()

adata

In [None]:
sample_labels = adata.obs['Sample'].values
time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

adata.obs['sample'] = sample_labels #
adata.obs['time']   = time_labels

uniq_samples = natsorted(np.unique(sample_labels))
nr_samples = [s for s in uniq_samples if "DR" not in s]
dr_samples = [s for s in uniq_samples if "DR" in s]

uniq_conds = np.array(natsorted(np.unique(adata.obs['cond'].values)))
print(uniq_conds)

In [None]:
# remove mitocondria genes
adata = adata[:,~adata.var.index.str.contains(r'^mt-')]
# remove sex genes
adata = adata[:,~adata.var.index.str.contains(r'^Xist$')]

# filter genes
cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
adata = adata[:,cond].copy()

In [None]:
# counts
x = adata.X
cov = np.ravel(np.sum(x, axis=1))
genes = adata.var.index.values

# CP10k
xn = (sparse.diags(1/cov).dot(x))*1e4

# log2(CP10k+1)
xln = xn.copy()
xln.data = np.log2(xln.data+1)

adata.layers[    'norm'] = np.array(xn.todense())
adata.layers[ 'lognorm'] = np.array(xln.todense())

In [None]:
# adata.obs['sample'].unique()
import re

todo_conds = [
    'P12DR', 'P14DR', 'P17DR', 'P21DR',
    'P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21', 
]
todo_samps = [
    'P12DRa', 'P12DRb',
    'P14DRa', 'P14DRb',
    'P17DRa', 'P17DRb',
    'P21DRa', 'P21DRb',
    'P6a', 'P6b', 'P6c', 
    'P8a', 'P8b', 'P8c', 
    'P10a', 'P10b', 
    'P12a', 'P12b', 'P12c', 
    'P14a', 'P14b',
    'P17a', 'P17b', 
    'P21a', 'P21b', 
]

cond2condcode = {
    'P12DR': 0, 
    'P14DR': 1,
    'P17DR': 2,
    'P21DR': 3,
    'P6':    4,
    'P8':    5,
    'P10':   6,
    'P12':   7,
    'P14':   8,
    'P17':   9,
    'P21':  10,
}
todo_conds_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_conds])
todo_samps_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_samps])
print(todo_conds_t)
print(todo_samps_t)

def mean_over_samples(mmat_res_samp):
    """25 samples to 11 conditions
    """
    assert mmat_res_samp.shape[0] == 25
    
    mmat_res_samp_mean = np.zeros(mmat_res_samp.shape)[:11]
    mmat_res_samp_mean[0] = np.mean(mmat_res_samp[ :2], axis=0)
    mmat_res_samp_mean[1] = np.mean(mmat_res_samp[2:4], axis=0)
    mmat_res_samp_mean[2] = np.mean(mmat_res_samp[4:6], axis=0)
    mmat_res_samp_mean[3] = np.mean(mmat_res_samp[6:8], axis=0)

    mmat_res_samp_mean[4] = np.mean(mmat_res_samp[8:11], axis=0)
    mmat_res_samp_mean[5] = np.mean(mmat_res_samp[11:14], axis=0)
    mmat_res_samp_mean[6] = np.mean(mmat_res_samp[14:16], axis=0)
    mmat_res_samp_mean[7] = np.mean(mmat_res_samp[16:19], axis=0)
    mmat_res_samp_mean[8] = np.mean(mmat_res_samp[19:21], axis=0)
    mmat_res_samp_mean[9] = np.mean(mmat_res_samp[21:23], axis=0)
    mmat_res_samp_mean[10] = np.mean(mmat_res_samp[23:  ], axis=0)
    
    return mmat_res_samp_mean

def transform_bigredmat(bigmat):
    """bigmat or redmat
    to fmat and zmat
    
    (samp, type, gene) -> (gene, samp*type)
    also change column order
    """
    fmat = bigmat.reshape(-1, bigmat.shape[-1]).T
    fmat = np.hstack([fmat[:,4*5:], fmat[:,:4*5]]) # CHANGED COLUMN ORDER!!
    zmat = zscore(fmat, axis=1)
    
    return fmat, zmat

In [None]:
%%time


offset = 1
n_type = 5
frac_archetypal_cells_viz = 0.2

mat = adata.layers['norm'][...]
gexp_l23baseline = np.log2(np.mean(mat, axis=0)*1e2+offset) # CP10k -> CPM

bigmat_nfd = np.zeros((len(todo_samps), n_type, adata.shape[1]))
bigmat_abc = np.zeros((len(todo_samps),      3, adata.shape[1]))

for i, samp in enumerate(todo_samps):
    print(samp)
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    # per type
    cells_type_nfd = pd.qcut(ranks_ac, n_type, labels=False)
    for j in range(n_type):
        mat_j = adatasub[cells_type_nfd==j].layers['norm'][...]
        mmat_j = np.log2(np.mean(mat_j, axis=0)*1e2+offset) - gexp_l23baseline # CP10k -> CPM
        bigmat_nfd[i,j] = mmat_j
    
    # A, B, C
    num_archetypal_cells_viz = int(n_cells*frac_archetypal_cells_viz)
    
    precond_a = ranks_ac <= num_archetypal_cells_viz
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells_viz
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells_viz
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
    for j, cond in enumerate([cond_a, cond_b, cond_c]):
        mat_j = adatasub[cond].layers['norm'][...]
        mmat_j = np.log2(np.mean(mat_j, axis=0)*1e2+offset) - gexp_l23baseline # CP10k -> CPM
        bigmat_abc[i,j] = mmat_j


In [None]:
redmat_abc = mean_over_samples(bigmat_abc)

In [None]:
bigmat_abc.shape, redmat_abc.shape, genes.shape

# prep region data

In [None]:
%%time
n_atac_total = 0
adatas_pk = []
# for cond in sample_conditions:
for cond in todo_conds:
    print(cond)
    f = f'/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/pmat_l23scenic_v2_{cond}.h5ad'
    adata_pk = sc.read(f)
    adatas_pk.append(adata_pk)
    n_atac_total += len(adata_pk)
    print(adata_pk)
    # break
print(n_atac_total)

In [None]:
%%time

n_cells_overlap = 0
for i, samp in enumerate(todo_samps):
    cond_name = samp[:-1]
    cond_code = cond2condcode[cond_name]
    print(samp, cond_name, cond_code)
    
    # get ATAC
    adata_pk = adatas_pk[cond_code]
    cells_atac = adata_pk.obs.index.values
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    cells_rna = adatasub.obs.index.values
    
    cells_overlap = np.intersect1d(cells_rna, cells_atac)
    n_cells_overlap += len(cells_overlap)

print(n_cells_overlap)

In [None]:
regionset = adata_pk.var.index.values
regionset.shape

In [None]:
%%time
mat = []
for adata_pk in adatas_pk:
    mat.append(np.array(adata_pk.X.todense()))
mat = np.vstack(mat)
mat = mat/np.sum(mat, axis=1).reshape(-1,1)*1e6
atac_l23baseline = np.log2(np.mean(mat, axis=0)+offset)

In [None]:
%%time

offset = 1
n_type = 5
frac_archetypal_cells_viz = 0.2
bigmatatac_nfd = np.zeros((len(todo_samps), n_type, len(regionset)))
bigmatatac_abc = np.zeros((len(todo_samps),      3, len(regionset)))

for i, samp in enumerate(todo_samps):
    cond_name = samp[:-1]
    cond_code = cond2condcode[cond_name]
    print(samp, cond_name, cond_code)
    
    # get ATAC
    adata_pk = adatas_pk[cond_code]
    cells_atac = adata_pk.obs.index.values
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    cells_rna = adatasub.obs.index.values
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    # per type
    cells_type_nfd = pd.qcut(ranks_ac, n_type, labels=False)
    for j in range(n_type):
        cond_j = cells_type_nfd==j
        cells_j = np.intersect1d(cells_rna[cond_j], cells_atac)
        mat_j = np.array(adata_pk[cells_j].X.todense()) 
        mat_j = mat_j/np.sum(mat_j, axis=1).reshape(-1,1)*1e6
        mmat_j = np.log2(np.mean(mat_j, axis=0)+offset) - atac_l23baseline
        bigmatatac_nfd[i,j] = mmat_j
    
    # A, B, C
    num_archetypal_cells_viz = int(n_cells*frac_archetypal_cells_viz)
    
    precond_a = ranks_ac <= num_archetypal_cells_viz
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells_viz
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells_viz
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
        
    # get A, B, C 
    cells_a = np.intersect1d(cells_rna[cond_a], cells_atac)
    cells_b = np.intersect1d(cells_rna[cond_b], cells_atac)
    cells_c = np.intersect1d(cells_rna[cond_c], cells_atac)
    
    for j, cond_j in enumerate([cond_a, cond_b, cond_c]):
        cells_j = np.intersect1d(cells_rna[cond_j], cells_atac)
        mat_j = np.array(adata_pk[cells_j].X.todense()) 
        mat_j = mat_j/np.sum(mat_j, axis=1).reshape(-1,1)*1e6
        mmat_j = np.log2(np.mean(mat_j, axis=0)+offset) - atac_l23baseline
        bigmatatac_abc[i,j] = mmat_j
        

In [None]:
print(bigmat_nfd.shape) # cond, type, gene
print(bigmat_abc.shape) # cond, type, gene

print(bigmatatac_nfd.shape) # cond, type, gene
print(bigmatatac_abc.shape) # cond, type, gene

# profile regulons

In [None]:
def prep_regulon(reg_tf, reg_name):
    """
    reg_tf = 'Meis2'
    reg_name = 'Meis2_+_+'
    """

    df_this_reg = df_scenic[df_scenic['Consensus_name']==reg_name]
    reg_genes = df_this_reg['Gene'].unique()
    reg_regions = df_this_reg['Region'].unique()
    print(reg_genes.shape, reg_regions.shape)

    #
    tf_idx   = basicu.get_index_from_array(genes, [reg_tf])[0]
    gene_idx  = basicu.get_index_from_array(genes, reg_genes)
    region_idx = basicu.get_index_from_array(regionset, reg_regions)
    assert np.all(region_idx != -1)
    assert np.all(gene_idx != -1)

    ftrs_x = bigmat_abc[:,:,tf_idx]
    ftrs_y = np.mean(bigmat_abc[:,:,gene_idx], axis=-1)
    ftrs_z = np.mean(bigmatatac_abc[:,:,region_idx], axis=-1)
    # print(ftrs_x.shape, ftrs_y.shape, ftrs_z.shape)

    bigmat_abc_ig_list = [ftrs_x, ftrs_y, ftrs_z]
    redmat_abc_ig_list = [mean_over_samples(x) for x in bigmat_abc_ig_list]

    return reg_genes, reg_regions, bigmat_abc_ig_list, redmat_abc_ig_list


class Regulon:
    def __init__(self, reg_tf, reg_name):
        reg_genes, reg_regions, bigmat_abc, redmat_abc = prep_regulon(reg_tf, reg_name)
        
        self.reg_tf      = reg_tf
        self.reg_name    = reg_name
        self.reg_genes   = reg_genes
        self.reg_regions = reg_regions
        
        self.bigmat_abc = bigmat_abc
        self.redmat_abc = redmat_abc
        
        return
    
    def plot(self, fig=None, axs=None, output=None):
        """
        """
        
        if fig is None:
            fig, axs = plt.subplots(1, 3, figsize=(3*3,1*4), sharex=True) #, sharey=True)
            
        for i in range(3):
            ax = axs[i]
            bigmat_mean_ig = self.bigmat_abc[i]
            redmat_mean_ig = self.redmat_abc[i]

            ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
            ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
            ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

            # ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,0], 's', markersize=5, fillstyle='none', color='C0', alpha=0.5)
            # ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,1], 's', markersize=5, fillstyle='none', color='C1', alpha=0.5)
            # ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,2], 's', markersize=5, fillstyle='none', color='C2', alpha=0.5)

            ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,0], '-', color='C0')
            ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,1], '-', color='C1')
            ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,2], '-', color='C2')

            # ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,0], '-', color='C0', alpha=0.5)
            # ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,1], '-', color='C1', alpha=0.5)
            # ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,2], '-', color='C2', alpha=0.5)

            ax.grid(False)
            sns.despine(ax=ax)
            ax.set_xticks([6,14,21])
            ax.set_xticklabels(["P6","14","21"])

        # axs[0].set_xlabel('Postnatal day (P)')
        axs[0].set_ylabel('log2(FC)')
        axs[0].set_title(f'{self.reg_name}')
        axs[1].set_title(f'{len(self.reg_genes)} genes')
        axs[2].set_title(f'{len(self.reg_regions)} regions')
        if output is not None:
            powerplots.savefig_autodate(fig, output)
            
        # fig.tight_layout()
        # plt.show()
        
        return
    

In [None]:
all_regulons = df_reg.reset_index()['Consensus_name'].values
cond_pp = np.array([x[-3:]=='+_+' for x in all_regulons])
cond_mp = np.array([x[-3:]=='-_+' for x in all_regulons])

print(all_regulons.shape)
print(cond_pp.sum(), 
      cond_mp.sum(),
     )

In [None]:
for reg_name in all_regulons[:2]:
    tf = reg_name.split('_')[0]
    reg = Regulon(tf, reg_name)
    reg.plot()

# all regulons 

In [None]:
from sklearn.cluster import KMeans

def mean_shape(vec):
    """
    """
    loc = np.arange(len(vec))
    
    # vec_n = (vec-np.min(vec))/(np.max(vec)-np.min(vec))
    vec_n = np.clip(vec, 0, None)
    vec_n = vec_n/np.sum(vec_n)
    
    ctrd = loc.dot(vec_n)
    return ctrd

def organize_zmat(zmat, fmat, redmat, title='', n_geneset_clsts=5, genes=None, timeorder_start=4, timeorder_end=None, n_init=10):
    """NOTE THAT THE ORDER OF COND is DIFFERRENT BETWEEN (zmat, fmat) - DR first) and (redmat) - NR first)
    """
    method = KMeans(n_clusters=n_geneset_clsts, n_init=n_init, random_state=0)
    geneset_clst = method.fit_predict(zmat)

    # average over genes per geneset and cell clusters - leave genesets and conditions there
    time_sketches = []
    for i in range(n_geneset_clsts):
        time_sketch = np.mean(redmat[:,:,geneset_clst==i], axis=2) # mean over genes
        time_sketch = np.max(time_sketch, axis=1) # max over cell types
        time_sketches.append(time_sketch)
    time_sketches = np.vstack(time_sketches)[:,timeorder_start:timeorder_end] # n_geneset_clsts, n_cond (select NR only)

    # clst_order = [2,1,3,4,0]
    # clst_order = np.argsort(np.argmax(ctrds, axis=1)) 
    clst_order = np.argsort([mean_shape(time_sketch) for time_sketch in time_sketches]) 
    geneset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(geneset_clst).values
    geneset_order = np.argsort(geneset_clst_renamed)
    
    # reorder 
    genes_ordered = genes[geneset_order]
    clsts_ordered = geneset_clst_renamed[geneset_order]
    zmat_ordered = zmat[geneset_order] 
    fmat_ordered = fmat[geneset_order] 
    
    # gene list per group
    geneset_list = []
    for i in range(n_geneset_clsts):
        geneset_list.append(genes_ordered[clsts_ordered==i])
    
    res = {
        'title': title,
        'order': geneset_order,
        'zmat':  zmat_ordered,
        'fmat':  fmat_ordered,
        'genes': genes_ordered,
        'clst':  clsts_ordered,
        'time_sketches':  time_sketches[clst_order],
        'geneset_list': geneset_list,
    }
    return res



In [None]:
n_regu = len(all_regulons)
n_samp = len(todo_samps)
n_type = bigmat_nfd.shape[1] # 3 - ABC / 5 - cell type
n_type_abc = bigmat_abc.shape[1] # 3 - ABC / 5 - cell type

bigreg_x = np.zeros((n_samp,n_type,n_regu))
bigreg_y = np.zeros((n_samp,n_type,n_regu))
bigreg_z = np.zeros((n_samp,n_type,n_regu))

bigreg_abc_x = np.zeros((n_samp,n_type_abc,n_regu))
bigreg_abc_y = np.zeros((n_samp,n_type_abc,n_regu))
bigreg_abc_z = np.zeros((n_samp,n_type_abc,n_regu))

all_reg_tfs = []
all_reg_genes = []
all_reg_regions = []

for i, reg_name in enumerate(all_regulons):
    reg_tf = reg_name.split('_')[0]
    
    df_this_reg = df_scenic[df_scenic['Consensus_name']==reg_name]
    reg_genes = df_this_reg['Gene'].unique()
    reg_regions = df_this_reg['Region'].unique()

    #
    tf_idx   = basicu.get_index_from_array(genes, [reg_tf])[0]
    gene_idx  = basicu.get_index_from_array(genes, reg_genes)
    region_idx = basicu.get_index_from_array(regionset, reg_regions)
    assert np.all(region_idx != -1)
    assert np.all(gene_idx != -1)
    
    all_reg_tfs.append(reg_tf)
    all_reg_genes.append(reg_genes)
    all_reg_regions.append(reg_regions)

    bigreg_x[:,:,i] = bigmat_nfd[:,:,tf_idx]
    bigreg_y[:,:,i] = np.mean(bigmat_nfd[:,:,gene_idx], axis=-1)
    bigreg_z[:,:,i] = np.mean(bigmatatac_nfd[:,:,region_idx], axis=-1)
    
    # ABC
    bigreg_abc_x[:,:,i] = bigmat_abc[:,:,tf_idx]
    bigreg_abc_y[:,:,i] = np.mean(bigmat_abc[:,:,gene_idx], axis=-1)
    bigreg_abc_z[:,:,i] = np.mean(bigmatatac_abc[:,:,region_idx], axis=-1)
    

print(bigreg_x.shape, bigreg_y.shape, bigreg_z.shape)
print(bigreg_abc_x.shape, bigreg_abc_y.shape, bigreg_abc_z.shape)

# ++ regulons

In [None]:
regulons = all_regulons[cond_pp]
redreg_x = mean_over_samples(bigreg_x)[:,:,cond_pp]
redreg_y = mean_over_samples(bigreg_y)[:,:,cond_pp] 
redreg_z = mean_over_samples(bigreg_z)[:,:,cond_pp] 

freg_x, zreg_x = transform_bigredmat(redreg_x)
freg_y, zreg_y = transform_bigredmat(redreg_y)
freg_z, zreg_z = transform_bigredmat(redreg_z)

print(redreg_x.shape)   # cond, type, gene
print(freg_x.shape)   # gene, cond*type
print(zreg_x.shape)   # gene, cond*type

In [None]:
res_y = organize_zmat(np.hstack([zreg_y, zreg_x, zreg_z,]),
                      np.hstack([freg_y, freg_x, freg_z,]),
                      # np.stack([redreg_y, redreg_z, redreg_x,], axis=0),
                      redreg_y, # determine the order
                      title='combined', 
                      genes=regulons, 
                      n_geneset_clsts=7,
                      timeorder_start=4,
                      timeorder_end=None, # 4+7,
                      n_init=20,
                     )

In [None]:
order = res_y['order']
title = res_y['title']
zmat  = res_y['zmat']
clsts = res_y['clst']
genes_this = res_y['genes']

In [None]:
fig, axs = plt.subplots(1,3, figsize=(5*3,12))
for i, zmat_this in enumerate([zreg_x, zreg_y, zreg_z]):
    ax = axs[i]
    zmat_this = zmat_this[order]
    title = ['TF', 'Genes', 'Regions'][i]
    if i == 0:
        yticklabels=regulons[order]
    else:
        yticklabels=False
        
    sns.heatmap(zmat_this, cmap='coolwarm', cbar_kws=dict(shrink=0.5, orientation='horizontal', pad=0.05), 
                yticklabels=yticklabels,
                vmax=3, vmin=-3,
                rasterized=True,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    # ax.set_yticks(0.5+np.arange(len(zmat)))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=8, rotation=0)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=8, rotation=0)
    ax.set_title(title, pad=30)
    
    ax.hlines(np.cumsum(np.unique(clsts, return_counts=True)[1]), 0, 55, color='white', linewidth=1)
    ax.vlines(np.arange(0,55,5), 0, len(zmat), color='white', linewidth=1)
    # ax.vlines(3*5, 0, len(zmat), color='black', linewidth=1)
    ax.vlines(7*5, 0, len(zmat), color='black', linewidth=1)

    ax.grid(False)
    for i, cond in enumerate(np.hstack([todo_conds[4:], todo_conds[:4]])):
        # ax.axvline(condcode*5, color='k', linestyle='--', linewidth=1)
        ax.text(i*5, -0.5, f'{cond}', fontsize=8, va='bottom')

fig.tight_layout()
output = os.path.join(outdirfig, f'heatmap_pp_regulons.pdf')
powerplots.savefig_autodate(fig, output)
plt.show()

# -+ regulons

In [None]:
regulons = all_regulons[cond_mp]
redreg_x = mean_over_samples(bigreg_x)[:,:,cond_mp]
redreg_y = mean_over_samples(bigreg_y)[:,:,cond_mp] 
redreg_z = mean_over_samples(bigreg_z)[:,:,cond_mp] 

freg_x, zreg_x = transform_bigredmat(redreg_x)
freg_y, zreg_y = transform_bigredmat(redreg_y)
freg_z, zreg_z = transform_bigredmat(redreg_z)

print(redreg_x.shape)   # cond, type, gene
print(freg_x.shape)   # gene, cond*type
print(zreg_x.shape)   # gene, cond*type

In [None]:
res_y = organize_zmat(np.hstack([zreg_y, zreg_x, zreg_z,]),
                      np.hstack([freg_y, freg_x, freg_z,]),
                      # np.stack([redreg_y, redreg_z, redreg_x,], axis=0),
                      redreg_y, # determine the order
                      title='combined', 
                      genes=regulons, 
                      n_geneset_clsts=8,
                      timeorder_start=4,
                      timeorder_end=None, # 4+7,
                      n_init=20,
                     )

In [None]:
order = res_y['order']
title = res_y['title']
zmat  = res_y['zmat']
clsts = res_y['clst']
genes_this = res_y['genes']

In [None]:
fig, axs = plt.subplots(1,3, figsize=(5*3,12))
for i, zmat_this in enumerate([zreg_x, zreg_y, zreg_z]):
    ax = axs[i]
    zmat_this = zmat_this[order]
    title = ['TF', 'Genes', 'Regions'][i]
    if i == 0:
        yticklabels=regulons[order]
    else:
        yticklabels=False
        
    sns.heatmap(zmat_this, cmap='coolwarm', cbar_kws=dict(shrink=0.5, orientation='horizontal', pad=0.05), 
                yticklabels=yticklabels,
                vmax=3, vmin=-3,
                rasterized=True,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    # ax.set_yticks(0.5+np.arange(len(zmat)))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=8, rotation=0)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=8, rotation=0)
    ax.set_title(title, pad=30)
    
    ax.hlines(np.cumsum(np.unique(clsts, return_counts=True)[1]), 0, 55, color='white', linewidth=1)
    ax.vlines(np.arange(0,55,5), 0, len(zmat), color='white', linewidth=1)
    # ax.vlines(3*5, 0, len(zmat), color='black', linewidth=1)
    ax.vlines(7*5, 0, len(zmat), color='black', linewidth=1)

    ax.grid(False)
    for i, cond in enumerate(np.hstack([todo_conds[4:], todo_conds[:4]])):
        # ax.axvline(condcode*5, color='k', linestyle='--', linewidth=1)
        ax.text(i*5, -0.5, f'{cond}', fontsize=8, va='bottom')

fig.tight_layout()
output = os.path.join(outdirfig, f'heatmap_mp_regulons.pdf')
powerplots.savefig_autodate(fig, output)
plt.show()

# Average profiles 

In [None]:
fig, axs = plt.subplots(1,3, figsize=(6*3,5))
for i, zmat_this in enumerate([zreg_x, zreg_y, zreg_z]):
    ax = axs[i]
    zmat_this = zmat_this[order]
    title = ['TF', 'Genes', 'Regions'][i]
    if i == 0:
        yticklabels=[f"M{i+1}" for i in range(1+np.max(clsts))] #regulons[order]
    else:
        yticklabels=False
        
    ctrds = np.array([np.mean(zmat_this[clsts==i], axis=0) for i in range(1+np.max(clsts))])
    sns.heatmap(ctrds, cmap='coolwarm', cbar_kws=dict(shrink=0.5, orientation='horizontal', pad=0.05), 
                yticklabels=yticklabels,
                vmax=3, vmin=-3,
                rasterized=True,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_yticks(0.5+np.arange(len(ctrds)))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10, rotation=0)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10, rotation=0)
    ax.set_title(title, pad=30)
    
    ax.hlines(np.cumsum(np.unique(clsts, return_counts=True)[1]), 0, 55, color='white', linewidth=1)
    ax.vlines(np.arange(0,55,5), 0, len(ctrds), color='white', linewidth=1)
    ax.vlines(7*5, 0, len(ctrds), color='black', linewidth=1)

    ax.grid(False)
    for i, cond in enumerate(np.hstack([todo_conds[4:], todo_conds[:4]])):
        # ax.axvline(condcode*5, color='k', linestyle='--', linewidth=1)
        ax.text(i*5, -0.5, f'{cond}', fontsize=10, va='bottom')

fig.tight_layout()
# output = os.path.join(outfigdir, f'heatmap_{title[0]}.pdf')
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
# save this results
df_out = pd.DataFrame()
df_out['regulon'] = all_regulons[cond_pp][order] # ]
df_out['clst'] = clsts
df_out


In [None]:
# output = '/u/home/f/f7xiesnm/v1_multiome/res/regulon_clusters_250515.csv'
# df_out.to_csv(output)

In [None]:
for j in range(np.max(clsts)+1):
    
    plot_x = np.mean(bigreg_abc_x[:,:,cond_pp][:,:,order][:,:,clsts==j], axis=2)
    plot_y = np.mean(bigreg_abc_y[:,:,cond_pp][:,:,order][:,:,clsts==j], axis=2)
    plot_z = np.mean(bigreg_abc_z[:,:,cond_pp][:,:,order][:,:,clsts==j], axis=2)

    redplot_x = mean_over_samples(plot_x)
    redplot_y = mean_over_samples(plot_y)
    redplot_z = mean_over_samples(plot_z)
    
    x_elements = np.array(all_reg_tfs, dtype=object)[cond_pp][order][clsts==j]
    y_elements = np.array(all_reg_genes, dtype=object)[cond_pp][order][clsts==j]
    z_elements = np.array(all_reg_regions, dtype=object)[cond_pp][order][clsts==j]
    
    num_x = len(np.unique(np.hstack(x_elements)))
    num_y = len(np.unique(np.hstack(y_elements)))
    num_z = len(np.unique(np.hstack(z_elements)))
    
    fig, axs = plt.subplots(1, 3, figsize=(3*3,1*4), sharex=True) #, sharey=True)
    for i in range(3):
        ax = axs[i]
        bigmat_mean_ig = [plot_x, plot_y, plot_z][i]
        redmat_mean_ig = [redplot_x, redplot_y, redplot_z][i]

        ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,0], 's', markersize=5, fillstyle='none', color='C0', alpha=0.5)
        ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,1], 's', markersize=5, fillstyle='none', color='C1', alpha=0.5)
        ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,2], 's', markersize=5, fillstyle='none', color='C2', alpha=0.5)

        ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,0], '-', color='C0')
        ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,1], '-', color='C1')
        ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,2], '-', color='C2')

        ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,0], '-', color='C0', alpha=0.5)
        ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,1], '-', color='C1', alpha=0.5)
        ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,2], '-', color='C2', alpha=0.5)

        ax.grid(False)
        ax.set_xticks([6,10,14,17,21])
        ax.set_xticklabels(["P6","10","14","17","21"])
        sns.despine(ax=ax)

    # axs[0].set_xlabel('P')
    axs[0].set_ylabel('log2(FC in CPM)')
    axs[0].set_title(f'M{j+1} TFs (n={num_x})', fontsize=15)
    axs[1].set_title(f'M{j+1} Genes (n={num_y})', fontsize=15)
    axs[2].set_title(f'M{j+1} Regions (n={num_z})', fontsize=15)
    # output = os.path.join(outfigdir, 'abc_degs_signals_over_time_withDR.pdf') 
    # powerplots.savefig_autodate(fig, output)
    fig.tight_layout()
    plt.show()
    break

In [None]:
selected_regulons = all_regulons[cond_pp][order][clsts==7]
for reg_name in selected_regulons:
    tf = reg_name.split('_')[0]
    reg = Regulon(tf, reg_name)
    reg.plot()
    break

In [None]:
# selected_regulons = [reg for reg in all_regulons if reg.startswith('Egr')]
# selected_regulons = [reg for reg in all_regulons if reg.startswith('Npas4') or reg.startswith('Arnt2')]
# selected_regulons = ['Meis2_+_+', 'Satb1_+_+', 'Tcf12_+_+', 'Sox5_+_+', 'Npas4_+_+', 'Fos_+_+', ] #'Egr1_+_+', 'Atf6_+_+']
selected_regulons = ['Meis2_+_+', 'Rfx3_+_+', 'Tcf12_+_+', 'Jdp2_+_+', 'Fosl2_+_+', 'Npas4_+_+', 'Satb2_+_+', 'Nfatc2_+_+', 'Ar_+_+', ]
n = len(selected_regulons)

fig, axss = plt.subplots(3, n, figsize=(n*3,3*4))
for j, reg_name in enumerate(selected_regulons):
    tf = reg_name.split('_')[0]
    reg = Regulon(tf, reg_name)
    # output = os.path.join(outdirfig, f'regulon_{reg_name}.pdf') 
    output = None
    reg.plot(fig=fig, axs=axss[:,j], output=output)
        
for ax in axss.flat:
    ax.set_ylabel('')
for ax in axss[:,0]:
    ax.set_ylabel('log2(FC)')
    
fig.tight_layout()
output = os.path.join(outdirfig, f'regulon_dynamics_activators_figs5.pdf') 
powerplots.savefig_autodate(fig, output)

In [None]:

# selected_regulons = [reg for reg in all_regulons if reg.startswith('Egr')]
# selected_regulons = [reg for reg in all_regulons if reg.startswith('Npas4') or reg.startswith('Arnt2')]
# selected_regulons = ['Meis2_+_+', 'Satb1_+_+', 'Tcf12_+_+', 'Sox5_+_+', 'Npas4_+_+', 'Fos_+_+', ] #'Egr1_+_+', 'Atf6_+_+']
selected_regulons = ['Meis2_-_+', 'Nfib_-_+', 'Tcf12_-_+', 'Satb1_-_+', 'Fos_-_+', 'Arnt2_-_+', 'Satb2_-_+', 'Cux2_-_+', 'Ar_-_+', ]
n = len(selected_regulons)

fig, axss = plt.subplots(3, n, figsize=(n*3,3*4))
for j, reg_name in enumerate(selected_regulons):
    tf = reg_name.split('_')[0]
    reg = Regulon(tf, reg_name)
    # output = os.path.join(outdirfig, f'regulon_{reg_name}.pdf') 
    output = None
    reg.plot(fig=fig, axs=axss[:,j], output=output)
        
for ax in axss.flat:
    ax.set_ylabel('')
for ax in axss[:,0]:
    ax.set_ylabel('log2(FC)')
    
fig.tight_layout()
output = os.path.join(outdirfig, f'regulon_dynamics_repressor_figs6.pdf') 
powerplots.savefig_autodate(fig, output)

In [None]:
# MP regulons
regulons = all_regulons[cond_mp]
redreg_x = mean_over_samples(bigreg_x)[:,:,cond_mp] 
redreg_y = mean_over_samples(bigreg_y)[:,:,cond_mp] 
redreg_z = mean_over_samples(bigreg_z)[:,:,cond_mp] 

freg_x, zreg_x = transform_bigredmat(redreg_x)
freg_y, zreg_y = transform_bigredmat(redreg_y)
freg_z, zreg_z = transform_bigredmat(redreg_z)

print(redreg_x.shape)   # cond, type, gene
print(freg_x.shape)   # gene, cond*type
print(zreg_x.shape)   # gene, cond*type

res_y_mp = organize_zmat(
                      np.hstack([zreg_y, zreg_x, zreg_z,]),
                      np.hstack([freg_y, freg_x, freg_z,]),
                      # np.stack([redreg_y, redreg_z, redreg_x,], axis=0),
                      redreg_y, # determine the order
                      title='combined', 
                      genes=regulons, 
                      n_geneset_clsts=8,
                      timeorder_start=4,
                      timeorder_end=None, # 4+7,
                      n_init=20,
                     )

order_mp = res_y_mp['order']
title_mp = res_y_mp['title']
zmat_mp  = res_y_mp['zmat']
clsts_mp = res_y_mp['clst']
genes_this_mp = res_y_mp['genes']

# Temporal vs DR changes

In [None]:
from sklearn.metrics import r2_score

In [None]:
def get_delt_delv(big_y):
    """big_y.shape = (25,)
    
    return 2 numbers: del_t, del_v
    """
    red_y = mean_over_samples(big_y) 

    del_t = np.mean(red_y[7+3]-red_y[3+3]) # mean over ABC

    del_v21 = np.mean(-red_y[7+3]+red_y[3]) # mean over ABC
    del_v17 = np.mean(-red_y[7+2]+red_y[2])
    del_v14 = np.mean(-red_y[7+1]+red_y[1])
    del_v12 = np.mean(-red_y[7+0]+red_y[0])

    del_varr = np.array([del_v21, del_v17, del_v14, del_v12])
    del_v = np.mean(del_varr) # mean over time
    # del_v = del_varr[np.argmax(np.abs(del_varr))] # max over time

    return del_t, del_v
    



In [None]:
print(bigmat_abc.shape, bigmatatac_abc.shape)
print(bigreg_abc_x.shape, 
      bigreg_abc_y.shape, 
      bigreg_abc_z.shape, 
     )

nreg = len(all_regulons)
print(nreg)

dels = np.zeros((3,2,nreg)) # XYZ, TV, NREG
for i in range(nreg):
    ftrs_x = np.mean(bigreg_abc_x[:,:,i], axis=1)
    ftrs_y = np.mean(bigreg_abc_y[:,:,i], axis=1)
    ftrs_z = np.mean(bigreg_abc_z[:,:,i], axis=1)
    
    del_tx, del_vx = get_delt_delv(ftrs_x)
    del_ty, del_vy = get_delt_delv(ftrs_y)
    del_tz, del_vz = get_delt_delv(ftrs_z)
    
    dels[:,:,i] = [[del_tx, del_vx],
                   [del_ty, del_vy],
                   [del_tz, del_vz],
                  ]
                    
dels.shape

In [None]:
palette = sns.color_palette('tab20', 8)
palette

In [None]:
def plot_del(cond=None, order=None, clsts=None, palette=None, title=None):
    titles = ['TFs', 'Genes', 'Regions']

    fig, axs = plt.subplots(1,3,figsize=(6*3,6))
    for i, ax in enumerate(axs):
        _x = dels[i,0]
        _y = dels[i,1]
        xmin = np.min(_x)
        xmax = np.max(_x)
        r, _ = stats.spearmanr(_x, _y)
        slope, intercept = np.polyfit(_x, _y, 1)
        r2 = r2_score(_y, _x*slope+intercept)
        xbase = np.linspace(xmin, xmax,5) 
        ybase = slope*xbase + intercept
        ax.scatter(_x, _y, s=5, color='lightgray')

        if cond is not None:
            _x = dels[i,0][cond][order]
            _y = dels[i,1][cond][order]
            _t = all_regulons[cond][order]

            for j in range(8): 
                ax.scatter(_x[clsts==j], 
                           _y[clsts==j],s=10, label=f'M{j+1}', color=palette[j])

            cond_text = np.logical_or(
                (_x < xmin+(xmax-xmin)*0.05), 
                (_x > xmin+(xmax-xmin)*0.70),
            )
            for __x, __y, __t in zip(
                _x[cond_text]+0.1,
                _y[cond_text],
                _t[cond_text],
            ):
                ax.text(__x, __y, __t[:-4], fontsize=10)

        ax.plot(xbase, ybase, '--k', linewidth=1, zorder=0)
        ax.axvline(0, color='gray',  linewidth=1, zorder=0)
        ax.axhline(0, color='gray',  linewidth=1, zorder=0)
        ax.grid(False)
        sns.despine(ax=ax)
        ax.set_ylabel('log2(DR/NR)')
        ax.set_xlabel('log2(P21/P10)')
        ax.set_title(f'{titles[i]} r={r:.2f}', fontsize=15)
        # ax.legend()

    if title is None:
        fig.suptitle('Regulons (TF, Genes, Regions)', fontsize=14)
    else:
        fig.suptitle(f'Regulons ({title})', fontsize=14)
        
    fig.tight_layout()
    # output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
    # powerplots.savefig_autodate(fig, output)
    plt.show()

In [None]:
plot_del(cond_pp, order, clsts, palette, '++')
plot_del(cond_mp, order_mp, clsts_mp, palette, '-+')

# Combined positive and negative 

In [None]:
titles = ['TFs', 'Genes', 'Regions']
s=30

fig, axs = plt.subplots(1,3,figsize=(5*3,6))
for i, ax in enumerate(axs):
    _x = dels[i,0]
    _y = dels[i,1] 
    _t = all_regulons
    
    r, _ = stats.spearmanr(_x, _y)
    slope, intercept = np.polyfit(_x, _y, 1)
    
    xmin = np.min(_x)
    xmax = np.max(_x)
    xbase = np.linspace(xmin-0.05*(xmax-xmin), xmax+0.05*(xmax-xmin),5) 
    ybase = slope*xbase + intercept
    r2 = r2_score(_x, _x*slope+intercept)

    ax.scatter(_x[cond_pp], _y[cond_pp], color='C1', s=s, facecolor='none', marker='o', label='++')
    ax.scatter(_x[cond_mp], _y[cond_mp], color='k', s=s, facecolor='none', marker='s', label='-+')
    
    ax.plot(xbase, ybase, '--', color='k', linewidth=2) #, zorder=0)
    ax.axvline(0, color='gray',  linewidth=1, zorder=0)
    ax.axhline(0, color='gray',  linewidth=1, zorder=0)
    ax.grid(False)
    sns.despine(ax=ax)
    axs[0].set_ylabel('log2(DR/NR)')
    ax.set_xlabel('log2(P21/P10)')
    ax.set_title(f'{titles[i]}\ny={slope:.2f}x+({intercept:.2f}); r={r:.2f}', fontsize=15)
    
    ax.legend()
    
fig.suptitle('Regulons (TF, Genes, Regions)', fontsize=14)
fig.tight_layout()
# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
highlights_b = [
    'Npas4', 
    'Arnt2',
    'Fos', 
    'Fosl2', 
    'Fosb', 
    'Jun',
    'Junb',
    'Egr1', 
    'Egr2', 
]
highlights_a = [
              'Meis2',
              'Nfib',
              'Rfx3',
]
highlights_c = [
              'Tcf12',
              'Satb1',
              'Jdp2',
             ] 
highlights_text = [
    'Npas4', 
    'Fosb', 
    'Fos', 
    'Fosl2', 
    'Junb',
    'Smad3',
]


cond_text = np.array([reg[:-4] in highlights_text for reg in all_regulons])
cond_a = np.array([reg[:-4] in highlights_a for reg in all_regulons])
cond_b = np.array([reg[:-4] in highlights_b for reg in all_regulons])
cond_c = np.array([reg[:-4] in highlights_c for reg in all_regulons])
cond_abc = 1*cond_a+2*cond_b+3*cond_c
# plot_order = np.argsort(cond_abc)

colors = np.array(['gray', 'C0', 'C1', 'C2'])[cond_abc.astype(int)]

titles = ['TFs', 'Target Genes', 'Target Regions']
s=30

fig, axs = plt.subplots(1,3,figsize=(5*3,6))
for i, ax in enumerate(axs):
    _x = dels[i,0]
    _y = dels[i,1] 
    _t = all_regulons
    
    r, _ = stats.spearmanr(_x, _y)
    slope, intercept = np.polyfit(_x, _y, 1)
    
    xmin = np.min(_x)
    xmax = np.max(_x)
    xbase = np.linspace(xmin-0.05*(xmax-xmin), xmax+0.05*(xmax-xmin),5) 
    ybase = slope*xbase + intercept
    r2 = r2_score(_x, _x*slope+intercept)

    ax.scatter(_x[cond_pp], _y[cond_pp], color=colors[cond_pp], s=s, facecolor='none', marker='o', label='++')
    ax.scatter(_x[cond_mp], _y[cond_mp], color=colors[cond_mp], s=s, facecolor='none', marker='s', label='-+')
    
    
    for __x, __y, __t in zip(
        _x[cond_text],
        _y[cond_text],
        _t[cond_text],
    ):
        # ax.text(__x, __y, __t[:-4], fontsize=10)
        ax.text(xmax+0.05*(xmax-xmin), __y, __t[:-4], fontsize=10)
    
    ax.plot(xbase, ybase, '--', color='k', linewidth=2) #, zorder=0)
    ax.axvline(0, color='gray',  linewidth=1, zorder=0)
    ax.axhline(0, color='gray',  linewidth=1, zorder=0)
    ax.grid(False)
    sns.despine(ax=ax)
    axs[0].set_ylabel('log2(DR/NR)')
    ax.set_xlabel('log2(P21/P10)')
    ax.set_title(f'{titles[i]}\ny={slope:.2f}x+({intercept:.2f}); r={r:.2f}', fontsize=15)
    
    ax.legend()
    
fig.suptitle('Regulons (TF, Genes, Regions)', fontsize=14)
fig.tight_layout()
# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
titles = ['TFs', 'Genes', 'Regions']

fig, ax = plt.subplots(1,1,figsize=(6*1,6))
_x = dels[0,1]
_y = dels[1,1] 
_t = all_regulons

cond_text = np.logical_or(
    _y < np.percentile(_y, 10), 
    _y > np.percentile(_y, 90)
)
for __x, __y, __t in zip(
    _x[cond_text]+0.02,
    _y[cond_text],
    _t[cond_text],
):
    ax.text(__x, __y, __t[:-4], fontsize=10)


ax.scatter(_x[cond_pp], _y[cond_pp], color='C1', s=s, facecolor='none', marker='o', label='++')
ax.scatter(_x[cond_mp], _y[cond_mp], color='k', s=s, facecolor='none', marker='s', label='-+')

# ax.plot(xbase, ybase, '--k', linewidth=1, zorder=0)
ax.axvline(0, color='gray',  linewidth=1, zorder=0)
ax.axhline(0, color='gray',  linewidth=1, zorder=0)
ax.grid(False)
sns.despine(ax=ax)
ax.set_xlabel('log2(DR/NR) TF')
ax.set_ylabel('log2(DR/NR) gene')
    
fig.suptitle('Regulons (TF, Genes, Regions)', fontsize=14)
fig.tight_layout()
# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
def plot_del_highlight(cond_sel, marker):
    titles = ['TFs', 'Genes', 'Regions']
    s = 100

    fig, axs = plt.subplots(1,3,figsize=(5*3,5))
    for i, ax in enumerate(axs):
        _x0 = dels[0,0]
        _x = dels[i,0]
        _y = dels[i,1] 
        _t = all_regulons

        r, _ = stats.spearmanr(_x, _y)
        slope, intercept = np.polyfit(_x, _y, 1)

        xmin = np.min(_x)
        xmax = np.max(_x)
        xbase = np.linspace(xmin, xmax,5) 
        ybase = slope*xbase + intercept
        r2 = r2_score(_x, _x*slope+intercept)


        ax.scatter(_x[cond_pp], _y[cond_pp], color='lightgray', s=s, facecolor='none', marker='o', label='++')
        ax.scatter(_x[cond_mp], _y[cond_mp], color='lightgray', s=s, facecolor='none', marker='s', label='-+')

        ax.scatter(_x[cond_sel], _y[cond_sel], c=_x0[cond_sel], s=s, facecolor='none', marker=marker, cmap='coolwarm', edgecolor='k')
        cond_text = np.logical_or(
            _y[cond_sel] < np.percentile(_y[cond_sel], 10), 
            _y[cond_sel] > np.percentile(_y[cond_sel], 95)
        )
        for __x, __y, __t in zip(
            _x[cond_sel][cond_text]+0.1,
            _y[cond_sel][cond_text],
            _t[cond_sel][cond_text],
        ):
            ax.text(__x, __y, __t[:-4], fontsize=10)


        ax.plot(xbase, ybase, '--k', linewidth=1, zorder=0)
        ax.axvline(0, color='gray',  linewidth=1, zorder=0)
        ax.axhline(0, color='gray',  linewidth=1, zorder=0)
        ax.grid(False)
        sns.despine(ax=ax)
        axs[0].set_ylabel('log2(DR/NR)')
        ax.set_xlabel('log2(P21/P10)')
        ax.set_title(f'{titles[i]}') # \ny={slope:.2f}x+({intercept:.2f}); r={r:.2f}', fontsize=15)

    # fig.suptitle('Regulons (TF, Genes, Regions)', fontsize=14)

    fig.tight_layout()
    # output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
    # powerplots.savefig_autodate(fig, output)
    plt.show()

In [None]:
plot_del_highlight(cond_pp, 'o')
plot_del_highlight(cond_mp, 's')

# Color by ABC regulon list

In [None]:
# Activator regulons (Odds ratio > 2)
aps = [
    'Arid5b_+_+', 
    'Meis2_+_+', 
    'Nfib_+_+',
    'Pbx1_+_+', 
    'Rfx3_+_+', 
    'Sox5_+_+',
    'Tcf4_+_+', 
    'Barx2_+_+',
    'Etv6_+_+', 
    'Foxo1_+_+',
    'Nfatc2_+_+', 
    'Nfia_+_+',
    'Tcfl5_+_+', 
    'Zbtb20_+_+']

bps = [
    'Egr1_+_+', 
    'Egr4_+_+', 
    'Fos_+_+', 
    'Fosb_+_+',
    'Fosl2_+_+', 
    'Irf2_+_+', 
    'Junb_+_+',
    'Npas4_+_+', 
    'Smad3_+_+', 
    'Sox5_+_+',
    'Barx2_+_+', 
    'Cpeb1_+_+',
    'Etv6_+_+', 
    'Mxi1_+_+',
    'Tfdp2_+_+']

cps = [
    'Hlf_+_+', 
    'Jdp2_+_+', 
    'Tcf12_+_+',
    'Cux1_+_+', 
    'Cux2_+_+',
    'Peg3_+_+', 
    'Rbpj_+_+',
    'Satb1_+_+',
]

# Repressor regulons
ans = [
    'Egr1_-_+', 
    'Etv5_-_+', 
    'Pou3f1_-_+',
    'Pou3f2_-_+', 
    'Tcf12_-_+', 
    'Cux1_-_+',
    'Cux2_-_+', 
    'Hlf_-_+',
    'Lcorl_-_+', 
    'Mxi1_-_+',
    'Pknox2_-_+', 
    'Satb1_-_+']
bns = [
    'Klf7_-_+', 
    'Pou3f1_-_+', 
    'Pou3f2_-_+',
    'Sox11_-_+', 
    'Zfp57_-_+', 
    'Zbtb18_-_+']
cns = [
    'Glis3_-_+', 
    'Meis2_-_+', 
    'Nfib_-_+',
    'Pbx1_-_+', 
    'Sox11_-_+', 
    'Sox5_-_+',
    'Zfp57_-_+', 
    'Ikzf2_-_+',
    'Lhx2_-_+', 
    'Zbtb18_-_+']

print(len(aps)+len(bps)+len(cps))
print(len(np.unique(aps+bps+cps)))

print(len(ans)+len(bns)+len(cns))
print(len(np.unique(ans+bns+cns)))

In [None]:
highlights_text = [
    'Fos_+_+',
    'Fos_-_+',
#     'Npas4', 
#     'Fosb', 
#     'Fos', 
#     'Fosl2', 
#     'Junb',
#     'Smad3',
]

cond_text = np.array([reg in highlights_text for reg in all_regulons])

cond_ap = np.array([reg in aps for reg in all_regulons])
cond_bp = np.array([reg in bps for reg in all_regulons])
cond_cp = np.array([reg in cps for reg in all_regulons])

# cond_an = np.array([reg.replace('-', '+') in aps for reg in all_regulons])
# cond_bn = np.array([reg.replace('-', '+') in bps for reg in all_regulons])
# cond_cn = np.array([reg.replace('-', '+') in cps for reg in all_regulons])

cond_an = np.array([reg in ans for reg in all_regulons])
cond_bn = np.array([reg in bns for reg in all_regulons])
cond_cn = np.array([reg in cns for reg in all_regulons])

# cond_abc = 1*cond_a+2*cond_b+3*cond_c
# plot_order = np.argsort(cond_abc)[::-1]

colors = np.array(['gray', 'C0', 'C1', 'C2'])[cond_abc.astype(int)]

titles = ['TFs', 'Target Genes', 'Target Regions']
s=40

fig, axs = plt.subplots(1,3,figsize=(5*3,5))
for i, ax in enumerate(axs):
    _x = dels[i,0]
    _y = dels[i,1] 
    _t = all_regulons
    
    r, _ = stats.spearmanr(_x, _y)
    slope, intercept = np.polyfit(_x, _y, 1)
    
    xmin = np.min(_x)
    xmax = np.max(_x)
    xbase = np.linspace(xmin-0.05*(xmax-xmin), xmax+0.05*(xmax-xmin),5) 
    ybase = slope*xbase + intercept
    r2 = r2_score(_x, _x*slope+intercept)

    ax.scatter(_x[cond_mp], _y[cond_mp], color='silver', s=s, marker='v', label='-', rasterized=True)
    ax.scatter(_x[cond_pp], _y[cond_pp], color='silver', s=s, marker='o', label='+', rasterized=True)#facecolor='none', )
    
    ax.scatter(_x[cond_an], _y[cond_an], color='C0', s=s, marker='v', label='a-', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_bn], _y[cond_bn], color='C1', s=s, marker='v', label='b-', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_cn], _y[cond_cn], color='C2', s=s, marker='v', label='c-', facecolor='none', rasterized=True )
    
    ax.scatter(_x[cond_ap], _y[cond_ap], color='C0', s=s, marker='o', label='a+', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_bp], _y[cond_bp], color='C1', s=s, marker='o', label='b+', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_cp], _y[cond_cp], color='C2', s=s, marker='o', label='c+', facecolor='none', rasterized=True )
    
    # for __x, __y, __t in zip(
    #     _x[cond_text],
    #     _y[cond_text],
    #     _t[cond_text],
    # ):
    #     # ax.text(__x, __y, __t, fontsize=10)
    #     ax.text(xmax+0.05*(xmax-xmin), __y, __t, fontsize=10)
    
    ax.plot(xbase, ybase, '--', color='k', linewidth=2, zorder=1)
    ax.axvline(0, color='gray',  linewidth=1, zorder=0)
    ax.axhline(0, color='gray',  linewidth=1, zorder=0)
    ax.grid(False)
    sns.despine(ax=ax)
    axs[0].set_ylabel('log2(DR/NR)')
    ax.set_xlabel('log2(P21/P10)')
    ax.set_title(f'{titles[i]}\ny={slope:.2f}x+({intercept:.2f}); r={r:.2f}', fontsize=15)
    
    # ax.legend()
    
fig.tight_layout()
# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
highlights_text = [
    'Fos_+_+',
    'Fos_-_+',
#     'Npas4', 
#     'Fosb', 
#     'Fos', 
#     'Fosl2', 
#     'Junb',
#     'Smad3',
]

cond_text = np.array([reg in highlights_text for reg in all_regulons])

cond_ap = np.array([reg in aps for reg in all_regulons])
cond_bp = np.array([reg in bps for reg in all_regulons])
cond_cp = np.array([reg in cps for reg in all_regulons])

cond_an = np.array([reg.replace('-', '+') in aps for reg in all_regulons])
cond_bn = np.array([reg.replace('-', '+') in bps for reg in all_regulons])
cond_cn = np.array([reg.replace('-', '+') in cps for reg in all_regulons])

# cond_an = np.array([reg in ans for reg in all_regulons])
# cond_bn = np.array([reg in bns for reg in all_regulons])
# cond_cn = np.array([reg in cns for reg in all_regulons])

# cond_abc = 1*cond_a+2*cond_b+3*cond_c
# plot_order = np.argsort(cond_abc)[::-1]

colors = np.array(['gray', 'C0', 'C1', 'C2'])[cond_abc.astype(int)]

titles = ['TFs', 'Target Genes', 'Target Regions']
s=40

fig, axs = plt.subplots(1,3,figsize=(5*3,5))
for i, ax in enumerate(axs):
    _x = dels[i,0]
    _y = dels[i,1] 
    _t = all_regulons
    
    r, _ = stats.spearmanr(_x, _y)
    slope, intercept = np.polyfit(_x, _y, 1)
    
    xmin = np.min(_x)
    xmax = np.max(_x)
    xbase = np.linspace(xmin-0.05*(xmax-xmin), xmax+0.05*(xmax-xmin),5) 
    ybase = slope*xbase + intercept
    r2 = r2_score(_x, _x*slope+intercept)

    ax.scatter(_x[cond_mp], _y[cond_mp], color='silver', s=s, marker='v', label='-', rasterized=True)
    ax.scatter(_x[cond_pp], _y[cond_pp], color='silver', s=s, marker='o', label='+', rasterized=True)#facecolor='none', )
    
    ax.scatter(_x[cond_an], _y[cond_an], color='C0', s=s, marker='v', label='a-', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_bn], _y[cond_bn], color='C1', s=s, marker='v', label='b-', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_cn], _y[cond_cn], color='C2', s=s, marker='v', label='c-', facecolor='none', rasterized=True )
    
    ax.scatter(_x[cond_ap], _y[cond_ap], color='C0', s=s, marker='o', label='a+', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_bp], _y[cond_bp], color='C1', s=s, marker='o', label='b+', facecolor='none', rasterized=True )
    ax.scatter(_x[cond_cp], _y[cond_cp], color='C2', s=s, marker='o', label='c+', facecolor='none', rasterized=True )
    
    # for __x, __y, __t in zip(
    #     _x[cond_text],
    #     _y[cond_text],
    #     _t[cond_text],
    # ):
    #     # ax.text(__x, __y, __t, fontsize=10)
    #     ax.text(xmax+0.05*(xmax-xmin), __y, __t, fontsize=10)
    
    ax.plot(xbase, ybase, '--', color='k', linewidth=2, zorder=1)
    ax.axvline(0, color='gray',  linewidth=1, zorder=0)
    ax.axhline(0, color='gray',  linewidth=1, zorder=0)
    ax.grid(False)
    sns.despine(ax=ax)
    axs[0].set_ylabel('log2(DR/NR)')
    ax.set_xlabel('log2(P21/P10)')
    ax.set_title(f'{titles[i]}\ny={slope:.2f}x+({intercept:.2f}); r={r:.2f}', fontsize=15)
    
    # ax.legend()
    
fig.tight_layout()
# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
highlights_text = [
    'Npas4',
    'Smad3',
    'Fos',
    'Fosb',
    'Fosl2',
    'Junb',
    
    'Meis2',
    'Tcf12',
]

cond_text = np.array([reg[:-4] in highlights_text for reg in all_regulons])
cond_textp = np.logical_and(cond_text, cond_pp)
cond_textn = np.logical_and(cond_text, cond_mp)


colors = np.array(['gray', 'C0', 'C1', 'C2'])[cond_abc.astype(int)]

titles = ['TFs', 'Target Genes', 'Target Regions']
s=40

fig, axs = plt.subplots(1,3,figsize=(5*3,5))
for i, ax in enumerate(axs):
    _x = dels[i,0]
    _y = dels[i,1] 
    _t = all_regulons
    
    r, _ = stats.spearmanr(_x, _y)
    slope, intercept = np.polyfit(_x, _y, 1)
    
    xmin = np.min(_x)
    xmax = np.max(_x)
    xbase = np.linspace(xmin-0.05*(xmax-xmin), xmax+0.05*(xmax-xmin),5) 
    ybase = slope*xbase + intercept
    r2 = r2_score(_x, _x*slope+intercept)

    ax.scatter(_x[cond_mp], _y[cond_mp], color='silver', s=s, marker='v', label='-', rasterized=True)
    ax.scatter(_x[cond_pp], _y[cond_pp], color='silver', s=s, marker='o', label='+', rasterized=True)#facecolor='none', )
    
    ax.scatter(_x[cond_textn], _y[cond_textn], color='k', s=s, marker='v', label='a-', rasterized=True )
    ax.scatter(_x[cond_textp], _y[cond_textp], color='k', s=s, marker='o', label='a+', rasterized=True )
    
    for __x, __y, __t in zip(
        _x[cond_text],
        _y[cond_text],
        _t[cond_text],
    ):
        ax.text(__x, __y, __t[:-4], fontsize=10)
        # ax.text(xmax+0.05*(xmax-xmin), __y, __t, fontsize=10)
    
    ax.plot(xbase, ybase, '--', color='k', linewidth=2, zorder=1)
    ax.axvline(0, color='gray',  linewidth=1, zorder=0)
    ax.axhline(0, color='gray',  linewidth=1, zorder=0)
    ax.grid(False)
    sns.despine(ax=ax)
    axs[0].set_ylabel('log2(DR/NR)')
    ax.set_xlabel('log2(P21/P10)')
    ax.set_title(f'{titles[i]}\ny={slope:.2f}x+({intercept:.2f}); r={r:.2f}', fontsize=15)
    
    # ax.legend()
    
fig.tight_layout()
# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:

a = Regulon('Meis2', 'Meis2_+_+')
a.reg_genes

In [None]:
dir(a)