In [None]:
import scipy.io
import numpy as np
import pandas as pd
import scanpy as sc
import h5py

import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP

from matplotlib.colors import LinearSegmentedColormap
from statsmodels.stats.multitest import multipletests

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  

import atac_utils

In [None]:
outfigdir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures/250415'
!mkdir -p $outfigdir

In [None]:
scores_abc = pd.read_csv("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/scores_l23abc.csv", 
                         index_col=0,
                        )
scores_abc['scores_c-a'] = scores_abc['scores_c'] - scores_abc['scores_a']
scores_abc

In [None]:
%%time
adata = anndata.read("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad")
adata

In [None]:
adata.X = adata.raw.X

In [None]:
adata.obs['scores_a'] = scores_abc.loc[adata.obs.index,'scores_a'].copy()
adata.obs['scores_b'] = scores_abc.loc[adata.obs.index,'scores_b'].copy()
adata.obs['scores_c'] = scores_abc.loc[adata.obs.index,'scores_c'].copy()
adata.obs['scores_c-a'] = scores_abc.loc[adata.obs.index,'scores_c-a'].copy()

# fix cond incompatibility # P6NR vs P6
adata.obs['cond'] = adata.obs['cond'].apply(lambda x: x.replace('NR', ''))

In [None]:
sample_labels = adata.obs['Sample'].values
time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

adata.obs['sample'] = sample_labels #
adata.obs['time']   = time_labels

uniq_samples = natsorted(np.unique(sample_labels))
nr_samples = [s for s in uniq_samples if "DR" not in s]
dr_samples = [s for s in uniq_samples if "DR" in s]

uniq_conds = np.array(natsorted(np.unique(adata.obs['cond'].values)))
print(uniq_conds)

In [None]:
condcode2cond = atac_utils.CONDCODE_TO_COND
condcode2cond

In [None]:
cond2condcode = {val: key for key, val in condcode2cond.items()}
cond2condcode

In [None]:
sample_conditions = list(condcode2cond.values())
n_cond = len(sample_conditions)
sample_conditions

In [None]:
%%time
adatas_pk = []
for cond_code, cond_name in condcode2cond.items():
    f = f'/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/pmat_l23concensus_{cond_name}.h5ad'
    adata_pk = sc.read(f)
    adatas_pk.append(adata_pk)
    print(adata_pk)
    # break

# SHUFFLE

In [None]:
%%time

num_archetypal_cells = 300

offset = 1
SHUFFLE = True

l2fc_th = np.log2(2)
l2fc_th_s = np.log2(1.2)
alpha_th = 0.05

n_cond = len(sample_conditions)
n_peak = adatas_pk[0].shape[1]
qs_tensor   = np.zeros((n_cond,3,n_peak))  # 3 represents 3 pairwise comparisons (ca, ba, bc)
l2fc_tensor = np.zeros((n_cond,3,n_peak))

print(qs_tensor.shape, l2fc_tensor.shape)

for cond_code, cond_name in condcode2cond.items():
    # get sub
    adatasub = adata[adata.obs['cond']==cond_name]
    cells_rna = adatasub.obs.index.values
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    precond_a = ranks_ac <= num_archetypal_cells
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
    # get ATAC
    adata_pk = adatas_pk[cond_code]
    cells_atac = adata_pk.obs.index.values
        
    # SHUFFLE
    if SHUFFLE:
        cells_rna = cells_rna[np.random.choice(len(cells_rna), size=len(cells_rna), replace=False)]
        
    # get A, B, C 
    cells_a = np.intersect1d(cells_rna[cond_a], cells_atac)
    cells_b = np.intersect1d(cells_rna[cond_b], cells_atac)
    cells_c = np.intersect1d(cells_rna[cond_c], cells_atac)
        
    # peak size (500bp)
    mat_a = np.array(adata_pk[cells_a].X.todense()) 
    mat_b = np.array(adata_pk[cells_b].X.todense()) 
    mat_c = np.array(adata_pk[cells_c].X.todense()) 
    
    # size norm (CPM) between cells
    mat_a = mat_a/np.sum(mat_a, axis=1).reshape(-1,1)*1e6
    mat_b = mat_b/np.sum(mat_b, axis=1).reshape(-1,1)*1e6
    mat_c = mat_c/np.sum(mat_c, axis=1).reshape(-1,1)*1e6
    
    logmat_a = np.log2(mat_a+offset)
    logmat_b = np.log2(mat_b+offset)
    logmat_c = np.log2(mat_c+offset)
    
    # DEGs
    l2fc_ca = np.log2((np.mean(mat_c, axis=0)+offset)) - np.log2(np.mean(mat_a, axis=0)+offset)
    l2fc_ba = np.log2((np.mean(mat_b, axis=0)+offset)) - np.log2(np.mean(mat_a, axis=0)+offset)
    l2fc_bc = np.log2((np.mean(mat_b, axis=0)+offset)) - np.log2(np.mean(mat_c, axis=0)+offset)
    
    ts, ps_ca = stats.ttest_ind(logmat_c, logmat_a)
    ts, ps_ba = stats.ttest_ind(logmat_b, logmat_a)
    ts, ps_bc = stats.ttest_ind(logmat_b, logmat_c)
    
    rs, qs_ca, _, _ = multipletests(np.nan_to_num(ps_ca, nan=1).reshape(-1,), method='fdr_bh')
    rs, qs_ba, _, _ = multipletests(np.nan_to_num(ps_ba, nan=1).reshape(-1,), method='fdr_bh')
    rs, qs_bc, _, _ = multipletests(np.nan_to_num(ps_bc, nan=1).reshape(-1,), method='fdr_bh')
    
    # sig_cond
    qs_a   = np.minimum(qs_ca, qs_ba) # the better of the two
    qs_c   = np.minimum(qs_ca, qs_bc) # the better of the two
    qs_b   = np.minimum(qs_ba, qs_bc) # the better of the two

    l2fc_a = np.max([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
    l2fc_c = np.max([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
    l2fc_b = np.max([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change
    
    l2fc_as = np.min([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
    l2fc_cs = np.min([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
    l2fc_bs = np.min([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change

    cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th], axis=0)
    cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th], axis=0)
    cond_sig_b = np.all([ l2fc_ba > 0,  l2fc_bc > 0, l2fc_b > l2fc_th, qs_b < alpha_th, l2fc_bs > l2fc_th_s], axis=0)
    
    print(cond_code, cond_name, len(mat_a), len(mat_b), len(mat_c), 
          cond_sig_a.sum(), cond_sig_c.sum(), cond_sig_b.sum())
    
    # save this
    l2fc_tensor[cond_code, 0] = l2fc_ca
    l2fc_tensor[cond_code, 1] = l2fc_ba
    l2fc_tensor[cond_code, 2] = l2fc_bc
    
    qs_tensor[cond_code, 0] = qs_ca
    qs_tensor[cond_code, 1] = qs_ba
    qs_tensor[cond_code, 2] = qs_bc
    

In [None]:
l2fc_th = np.log2(2)
l2fc_th_s = np.log2(1.2)
alpha_th = 0.05

In [None]:
qs_ca   = qs_tensor[:,0,:]
qs_ba   = qs_tensor[:,1,:]
qs_bc   = qs_tensor[:,2,:]

l2fc_ca = l2fc_tensor[:,0,:]
l2fc_ba = l2fc_tensor[:,1,:]
l2fc_bc = l2fc_tensor[:,2,:]

In [None]:
qs_a   = np.minimum(qs_ca, qs_ba) # the better of the two
qs_c   = np.minimum(qs_ca, qs_bc) # the better of the two
qs_b   = np.minimum(qs_ba, qs_bc) # the better of the two

l2fc_a = np.max([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
l2fc_c = np.max([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
l2fc_b = np.max([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change


l2fc_as = np.min([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
l2fc_cs = np.min([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
l2fc_bs = np.min([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change


cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th], axis=0)
cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th], axis=0)
# cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th, l2fc_as > l2fc_th_s], axis=0)
# cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th, l2fc_cs > l2fc_th_s], axis=0)
cond_sig_b = np.all([ l2fc_ba > 0,  l2fc_bc > 0, l2fc_b > l2fc_th, qs_b < alpha_th, l2fc_bs > l2fc_th_s], axis=0)


instances, counts_a = np.unique(cond_sig_a.sum(axis=0), return_counts=True)
instances, counts_c = np.unique(cond_sig_c.sum(axis=0), return_counts=True)
instances, counts_b = np.unique(cond_sig_b.sum(axis=0), return_counts=True)

print('num A genes for each cond:\t', cond_sig_a.sum(axis=1), np.any(cond_sig_a, axis=0).sum())
print('num C genes for each cond:\t', cond_sig_c.sum(axis=1), np.any(cond_sig_c, axis=0).sum())
print('num B genes for each cond:\t', cond_sig_b.sum(axis=1), np.any(cond_sig_b, axis=0).sum())

print('num A genes in num conds:\t',  counts_a[1:])
print('num C genes in num conds:\t',  counts_c[1:])
print('num B genes in num conds:\t',  counts_b[1:])

# REAL

In [None]:
%%time

num_archetypal_cells = 300

offset = 1
SHUFFLE = False

l2fc_th = np.log2(2)
l2fc_th_s = np.log2(1.2)
alpha_th = 0.05

n_cond = len(sample_conditions)
n_peak = adatas_pk[0].shape[1]
qs_tensor   = np.zeros((n_cond,3,n_peak))  # 3 represents 3 pairwise comparisons (ca, ba, bc)
l2fc_tensor = np.zeros((n_cond,3,n_peak))

print(qs_tensor.shape, l2fc_tensor.shape)

for cond_code, cond_name in condcode2cond.items():
    # get sub
    adatasub = adata[adata.obs['cond']==cond_name]
    cells_rna = adatasub.obs.index.values
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    precond_a = ranks_ac <= num_archetypal_cells
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
    # get ATAC
    adata_pk = adatas_pk[cond_code]
    cells_atac = adata_pk.obs.index.values
        
    # SHUFFLE
    if SHUFFLE:
        cells_rna = cells_rna[np.random.choice(len(cells_rna), size=len(cells_rna), replace=False)]
        
    # get A, B, C 
    cells_a = np.intersect1d(cells_rna[cond_a], cells_atac)
    cells_b = np.intersect1d(cells_rna[cond_b], cells_atac)
    cells_c = np.intersect1d(cells_rna[cond_c], cells_atac)
        
    # peak size (500bp)
    mat_a = np.array(adata_pk[cells_a].X.todense()) 
    mat_b = np.array(adata_pk[cells_b].X.todense()) 
    mat_c = np.array(adata_pk[cells_c].X.todense()) 
    
    # size norm (CPM) between cells
    mat_a = mat_a/np.sum(mat_a, axis=1).reshape(-1,1)*1e6
    mat_b = mat_b/np.sum(mat_b, axis=1).reshape(-1,1)*1e6
    mat_c = mat_c/np.sum(mat_c, axis=1).reshape(-1,1)*1e6
    
    logmat_a = np.log2(mat_a+offset)
    logmat_b = np.log2(mat_b+offset)
    logmat_c = np.log2(mat_c+offset)
    
    # DEGs
    l2fc_ca = np.log2((np.mean(mat_c, axis=0)+offset)) - np.log2(np.mean(mat_a, axis=0)+offset)
    l2fc_ba = np.log2((np.mean(mat_b, axis=0)+offset)) - np.log2(np.mean(mat_a, axis=0)+offset)
    l2fc_bc = np.log2((np.mean(mat_b, axis=0)+offset)) - np.log2(np.mean(mat_c, axis=0)+offset)
    
    ts, ps_ca = stats.ttest_ind(logmat_c, logmat_a)
    ts, ps_ba = stats.ttest_ind(logmat_b, logmat_a)
    ts, ps_bc = stats.ttest_ind(logmat_b, logmat_c)
    
    rs, qs_ca, _, _ = multipletests(np.nan_to_num(ps_ca, nan=1).reshape(-1,), method='fdr_bh')
    rs, qs_ba, _, _ = multipletests(np.nan_to_num(ps_ba, nan=1).reshape(-1,), method='fdr_bh')
    rs, qs_bc, _, _ = multipletests(np.nan_to_num(ps_bc, nan=1).reshape(-1,), method='fdr_bh')
    
    # sig_cond
    qs_a   = np.minimum(qs_ca, qs_ba) # the better of the two
    qs_c   = np.minimum(qs_ca, qs_bc) # the better of the two
    qs_b   = np.minimum(qs_ba, qs_bc) # the better of the two

    l2fc_a = np.max([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
    l2fc_c = np.max([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
    l2fc_b = np.max([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change
    
    l2fc_as = np.min([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
    l2fc_cs = np.min([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
    l2fc_bs = np.min([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change

    cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th], axis=0)
    cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th], axis=0)
    cond_sig_b = np.all([ l2fc_ba > 0,  l2fc_bc > 0, l2fc_b > l2fc_th, qs_b < alpha_th, l2fc_bs > l2fc_th_s], axis=0)
    
    print(cond_code, cond_name, len(mat_a), len(mat_b), len(mat_c), 
          cond_sig_a.sum(), cond_sig_c.sum(), cond_sig_b.sum())
    
    # save this
    l2fc_tensor[cond_code, 0] = l2fc_ca
    l2fc_tensor[cond_code, 1] = l2fc_ba
    l2fc_tensor[cond_code, 2] = l2fc_bc
    
    qs_tensor[cond_code, 0] = qs_ca
    qs_tensor[cond_code, 1] = qs_ba
    qs_tensor[cond_code, 2] = qs_bc
    

In [None]:
# %%time
# fout1 = '/u/home/f/f7xiesnm/v1_multiome/DAR_l23abc_qs_250411.npy'
# fout2 = '/u/home/f/f7xiesnm/v1_multiome/DAR_l23abc_l2fc_250411.npy'
# fout3 = '/u/home/f/f7xiesnm/v1_multiome/DAR_l23abc_cpm_tensor_250411.npy'

# np.save(fout1, qs_tensor)
# np.save(fout2, l2fc_tensor)

# check results and stats

In [None]:
l2fc_th = np.log2(2)
l2fc_th_s = np.log2(1.2)
alpha_th = 0.05

In [None]:
qs_ca   = qs_tensor[:,0,:]
qs_ba   = qs_tensor[:,1,:]
qs_bc   = qs_tensor[:,2,:]

l2fc_ca = l2fc_tensor[:,0,:]
l2fc_ba = l2fc_tensor[:,1,:]
l2fc_bc = l2fc_tensor[:,2,:]

In [None]:
qs_a   = np.minimum(qs_ca, qs_ba) # the better of the two
qs_c   = np.minimum(qs_ca, qs_bc) # the better of the two
qs_b   = np.minimum(qs_ba, qs_bc) # the better of the two

l2fc_a = np.max([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
l2fc_c = np.max([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
l2fc_b = np.max([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change


l2fc_as = np.min([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
l2fc_cs = np.min([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
l2fc_bs = np.min([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change


cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th], axis=0)
cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th], axis=0)
# cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th, l2fc_as > l2fc_th_s], axis=0)
# cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th, l2fc_cs > l2fc_th_s], axis=0)
cond_sig_b = np.all([ l2fc_ba > 0,  l2fc_bc > 0, l2fc_b > l2fc_th, qs_b < alpha_th, l2fc_bs > l2fc_th_s], axis=0)


instances, counts_a = np.unique(cond_sig_a.sum(axis=0), return_counts=True)
instances, counts_c = np.unique(cond_sig_c.sum(axis=0), return_counts=True)
instances, counts_b = np.unique(cond_sig_b.sum(axis=0), return_counts=True)

print('num A genes for each cond:\t', cond_sig_a.sum(axis=1), np.any(cond_sig_a, axis=0).sum())
print('num C genes for each cond:\t', cond_sig_c.sum(axis=1), np.any(cond_sig_c, axis=0).sum())
print('num B genes for each cond:\t', cond_sig_b.sum(axis=1), np.any(cond_sig_b, axis=0).sum())

print('num A genes in num conds:\t',  counts_a[1:])
print('num C genes in num conds:\t',  counts_c[1:])
print('num B genes in num conds:\t',  counts_b[1:])

In [None]:
a_any = np.sort(adata_pk.var[np.any(cond_sig_a, axis=0)].index.values)
a_all = np.sort(adata_pk.var[np.all(cond_sig_a, axis=0)].index.values)

c_any = np.sort(adata_pk.var[np.any(cond_sig_c, axis=0)].index.values)
c_all = np.sort(adata_pk.var[np.all(cond_sig_c, axis=0)].index.values)
ac_overlap = np.sort(adata_pk.var[np.logical_and(np.any(cond_sig_a, axis=0), np.any(cond_sig_c, axis=0))].index.values)

b_any = np.sort(adata_pk.var[np.any(cond_sig_b, axis=0)].index.values)
b_all = np.sort(adata_pk.var[np.all(cond_sig_b, axis=0)].index.values)


abc_any = np.sort(adata_pk.var[(np.any(cond_sig_a, axis=0)+
                                np.any(cond_sig_b, axis=0)+
                                np.any(cond_sig_c, axis=0))>0].index.values)

print('a any', a_any.shape)
print('a all', a_all.shape)

print('c any', c_any.shape)
print('c all', c_all.shape)
print('ac overlap', ac_overlap.shape)

print('b any', b_any.shape)
print('b all', b_all.shape)

print('abc any', abc_any.shape)

In [None]:
sample_conditions = np.array(list(condcode2cond.values()))
n_cond = len(sample_conditions)

nr_condcodes  = np.array([0,1,2,3,5,7,9])
dr_condcodes  = np.array([4,6,8,10])

nr_conditions = sample_conditions[nr_condcodes]
dr_conditions = sample_conditions[dr_condcodes]

nr_times = [6,8,10,12,14,17,21]
dr_times =        [12,14,17,21]
times = nr_times

print(sample_conditions)
print(nr_condcodes, nr_conditions)
print(dr_condcodes, dr_conditions)

In [None]:
fig, axs = plt.subplots(1,3, figsize=(3*3,1*4), sharex=True, sharey=True)

num_uniq_a = np.sum(np.any(cond_sig_a, axis=0))
num_uniq_b = np.sum(np.any(cond_sig_b, axis=0))
num_uniq_c = np.sum(np.any(cond_sig_c, axis=0))

ax = axs[0]
ax.plot(times, cond_sig_a.sum(axis=1)[nr_condcodes], 
        '-o', color='C0', fillstyle='none', label=f'A DARs (unique n = {num_uniq_a:,})') 
ax.grid(False, axis='x')
sns.despine(ax=ax)
ax.set_ylabel('number of DARs')
ax.set_title('A regions')

ax = axs[1]
ax.plot(times, cond_sig_b.sum(axis=1)[nr_condcodes],
        '-o', color='C1', fillstyle='none', label=f'B DARs (unique n = {num_uniq_b:,})')
ax.grid(False, axis='x')
sns.despine(ax=ax)
ax.set_title('B regions')

ax = axs[2]
ax.plot(times, cond_sig_c.sum(axis=1)[nr_condcodes], 
        '-o', color='C2', fillstyle='none', label=f'C DARs (unique n = {num_uniq_c:,})') 
ax.grid(False, axis='x')
ax.set_xticks([6,10,14,17,21])
sns.despine(ax=ax)
ax.set_title('C regions')

fig.tight_layout()
output = os.path.join(outfigdir, 'abc_dars_over_time_v0.pdf') 
powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(1*4,1*4)) # , sharex=True)

num_uniq_a = np.sum(np.any(cond_sig_a, axis=0))
num_uniq_b = np.sum(np.any(cond_sig_b, axis=0))
num_uniq_c = np.sum(np.any(cond_sig_c, axis=0))

ax.plot(times, cond_sig_a.sum(axis=1)[nr_condcodes], 
        '-o', color='C0', fillstyle='none', label=f'A DARs (unique n = {num_uniq_a:,})') 
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)

ax.plot(times, cond_sig_c.sum(axis=1)[nr_condcodes], 
        '-o', color='C2', fillstyle='none', label=f'C DARs (unique n = {num_uniq_c:,})') 
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)

ax.plot(times, cond_sig_b.sum(axis=1)[nr_condcodes],
        '-o', color='C1', fillstyle='none', label=f'B DARs (unique n = {num_uniq_b:,})')
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)

ax.set_ylabel('number of DARs')
# ax.set_ylim(ymin=-500, ymax=12500)
ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)
ax.legend(bbox_to_anchor=(1,1))


fig.tight_layout()
output = os.path.join(outfigdir, 'abc_dars_over_time.pdf') 
powerplots.savefig_autodate(fig, output)

plt.show()

# get the full matrix (CPM - 5 fold) 

In [None]:
# %%time
# n_type = 5

# atac_tensor = np.zeros((n_cond, n_type, n_peak))
# for i, cond in enumerate(sample_conditions):
#     # get sub
#     adatasub = adata[adata.obs['cond']==cond]
#     cells_rna = adatasub.obs.index.values
#     # x = adatasub.obsm['pca_p17on'][...,0]
#     x = adatasub.obs['scores_c-a'].values
#     cells_rna_type = pd.qcut(x, n_type, labels=False)
    
#     # get ATAC
#     adata_pk = adatas_pk[i]
#     cells_atac = adata_pk.obs.index.values
    
#     # per type
#     for j in range(n_type):
#         cells_j = np.intersect1d(cells_rna[cells_rna_type==j], cells_atac)
#         atac_tensor[i,j] = np.array(np.sum(adata_pk[cells_j].X, axis=0))
        
# # CPM
# atac_tensor_cov = np.sum(atac_tensor, axis=2)
# atac_tensor = atac_tensor/np.expand_dims(atac_tensor_cov, axis=2)*1e6 # .reshape(n_cond, n_type, -1) # *1e6
# print(atac_tensor.shape)
# np.sum(atac_tensor, axis=2)

In [None]:
# np.save(fout3, atac_tensor)

# check effect size 

In [None]:
# adata.obs['sample'].unique()
import re

todo_conds = [
    'P12DR', 'P14DR', 'P17DR', 'P21DR',
    'P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21', 
]
todo_samps = [
    'P12DRa', 'P12DRb',
    'P14DRa', 'P14DRb',
    'P17DRa', 'P17DRb',
    'P21DRa', 'P21DRb',
    'P6a', 'P6b', 'P6c', 
    'P8a', 'P8b', 'P8c', 
    'P10a', 'P10b', 
    'P12a', 'P12b', 'P12c', 
    'P14a', 'P14b',
    'P17a', 'P17b', 
    'P21a', 'P21b', 
]
todo_conds_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_conds])
todo_samps_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_samps])
print(todo_conds_t)
print(todo_samps_t)

def mean_over_samples(mmat_res_samp):
    """25 samples to 11 conditions
    """
    assert mmat_res_samp.shape[0] == 25
    
    mmat_res_samp_mean = np.zeros(mmat_res_samp.shape)[:11]
    mmat_res_samp_mean[0] = np.mean(mmat_res_samp[ :2], axis=0)
    mmat_res_samp_mean[1] = np.mean(mmat_res_samp[2:4], axis=0)
    mmat_res_samp_mean[2] = np.mean(mmat_res_samp[4:6], axis=0)
    mmat_res_samp_mean[3] = np.mean(mmat_res_samp[6:8], axis=0)

    mmat_res_samp_mean[4] = np.mean(mmat_res_samp[8:11], axis=0)
    mmat_res_samp_mean[5] = np.mean(mmat_res_samp[11:14], axis=0)
    mmat_res_samp_mean[6] = np.mean(mmat_res_samp[14:16], axis=0)
    mmat_res_samp_mean[7] = np.mean(mmat_res_samp[16:19], axis=0)
    mmat_res_samp_mean[8] = np.mean(mmat_res_samp[19:21], axis=0)
    mmat_res_samp_mean[9] = np.mean(mmat_res_samp[21:23], axis=0)
    mmat_res_samp_mean[10] = np.mean(mmat_res_samp[23:  ], axis=0)
    
    return mmat_res_samp_mean

def transform_bigredmat(bigmat):
    """bigmat or redmat
    to fmat and zmat
    """
    fmat = bigmat.reshape(-1, bigmat.shape[-1]).T
    fmat = np.hstack([fmat[:,4*5:], fmat[:,:4*5]]) # CHANGED COLUMN ORDER!!
    zmat = zscore(fmat, axis=1)
    
    return fmat, zmat

In [None]:
cond_sig_a_any = np.any(cond_sig_a, axis=0)
cond_sig_b_any = np.any(cond_sig_b, axis=0)
cond_sig_c_any = np.any(cond_sig_c, axis=0)

In [None]:
%%time
mat = []
for cond_code in condcode2cond.keys():
    adata_pk = adatas_pk[cond_code]
    mat.append(np.array(adata_pk.X.todense()))
mat = np.vstack(mat)
mat = mat/np.sum(mat, axis=1).reshape(-1,1)*1e6
atac_l23baseline = np.log2(np.mean(mat, axis=0)+offset)

In [None]:
offset

In [None]:
%%time

n_type = 5
frac_archetypal_cells_viz = 0.2
bigmat_nfd = np.zeros((len(todo_samps), n_type, mat.shape[1]))
bigmat_abc = np.zeros((len(todo_samps),      3, mat.shape[1]))

for i, samp in enumerate(todo_samps):
    cond_name = samp[:-1]
    cond_code = cond2condcode[cond_name]
    print(samp, cond_name, cond_code)
    
    # get ATAC
    adata_pk = adatas_pk[cond_code]
    cells_atac = adata_pk.obs.index.values
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    cells_rna = adatasub.obs.index.values
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    # per type
    cells_type_nfd = pd.qcut(ranks_ac, n_type, labels=False)
    for j in range(n_type):
        cond_j = cells_type_nfd==j
        cells_j = np.intersect1d(cells_rna[cond_j], cells_atac)
        mat_j = np.array(adata_pk[cells_j].X.todense()) 
        mat_j = mat_j/np.sum(mat_j, axis=1).reshape(-1,1)*1e6
        mmat_j = np.log2(np.mean(mat_j, axis=0)+offset) - atac_l23baseline
        bigmat_nfd[i,j] = mmat_j
    
    # A, B, C
    num_archetypal_cells_viz = int(n_cells*frac_archetypal_cells_viz)
    
    precond_a = ranks_ac <= num_archetypal_cells_viz
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells_viz
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells_viz
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
        
    # get A, B, C 
    cells_a = np.intersect1d(cells_rna[cond_a], cells_atac)
    cells_b = np.intersect1d(cells_rna[cond_b], cells_atac)
    cells_c = np.intersect1d(cells_rna[cond_c], cells_atac)
    
    for j, cond_j in enumerate([cond_a, cond_b, cond_c]):
        cells_j = np.intersect1d(cells_rna[cond_j], cells_atac)
        mat_j = np.array(adata_pk[cells_j].X.todense()) 
        mat_j = mat_j/np.sum(mat_j, axis=1).reshape(-1,1)*1e6
        mmat_j = np.log2(np.mean(mat_j, axis=0)+offset) - atac_l23baseline
        bigmat_abc[i,j] = mmat_j

In [None]:
print(bigmat_abc.shape) # cond, type, gene

In [None]:
bigmat_abc_ig_list = [
    np.mean(bigmat_abc[:,:,cond_sig_a_any], axis=-1),
    np.mean(bigmat_abc[:,:,cond_sig_b_any], axis=-1),
    np.mean(bigmat_abc[:,:,cond_sig_c_any], axis=-1),
]

redmat_abc_ig_list = [mean_over_samples(x) for x in bigmat_abc_ig_list]

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(3*3,1*3), sharex=True, sharey=True)
for i in range(3):
    ax = axs[i]
    bigmat_mean_ig = bigmat_abc_ig_list[i]
    redmat_mean_ig = redmat_abc_ig_list[i]

    ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
    ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
    ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,2], 'o', markersize=5, fillstyle='none', color='C2')
    
    ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,0], '-', color='C0')
    ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,1], '-', color='C1')
    ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,2], '-', color='C2')
    
    ax.grid(False)
    ax.set_xticks([6,10,14,17,21])
    sns.despine(ax=ax)

axs[0].set_xlabel('Postnatal day (P)')
axs[0].set_ylabel('Chromatin acc.\nlog2(archetype / baseline)')
axs[0].set_title(f'A regions\nn={num_uniq_a:,}')
axs[1].set_title(f'B regions\nn={num_uniq_b:,}')
axs[2].set_title(f'C regions\nn={num_uniq_c:,}')
output = os.path.join(outfigdir, 'abc_degs_signals_over_time.pdf') 
powerplots.savefig_autodate(fig, output)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(3*3,1*3), sharex=True, sharey=True)
for i in range(3):
    ax = axs[i]
    bigmat_mean_ig = bigmat_abc_ig_list[i]
    redmat_mean_ig = redmat_abc_ig_list[i]
    
    ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
    ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
    ax.plot(todo_samps_t[8:], bigmat_mean_ig[8:,2], 'o', markersize=5, fillstyle='none', color='C2')
    
    ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,0], 's', markersize=5, fillstyle='none', color='C0', alpha=0.5)
    ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,1], 's', markersize=5, fillstyle='none', color='C1', alpha=0.5)
    ax.plot(todo_samps_t[:8], bigmat_mean_ig[:8,2], 's', markersize=5, fillstyle='none', color='C2', alpha=0.5)
    
    ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,0], '-', color='C0')
    ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,1], '-', color='C1')
    ax.plot(todo_conds_t[4:], redmat_mean_ig[4:,2], '-', color='C2')
    
    ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,0], '-', color='C0', alpha=0.5)
    ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,1], '-', color='C1', alpha=0.5)
    ax.plot(todo_conds_t[:4], redmat_mean_ig[:4,2], '-', color='C2', alpha=0.5)

    ax.grid(False)
    ax.set_xticks([6,10,14,17,21])
    sns.despine(ax=ax)

axs[0].set_xlabel('Postnatal day (P)')
axs[0].set_ylabel('Chromatin acc.\nlog2(archetype / baseline)')
axs[0].set_title(f'A regions\nn={num_uniq_a:,}')
axs[1].set_title(f'B regions\nn={num_uniq_b:,}')
axs[2].set_title(f'C regions\nn={num_uniq_c:,}')
output = os.path.join(outfigdir, 'abc_degs_signals_over_time_withDR.pdf') 
powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
redmat_nfd = mean_over_samples(bigmat_nfd)
fmat_nfd, zmat_nfd = transform_bigredmat(redmat_nfd)
print(redmat_nfd.shape) # cond, type, gene
print(fmat_nfd.shape)   # gene, cond*type
print(zmat_nfd.shape)   # gene, cond*type

fmat_nfd_ag = fmat_nfd[cond_sig_a_any]
fmat_nfd_bg = fmat_nfd[cond_sig_b_any]
fmat_nfd_cg = fmat_nfd[cond_sig_c_any]

zmat_nfd_ag = zmat_nfd[cond_sig_a_any]
zmat_nfd_bg = zmat_nfd[cond_sig_b_any]
zmat_nfd_cg = zmat_nfd[cond_sig_c_any]

print(fmat_nfd_ag.shape, zmat_nfd_ag.shape)

In [None]:
redmat_nfd_ag = redmat_nfd[:,:,cond_sig_a_any] #.shape
redmat_nfd_bg = redmat_nfd[:,:,cond_sig_b_any] #.shape
redmat_nfd_cg = redmat_nfd[:,:,cond_sig_c_any] #.shape
redmat_nfd_ag.shape


In [None]:
from sklearn.cluster import KMeans

def mean_shape(vec):
    """
    """
    loc = np.arange(len(vec))
    
    # vec_n = (vec-np.min(vec))/(np.max(vec)-np.min(vec))
    vec_n = np.clip(vec, 0, None)
    vec_n = vec_n/np.sum(vec_n)
    
    ctrd = loc.dot(vec_n)
    return ctrd

def organize_zmat(zmat, fmat, redmat, title='', n_geneset_clsts=5, genes=None):
    """NOTE THAT THE ORDER OF COND is DIFFERRENT BETWEEN (zmat, fmat) - DR first) and (redmat) - NR first)
    """
    method = KMeans(n_clusters=n_geneset_clsts, n_init=10, random_state=0)
    geneset_clst = method.fit_predict(zmat)

    # average over genes per geneset and cell clusters - leave genesets and conditions there
    time_sketches = []
    for i in range(n_geneset_clsts):
        time_sketch = np.mean(redmat[:,:,geneset_clst==i], axis=2) # mean over genes
        time_sketch = np.max(time_sketch, axis=1) # max over cell types
        time_sketches.append(time_sketch)
    time_sketches = np.vstack(time_sketches)[:,4:] # n_geneset_clsts, n_cond (select NR only)

    # clst_order = [2,1,3,4,0]
    # clst_order = np.argsort(np.argmax(ctrds, axis=1)) 
    clst_order = np.argsort([mean_shape(time_sketch) for time_sketch in time_sketches]) 
    geneset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(geneset_clst).values
    geneset_order = np.argsort(geneset_clst_renamed)
    
    # reorder 
    genes_ordered = genes[geneset_order]
    clsts_ordered = geneset_clst_renamed[geneset_order]
    zmat_ordered = zmat[geneset_order] 
    fmat_ordered = fmat[geneset_order] 
    
    # gene list per group
    geneset_list = []
    for i in range(n_geneset_clsts):
        geneset_list.append(genes_ordered[clsts_ordered==i])
    
    res = {
        'title': title,
        'order': geneset_order,
        'zmat':  zmat_ordered,
        'fmat':  fmat_ordered,
        'genes': genes_ordered,
        'clst':  clsts_ordered,
        'time_sketches':  time_sketches[clst_order],
        'geneset_list': geneset_list,
    }
    return res

In [None]:
regions = adata_pk.var.index.values
regions.shape

In [None]:
res_a = organize_zmat(zmat_nfd_ag, fmat_nfd_ag, redmat_nfd_ag, title='A regions', genes=regions[cond_sig_a_any])
res_b = organize_zmat(zmat_nfd_bg, fmat_nfd_bg, redmat_nfd_bg, title='B regions', genes=regions[cond_sig_b_any])
res_c = organize_zmat(zmat_nfd_cg, fmat_nfd_cg, redmat_nfd_cg, title='C regions', genes=regions[cond_sig_c_any])

In [None]:

for res in [res_a, res_c, res_b]:
    fig, axs = plt.subplots(1,5,figsize=(5*3,1*3))
    for i in range(5):
        ax = axs[i]
        ax.plot(res['time_sketches'][i])
        ax.set_title(f"{mean_shape(res['time_sketches'][i]):.2f}")


In [None]:
for res_this in [res_a, res_c, res_b]:
    order = res_this['order']
    title = res_this['title']
    zmat  = res_this['zmat']
    clsts = res_this['clst']
    genes_this = res_this['genes']

    fig, ax = plt.subplots(figsize=(10,12))

    sns.heatmap(zmat, cmap='coolwarm', cbar_kws=dict(shrink=0.5), 
                yticklabels=5000,
                vmax=3, vmin=-3,
                rasterized=True,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    # ax.set_yticks(0.5+np.arange(len(zmat)))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10, rotation=0)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=10, rotation=0)
    ax.set_title(title, pad=30)
    
    ax.hlines(np.cumsum(np.unique(clsts, return_counts=True)[1]), 0, 55, color='white', linewidth=1)
    ax.vlines(np.arange(0,55,5), 0, len(zmat), color='white', linewidth=1)
    ax.vlines(7*5, 0, len(zmat), color='black', linewidth=1)

    ax.grid(False)
    for i, cond in enumerate(np.hstack([todo_conds[4:], todo_conds[:4]])):
        # ax.axvline(condcode*5, color='k', linestyle='--', linewidth=1)
        ax.text(i*5, -0.5, f'{cond}', fontsize=10, va='bottom')

    output = os.path.join(outfigdir, f'heatmap_{title[0]}.pdf')
    powerplots.savefig_autodate(fig, output)
    plt.show()

In [None]:
res_b.keys()


In [None]:
res_b['order']

In [None]:
res_b['genes']

In [None]:
res_b['clst']

In [None]:
res_b['genes'][res_b['clst']==1]
# df_tosave = pd.DataFrame(list(pd.Series(res['geneset_list'][i]).str.split(pat=':|-'))).sort_values([0,1,2])#.shape #[res['clst']==0]

In [None]:
res_b['geneset_list'][1]

In [None]:
res_b['clst'][basicu.get_index_from_array(res_b['genes'], ['chr19:48254253-48254754'])]

In [None]:
res_a['clst'][basicu.get_index_from_array(res_a['genes'], ['chr3:141618480-141618981'])]

In [None]:
res_c['clst'][basicu.get_index_from_array(res_c['genes'], ['chr17:52873088-52873589'])]

In [None]:
for res_this in [res_a, res_c, res_b]:
    order = res_this['order']
    title = res_this['title']
    zmat  = res_this['zmat']
    clsts = res_this['clst']
    genes_this = res_this['genes']

    fig, ax = plt.subplots(figsize=(10,2))

    ctrds = np.array([np.mean(zmat[clsts==i], axis=0) for i in range(5)])
    sns.heatmap(ctrds, cmap='coolwarm', cbar_kws=dict(shrink=0.5), 
                vmax=2, vmin=-2,
                rasterized=True,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10, rotation=0)
    ax.set_title(title, pad=30)
    
    ax.hlines(np.cumsum(np.unique(clsts, return_counts=True)[1]), 0, 55, color='white', linewidth=1)
    ax.vlines(np.arange(0,55,5), 0, 5, color='white', linewidth=1)
    ax.vlines(7*5, 0, 5, color='black', linewidth=1)

    ax.grid(False)
    for i, cond in enumerate(np.hstack([todo_conds[4:], todo_conds[:4]])):
        # ax.axvline(condcode*5, color='k', linestyle='--', linewidth=1)
        ax.text(i*5, -0.5, f'{cond}', fontsize=10, va='bottom')

    # output = os.path.join(outfigdir, f'heatmap_centroids_{title[0]}.pdf')
    # powerplots.savefig_autodate(fig, output)
    plt.show()
    

In [None]:
for res_this in [res_a, res_c, res_b]:
    title = res_this['title']
    zmat  = res_this['zmat']
    clsts = res_this['clst']

    fig, ax = plt.subplots(figsize=(8,2))

    ctrds = np.array([np.mean(zmat[clsts==i], axis=0) for i in range(5)])
    ctrds = ctrds[:,:-5*4]
    sns.heatmap(ctrds, cmap='coolwarm', cbar_kws=dict(shrink=0.5), 
                vmax=2, vmin=-2,
                rasterized=True,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10, rotation=0)
    ax.set_title(title, pad=30)
    
    num_clsts = np.unique(clsts, return_counts=True)[1]
    ax.set_yticklabels([f'{title[0]}{i+1} ({num_clsts[i]}r)' for i in range(5)], rotation=0)
    ax.vlines(np.arange(0,5*7,5), 0, 5, color='white', linewidth=1)

    ax.grid(False)
    for i, cond in enumerate(np.hstack([todo_conds[4:]])):
        ax.text(i*5, -0.5, f'{cond}', fontsize=10, va='bottom')

    output = os.path.join(outfigdir, f'heatmap_centroids_{title[0]}.pdf')
    powerplots.savefig_autodate(fig, output)
    plt.show()
    # break
    

# Profile these modules

In [None]:
times = np.array([6,8,10,12,14,17,21])
dr_times = np.array([12,14,17,21])

In [None]:
from matplotlib.colors import LinearSegmentedColormap

colors_a = [(0.0, 'black'), (1.0, 'C0')]      
colors_b = [(0.0, 'black'), (1.0, 'C1')]      
colors_c = [(0.0, 'black'), (1.0, 'C2')]      

# Create a custom colormap using LinearSegmentedColormap
cmap_a = LinearSegmentedColormap.from_list('cmap_a', colors_a)
cmap_b = LinearSegmentedColormap.from_list('cmap_b', colors_b)
cmap_c = LinearSegmentedColormap.from_list('cmap_c', colors_c)

colors_l23 = [
    np.array(cmap_a(1.0)),
    0.7*np.array(cmap_a(1.0))+0.3*np.array(cmap_b(1.0)),
    np.array(cmap_b(1.0)),
    0.7*np.array(cmap_b(1.0))+0.3*np.array(cmap_c(1.0)),
    np.array(cmap_c(1.0)),
]

# Another version with ABC scores
- and replicates

In [None]:
        
for suptitle, res_this, cond_sig_i_any in zip(
    ['A_genes', 'C_genes', 'B_genes',],
    [res_a, res_c, res_b,],
    [cond_sig_a_any, cond_sig_c_any, cond_sig_b_any,],
    ):
    order = res_this['order']
    clsts = res_this['clst']
    bigmat_abc_ig_order = bigmat_abc[:,:,cond_sig_i_any][:,:,order] # .shape
    
    fig, axs = plt.subplots(1, 5, figsize=(3*5,1*4), sharex=True, sharey=True)
    for i in range(5):
        ax = axs[i]
        num = (clsts==i).sum()
        # prop = (clsts == i).sum() / len(clsts) 

        big_y = np.mean(bigmat_abc_ig_order[:,:,clsts==i], axis=2)#.reshape(11,5)[:-4]# [nr_condcodes] # .shape
        red_y = mean_over_samples(big_y) # np.mean(bigmat_abc_ag_order[:,:,clsts==i], axis=2)#.reshape(11,5)[:-4]# [nr_condcodes] # .shape

        ax.plot(todo_samps_t[8:], big_y[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[8:], big_y[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[8:], big_y[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[4:], red_y[4:,0], '-', color='C0')
        ax.plot(todo_conds_t[4:], red_y[4:,1], '-', color='C1')
        ax.plot(todo_conds_t[4:], red_y[4:,2], '-', color='C2')

        ax.grid(False, axis='x')
        ax.set_xticks([6,10,14,17,21])
        sns.despine(ax=ax)
        ax.set_title(f'{suptitle[0]}{i+1}: {num} regions')

    axs[0].set_xlabel('Postnatal day (P)')
    axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
    output = os.path.join(outfigdir, f'gene_groups_abc_v4_{suptitle}.pdf') 
    powerplots.savefig_autodate(fig, output)
    plt.show()

In [None]:
        
for suptitle, res_this, cond_sig_i_any in zip(
    ['A_genes', 'C_genes', 'B_genes',],
    [res_a, res_c, res_b,],
    [cond_sig_a_any, cond_sig_c_any, cond_sig_b_any,],
    ):
    order = res_this['order']
    clsts = res_this['clst']
    bigmat_abc_ig_order = bigmat_abc[:,:,cond_sig_i_any][:,:,order] # .shape
    
    fig, axs = plt.subplots(1, 5, figsize=(3*5,1*4), sharex=True, sharey=True)
    for i in range(5):
        ax = axs[i]
        prop = (clsts == i).sum() / len(clsts) 

        big_y = np.mean(bigmat_abc_ig_order[:,:,clsts==i], axis=2)#.reshape(11,5)[:-4]# [nr_condcodes] # .shape
        red_y = mean_over_samples(big_y) # np.mean(bigmat_abc_ag_order[:,:,clsts==i], axis=2)#.reshape(11,5)[:-4]# [nr_condcodes] # .shape

        ax.plot(todo_samps_t[8:], big_y[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[8:], big_y[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[8:], big_y[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[4:], red_y[4:,0], '-', color='C0')
        ax.plot(todo_conds_t[4:], red_y[4:,1], '-', color='C1')
        ax.plot(todo_conds_t[4:], red_y[4:,2], '-', color='C2')

        # DR
        ax.plot(todo_samps_t[:8], big_y[:8,0], 's', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[:8], big_y[:8,1], 's', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[:8], big_y[:8,2], 's', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[:4], red_y[:4,0], '-', color='C0', alpha=0.5)
        ax.plot(todo_conds_t[:4], red_y[:4,1], '-', color='C1', alpha=0.5)
        ax.plot(todo_conds_t[:4], red_y[:4,2], '-', color='C2', alpha=0.5)
        
        ax.grid(False, axis='x')
        ax.set_xticks([6,10,14,17,21])
        sns.despine(ax=ax)
        ax.set_title(f'{suptitle[0]}{i+1}: {prop*100: .1f}%')

    axs[0].set_xlabel('Postnatal day (P)')
    axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
    output = os.path.join(outfigdir, f'gene_groups_abc_v4_{suptitle}_DR.pdf') 
    powerplots.savefig_autodate(fig, output)
    plt.show()

In [None]:
for suptitle, res_this, cond_sig_i_any in zip(
    ['A_genes', 'C_genes', 'B_genes',],
    [res_a, res_c, res_b,],
    [cond_sig_a_any, cond_sig_c_any, cond_sig_b_any,],
    ):
    order = res_this['order']
    clsts = res_this['clst']
    bigmat_abc_ig_order = bigmat_abc[:,:,cond_sig_i_any][:,:,order] # .shape
    
    fig, axs = plt.subplots(1, 5, figsize=(5*3,1*4), sharex=True, sharey=True)
    for i in range(5):
        ax = axs[i]
        prop = (clsts == i).sum() / len(clsts) 

        big_y = np.mean(bigmat_abc_ig_order[:,:,clsts==i], axis=2)#.reshape(11,5)[:-4]# [nr_condcodes] # .shape
        red_y = mean_over_samples(big_y) # np.mean(bigmat_abc_ag_order[:,:,clsts==i], axis=2)#.reshape(11,5)[:-4]# [nr_condcodes] # .shape

        ax.plot(todo_samps_t[8:], big_y[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[8:], big_y[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[8:], big_y[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[4:], red_y[4:,0], '-', color='C0')
        ax.plot(todo_conds_t[4:], red_y[4:,1], '-', color='C1')
        ax.plot(todo_conds_t[4:], red_y[4:,2], '-', color='C2')

        # DR
        plot_offset = 12
        ax.axvspan(12-2+1            , 21+1            , color='orange', alpha=0.1, linewidth=0, zorder=0)
        ax.axvspan(12-2+1+plot_offset, 21+1+plot_offset, color='lightgray', alpha=0.3, linewidth=0, zorder=0)
        
        ax.plot(todo_samps_t[:8]+plot_offset, big_y[:8,0], 's', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[:8]+plot_offset, big_y[:8,1], 's', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[:8]+plot_offset, big_y[:8,2], 's', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[:4]+plot_offset, red_y[:4,0], '--', color='C0', alpha=1)
        ax.plot(todo_conds_t[:4]+plot_offset, red_y[:4,1], '--', color='C1', alpha=1)
        ax.plot(todo_conds_t[:4]+plot_offset, red_y[:4,2], '--', color='C2', alpha=1)
        
        ax.grid(False, axis='x')
        ax.set_xticks([6, 12, 21, 
                          12+plot_offset, 21+plot_offset])
        ax.set_xticklabels([6, 12, 21, 
                               12, 21])
        sns.despine(ax=ax)
        ax.set_title(f'{suptitle[0]}{i+1}: {prop*100: .1f}%')

    axs[0].set_xlabel('Postnatal day (P)')
    axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
    fig.tight_layout()
    
    output = os.path.join(outfigdir, f'gene_groups_abc_v4_{suptitle}_DR2.pdf') 
    powerplots.savefig_autodate(fig, output)
    plt.show()
    
    # break

# Quantify time vs DR effect
- late are DR sensitive
- (P21NR-P10NR) vs (P21NR vs DR)
- refine this as the average time effect vs average DR effect

In [None]:
from sklearn.metrics import r2_score

In [None]:
dels = np.zeros((2,3,5)) # time vs vision; ABC; 5 gene sets 

for j, (res_this, cond_sig_i_any) in enumerate(zip(
    [res_a, res_b, res_c,],
    [cond_sig_a_any, cond_sig_b_any, cond_sig_c_any,],
    )):
    order = res_this['order']
    clsts = res_this['clst']
    bigmat_abc_ig_order = bigmat_abc[:,:,cond_sig_i_any][:,:,order] # .shape
    
    for i in range(5):
        big_y = np.mean(bigmat_abc_ig_order[:,:,clsts==i], axis=2)
        red_y = mean_over_samples(big_y) 
        
        del_t = np.mean(red_y[7+3]-red_y[3+3]) # mean over ABC
        
        del_v21 = np.mean(-red_y[7+3]+red_y[3]) # mean over ABC
        del_v17 = np.mean(-red_y[7+2]+red_y[2])
        del_v14 = np.mean(-red_y[7+1]+red_y[1])
        del_v12 = np.mean(-red_y[7+0]+red_y[0])
        
        del_varr = np.array([del_v21, del_v17, del_v14, del_v12])
        del_v = np.mean(del_varr) # mean over time
        # del_v = del_varr[np.argmax(np.abs(del_varr))] # max over time
        
        dels[0, j, i] = del_t
        dels[1, j, i] = del_v

In [None]:
_x = dels[0]
_y = dels[1]

_xflat, _yflat = _x.reshape(-1,), _y.reshape(-1,)
r, _ = stats.pearsonr(_xflat, _yflat)
slope, intercept = np.polyfit(_xflat, _yflat, 1)
xbase = np.linspace(-1.5,1.5,5) 
ybase = slope*xbase + intercept
r2 = r2_score(_yflat, _xflat*slope+intercept)

assert r**2 - r2 < 1e-3

fig, ax = plt.subplots(figsize=(5,4))
ax.scatter(_x[0], _y[0])
ax.scatter(_x[1], _y[1])
ax.scatter(_x[2], _y[2])
for i, (xt, yt) in enumerate(zip(_x[0], _y[0])): 
    ax.text(xt, yt+0.01, f'A{i+1}', fontsize=8, va='bottom', ha='center',)
for i, (xt, yt) in enumerate(zip(_x[1], _y[1])): 
    ax.text(xt, yt+0.01, f'B{i+1}', fontsize=8, va='bottom', ha='center',)
for i, (xt, yt) in enumerate(zip(_x[2], _y[2])): 
    ax.text(xt, yt+0.01, f'C{i+1}', fontsize=8, va='bottom', ha='center',)
    
ax.plot(xbase, ybase, '--k', linewidth=1, zorder=0)
ax.axvline(0, color='gray',  linewidth=1, zorder=0)
ax.axhline(0, color='gray',  linewidth=1, zorder=0)
ax.grid(False)
sns.despine(ax=ax)
ax.set_ylabel('log2(DR/NR)')
ax.set_xlabel('log2(P21/P10)')
ax.set_title(f'y={slope:.2f}x{intercept:.2f}; r={r:.2f}', fontsize=15)

output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
powerplots.savefig_autodate(fig, output)
plt.show()

# Check individual regions

In [None]:
def plot_genes(query):
    n = len(query)
    fig, axs = plt.subplots(1, n, figsize=(n*3,1*4), sharex=True) # , sharey=True)
    for i in range(n):
        ax = axs[i]
        gn = query[i]
        gidx = query_idx[i]

        big_y = bigmat_abc[:,:,gidx]
        red_y = mean_over_samples(big_y) 

        ax.plot(todo_samps_t[8:], big_y[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[8:], big_y[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[8:], big_y[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[4:], red_y[4:,0], '-', color='C0')
        ax.plot(todo_conds_t[4:], red_y[4:,1], '-', color='C1')
        ax.plot(todo_conds_t[4:], red_y[4:,2], '-', color='C2')

        # DR
        plot_offset = 12
        ax.axvspan(12-2+1            , 21+1            , color='orange', alpha=0.1, linewidth=0, zorder=0)
        ax.axvspan(12-2+1+plot_offset, 21+1+plot_offset, color='lightgray', alpha=0.3, linewidth=0, zorder=0)

        ax.plot(todo_samps_t[:8]+plot_offset, big_y[:8,0], 's', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[:8]+plot_offset, big_y[:8,1], 's', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[:8]+plot_offset, big_y[:8,2], 's', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[:4]+plot_offset, red_y[:4,0], '--', color='C0', alpha=1)
        ax.plot(todo_conds_t[:4]+plot_offset, red_y[:4,1], '--', color='C1', alpha=1)
        ax.plot(todo_conds_t[:4]+plot_offset, red_y[:4,2], '--', color='C2', alpha=1)

        ax.grid(False, axis='x')
        ax.set_xticks([6, 12, 21, 
                          12+plot_offset, 21+plot_offset])
        ax.set_xticklabels([6, 12, 21, 
                               12, 21])
        sns.despine(ax=ax)
        ax.set_title(gn)

    axs[0].set_xlabel('Postnatal day (P)')
    axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
    fig.tight_layout()

    # output = os.path.join(outfigdir, f'gene_groups_abc_v4_{suptitle}_DR2.pdf') 
    # powerplots.savefig_autodate(fig, output)
    plt.show()
    
def plot_genes_nr(query, output=None):
    n = len(query)
    fig, axs = plt.subplots(1, n, figsize=(n*3,1*4), sharex=True) # , sharey=True)
    for i in range(n):
        ax = axs[i]
        gn = query[i]
        gidx = query_idx[i]

        big_y = bigmat_abc[:,:,gidx]
        red_y = mean_over_samples(big_y) 

        ax.plot(todo_samps_t[8:], big_y[8:,0], 'o', markersize=5, fillstyle='none', color='C0')
        ax.plot(todo_samps_t[8:], big_y[8:,1], 'o', markersize=5, fillstyle='none', color='C1')
        ax.plot(todo_samps_t[8:], big_y[8:,2], 'o', markersize=5, fillstyle='none', color='C2')

        ax.plot(todo_conds_t[4:], red_y[4:,0], '-', color='C0')
        ax.plot(todo_conds_t[4:], red_y[4:,1], '-', color='C1')
        ax.plot(todo_conds_t[4:], red_y[4:,2], '-', color='C2')


        ax.grid(False, axis='x')
        ax.set_xticks([6, 10, 14, 21])
        sns.despine(ax=ax)
        ax.set_title(gn)

    axs[0].set_xlabel('Postnatal day (P)')
    axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
    fig.tight_layout()

    if output is not None:
        powerplots.savefig_autodate(fig, output)
    plt.show()

In [None]:
bigmat_abc.shape

In [None]:
query = c_all[:5]
query_idx = basicu.get_index_from_array(regions, query)
print(query_idx)
plot_genes_nr(query)

plot_genes(query)

# export bed files - for TFBS enrichment analysis

In [None]:
peakset = pd.DataFrame(list(adata_pk.var.index.str.split(pat=':|-'))) # 
peakset

In [None]:
for j in range(3):
    res = [res_a, res_b, res_c][j]
    tag = ['A', 'B', 'C'][j]
    
    for i in range(5):
        fout = f'~/v1_multiome/L23{tag}_regions_M{i+1}.bed'
        print(fout)
        df_tosave = pd.DataFrame(list(pd.Series(res['geneset_list'][i]).str.split(pat=':|-'))).sort_values([0,1,2])#.shape #[res['clst']==0]
        df_tosave.to_csv(fout, sep='\t', header=False, index=False)


In [None]:
# bedtofasta
# for i in *.bed; do j=${i/bed/fasta}; echo $i, $j; bedtools getfasta -fi ~/project-zipursky/v1-bb/v1/results_atac/mm10.fna -bed $i -fo $j; done

In [None]:
# run AME

In [None]:
# report ABC as a whole

In [None]:
peakset_a = peakset[np.any(cond_sig_a, axis=0)]
peakset_b = peakset[np.any(cond_sig_b, axis=0)]
peakset_c = peakset[np.any(cond_sig_c, axis=0)]
print(peakset_a.shape, peakset_b.shape, peakset_c.shape)
peakset_a

In [None]:
f_a = "/u/home/f/f7xiesnm/v1_multiome/atac_fragments/frag_bed/processed_peaks/consensus_peaks_v2_DAR_A.bed" 
f_b = "/u/home/f/f7xiesnm/v1_multiome/atac_fragments/frag_bed/processed_peaks/consensus_peaks_v2_DAR_B.bed" 
f_c = "/u/home/f/f7xiesnm/v1_multiome/atac_fragments/frag_bed/processed_peaks/consensus_peaks_v2_DAR_C.bed" 

# peakset_a.to_csv(f_a, sep='\t', header=False, index=False)
# peakset_b.to_csv(f_b, sep='\t', header=False, index=False)
# peakset_c.to_csv(f_c, sep='\t', header=False, index=False)