In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP
from statsmodels.stats.multitest import multipletests

from matplotlib.colors import LinearSegmentedColormap

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  


In [None]:
outfigdir = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures/250409"
!mkdir $outfigdir

# load gene annotation and data

In [None]:
gene_modules = GeneModules()
g, gs, ms = gene_modules.check_genes('Cdh13')
print("\t".join(g))
print("\t".join(gs))
print("\t".join(ms))

In [None]:
genes_alltime_hvgs = np.loadtxt('/u/home/f/f7xiesnm/v1_multiome/l23_alltime_hvgs_n4940.txt', dtype='str')
genes_alltime_hvgs

In [None]:
# use those 286 genes
df = pd.read_csv("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/res/L23-ABC-genes-n288-n286unq-annot_v3_july8_2024.csv")
genes_l23 = df['gene'].astype(str).values
genes_l23a = df[df['P17on']=='A']['gene'].astype(str).values
genes_l23b = df[df['P17on']=='B']['gene'].astype(str).values
genes_l23c = df[df['P17on']=='C']['gene'].astype(str).values

print(genes_l23a.shape, genes_l23b.shape, genes_l23c.shape)
genes_grp = df['P17on'].astype(str).values
assert len(genes_l23) == len(np.unique(genes_l23))

genes_l23.shape

In [None]:
genes_alltime_hvgs_rm_l23 = genes_alltime_hvgs[~np.isin(genes_alltime_hvgs, genes_l23)]
genes_alltime_hvgs_rm_l23.shape

In [None]:
scores_abc = pd.read_csv("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/scores_l23abc.csv", 
                         index_col=0,
                        )
scores_abc['scores_c-a'] = scores_abc['scores_c'] - scores_abc['scores_a']
scores_abc

In [None]:
adata = anndata.read("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad")
adata

In [None]:
adata.X = adata.raw.X

In [None]:
adata.obs['scores_a'] = scores_abc.loc[adata.obs.index,'scores_a'].copy()
adata.obs['scores_b'] = scores_abc.loc[adata.obs.index,'scores_b'].copy()
adata.obs['scores_c'] = scores_abc.loc[adata.obs.index,'scores_c'].copy()
adata.obs['scores_c-a'] = scores_abc.loc[adata.obs.index,'scores_c-a'].copy()

In [None]:
sample_labels = adata.obs['Sample'].values
time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

adata.obs['sample'] = sample_labels #
adata.obs['time']   = time_labels

uniq_samples = natsorted(np.unique(sample_labels))
nr_samples = [s for s in uniq_samples if "DR" not in s]
dr_samples = [s for s in uniq_samples if "DR" in s]

uniq_conds = np.array(natsorted(np.unique(adata.obs['cond'].values)))
print(uniq_conds)

In [None]:
nr_idx = np.array([0,1,2,4,6,8,10])
dr_idx = np.array([3,5,7,9])

nr_times = np.array([6,8,10,12,14,17,21])
dr_times = np.array(       [12,14,17,21])

In [None]:
# remove mitocondria genes
adata = adata[:,~adata.var.index.str.contains(r'^mt-')]
# remove sex genes
adata = adata[:,~adata.var.index.str.contains(r'^Xist$')]

# filter genes
cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
adata = adata[:,cond].copy()

In [None]:
# counts
x = adata.X
cov = np.ravel(np.sum(x, axis=1))
genes = adata.var.index.values

# CP10k
xn = (sparse.diags(1/cov).dot(x))*1e4

# log2(CP10k+1)
xln = xn.copy()
xln.data = np.log2(xln.data+1)

adata.layers[    'norm'] = np.array(xn.todense())
adata.layers[ 'lognorm'] = np.array(xln.todense())

In [None]:
genes_idx_alltime_hvgs_rm_l23 = basicu.get_index_from_array(adata.var.index.values, genes_alltime_hvgs_rm_l23)
genes_idx_alltime_hvgs_rm_l23

In [None]:
np.random.rand(0)

num_archetypal_cells = 100
offset = 1 # CP10k + offset (CPM + 100*offset)
SHUFFLE = False #False

n_pseudo_genes = 35
n_repeat = 2

n_cond = len(uniq_conds)
n_gene = adata.shape[1] 

qs_tensor   = np.zeros((n_cond,3,n_gene))  # 3 represents 3 pairwise comparisons (ca, ba, bc)
l2fc_tensor = np.zeros((n_cond,3,n_gene))

cond_sig_bp_tensor   = np.zeros((n_cond,n_repeat,n_gene))  # 3 represents 3 pairwise comparisons (ca, ba, bc)

for cond_code, cond in enumerate(uniq_conds):
    # get sub
    adatasub = adata[adata.obs['cond']==cond]
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    precond_a = ranks_ac <= num_archetypal_cells
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
    # SHUFFLE
    if SHUFFLE:
        adatasub = adatasub[np.random.choice(n_cells, size=n_cells, replace=False)]
    
    # print(precond_a.sum(), 
    #       precond_b.sum(), 
    #       precond_c.sum(),)
    print(cond, cond_a.sum(), cond_b.sum(), cond_c.sum())
    
    adatasub_a = adatasub[cond_a]
    adatasub_b = adatasub[cond_b]
    adatasub_c = adatasub[cond_c]
    
    # DEGs
    mat_a = adatasub_a.layers['norm'][...]
    mat_b = adatasub_b.layers['norm'][...]
    mat_c = adatasub_c.layers['norm'][...]
    
    logmat_a = adatasub_a.layers['lognorm'][...]
    logmat_b = adatasub_b.layers['lognorm'][...]
    logmat_c = adatasub_c.layers['lognorm'][...]
    
    ts_ca, ps_ca = stats.ttest_ind(logmat_c, logmat_a)
    ts_ba, ps_ba = stats.ttest_ind(logmat_b, logmat_a)
    ts_bc, ps_bc = stats.ttest_ind(logmat_b, logmat_c)
    
    _, qs_ca, _, _ = multipletests(np.nan_to_num(ps_ca, nan=1).reshape(-1,), method='fdr_bh') # why nan in ps -- not expressed
    _, qs_ba, _, _ = multipletests(np.nan_to_num(ps_ba, nan=1).reshape(-1,), method='fdr_bh') # why nan in ps -- not expressed
    _, qs_bc, _, _ = multipletests(np.nan_to_num(ps_bc, nan=1).reshape(-1,), method='fdr_bh') # why nan in ps -- not expressed
    
    l2fc_ca = np.log2(np.mean(mat_c, axis=0)+offset) - np.log2(np.mean(mat_a, axis=0)+offset) # log2FC (CP10k as raw counts)
    l2fc_ba = np.log2(np.mean(mat_b, axis=0)+offset) - np.log2(np.mean(mat_a, axis=0)+offset) # log2FC (CP10k as raw counts)
    l2fc_bc = np.log2(np.mean(mat_b, axis=0)+offset) - np.log2(np.mean(mat_c, axis=0)+offset) # log2FC (CP10k as raw counts)
    
    qs_a   = np.minimum(qs_ca, qs_ba) # the better of the two
    qs_c   = np.minimum(qs_ca, qs_bc) # the better of the two
    qs_b   = np.minimum(qs_ba, qs_bc) # the better of the two

    l2fc_a = np.max([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
    l2fc_c = np.max([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
    l2fc_b = np.max([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change

    cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > 1, qs_a < 0.05], axis=0)
    cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > 1, qs_c < 0.05], axis=0)
    cond_sig_b = np.all([ l2fc_ba > 0,  l2fc_bc > 0, l2fc_b > 1, qs_b < 0.05], axis=0)
    
    # save this
    l2fc_tensor[cond_code, 0] = l2fc_ca
    l2fc_tensor[cond_code, 1] = l2fc_ba
    l2fc_tensor[cond_code, 2] = l2fc_bc
    
    qs_tensor[cond_code, 0] = qs_ca
    qs_tensor[cond_code, 1] = qs_ba
    qs_tensor[cond_code, 2] = qs_bc
    
    # [0,1] scaled scores
    mat = adatasub.layers['lognorm'][...]
    mins = np.min(mat, axis=0)
    maxs = np.max(mat, axis=0)
    nmat = (mat - mins)/(maxs-mins+1e-10)
    
    # print(cond, cond_sig_a.sum(), cond_sig_c.sum(), cond_sig_b.sum()) 
    
    # select cells by pseudo scores (B)
    for i in range(n_repeat):
        bpseudo_idx = genes_idx_alltime_hvgs_rm_l23[np.random.choice(len(genes_idx_alltime_hvgs_rm_l23), size=n_pseudo_genes, replace=False)]
        scores_bpseudo = np.mean(nmat[:,bpseudo_idx], axis=1)
        ranks_bp = pd.Series(scores_bpseudo).rank()
        precond_bp = ranks_bp  > adatasub.shape[0] - num_archetypal_cells
        cond_bp = precond_bp

        adatasub_bp = adatasub[cond_bp]
        mat_bp = adatasub_bp.layers['norm'][...]
        logmat_bp = adatasub_bp.layers['lognorm'][...]
        ts_bpa, ps_bpa = stats.ttest_ind(logmat_bp, logmat_a)
        ts_bpc, ps_bpc = stats.ttest_ind(logmat_bp, logmat_c)
        _, qs_bpa, _, _ = multipletests(np.nan_to_num(ps_bpa, nan=1).reshape(-1,), method='fdr_bh') # why nan in ps -- not expressed
        _, qs_bpc, _, _ = multipletests(np.nan_to_num(ps_bpc, nan=1).reshape(-1,), method='fdr_bh') # why nan in ps -- not expressed
        l2fc_bpa = np.log2(np.mean(mat_bp, axis=0)+offset) - np.log2(np.mean(mat_a, axis=0)+offset) # log2FC (CP10k as raw counts)
        l2fc_bpc = np.log2(np.mean(mat_bp, axis=0)+offset) - np.log2(np.mean(mat_c, axis=0)+offset) # log2FC (CP10k as raw counts)

        qs_bp   = np.minimum(qs_bpa, qs_bpc) # the better of the two
        l2fc_bp = np.max([ l2fc_bpa,  l2fc_bpc], axis=0) # mean fold change
        l2fc_bps = np.min([ l2fc_bpa,  l2fc_bpc], axis=0) # mean fold change
        cond_sig_bp = np.all([ l2fc_bpa > 0,  l2fc_bpc > 0, l2fc_bp > 1, qs_bp < 0.05, l2fc_bps > np.log2(1.1)], axis=0)

        cond_sig_bp_tensor[cond_code, i] = cond_sig_bp
        
        # print(cond_sig_bp.sum(), end=',') 
        
    # print("")
    
    


# output, check results and stats

In [None]:
# %%time
# fout1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DEG_l23abc_qs_250409.npy'
# fout2 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DEG_l23abc_l2fc_250409.npy'
# fout3 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DEG_l23abc_gene_list_250409.csv'

# np.save(fout1, qs_tensor)
# np.save(fout2, l2fc_tensor)

In [None]:
# qs_tensor = np.load(fout1)
# l2fc_tensor = np.load(fout2)

l2fc_th = np.log2(2)
l2fc_th_s = np.log2(1.2)
alpha_th = 0.05

In [None]:
qs_ca   = qs_tensor[:,0,:]
qs_ba   = qs_tensor[:,1,:]
qs_bc   = qs_tensor[:,2,:]

l2fc_ca = l2fc_tensor[:,0,:]
l2fc_ba = l2fc_tensor[:,1,:]
l2fc_bc = l2fc_tensor[:,2,:]

In [None]:
qs_a   = np.minimum(qs_ca, qs_ba) # the better of the two
qs_c   = np.minimum(qs_ca, qs_bc) # the better of the two
qs_b   = np.minimum(qs_ba, qs_bc) # the better of the two

l2fc_a = np.max([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
l2fc_c = np.max([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
l2fc_b = np.max([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change


l2fc_as = np.min([-l2fc_ca, -l2fc_ba], axis=0) # mean fold change
l2fc_cs = np.min([ l2fc_ca, -l2fc_bc], axis=0) # mean fold change
l2fc_bs = np.min([ l2fc_ba,  l2fc_bc], axis=0) # mean fold change


cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th], axis=0)
cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th], axis=0)
# cond_sig_a = np.all([-l2fc_ca > 0, -l2fc_ba > 0, l2fc_a > l2fc_th, qs_a < alpha_th, l2fc_as > l2fc_th_s], axis=0)
# cond_sig_c = np.all([ l2fc_ca > 0, -l2fc_bc > 0, l2fc_c > l2fc_th, qs_c < alpha_th, l2fc_cs > l2fc_th_s], axis=0)
cond_sig_b = np.all([ l2fc_ba > 0,  l2fc_bc > 0, l2fc_b > l2fc_th, qs_b < alpha_th, l2fc_bs > l2fc_th_s], axis=0)


instances, counts_a = np.unique(cond_sig_a.sum(axis=0), return_counts=True)
instances, counts_c = np.unique(cond_sig_c.sum(axis=0), return_counts=True)
instances, counts_b = np.unique(cond_sig_b.sum(axis=0), return_counts=True)

print('num A genes for each cond:\t', cond_sig_a.sum(axis=1), np.any(cond_sig_a, axis=0).sum())
print('num C genes for each cond:\t', cond_sig_c.sum(axis=1), np.any(cond_sig_c, axis=0).sum())
print('num B genes for each cond:\t', cond_sig_b.sum(axis=1), np.any(cond_sig_b, axis=0).sum())

print('num A genes in num conds:\t',  counts_a[1:])
print('num C genes in num conds:\t',  counts_c[1:])
print('num B genes in num conds:\t',  counts_b[1:])

In [None]:
df_res_all = []

for label, cond_sig in zip(['A', 'C', 'B'], 
                           [cond_sig_a, cond_sig_c, cond_sig_b]):
    
    cond_idx, gene_idx = np.nonzero(cond_sig.astype(int))
    
    df_res = pd.DataFrame()
    df_res['cond'] = uniq_conds[cond_idx]
    df_res['gene'] = genes[gene_idx]
    df_res['archetype'] = label
    df_res_all.append(df_res)
    
df_res_all = pd.concat(df_res_all)
df_res_all
    

In [None]:
df_res_all.groupby('gene').sum()

In [None]:
# df_res_all.to_csv(fout3, header=True, index=False)

# further check

In [None]:
a_any = np.sort(adata.var[np.any(cond_sig_a, axis=0)].index.values)
a_all = np.sort(adata.var[np.all(cond_sig_a, axis=0)].index.values)

c_any = np.sort(adata.var[np.any(cond_sig_c, axis=0)].index.values)
c_all = np.sort(adata.var[np.all(cond_sig_c, axis=0)].index.values)
ac_overlap = np.sort(adata.var[np.logical_and(np.any(cond_sig_a, axis=0), np.any(cond_sig_c, axis=0))].index.values)

b_any = np.sort(adata.var[np.any(cond_sig_b, axis=0)].index.values)
b_all = np.sort(adata.var[np.all(cond_sig_b, axis=0)].index.values)

print('a any', a_any.shape)
print('a all', a_all.shape)

print('c any', c_any.shape)
print('c all', c_all.shape)
print('ac overlap', ac_overlap.shape)

print('b any', b_any.shape)
print('b all', b_all.shape)

In [None]:
a_all_annots, a_all_styled, a_all_annots_styled = gene_modules.check_genes(a_all)
c_all_annots, c_all_styled, c_all_annots_styled = gene_modules.check_genes(c_all)
b_all_annots, b_all_styled, b_all_annots_styled = gene_modules.check_genes(b_all)

ac_overlap_annots, ac_overlap_styled, ac_overlap_annots_styled = gene_modules.check_genes(ac_overlap)

print("\t".join(a_all_annots_styled)) # _styled))
print("---"*10) # _styled))
print("\t".join(c_all_annots_styled)) # _styled))
print("---"*10) # _styled))
print("\t".join(b_all_annots_styled)) # _styled))
print("---"*10) # _styled))
print("\t".join(ac_overlap_annots_styled)) # _styled))

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
ax.plot(np.arange(1,1+11)[::-1], np.cumsum(counts_a[1:][::-1]), '-o', label='A', color='C0')
ax.plot(np.arange(1,1+11)[::-1], np.cumsum(counts_c[1:][::-1]), '-o', label='C', color='C2')
ax.set_ylim(ymin=0)
ax.legend()
ax.set_ylabel('number of TFs')
ax.set_xlabel('number of time points')
sns.despine(ax=ax)
plt.show()

In [None]:
nums_a = cond_sig_a.sum(axis=1)
nums_c = cond_sig_c.sum(axis=1)
nums_b = cond_sig_b.sum(axis=1)

fig, axs = plt.subplots(1,2,figsize=(4*2,4))
ax = axs[0]
ax.plot(nr_times, nums_a[nr_idx], '-o' , fillstyle='none', label='A NR', color='C0')
ax.plot(dr_times, nums_a[dr_idx], '--s', fillstyle='none', label='A DR', color='C0')
ax.plot(nr_times, nums_c[nr_idx], '-o' , fillstyle='none', label='C NR', color='C2')
ax.plot(dr_times, nums_c[dr_idx], '--s', fillstyle='none', label='C DR', color='C2')
ax.set_xticks(nr_times)
ax.grid(False, axis='x')
ax.set_ylim(ymin=0) # , ymax=120)
ax.legend()
ax.set_ylabel('num. of gene')
ax.set_xlabel('time (P)')
sns.despine(ax=ax)

ax = axs[1]
ax.plot(nr_times, nums_b[nr_idx], '-o' , fillstyle='none', label='B NR', color='C1')
ax.plot(dr_times, nums_b[dr_idx], '--s', fillstyle='none', label='B DR', color='C1')
ax.set_xticks(nr_times)
ax.grid(False, axis='x')
ax.set_ylim(ymin=0) #, ymax=80)
ax.legend()
sns.despine(ax=ax)


output = os.path.join(outfigdir, 'num_degs_abc_1.pdf')
powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
nums_a = cond_sig_a.sum(axis=1)
nums_c = cond_sig_c.sum(axis=1)
nums_b = cond_sig_b.sum(axis=1)

fig, axs = plt.subplots(1,2,figsize=(3*2,4))
ax = axs[0]
ax.plot(nr_times, nums_a[nr_idx], '-o' , fillstyle='none', label='A', color='C0')
ax.plot(nr_times, nums_c[nr_idx], '-o' , fillstyle='none', label='C', color='C2')
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=110)
# ax.legend()
ax.set_ylabel('num. of gene')
ax.set_xlabel('time (P)')
sns.despine(ax=ax)

ax = axs[1]
ax.plot(nr_times, nums_b[nr_idx], '-o' , fillstyle='none', label='B', color='C1')
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=70)
# ax.legend()
sns.despine(ax=ax)
fig.tight_layout()
output = os.path.join(outfigdir, 'num_degs_abc_2.pdf')
powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
nums_a = cond_sig_a.sum(axis=1)
nums_b = cond_sig_b.sum(axis=1)
nums_c = cond_sig_c.sum(axis=1)

fig, axs = plt.subplots(1,3,figsize=(3*3,4))
ax = axs[0]
ax.plot(nr_times, nums_a[nr_idx], '-o' , fillstyle='none', label='A', color='C0')
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=110)
# ax.legend()
ax.set_ylabel('num. of gene')
ax.set_xlabel('time (P)')
ax.set_title('A genes')
sns.despine(ax=ax)

ax = axs[1]
ax.plot(nr_times, nums_c[nr_idx], '-o' , fillstyle='none', label='C', color='C2')
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=110)
ax.set_title('C genes')
# ax.legend()
sns.despine(ax=ax)

ax = axs[2]
ax.plot(nr_times, nums_b[nr_idx], '-o' , fillstyle='none', label='B', color='C1')
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=70)
# ax.legend()
sns.despine(ax=ax)
ax.set_title('B genes')
fig.tight_layout()
output = os.path.join(outfigdir, 'num_degs_abc_2.pdf')
powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
nums_bp_trials = np.sum(cond_sig_bp_tensor, axis=2)
nums_bp_mean = np.mean(nums_bp_trials, axis=1)
nums_bp_sem  = np.std(nums_bp_trials, axis=1)/np.sqrt(n_repeat)*1.96

In [None]:
num_uniq_a = np.sum(np.any(cond_sig_a, axis=0))
num_uniq_b = np.sum(np.any(cond_sig_b, axis=0))
num_uniq_c = np.sum(np.any(cond_sig_c, axis=0))

In [None]:
nums_a = cond_sig_a.sum(axis=1)
nums_c = cond_sig_c.sum(axis=1)
nums_b = cond_sig_b.sum(axis=1)

fig, axs = plt.subplots(1,2,figsize=(3*2,4))
ax = axs[0]
ax.plot(nr_times, nums_a[nr_idx], '-o' , fillstyle='none', label='A', color='C0')
ax.plot(nr_times, nums_c[nr_idx], '-o' , fillstyle='none', label='C', color='C2')
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=110)
ax.legend(fontsize='x-small')
ax.set_ylabel('num. of gene')
ax.set_xlabel('time (P)')
sns.despine(ax=ax)

ax = axs[1]
ax.plot(nr_times, nums_b[nr_idx], '-o' , fillstyle='none', label='B', color='C1')
ax.plot(nr_times, nums_bp_mean[nr_idx], '-o' , fillstyle='none', label='rnd hvg', color='gray')
ax.fill_between(nr_times, 
                nums_bp_mean[nr_idx]-nums_bp_sem[nr_idx], 
                nums_bp_mean[nr_idx]+nums_bp_sem[nr_idx], 
                alpha=0.3,
                facecolor='gray',
                edgecolor='none',
                )
ax.set_xticks([6,10,14,17,21])
ax.grid(False, axis='x')
ax.set_ylim(ymin=0, ymax=70)
ax.legend(loc='upper left', fontsize='x-small')
sns.despine(ax=ax)
fig.tight_layout()
output = os.path.join(outfigdir, 'num_degs_abc_3.pdf')
powerplots.savefig_autodate(fig, output)
plt.show()

# check effect size 

In [None]:
cond_sig_a_any = np.any(cond_sig_a, axis=0)
cond_sig_b_any = np.any(cond_sig_b, axis=0)
cond_sig_c_any = np.any(cond_sig_c, axis=0)

In [None]:
# adata.obs['sample'].unique()
import re

todo_conds = [
    'P12DR', 'P14DR', 'P17DR', 'P21DR',
    'P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21', 
]
todo_samps = [
    'P12DRa', 'P12DRb',
    'P14DRa', 'P14DRb',
    'P17DRa', 'P17DRb',
    'P21DRa', 'P21DRb',
    'P6a', 'P6b', 'P6c', 
    'P8a', 'P8b', 'P8c', 
    'P10a', 'P10b', 
    'P12a', 'P12b', 'P12c', 
    'P14a', 'P14b',
    'P17a', 'P17b', 
    'P21a', 'P21b', 
]
todo_conds_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_conds])
todo_samps_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_samps])
print(todo_conds_t)
print(todo_samps_t)

In [None]:
adata.obs['sample'].unique()

In [None]:
%%time

mat = adata.layers['norm'][...]
gexp_l23baseline = np.log2(np.mean(mat, axis=0)*1e2+offset) # CP10k -> CPM


mmat_res_samp = np.zeros((len(todo_samps),3,3))
for i, samp in enumerate(todo_samps):
    print(samp)
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    num_archetypal_cells_viz = int(n_cells*0.1)
    
    precond_a = ranks_ac <= num_archetypal_cells_viz
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells_viz
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells_viz
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
    if SHUFFLE:
        adatasub = adatasub[np.random.choice(n_cells, size=n_cells, replace=False)]
        
    adatasub_a = adatasub[cond_a]
    adatasub_b = adatasub[cond_b]
    adatasub_c = adatasub[cond_c]
    
    # DEGs
    mat_a = adatasub_a.layers['norm'][...]
    mat_b = adatasub_b.layers['norm'][...]
    mat_c = adatasub_c.layers['norm'][...]
    
    mmat_a = np.log2(np.mean(mat_a, axis=0)*1e2+offset)-gexp_l23baseline # CP10k -> CPM
    mmat_b = np.log2(np.mean(mat_b, axis=0)*1e2+offset)-gexp_l23baseline
    mmat_c = np.log2(np.mean(mat_c, axis=0)*1e2+offset)-gexp_l23baseline
    
    # broad 
    aa = np.mean(mmat_a[cond_sig_a_any])
    ab = np.mean(mmat_a[cond_sig_b_any])
    ac = np.mean(mmat_a[cond_sig_c_any])
    
    ba = np.mean(mmat_b[cond_sig_a_any])
    bb = np.mean(mmat_b[cond_sig_b_any])
    bc = np.mean(mmat_b[cond_sig_c_any])
    
    ca = np.mean(mmat_c[cond_sig_a_any])
    cb = np.mean(mmat_c[cond_sig_b_any])
    cc = np.mean(mmat_c[cond_sig_c_any])
    
    mmat_res_samp[i] = np.array([
        [aa, ab, ac],
        [ba, bb, bc],
        [ca, cb, cc],
    ])

In [None]:
todo_samps = [
    'P12DRa', 'P12DRb',
    'P14DRa', 'P14DRb',
    'P17DRa', 'P17DRb',
    'P21DRa', 'P21DRb',
    'P6a', 'P6b', 'P6c', 
    'P8a', 'P8b', 'P8c', 
    'P10a', 'P10b', 
    'P12a', 'P12b', 'P12c', 
    'P14a', 'P14b',
    'P17a', 'P17b', 
    'P21a', 'P21b', 
]

def mean_over_samples(mmat_res_samp):
    """25 samples to 11 conditions
    """
    dim0, dim1, dim2 = mmat_res_samp.shape
    assert dim0 == 25
    
    mmat_res_samp_mean = np.zeros((11,dim1,dim2))
    mmat_res_samp_mean[0] = np.mean(mmat_res_samp[ :2], axis=0)
    mmat_res_samp_mean[1] = np.mean(mmat_res_samp[2:4], axis=0)
    mmat_res_samp_mean[2] = np.mean(mmat_res_samp[4:6], axis=0)
    mmat_res_samp_mean[3] = np.mean(mmat_res_samp[6:8], axis=0)

    mmat_res_samp_mean[4] = np.mean(mmat_res_samp[8:11], axis=0)
    mmat_res_samp_mean[5] = np.mean(mmat_res_samp[11:14], axis=0)
    mmat_res_samp_mean[6] = np.mean(mmat_res_samp[14:16], axis=0)
    mmat_res_samp_mean[7] = np.mean(mmat_res_samp[16:19], axis=0)
    mmat_res_samp_mean[8] = np.mean(mmat_res_samp[19:21], axis=0)
    mmat_res_samp_mean[9] = np.mean(mmat_res_samp[21:23], axis=0)
    mmat_res_samp_mean[10] = np.mean(mmat_res_samp[23:  ], axis=0)
    
    return mmat_res_samp_mean

mmat_res_samp_mean = mean_over_samples(mmat_res_samp)
mmat_res_samp_mean.shape

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(3*3,1*3), sharex=True, sharey=True)
for i in range(3):
    ax = axs[i]

    ax.plot(todo_samps_t[8:], mmat_res_samp[8:,0,i], 'o', markersize=5, fillstyle='none', color='C0')
    ax.plot(todo_samps_t[8:], mmat_res_samp[8:,1,i], 'o', markersize=5, fillstyle='none', color='C1')
    ax.plot(todo_samps_t[8:], mmat_res_samp[8:,2,i], 'o', markersize=5, fillstyle='none', color='C2')
    
    ax.plot(todo_conds_t[4:], mmat_res_samp_mean[4:,0,i], '-', color='C0')
    ax.plot(todo_conds_t[4:], mmat_res_samp_mean[4:,1,i], '-', color='C1')
    ax.plot(todo_conds_t[4:], mmat_res_samp_mean[4:,2,i], '-', color='C2')
    
    ax.grid(False)
    ax.set_xticks([6,10,14,17,21])
    sns.despine(ax=ax)

axs[0].set_xlabel('Postnatal day (P)')
axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
axs[0].set_title(f'A genes\nn={num_uniq_a:,}')
axs[1].set_title(f'B genes\nn={num_uniq_b:,}')
axs[2].set_title(f'C genes\nn={num_uniq_c:,}')
output = os.path.join(outfigdir, 'abc_degs_signals_over_time.pdf') 
powerplots.savefig_autodate(fig, output)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(3*3,1*3), sharex=True, sharey=True)
for i in range(3):
    ax = axs[i]

    ax.plot(todo_samps_t[8:], mmat_res_samp[8:,0,i], 'o', markersize=5, fillstyle='none', color='C0')
    ax.plot(todo_samps_t[8:], mmat_res_samp[8:,1,i], 'o', markersize=5, fillstyle='none', color='C1')
    ax.plot(todo_samps_t[8:], mmat_res_samp[8:,2,i], 'o', markersize=5, fillstyle='none', color='C2')
    
    ax.plot(todo_samps_t[:8], mmat_res_samp[:8,0,i], 's', markersize=5, fillstyle='none', color='C0', alpha=0.5)
    ax.plot(todo_samps_t[:8], mmat_res_samp[:8,1,i], 's', markersize=5, fillstyle='none', color='C1', alpha=0.5)
    ax.plot(todo_samps_t[:8], mmat_res_samp[:8,2,i], 's', markersize=5, fillstyle='none', color='C2', alpha=0.5)
    
    ax.plot(todo_conds_t[4:], mmat_res_samp_mean[4:,0,i], '-', color='C0')
    ax.plot(todo_conds_t[4:], mmat_res_samp_mean[4:,1,i], '-', color='C1')
    ax.plot(todo_conds_t[4:], mmat_res_samp_mean[4:,2,i], '-', color='C2')
    
    ax.plot(todo_conds_t[:4], mmat_res_samp_mean[:4,0,i], '-', color='C0', alpha=0.5)
    ax.plot(todo_conds_t[:4], mmat_res_samp_mean[:4,1,i], '-', color='C1', alpha=0.5)
    ax.plot(todo_conds_t[:4], mmat_res_samp_mean[:4,2,i], '-', color='C2', alpha=0.5)
    
    ax.grid(False)
    ax.set_xticks([6,10,14,17,21])
    sns.despine(ax=ax)

axs[0].set_xlabel('Postnatal day (P)')
axs[0].set_ylabel('Gene expr.\nlog2(archetype / baseline)')
axs[0].set_title(f'A genes\nn={num_uniq_a:,}')
axs[1].set_title(f'B genes\nn={num_uniq_b:,}')
axs[2].set_title(f'C genes\nn={num_uniq_c:,}')
output = os.path.join(outfigdir, 'abc_degs_signals_over_time_withDR.pdf') 
powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
%%time

n_type = 5

mat = adata.layers['norm'][...]
gexp_l23baseline = np.log2(np.mean(mat, axis=0)*1e2+offset) # CP10k -> CPM

mmat_res = np.zeros((len(todo_samps), n_type, mat.shape[1]))
for i, samp in enumerate(todo_samps):
    print(samp)
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    cells_type = pd.qcut(ranks_ac, n_type, labels=False)
    
    # num_archetypal_cells_viz = int(n_cells*0.1)
    # precond_a = ranks_ac <= num_archetypal_cells_viz
    # precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells_viz
    # precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells_viz
    
    # per type
    for j in range(n_type):
        mat_j = adatasub[cells_type==j].layers['norm'][...]
        mmat_j = np.log2(np.mean(mat_j, axis=0)*1e2+offset)-gexp_l23baseline # CP10k -> CPM
        mmat_res[i,j] = mmat_j
        

In [None]:
mmat_res_cond = mean_over_samples(mmat_res)
print(mmat_res.shape) # sample, abc, gene
print(mmat_res_cond.shape) # cond, abc, gene

In [None]:
fmat_a = mmat_res_cond[:,:,cond_sig_a_any]
fmat_b = mmat_res_cond[:,:,cond_sig_b_any]
fmat_c = mmat_res_cond[:,:,cond_sig_c_any]

fmat_flat_a = fmat_a.reshape(-1,fmat_a.shape[-1]).T
fmat_flat_a = np.hstack([fmat_flat_a[:,4*5:], fmat_flat_a[:,:4*5]]) # change columns
zmat_flat_a = zscore(fmat_flat_a, axis=1)

fmat_flat_b = fmat_b.reshape(-1,fmat_b.shape[-1]).T
fmat_flat_b = np.hstack([fmat_flat_b[:,4*5:], fmat_flat_b[:,:4*5]]) # change columns
zmat_flat_b = zscore(fmat_flat_b, axis=1)

fmat_flat_c = fmat_c.reshape(-1,fmat_c.shape[-1]).T
fmat_flat_c = np.hstack([fmat_flat_c[:,4*5:], fmat_flat_c[:,:4*5]]) # change columns
zmat_flat_c = zscore(fmat_flat_c, axis=1)

In [None]:
# sns.clustermap(zmat_flat_a, col_cluster=False, 
#                xticklabels=5,
#                yticklabels=20,
#                cmap='coolwarm', center=0, vmax=3, vmin=-3)

In [None]:
from sklearn.cluster import KMeans

In [None]:
def organize_zmat(zmat, fmat, title='', n_peakset_clsts=5):
    """
    """
    method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
    peakset_clst = method.fit_predict(zmat)

    zmat_ctrds = []
    for i in range(n_peakset_clsts):
        zmat_ctrds.append(np.mean(zmat[peakset_clst == i], axis=0))
    zmat_ctrds = np.array(zmat_ctrds).reshape(-1,5,11)[:,:,4:] # NR only

    # clst_order = [2,1,3,4,0]
    clst_order = np.argsort(np.argmax(np.mean(zmat_ctrds, axis=1), axis=1)) # .shape
    peakset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(peakset_clst).values
    peakset_order = np.argsort(peakset_clst_renamed)
    
    res = {
        'title': title,
        'zmat': zmat,
        'fmat': fmat,
        'nclst': n_peakset_clsts,
        'clst': peakset_clst_renamed,
        'order': peakset_order,
    }
    return res

In [None]:
res_a = organize_zmat(zmat_flat_a, fmat_flat_a, title='A genes')
res_b = organize_zmat(zmat_flat_b, fmat_flat_b, title='B genes')
res_c = organize_zmat(zmat_flat_c, fmat_flat_c, title='C genes')

In [None]:

for res_this in [res_a, res_c, res_b]:
    title = res_this['title']
    zmat  = res_this['zmat']
    order = res_this['order']

    fig, ax = plt.subplots(figsize=(15,8))

    sns.heatmap(zmat[order], yticklabels=2000, cmap='coolwarm', cbar_kws=dict(shrink=0.5), 
                vmax=3, vmin=-3,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10, rotation=0)
    ax.set_title(title, pad=30)

    # for i in range(n_cond):
    # for condcode, cond in condcode2cond.items():
    #     ax.axvline(condcode*n_type, color='k', linestyle='--', linewidth=1)
    #     ax.text(condcode*n_type, -0.5, f'{cond}', fontsize=10, va='bottom')

    plt.show()

# Profile these modules

In [None]:
from matplotlib.colors import LinearSegmentedColormap

colors_a = [(0.0, 'black'), (1.0, 'C0')]      
colors_b = [(0.0, 'black'), (1.0, 'C1')]      
colors_c = [(0.0, 'black'), (1.0, 'C2')]      

# Create a custom colormap using LinearSegmentedColormap
cmap_a = LinearSegmentedColormap.from_list('cmap_a', colors_a)
cmap_b = LinearSegmentedColormap.from_list('cmap_b', colors_b)
cmap_c = LinearSegmentedColormap.from_list('cmap_c', colors_c)

colors_l23 = [
    np.array(cmap_a(1.0)),
    0.7*np.array(cmap_a(1.0))+0.3*np.array(cmap_b(1.0)),
    np.array(cmap_b(1.0)),
    0.7*np.array(cmap_b(1.0))+0.3*np.array(cmap_c(1.0)),
    np.array(cmap_c(1.0)),
]

In [None]:
times = np.array([6,8,10,12,14,17,21])
dr_times = np.array([12,14,17,21])

In [None]:
# nr_condcodes = np.array([4,5,6,7,8,9,10]

In [None]:
for res_this in [res_a, res_c, res_b]:
    title = res_this['title']
    fmat  = res_this['fmat']
    order = res_this['order']
    clsts = res_this['clst']

    fig, axs = plt.subplots(1,5,figsize=(6*3,1*3), sharex=True, sharey=True)
    fig.suptitle(f'{title} n={len(fmat):,}', y=1.08, fontsize=18)
    axs[0].set_ylabel('fold change')
    for i in range(5):
        ax = axs[i]
        prop = (clsts == i).sum() / len(fmat)
        
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[:-4] #[nr_condcodes] # .shape
        for j, (ycol, color) in enumerate(zip(y.T, colors_l23)):
            if j in [0,2,4]:
                ax.plot(nr_times, ycol, '-', color=color)
        
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[-4:] # [dr_condcodes] # .shape
        for j, (ycol, color) in enumerate(zip(y.T, colors_l23)):
            if j in [0,2,4]:
                ax.plot(dr_times+10, ycol, '-', color=color)

        ax.axvline(12, color='k', linestyle='--', linewidth=1.5)
        ax.axvline(22, color='k', linestyle='--', linewidth=1.5)
        sns.despine(ax=ax)
        ax.grid(False, axis='x')
        ax.set_title(f'M{i+1}: {prop*100: .1f}%')
        ax.set_xticks([6,12,21,22,31])
        ax.set_xticklabels([6,12,21,12,21])
output = os.path.join(outfigdir, 'atac_groups_abc_v1.pdf')
powerplots.savefig_autodate(fig, output)


In [None]:
for res_this in [res_a, res_c, res_b]:
    title = res_this['title']
    fmat  = res_this['fmat']
    order = res_this['order']
    clsts = res_this['clst']

    fig, axs = plt.subplots(1,5,figsize=(4*3,1*4), sharex=True, sharey=True)
    fig.suptitle(f'{title} n={len(fmat):,}', y=1.08, fontsize=18)
    axs[0].set_ylabel('fold change')
    for i in range(5):
        ax = axs[i]
        prop = (clsts == i).sum() / len(fmat)
        
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[:-4]# [nr_condcodes] # .shape
        for j, (ycol, color) in enumerate(zip(y.T, colors_l23)):
            if j in [0,2,4]:
                ax.plot(nr_times, ycol, '-', color=color, linewidth=2)
        
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[-4:]# [dr_condcodes] # .shape
        for j, (ycol, color) in enumerate(zip(y.T, colors_l23)):
            if j in [0,2,4]:
                ax.plot(dr_times, ycol, linestyle=(0,(2,1)), color=color, linewidth=2) # alpha=0.5)

        # ax.axvline(12, color='k', linestyle='--', linewidth=1.5)
        # ax.axvline(22, color='k', linestyle='--', linewidth=1.5)
        sns.despine(ax=ax)
        ax.grid(False, axis='x')
        ax.set_title(f'M{i+1}: {prop*100: .1f}%')
        ax.set_xticks([6,12,21]) # ,22,31])
        # ax.set_xticklabels([6,12,21,12,21])
        
output = os.path.join(outfigdir, 'atac_groups_abc_v2.pdf')
powerplots.savefig_autodate(fig, output)

# Another version with ABC scores