In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP
from statsmodels.stats.multitest import multipletests

from matplotlib.colors import LinearSegmentedColormap

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  


In [None]:
outfigdir = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures/250409"
!mkdir -p $outfigdir

# load gene annotation and data

In [None]:
genes_alltime_hvgs = np.loadtxt('/u/home/f/f7xiesnm/v1_multiome/l23_alltime_hvgs_n4940.txt', dtype='str')
genes_alltime_hvgs.shape

In [None]:
gene_modules = GeneModules()
g, gs, ms = gene_modules.check_genes('Cdh13')
print("\t".join(g))
print("\t".join(gs))
print("\t".join(ms))

In [None]:
adata = anndata.read("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_L56IT.h5ad")
adata

In [None]:
sample_labels = adata.obs['Sample'].values
time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

adata.obs['sample'] = sample_labels #
adata.obs['time']   = time_labels
adata.obs['cond']   = [sample[:-1] for sample in sample_labels]
print(sample_labels)

uniq_samples = natsorted(np.unique(sample_labels))
nr_samples = [s for s in uniq_samples if "DR" not in s]
dr_samples = [s for s in uniq_samples if "DR" in s]

uniq_conds = np.array(natsorted(np.unique(adata.obs['cond'].values)))
print(uniq_conds)

In [None]:
# remove mitocondria genes
adata = adata[:,~adata.var.index.str.contains(r'^mt-')]
# remove sex genes
adata = adata[:,~adata.var.index.str.contains(r'^Xist$')]

In [None]:
adata

# check effect size 

In [None]:
# adata.obs['sample'].unique()
import re

todo_conds = [
    'P12DR', 'P14DR', 'P17DR', 'P21DR',
    'P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21', 
]
todo_samps = [
    'P12DRa', 'P12DRb',
    'P14DRa', 'P14DRb',
    'P17DRa', 'P17DRb',
    'P21DRa', 'P21DRb',
    'P6a', 'P6b', 'P6c', 
    'P8a', 'P8b', 'P8c', 
    'P10a', 'P10b', 
    'P12a', 'P12b', 'P12c', 
    'P14a', 'P14b',
    'P17a', 'P17b', 
    'P21a', 'P21b', 
]
todo_conds_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_conds])
todo_samps_t = np.array([int(re.sub(r'[a-zA-Z]', '', a)) for a in todo_samps])
print(todo_conds_t)
print(todo_samps_t)

def mean_over_samples(mmat_res_samp):
    """25 samples to 11 conditions
    """
    assert mmat_res_samp.shape[0] == 25
    
    mmat_res_samp_mean = np.zeros(mmat_res_samp.shape)[:11]
    mmat_res_samp_mean[0] = np.mean(mmat_res_samp[ :2], axis=0)
    mmat_res_samp_mean[1] = np.mean(mmat_res_samp[2:4], axis=0)
    mmat_res_samp_mean[2] = np.mean(mmat_res_samp[4:6], axis=0)
    mmat_res_samp_mean[3] = np.mean(mmat_res_samp[6:8], axis=0)

    mmat_res_samp_mean[4] = np.mean(mmat_res_samp[8:11], axis=0)
    mmat_res_samp_mean[5] = np.mean(mmat_res_samp[11:14], axis=0)
    mmat_res_samp_mean[6] = np.mean(mmat_res_samp[14:16], axis=0)
    mmat_res_samp_mean[7] = np.mean(mmat_res_samp[16:19], axis=0)
    mmat_res_samp_mean[8] = np.mean(mmat_res_samp[19:21], axis=0)
    mmat_res_samp_mean[9] = np.mean(mmat_res_samp[21:23], axis=0)
    mmat_res_samp_mean[10] = np.mean(mmat_res_samp[23:  ], axis=0)
    
    return mmat_res_samp_mean

def transform_bigredmat(bigmat):
    """bigmat or redmat
    to fmat and zmat
    """
    fmat = bigmat.reshape(-1, bigmat.shape[-1]).T
    fmat = np.hstack([fmat[:,4*5:], fmat[:,:4*5]]) # CHANGED COLUMN ORDER!!
    zmat = zscore(fmat, axis=1)
    
    return fmat, zmat

In [None]:
%%time

offset = 1

mat = adata.layers['norm'][...]
gexp_l23baseline = np.log2(np.mean(mat, axis=0)*1e2+offset) # CP10k -> CPM

n_type = 1
frac_archetypal_cells_viz = 0.2
bigmat_nfd = np.zeros((len(todo_samps), n_type, mat.shape[1]))

for i, samp in enumerate(todo_samps):
    print(samp)
    
    # get sub
    adatasub = adata[adata.obs['sample']==samp]
    n_cells = adatasub.shape[0]
    
    mat_j = adatasub.layers['norm'][...]
    mmat_j = np.log2(np.mean(mat_j, axis=0)*1e2+offset)-gexp_l23baseline # CP10k -> CPM
    bigmat_nfd[i,0] = mmat_j
    

In [None]:
print(bigmat_nfd.shape) # cond, type, gene

In [None]:
redmat_nfd = mean_over_samples(bigmat_nfd)
fmat_nfd, zmat_nfd = transform_bigredmat(redmat_nfd)
print(redmat_nfd.shape) # cond, type, gene
print(fmat_nfd.shape)   # gene, cond*type
print(zmat_nfd.shape)   # gene, cond*type

In [None]:
from sklearn.cluster import KMeans
def mean_shape(vec):
    """
    """
    loc = np.arange(len(vec))
    
    # vec_n = (vec-np.min(vec))/(np.max(vec)-np.min(vec))
    vec_n = np.clip(vec, 0, None)
    vec_n = vec_n/np.sum(vec_n)
    
    ctrd = loc.dot(vec_n)
    return ctrd

def organize_zmat(zmat, fmat, redmat, title='', n_geneset_clsts=5, genes=None):
    """NOTE THAT THE ORDER OF COND is DIFFERRENT BETWEEN (zmat, fmat) - DR first) and (redmat) - NR first)
    """
    method = KMeans(n_clusters=n_geneset_clsts, n_init=10, random_state=0)
    geneset_clst = method.fit_predict(zmat)

    # average over genes per geneset and cell clusters - leave genesets and conditions there
    time_sketches = []
    for i in range(n_geneset_clsts):
        time_sketch = np.mean(redmat[:,:,geneset_clst==i], axis=2) # mean over genes
        time_sketch = np.max(time_sketch, axis=1) # max over cell types
        time_sketches.append(time_sketch)
    time_sketches = np.vstack(time_sketches)[:,4:] # n_geneset_clsts, n_cond (select NR only)

    # clst_order = [2,1,3,4,0]
    # clst_order = np.argsort(np.argmax(ctrds, axis=1)) 
    clst_order = np.argsort([mean_shape(time_sketch) for time_sketch in time_sketches]) 
    geneset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(geneset_clst).values
    geneset_order = np.argsort(geneset_clst_renamed)
    
    # reorder 
    genes_ordered = genes[geneset_order]
    clsts_ordered = geneset_clst_renamed[geneset_order]
    zmat_ordered = zmat[geneset_order] 
    fmat_ordered = fmat[geneset_order] 
    
    # gene list per group
    geneset_list = []
    for i in range(n_geneset_clsts):
        geneset_list.append(genes_ordered[clsts_ordered==i])
    
    res = {
        'title': title,
        'order': geneset_order,
        'zmat':  zmat_ordered,
        'fmat':  fmat_ordered,
        'genes': genes_ordered,
        'clst':  clsts_ordered,
        'time_sketches':  time_sketches[clst_order],
        'geneset_list': geneset_list,
    }
    return res

In [None]:
def calc_del(red_y):
    del_t = np.mean(red_y[7+3]-red_y[3+3], axis=0) # mean over ABC
    del_v21 = np.mean(-red_y[7+3]+red_y[3], axis=0) # mean over ABC
    del_v17 = np.mean(-red_y[7+2]+red_y[2], axis=0)
    del_v14 = np.mean(-red_y[7+1]+red_y[1], axis=0)
    del_v12 = np.mean(-red_y[7+0]+red_y[0], axis=0)

    del_varr = np.array([del_v21, del_v17, del_v14, del_v12])
    del_v = np.mean(del_varr, axis=0) # mean over time
    del_v1721 = np.mean(del_varr[:2], axis=0) # mean over time
    
    return del_t, del_v, del_v21

def calc_del_typespec(red_y):
    del_t   = red_y[7+3]-red_y[3+3] # (n_type, n_gene)
    del_v21 = -red_y[7+3]+red_y[3]  #
    del_v17 = -red_y[7+2]+red_y[2]
    del_v14 = -red_y[7+1]+red_y[1]
    del_v12 = -red_y[7+0]+red_y[0]

    del_varr = np.array([del_v21, del_v17, del_v14, del_v12])
    del_v = np.mean(del_varr, axis=0) # mean over time
    del_v1721 = np.mean(del_varr[:2], axis=0) # mean over time
    
    return del_t, del_v, del_v21

# Quantify time vs DR effect
- late are DR sensitive
- (P21NR-P10NR) vs (P21NR vs DR)
- refine this as the average time effect vs average DR effect

In [None]:
times = np.array([6,8,10,12,14,17,21])
dr_times = np.array([12,14,17,21])

In [None]:
from matplotlib.colors import LinearSegmentedColormap

colors_a = [(0.0, 'black'), (1.0, 'C0')]      
colors_b = [(0.0, 'black'), (1.0, 'C1')]      
colors_c = [(0.0, 'black'), (1.0, 'C2')]      

# Create a custom colormap using LinearSegmentedColormap
cmap_a = LinearSegmentedColormap.from_list('cmap_a', colors_a)
cmap_b = LinearSegmentedColormap.from_list('cmap_b', colors_b)
cmap_c = LinearSegmentedColormap.from_list('cmap_c', colors_c)

colors_l23 = [
    np.array(cmap_a(1.0)),
    0.7*np.array(cmap_a(1.0))+0.3*np.array(cmap_b(1.0)),
    np.array(cmap_b(1.0)),
    0.7*np.array(cmap_b(1.0))+0.3*np.array(cmap_c(1.0)),
    np.array(cmap_c(1.0)),
]

In [None]:
from sklearn.metrics import r2_score

In [None]:
genes = adata.var.index.values
genes

In [None]:
cond_sig_hvg = np.array([g in genes_alltime_hvgs for g in genes])
cond_sig_hvg

In [None]:
cond_sig_hvg.sum()

In [None]:
big_y = bigmat_nfd[:,:,cond_sig_hvg]
red_y = mean_over_samples(big_y)
del_t, del_v, del_v21 = calc_del(red_y)

In [None]:
big_y.shape, del_t.shape, del_v.shape

In [None]:
_x = del_t
_y = del_v
genes_plot = genes[cond_sig_hvg]

highlights = genes_plot[np.argsort(np.abs(_y))[::-1][:50]]
print(highlights)

cond_high = basicu.get_index_from_array(genes_plot, highlights)


n = len(_x)

r, _ = stats.pearsonr(_x, _y)
slope, intercept = np.polyfit(_x, _y, 1)
xbase = np.linspace(-4,5,5) 
ybase = slope*xbase + intercept
r2 = r2_score(_y, _x*slope+intercept)
assert r**2 - r2 < 1e-3

fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(_x, _y, s=5, color='lightgray')#s=10, facecolors='none', edgecolors='C0', linewidths=1)
ax.scatter(_x[cond_high], _y[cond_high], s=5, color='C1')#s=10, facecolors='none', edgecolors='C0', linewidths=1)
for xc, yc, name in zip(_x[cond_high], _y[cond_high], highlights):
    ax.text(xc, yc, name, fontsize=10)
    
ax.plot(xbase, ybase, '--r', linewidth=1) #, zorder=0)
ax.axvline(0, color='gray',  linewidth=1, zorder=0)
ax.axhline(0, color='gray',  linewidth=1, zorder=0)
ax.grid(False)
sns.despine(ax=ax)
ax.set_ylabel('log2(DR/NR)')
ax.set_xlabel('log2(P21/P10)')
ax.set_title(f'y={slope:.2f}x{intercept:.2f}; r={r:.2f}; n={n}', fontsize=15)

# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

In [None]:
genes_plot = genes[cond_sig_hvg]
highlights = np.array([
    'Fos', 'Npas4', 'Baz1a',
    'Nptx2', 'Egr1',
    'Megf11', 'Pcdh15',
    'Fosl2', 'Meis2',
    'Sorcs3', 'Trpc6',
])
highlights = np.intersect1d(genes_plot, highlights)
print(highlights)

cond_high = basicu.get_index_from_array(genes_plot, highlights)

_x = del_t
_y = del_v

n = len(_x)

r, _ = stats.pearsonr(_x, _y)
slope, intercept = np.polyfit(_x, _y, 1)
xbase = np.linspace(-4,5,5) 
ybase = slope*xbase + intercept
r2 = r2_score(_y, _x*slope+intercept)
assert r**2 - r2 < 1e-3

fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(_x, _y, s=5, color='lightgray')#s=10, facecolors='none', edgecolors='C0', linewidths=1)
ax.scatter(_x[cond_high], _y[cond_high], s=5, color='C1')#s=10, facecolors='none', edgecolors='C0', linewidths=1)
for xc, yc, name in zip(_x[cond_high], _y[cond_high], highlights):
    ax.text(xc, yc, name, fontsize=10)
    
ax.plot(xbase, ybase, '--r', linewidth=1) #, zorder=0)
ax.axvline(0, color='gray',  linewidth=1, zorder=0)
ax.axhline(0, color='gray',  linewidth=1, zorder=0)
ax.grid(False)
sns.despine(ax=ax)
ax.set_ylabel('log2(DR/NR)')
ax.set_xlabel('log2(P21/P10)')
ax.set_title(f'y={slope:.2f}x{intercept:.2f}; r={r:.2f}; n={n}', fontsize=15)

# output = os.path.join(outfigdir, f'time_vs_dr_linear.pdf') 
# powerplots.savefig_autodate(fig, output)
plt.show()

# different types (visualize their gradient)
# read paper on where to include other cells