In [None]:
import scipy.io
import numpy as np
import pandas as pd
import scanpy as sc
import h5py

import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP

from matplotlib.colors import LinearSegmentedColormap
from statsmodels.stats.multitest import multipletests

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  

# load data

In [None]:
%%time
f_rna = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/L23_allmultiome_proc_P6toP21.h5ad'
adata = sc.read(f_rna)
cells_rna = np.array([c.split(' ')[0][:-len('-2023')] for c in adata.obs.index.values])
adata.obs.index = cells_rna
adata

In [None]:
sample_conditions = ['P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21']
n_cond = len(sample_conditions)

In [None]:
%%time
adatas_pk = []
for cond in sample_conditions:
    print(cond)
    f = f'/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/{cond}_ATAC_L23_peakbycell_filtered.h5ad'
    adata_pk = sc.read(f)
    adata_pk.obs = adata_pk.obs.set_index('cell')
    print(adata_pk.var['peak'][:3].values)
    
    adatas_pk.append(adata_pk)
    print(adata_pk)
    # break

In [None]:
n_cond = len(sample_conditions)
n_peak = adatas_pk[0].shape[1]
n_type = 5

# check results and stats

In [None]:
%%time
fin1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DAR_qs_abc_p6to21.npy'
fin2 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DAR_l2fc_abc_p6to21.npy'

In [None]:
qs_tensor = np.load(fin1)
l2fc_tensor = np.load(fin2)

In [None]:
qs   = qs_tensor[:,0,:]
l2fc = l2fc_tensor[:,0,:]

In [None]:
l2fc_th = np.log2(2)
alpha_th = 0.05
cond_sig_a = np.all([qs < alpha_th, l2fc < -l2fc_th], axis=0)
cond_sig_c = np.all([qs < alpha_th, l2fc >  l2fc_th], axis=0)

print('num A > C DARs per cond:\t',    cond_sig_a.sum(axis=1), np.any(cond_sig_a, axis=0).sum())
print('num C > A DARs per cond:\t',    cond_sig_c.sum(axis=1), np.any(cond_sig_c, axis=0).sum())

instances, counts_a = np.unique(cond_sig_a.sum(axis=0), return_counts=True)
instances, counts_c = np.unique(cond_sig_c.sum(axis=0), return_counts=True)

print('num A > C DARs in num conds:\t',  counts_a[1:])
print('num C > A DARs in num conds:\t',  counts_c[1:])

In [None]:
qs_ba   = qs_tensor[:,1,:]
qs_bc   = qs_tensor[:,2,:]

l2fc_ba = l2fc_tensor[:,1,:]
l2fc_bc = l2fc_tensor[:,2,:]

qs_b   = np.minimum(qs_ba, qs_bc)
l2fc_b = np.mean([l2fc_ba, l2fc_bc], axis=0)

qs_b.shape, l2fc_b.shape

In [None]:
l2fc_th = np.log2(1.5)
alpha_th = 0.05
cond_sig_bp = np.all([l2fc_ba > 0, l2fc_bc > 0, l2fc_b >  l2fc_th, qs_b < alpha_th], axis=0)
cond_sig_bn = np.all([l2fc_ba < 0, l2fc_bc < 0, l2fc_b < -l2fc_th, qs_b < alpha_th], axis=0)
cond_sig_b  = np.logical_or(cond_sig_bp, cond_sig_bn)

print('num B > A,C per cond:\t',    cond_sig_bp.sum(axis=1))
print('num B < A,C per cond:\t',    cond_sig_bn.sum(axis=1))

instances, counts_bp = np.unique(cond_sig_bp.sum(axis=0), return_counts=True)
instances, counts_bn = np.unique(cond_sig_bn.sum(axis=0), return_counts=True)

print('num B > A,C DARs in num conds:\t',  counts_bp[1:])
print('num B < A,C DARs in num conds:\t',  counts_bn[1:])

In [None]:
%%time

all_x = []
all_y_a = []
all_y_c = []
all_y_b = []
all_nums = []

for cond_order, cond in enumerate(sample_conditions):
    # get sub
    adatasub = adata[adata.obs['cond']==cond]
    cells_rna = adatasub.obs.index.values
    
    # get ATAC
    adata_pk = adatas_pk[cond_order]
    cells_atac = adata_pk.obs.index.values
    
    cells_mult = np.intersect1d(cells_rna, cells_atac)
    
    # no normalization here
    x = adatasub[cells_mult].obsm['pca_p17on'][...,0]
    y_a = np.mean(np.log10(1+adata_pk[cells_mult, cond_sig_a[cond_order]].X), axis=1) # layers['lognorm'][...]
    y_c = np.mean(np.log10(1+adata_pk[cells_mult, cond_sig_c[cond_order]].X), axis=1) # layers['lognorm'][...]
    
    if cond_sig_bp[cond_order].sum() > 0:
        y_b = np.mean(np.log10(1+adata_pk[cells_mult, cond_sig_bp[cond_order]].X), axis=1) # layers['lognorm'][...]
    else:
        y_b = np.array([np.nan]*len(y_a))
    
    all_x.append(x)
    all_y_a.append(y_a)
    all_y_c.append(y_c)
    all_y_b.append(y_b)
    all_nums.append([cond_sig_a[cond_order].sum(),
                     cond_sig_bp[cond_order].sum(),
                     cond_sig_c[cond_order].sum(),
                    ])
    

In [None]:
fig, axs = plt.subplots(2,7, figsize=(4*7,4*2)) # , sharey=True, sharex=True)  #, sharex=True, sharey=True)
for cond_order, cond in enumerate(sample_conditions):
    x = all_x[cond_order]
    y_a = all_y_a[cond_order]
    y_c = all_y_c[cond_order]
    y_b = all_y_b[cond_order]
    num_a, num_b, num_c = all_nums[cond_order]
    
    ax = axs[0][cond_order]
    ax.scatter(x, y_a, s=1, color='C0', label='A peaks', alpha=0.5)# , facecolor='none')
    ax.scatter(x, y_c, s=1, color='C2', label='C peaks', alpha=0.5)# , facecolor='none')
    r_a, p  = stats.spearmanr(x, y_a)
    r_c, p  = stats.spearmanr(x, y_c)
    ax.set_title(f'{cond}\nNa={num_a:,}, r={r_a:.2f}\nNc={num_c:,}, r={r_c:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)

    ax = axs[1][cond_order]
    ax.scatter(x, y_b, s=1, color='C1', label='B peaks', alpha=0.5)# , facecolor='none')
    r_b, p  = stats.spearmanr(x, y_b)
    ax.set_title(f'Nb={num_b:,}, r={r_b:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    if np.all(np.isnan(y_b)):
        ax.set_xticks([])
        ax.set_yticks([])
    
axs[0][0].set_xlabel('RNA PC1 (P17-21)')
axs[0][0].set_ylabel('ATAC peak accessibility')
fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(1*8,1*4))

num_uniq_a = np.sum(np.any(cond_sig_a, axis=0))
num_uniq_b = np.sum(np.any(cond_sig_bp, axis=0))
num_uniq_c = np.sum(np.any(cond_sig_c, axis=0))

ax.plot(np.arange(n_cond), cond_sig_a.sum(axis=1), '-o', color='C0', fillstyle='none', label=f'A DARs (unique n = {num_uniq_a:,})') 
ax.plot(np.arange(n_cond), cond_sig_c.sum(axis=1), '-o', color='C2', fillstyle='none', label=f'C DARs (unique n = {num_uniq_c:,})') 
ax.plot(np.arange(n_cond), cond_sig_bp.sum(axis=1),'-o', color='C1', fillstyle='none', label=f'B DARs (unique n = {num_uniq_b:,})')

ax.plot(np.arange(n_cond), [0]*n_cond,'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.set_ylim(ymin=-500)

ax.set_xticks(np.arange(n_cond))
ax.set_xticklabels(sample_conditions)
ax.grid(False)
sns.despine(ax=ax)
ax.legend(bbox_to_anchor=(1,0.6), fontsize=12)

ax.set_ylabel('number of DARs')
fig.tight_layout()
plt.show()

# get the full matrix

In [None]:
from sklearn.cluster import KMeans

In [None]:
fin3 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DAR_mean_counts_p6to21.npy'
atac_tensor = np.load(fin3)

In [None]:
cond_sig_selected = np.any(np.vstack([cond_sig_a, cond_sig_c, cond_sig_bp]), axis=0)
n_dar = cond_sig_selected.sum()
atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
mat = atac_tensor_sel.reshape(n_cond*n_type, n_dar).T

In [None]:
n_peakset_clsts = 30
method = KMeans(n_clusters=n_peakset_clsts)
peakset_clst = method.fit_predict(mat)

In [None]:
mat_mean = np.zeros((n_peakset_clsts, mat.shape[1]))
for i in range(n_peakset_clsts):
    mean_val = np.mean(mat[peakset_clst==i], axis=0) 
    z_mean = (mean_val - np.mean(mean_val))/np.std(mean_val)
    mat_mean[i] = z_mean 

In [None]:
# sns.heatmap(mat_mean, cmap='coolwarm')
# set of A vs C peaks together 
g = sns.clustermap(mat_mean, col_cluster=False, cmap='coolwarm', figsize=(8,5), xticklabels=False, yticklabels=False)

ax = g.ax_heatmap
ax.set_xticks(0.5+np.arange(n_type))
ax.set_xticklabels(['A', '~', 'B', '~', 'C'], fontsize=10)

for i in range(n_cond):
    ax.axvline(i*n_type, color='k', linestyle='--', linewidth=1)
    ax.text(i*n_type, 0, f'{sample_conditions[i]}')

# ABC peaks

In [None]:
n_peakset_clsts = 10
for cond_sig_, title in zip([cond_sig_a, cond_sig_c, cond_sig_bp], 
                            ['A peaks', 'C peaks', 'B peaks',]):
    
    cond_sig_selected = np.any(cond_sig_, axis=0)
    n_dar = cond_sig_selected.sum()
    atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
    mat = atac_tensor_sel.reshape(n_cond*n_type, n_dar).T

    method = KMeans(n_clusters=n_peakset_clsts)
    peakset_clst = method.fit_predict(mat)

    mat_mean = np.zeros((n_peakset_clsts, mat.shape[1]))
    for i in range(n_peakset_clsts):
        mean_val = np.mean(mat[peakset_clst==i], axis=0) 
        z_mean = (mean_val - np.mean(mean_val))/np.std(mean_val)
        mat_mean[i] = z_mean 

    # sns.heatmap(mat_mean, cmap='coolwarm')
    # set of A vs C peaks together 
    g = sns.clustermap(mat_mean, col_cluster=False, cmap='coolwarm', figsize=(8,3), xticklabels=False, yticklabels=False)

    ax = g.ax_heatmap
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '~', 'B', '~', 'C'], fontsize=10)
    ax.set_title(title, pad=30)

    for i in range(n_cond):
        ax.axvline(i*n_type, color='k', linestyle='--', linewidth=1)
        ax.text(i*n_type, -0.5, f'{sample_conditions[i]}')