In [None]:
import scipy.io
import numpy as np
import pandas as pd
import scanpy as sc
import h5py

import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP

from matplotlib.colors import LinearSegmentedColormap
from statsmodels.stats.multitest import multipletests

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  

In [None]:
%%time
f_rna = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/L23_allmultiome_proc_P6toP21.h5ad'
adata = sc.read(f_rna)
cells_rna = np.array([c.split(' ')[0][:-len('-2023')] for c in adata.obs.index.values])
adata.obs.index = cells_rna
adata

In [None]:
sample_conditions = ['P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21']

In [None]:
%%time
adatas_pk = []
for cond in sample_conditions:
    print(cond)
    f = f'/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/{cond}_ATAC_L23_peakbycell_filtered.h5ad'
    adata_pk = sc.read(f)
    adata_pk.obs = adata_pk.obs.set_index('cell')
    print(adata_pk.var['peak'][:3].values)
    
    adatas_pk.append(adata_pk)
    print(adata_pk)
    # break

In [None]:
n_cond = len(sample_conditions)
n_peak = adatas_pk[0].shape[1]
qs_tensor = np.zeros((n_cond,3,n_peak))  # 3 represents 3 pairwise comparisons (ca, ba, bc)
l2fc_tensor = np.zeros((n_cond,3,n_peak))

print(qs_tensor.shape, l2fc_tensor.shape)

n_type = 5

In [None]:
%%time
for cond_order, cond in enumerate(sample_conditions):
    # get sub
    adatasub = adata[adata.obs['cond']==cond]
    cells_rna = adatasub.obs.index.values
    x = adatasub.obsm['pca_p17on'][...,0]
    cells_rna_type = pd.qcut(x, n_type, labels=False)
    
    # get ATAC
    adata_pk = adatas_pk[cond_order]
    cells_atac = adata_pk.obs.index.values
    
    # get A, B, C 
    cond_a = (cells_rna_type == 0)
    cond_b = (cells_rna_type == int(n_type/2))
    cond_c = (cells_rna_type == n_type-1)
    
    cells_a = np.intersect1d(cells_rna[cond_a], cells_atac)
    cells_b = np.intersect1d(cells_rna[cond_b], cells_atac)
    cells_c = np.intersect1d(cells_rna[cond_c], cells_atac)
    
    # no normalization here
    mat_a = np.log10(1+adata_pk[cells_a].X[...]) # layers['lognorm'][...]
    mat_b = np.log10(1+adata_pk[cells_b].X[...]) # layers['lognorm'][...]
    mat_c = np.log10(1+adata_pk[cells_c].X[...]) # layers['lognorm'][...]
    
    # DARs A, B, C 
    lfc_ca = np.log2(10)*(np.mean(mat_c, axis=0) - np.mean(mat_a, axis=0)) # log2FC
    lfc_ba = np.log2(10)*(np.mean(mat_b, axis=0) - np.mean(mat_a, axis=0))
    lfc_bc = np.log2(10)*(np.mean(mat_b, axis=0) - np.mean(mat_c, axis=0)) 
    
    ts, ps_ca = stats.ttest_ind(mat_c, mat_a)
    ts, ps_ba = stats.ttest_ind(mat_b, mat_a)
    ts, ps_bc = stats.ttest_ind(mat_b, mat_c)
    
    rs, qs_ca, _, _ = multipletests(np.nan_to_num(ps_ca, nan=1).reshape(-1,), method='fdr_bh') # why nan in ps -- not expressed
    rs, qs_ba, _, _ = multipletests(np.nan_to_num(ps_ba, nan=1).reshape(-1,), method='fdr_bh')
    rs, qs_bc, _, _ = multipletests(np.nan_to_num(ps_bc, nan=1).reshape(-1,), method='fdr_bh')
    
    
    l2fc_tensor[cond_order, 0] = lfc_ca
    l2fc_tensor[cond_order, 1] = lfc_ba
    l2fc_tensor[cond_order, 2] = lfc_bc
    
    qs_tensor[cond_order, 0] = qs_ca
    qs_tensor[cond_order, 1] = qs_ba
    qs_tensor[cond_order, 2] = qs_bc
    
    cond_sig_ca = np.all([qs_ca < 0.05, 
                          np.abs(lfc_ca) > np.log2(2),
                         ], axis=0)
    num_sig_ca = np.sum(cond_sig_ca)
    
    cond_sig_b = np.all([lfc_ba*lfc_bc>0, 
                          np.abs(np.mean([lfc_ba, lfc_bc], axis=0)) > np.log2(1.5),
                          np.minimum(qs_ba, qs_bc) < 0.05,
                         ], axis=0)
    num_sig_b = np.sum(cond_sig_b)
    print(cond, mat_a.shape, mat_b.shape, mat_c.shape, num_sig_ca, num_sig_b)

In [None]:
%%time
fout1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DAR_qs_abc_p6to21.npy'
fout2 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DAR_l2fc_abc_p6to21.npy'

# np.save(fout1, qs_tensor)
# np.save(fout2, l2fc_tensor)

# check results and stats

In [None]:
qs_tensor = np.load(fout1)
l2fc_tensor = np.load(fout2)

In [None]:
qs   = qs_tensor[:,0,:]
l2fc = l2fc_tensor[:,0,:]

In [None]:
l2fc_th = np.log2(2)
alpha_th = 0.05
cond_sig_a = np.all([qs < alpha_th, l2fc < -l2fc_th], axis=0)
cond_sig_c = np.all([qs < alpha_th, l2fc >  l2fc_th], axis=0)

print('num A > C DARs per cond:\t',    cond_sig_a.sum(axis=1), np.any(cond_sig_a, axis=0).sum())
print('num C > A DARs per cond:\t',    cond_sig_c.sum(axis=1), np.any(cond_sig_c, axis=0).sum())

instances, counts_a = np.unique(cond_sig_a.sum(axis=0), return_counts=True)
instances, counts_c = np.unique(cond_sig_c.sum(axis=0), return_counts=True)

print('num A > C DARs in num conds:\t',  counts_a[1:])
print('num C > A DARs in num conds:\t',  counts_c[1:])

In [None]:
qs_ba   = qs_tensor[:,1,:]
qs_bc   = qs_tensor[:,2,:]

l2fc_ba = l2fc_tensor[:,1,:]
l2fc_bc = l2fc_tensor[:,2,:]

qs_b   = np.minimum(qs_ba, qs_bc)
l2fc_b = np.mean([l2fc_ba, l2fc_bc], axis=0)

qs_b.shape, l2fc_b.shape

In [None]:
l2fc_th = np.log2(1.5)
alpha_th = 0.05
cond_sig_bp = np.all([l2fc_ba > 0, l2fc_bc > 0, l2fc_b >  l2fc_th, qs_b < alpha_th], axis=0)
cond_sig_bn = np.all([l2fc_ba < 0, l2fc_bc < 0, l2fc_b < -l2fc_th, qs_b < alpha_th], axis=0)

print('num B > A,C per cond:\t',    cond_sig_bp.sum(axis=1))
print('num B < A,C per cond:\t',    cond_sig_bn.sum(axis=1))

instances, counts_bp = np.unique(cond_sig_bp.sum(axis=0), return_counts=True)
instances, counts_bn = np.unique(cond_sig_bn.sum(axis=0), return_counts=True)

print('num B > A,C DARs in num conds:\t',  counts_bp[1:])
print('num B < A,C DARs in num conds:\t',  counts_bn[1:])

# get the full matrix

In [None]:
%%time
atac_tensor = np.zeros((n_cond, n_type, n_peak))
for i, cond in enumerate(sample_conditions):
    # get sub
    adatasub = adata[adata.obs['cond']==cond]
    cells_rna = adatasub.obs.index.values
    x = adatasub.obsm['pca_p17on'][...,0]
    cells_rna_type = pd.qcut(x, n_type, labels=False)
    
    # get ATAC
    adata_pk = adatas_pk[i]
    cells_atac = adata_pk.obs.index.values
    
    # per type
    for j in range(n_type):
        cells_j = np.intersect1d(cells_rna[cells_rna_type==j], cells_atac)
        atac_tensor[i,j] = np.mean(adata_pk[cells_j].X, axis=0)

In [None]:
fout3 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DAR_mean_counts_p6to21.npy'
np.save(fout3, atac_tensor)

# is this 93 co-incidence?