In [None]:
import scipy.io
import numpy as np
import pandas as pd
import scanpy as sc
import h5py

import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP

from matplotlib.colors import LinearSegmentedColormap
from statsmodels.stats.multitest import multipletests

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  

import atac_utils

# load data

In [None]:
%%time
f_rna = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/L23_allmultiome_proc_P6toP21_NRDR.h5ad'
adata = sc.read(f_rna)
adata

In [None]:
condcode2cond = atac_utils.CONDCODE_TO_COND
condcode2cond

In [None]:
sample_conditions = np.array(list(condcode2cond.values()))
n_cond = len(sample_conditions)

nr_condcodes  = np.array([0,1,2,3,5,7,9])
dr_condcodes  = np.array([4,6,8,10])

nr_conditions = sample_conditions[nr_condcodes]
dr_conditions = sample_conditions[dr_condcodes]

nr_times = [6,8,10,12,14,17,21]
dr_times =        [12,14,17,21]
times = nr_times

print(sample_conditions)
print(nr_condcodes, nr_conditions)
print(dr_condcodes, dr_conditions)

In [None]:
%%time
adatas_pk = []
for cond in sample_conditions:
    print(cond)
    f = f'/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/pmat_l23concensus_{cond}.h5ad'
    adata_pk = sc.read(f)
    adatas_pk.append(adata_pk)
    print(adata_pk)
    # break

In [None]:
n_cond = len(sample_conditions)
n_peak = adatas_pk[0].shape[1]
n_type = 5

# check results and stats

In [None]:
def mean_bin(x, y, bins=None):
    df = pd.DataFrame() # 
    df['x'] = x
    df['y'] = y 
    
    if bins is None:
        binned, bins = pd.qcut(df['x'], 10, retbins=True)
        df['xbin'] = binned
    
    df_mean = df.groupby('xbin').mean()
    mid_bins = np.mean(np.vstack([bins[:-1], bins[1:]]).T, axis=1)
    
    df_mean['mid'] = mid_bins
    return df_mean

In [None]:
%%time
fin1 = '/u/home/f/f7xiesnm/v1_multiome/DAR_l23abc_l2fc.npy'
fin2 = '/u/home/f/f7xiesnm/v1_multiome/DAR_l23abc_qs.npy'

In [None]:
l2fc_tensor = np.load(fin1)
qs_tensor = np.load(fin2)

In [None]:
l2fc_th = np.log2(2)
alpha_th = 0.05

In [None]:
qs   = qs_tensor[:,0,:]
l2fc = l2fc_tensor[:,0,:]

cond_sig_a = np.all([qs < alpha_th, l2fc < -l2fc_th], axis=0)
cond_sig_c = np.all([qs < alpha_th, l2fc >  l2fc_th], axis=0)

print('num A > C DARs per cond:\t',    cond_sig_a.sum(axis=1), np.any(cond_sig_a, axis=0).sum())
print('num C > A DARs per cond:\t',    cond_sig_c.sum(axis=1), np.any(cond_sig_c, axis=0).sum())

instances, counts_a = np.unique(cond_sig_a.sum(axis=0), return_counts=True)
instances, counts_c = np.unique(cond_sig_c.sum(axis=0), return_counts=True)

print('num A > C DARs in num conds:\t',  counts_a[1:])
print('num C > A DARs in num conds:\t',  counts_c[1:])

In [None]:
qs_ba   = qs_tensor[:,1,:]
qs_bc   = qs_tensor[:,2,:]

l2fc_ba = l2fc_tensor[:,1,:]
l2fc_bc = l2fc_tensor[:,2,:]

qs_b   = np.minimum(qs_ba, qs_bc)
l2fc_b = np.mean([l2fc_ba, l2fc_bc], axis=0)

cond_sig_bp = np.all([l2fc_ba > 0, l2fc_bc > 0, l2fc_b >  l2fc_th, qs_b < alpha_th], axis=0)
cond_sig_bn = np.all([l2fc_ba < 0, l2fc_bc < 0, l2fc_b < -l2fc_th, qs_b < alpha_th], axis=0)

print('num B > A,C per cond:\t',    cond_sig_bp.sum(axis=1))
print('num B < A,C per cond:\t',    cond_sig_bn.sum(axis=1))

instances, counts_bp = np.unique(cond_sig_bp.sum(axis=0), return_counts=True)
instances, counts_bn = np.unique(cond_sig_bn.sum(axis=0), return_counts=True)

print('num B > A,C DARs in num conds:\t',  counts_bp[1:])
print('num B < A,C DARs in num conds:\t',  counts_bn[1:])

In [None]:
%%time

all_x = []
all_y_a = []
all_y_c = []
all_y_b = []
all_nums = []

offset = 1

for cond_order, cond in condcode2cond.items():
    # get sub
    adatasub = adata[adata.obs['cond']==cond]
    cells_rna = adatasub.obs.index.values
    
    # get ATAC
    adata_pk = adatas_pk[cond_order]
    cells_atac = adata_pk.obs.index.values
    
    cells_mult = np.intersect1d(cells_rna, cells_atac)
    
    # no normalization here
    x = adatasub[cells_mult].obsm['pca_p17on'][...,0]
    
    cellcov = np.array(adata_pk[cells_mult].X.sum(axis=1)) #.reshape(-1,)
    
    # TODO: peak size (500kb) fix
    mat_a = np.array(adata_pk[cells_mult, cond_sig_a[cond_order]].X.todense()) 
    mat_c = np.array(adata_pk[cells_mult, cond_sig_c[cond_order]].X.todense()) 
    
    # size norm (CPM) between cells
    mat_a = np.log2(mat_a/cellcov*1e6 + 1)
    mat_c = np.log2(mat_c/cellcov*1e6 + 1)
    
    y_a = np.mean(mat_a, axis=1)
    y_c = np.mean(mat_c, axis=1)
    
    if cond_sig_bp[cond_order].sum() > 0:
        mat_b = np.array(adata_pk[cells_mult, cond_sig_bp[cond_order]].X.todense()) 
        mat_b = np.log2(mat_b/cellcov*1e6 + 1)
        y_b = np.mean(mat_b, axis=1) 
    else:
        y_b = np.array([np.nan]*len(y_a))
    
    all_x.append(x)
    all_y_a.append(y_a)
    all_y_c.append(y_c)
    all_y_b.append(y_b)
    all_nums.append([cond_sig_a[cond_order].sum(),
                     cond_sig_bp[cond_order].sum(),
                     cond_sig_c[cond_order].sum(),
                    ])
    

In [None]:
nx = len(nr_conditions)
fig, axs = plt.subplots(3,nx, figsize=(4*nx,4*3)) 
for i, cond_code in enumerate(nr_condcodes):
    cond = atac_utils.decode_cond(cond_code)
    x   = all_x[cond_code]
    y_a = all_y_a[cond_code]
    y_c = all_y_c[cond_code]
    y_b = all_y_b[cond_code]
    num_a, num_b, num_c = all_nums[cond_code]
    
    ax = axs[0][i]
    ax.scatter(x, y_a, s=1, color='C0', label='A peaks', alpha=0.5)# , facecolor='none')
    r_a, p  = stats.spearmanr(x, y_a)
    means = mean_bin(x, y_a)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'{cond}\nNa={num_a:,}, r={r_a:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)

    ax = axs[1][i]
    ax.scatter(x, y_c, s=1, color='C2', label='C peaks', alpha=0.5)# , facecolor='none')
    r_c, p  = stats.spearmanr(x, y_c)
    means = mean_bin(x, y_c)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nc={num_c:,}, r={r_c:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    ax = axs[2][i]
    ax.scatter(x, y_b, s=1, color='C1', label='B peaks', alpha=0.5)# , facecolor='none')
    r_b, p  = stats.spearmanr(x, y_b)
    means = mean_bin(x, y_b)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nb={num_b:,}, r={r_b:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    
    if np.all(np.isnan(y_b)):
        ax.set_xticks([])
        ax.set_yticks([])
    
axs[0][0].set_xlabel('RNA PC1 (P17-21)')
axs[0][0].set_ylabel('ATAC peak accessibility')
fig.tight_layout()
plt.show()

In [None]:
nx = len(nr_conditions)
fig, axs = plt.subplots(3,nx, figsize=(4*nx,4*3), sharey=True) 
for i, cond_code in enumerate(nr_condcodes):
    cond = atac_utils.decode_cond(cond_code)
    x   = all_x[cond_code]
    y_a = all_y_a[cond_code]
    y_c = all_y_c[cond_code]
    y_b = all_y_b[cond_code]
    num_a, num_b, num_c = all_nums[cond_code]
    
    ax = axs[0][i]
    # ax.scatter(x, y_a, s=1, color='C0', label='A peaks', alpha=0.5)# , facecolor='none')
    # r_a, p  = stats.spearmanr(x, y_a)
    means = mean_bin(x, y_a)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'{cond}\nNa={num_a:,}, r={r_a:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)

    ax = axs[1][i]
    # ax.scatter(x, y_c, s=1, color='C2', label='C peaks', alpha=0.5)# , facecolor='none')
    # r_c, p  = stats.spearmanr(x, y_c)
    means = mean_bin(x, y_c)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nc={num_c:,}, r={r_c:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    ax = axs[2][i]
    # ax.scatter(x, y_b, s=1, color='C1', label='B peaks', alpha=0.5)# , facecolor='none')
    # r_b, p  = stats.spearmanr(x, y_b)
    means = mean_bin(x, y_b)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nb={num_b:,}, r={r_b:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    
    if np.all(np.isnan(y_b)):
        ax.set_xticks([])
        ax.set_yticks([])
    
axs[0][0].set_xlabel('RNA PC1 (P17-21)')
axs[0][0].set_ylabel('ATAC peak accessibility')
fig.tight_layout()
plt.show()

In [None]:
nx = len(times)
fig, axs = plt.subplots(6,nx, figsize=(4*nx,4*6)) 
for i, cond_code in enumerate(nr_condcodes):
    cond = atac_utils.decode_cond(cond_code)
    x   = all_x[cond_code]
    y_a = all_y_a[cond_code]
    y_c = all_y_c[cond_code]
    y_b = all_y_b[cond_code]
    num_a, num_b, num_c = all_nums[cond_code]
    
    ax = axs[0*2][i]
    ax.scatter(x, y_a, s=1, color='C0', label='A peaks', alpha=0.5)# , facecolor='none')
    r_a, p  = stats.spearmanr(x, y_a)
    means = mean_bin(x, y_a)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'{cond}\nNa={num_a:,}, r={r_a:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)

    ax = axs[1*2][i]
    ax.scatter(x, y_c, s=1, color='C2', label='C peaks', alpha=0.5)# , facecolor='none')
    r_c, p  = stats.spearmanr(x, y_c)
    means = mean_bin(x, y_c)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nc={num_c:,}, r={r_c:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    ax = axs[2*2][i]
    ax.scatter(x, y_b, s=1, color='C1', label='B peaks', alpha=0.5)# , facecolor='none')
    r_b, p  = stats.spearmanr(x, y_b)
    means = mean_bin(x, y_b)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nb={num_b:,}, r={r_b:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    
for i, cond_code in enumerate(dr_condcodes):
    cond = atac_utils.decode_cond(cond_code)
    x   = all_x[cond_code]
    y_a = all_y_a[cond_code]
    y_c = all_y_c[cond_code]
    y_b = all_y_b[cond_code]
    num_a, num_b, num_c = all_nums[cond_code]
    
    ax = axs[0*2+1][i+3]
    ax.scatter(x, y_a, s=1, color='C0', label='A peaks', alpha=0.5)# , facecolor='none')
    r_a, p  = stats.spearmanr(x, y_a)
    means = mean_bin(x, y_a)
    ax.plot(means['x'], means['y'], color='k',)# , facecolor='none')
    ax.set_title(f'{cond}\nNa={num_a:,}, r={r_a:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)

    ax = axs[1*2+1][i+3]
    ax.scatter(x, y_c, s=1, color='C2', label='C peaks', alpha=0.5)# , facecolor='none')
    r_c, p  = stats.spearmanr(x, y_c)
    means = mean_bin(x, y_c)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nc={num_c:,}, r={r_c:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)
    
    ax = axs[2*2+1][i+3]
    ax.scatter(x, y_b, s=1, color='C1', label='B peaks', alpha=0.5)# , facecolor='none')
    r_b, p  = stats.spearmanr(x, y_b)
    means = mean_bin(x, y_b)
    ax.plot(means['x'], means['y'], color='k', )# , facecolor='none')
    ax.set_title(f'Nb={num_b:,}, r={r_b:.2f}')
    sns.despine(ax=ax)
    ax.grid(False)

for ax in axs[1][:3]:
    ax.axis('off')
for ax in axs[3][:3]:
    ax.axis('off')
for ax in axs[5][:3]:
    ax.axis('off')
    
axs[0][0].set_xlabel('RNA PC1 (P17-21)')
axs[0][0].set_ylabel('ATAC peak accessibility')
fig.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1,3, figsize=(3*4,1*4), sharex=True)

num_uniq_a = np.sum(np.any(cond_sig_a,  axis=0))
num_uniq_b = np.sum(np.any(cond_sig_bp, axis=0))
num_uniq_c = np.sum(np.any(cond_sig_c,  axis=0))

ax = axs[0]
ax.plot(times, cond_sig_a.sum(axis=1)[nr_condcodes], 
        '-o', color='C0', fillstyle='none', label=f'A DARs (unique n = {num_uniq_a:,})') 
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.set_ylim(ymin=-500, ymax=12500)
ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)
ax.set_ylabel('number of DARs')

ax = axs[1]
ax.plot(times, cond_sig_c.sum(axis=1)[nr_condcodes], 
        '-o', color='C2', fillstyle='none', label=f'C DARs (unique n = {num_uniq_c:,})') 
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.set_ylim(ymin=-500, ymax=12500)
ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)

ax = axs[2]
ax.plot(times, cond_sig_bp.sum(axis=1)[nr_condcodes],
        '-o', color='C1', fillstyle='none', label=f'B DARs (unique n = {num_uniq_b:,})')
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
# ax.set_ylim(ymin=0)
ax.set_ylim(ymin=-50, ymax=2000)
ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)

fig.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1,3, figsize=(3*4,1*4))

num_uniq_a = np.sum(np.any(cond_sig_a,  axis=0))
num_uniq_b = np.sum(np.any(cond_sig_bp, axis=0))
num_uniq_c = np.sum(np.any(cond_sig_c,  axis=0))

ax = axs[0]
ax.plot(times, cond_sig_a.sum(axis=1)[nr_condcodes], 
        '-o', color='C0', fillstyle='none', label=f'A DARs (unique n = {num_uniq_a:,})') 
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.plot(dr_times, cond_sig_a.sum(axis=1)[dr_condcodes], 
        '-.s', color='C0', fillstyle='none', label=f'A DARs (unique n = {num_uniq_a:,})') 
ax.plot(dr_times, [0]*len(dr_times),
        '-.s', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)
ax.set_ylabel('number of DARs')
ax.set_ylim(ymin=-500, ymax=12500)

ax = axs[1]
ax.plot(times, cond_sig_c.sum(axis=1)[nr_condcodes], 
        '-o', color='C2', fillstyle='none', label=f'C DARs (unique n = {num_uniq_c:,})') 
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.plot(dr_times, cond_sig_c.sum(axis=1)[dr_condcodes], 
        '-.s', color='C2', fillstyle='none', label=f'C DARs (unique n = {num_uniq_c:,})') 
ax.plot(dr_times, [0]*len(dr_times),
        '-.s', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.set_ylim(ymin=-500, ymax=12500)

ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)

ax = axs[2]
ax.plot(times, cond_sig_bp.sum(axis=1)[nr_condcodes],
        '-o', color='C1', fillstyle='none', label=f'B DARs (unique n = {num_uniq_b:,})')
ax.plot(times, [0]*len(times),'--o', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.plot(dr_times, cond_sig_bp.sum(axis=1)[dr_condcodes], 
        '-.s', color='C1', fillstyle='none', label=f'B DARs (unique n = {num_uniq_b:,})') 
ax.plot(dr_times, [0]*len(dr_times),
        '-.s', color='gray', fillstyle='none', label=f'random DARs (n=0)', zorder=0)
ax.set_ylim(ymin=-50, ymax=2000)

ax.set_xticks(times)
ax.set_xticklabels(times)
ax.grid(False)
sns.despine(ax=ax)

fig.tight_layout()
plt.show()

# get the full matrix

In [None]:
from sklearn.cluster import KMeans

In [None]:
fin3 = '/u/home/f/f7xiesnm/v1_multiome/DAR_l23abc_cpm_tensor.npy'
atac_tensor = np.load(fin3)

In [None]:
cond_sig_selected = np.any(np.vstack([cond_sig_a, cond_sig_c, cond_sig_bp]), axis=0)
n_dar = cond_sig_selected.sum()
atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
mat = np.log2(1+atac_tensor_sel.reshape(n_cond*n_type, n_dar).T)

In [None]:
mat.shape

In [None]:
n_peakset_clsts = 30
method = KMeans(n_clusters=n_peakset_clsts)
peakset_clst = method.fit_predict(mat)

In [None]:
mat_mean = np.zeros((n_peakset_clsts, mat.shape[1]))
for i in range(n_peakset_clsts):
    mean_val = np.mean(mat[peakset_clst==i], axis=0) 
    z_mean = (mean_val - np.mean(mean_val))/np.std(mean_val)
    mat_mean[i] = z_mean 

In [None]:
mat_mean.shape

In [None]:
# sns.heatmap(mat_mean, cmap='coolwarm')
# set of A vs C peaks together 
g = sns.clustermap(mat_mean, col_cluster=False, cmap='coolwarm', figsize=(8,5), xticklabels=False, yticklabels=False)

ax = g.ax_heatmap
ax.set_xticks(0.5+np.arange(n_type))
ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10)

for i in range(n_cond):
    ax.axvline(i*n_type, color='k', linestyle='--', linewidth=1)
    ax.text(i*n_type, 0, f'{sample_conditions[i]}')

# ABC peaks

In [None]:
n_peakset_clsts = 10
for cond_sig_, title in zip([cond_sig_a, cond_sig_c, cond_sig_bp], 
                            ['A regions', 'C regions', 'B regions',]):
    
    cond_sig_selected = np.any(cond_sig_, axis=0)
    n_dar = cond_sig_selected.sum()
    atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
    mat = np.log2(1+atac_tensor_sel.reshape(n_cond*n_type, n_dar).T)

    method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
    peakset_clst = method.fit_predict(mat)

    mat_mean = np.zeros((n_peakset_clsts, mat.shape[1]))
    for i in range(n_peakset_clsts):
        mean_val = np.mean(mat[peakset_clst==i], axis=0) 
        z_mean = (mean_val - np.mean(mean_val))/np.std(mean_val)
        mat_mean[i] = z_mean 

    # sns.heatmap(mat_mean, cmap='coolwarm')
    # set of A vs C peaks together 
    g = sns.clustermap(mat_mean, col_cluster=False, cmap='coolwarm', figsize=(8,3), xticklabels=False, yticklabels=False)

    ax = g.ax_heatmap
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10)
    ax.set_title(title, pad=30)

    for i in range(n_cond):
        ax.axvline(i*n_type, color='k', linestyle='--', linewidth=1)
        ax.text(i*n_type, -0.5, f'{sample_conditions[i]}', fontsize=10, va='bottom')

# ABC peaks (NR only)

In [None]:
n_cond_nr = len(nr_condcodes)
nr_condcodes

In [None]:
n_peakset_clsts = 10
for cond_sig_, title in zip([cond_sig_a, cond_sig_c, cond_sig_bp], 
                            ['A regions', 'C regions', 'B regions',]):
    
    cond_sig_selected = np.any(cond_sig_, axis=0)
    n_dar = cond_sig_selected.sum()
    atac_tensor_sel = atac_tensor[:,:,cond_sig_selected][nr_condcodes]
    mat = np.log2(1+atac_tensor_sel.reshape(n_cond_nr*n_type, n_dar).T)

    method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
    peakset_clst = method.fit_predict(mat)

    mat_mean = np.zeros((n_peakset_clsts, mat.shape[1]))
    for i in range(n_peakset_clsts):
        mean_val = np.mean(mat[peakset_clst==i], axis=0) 
        z_mean = (mean_val - np.mean(mean_val))/np.std(mean_val)
        mat_mean[i] = z_mean 

    # sns.heatmap(mat_mean, cmap='coolwarm')
    # set of A vs C peaks together 
    g = sns.clustermap(mat_mean, col_cluster=False, cmap='coolwarm', figsize=(8,3), xticklabels=False, yticklabels=False)

    ax = g.ax_heatmap
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10)
    ax.set_title(title, pad=30)

    for i in range(n_cond_nr):
        ax.axvline(i*n_type, color='k', linestyle='--', linewidth=1)
        ax.text(i*n_type, -0.5, f'{sample_conditions[nr_condcodes][i]}', fontsize=10, va='bottom')

# ABC peaks (all - reordered)

In [None]:
n_cond_nr = len(nr_condcodes)
nr_condcodes

In [None]:
nrdr_condcodes = np.hstack([nr_condcodes, dr_condcodes])
nrdr_condcodes

In [None]:
n_peakset_clsts = 10
for cond_sig_, title in zip([cond_sig_a, cond_sig_c, cond_sig_bp], 
                            ['A regions', 'C regions', 'B regions',]):
    
    cond_sig_selected = np.any(cond_sig_, axis=0)
    n_dar = cond_sig_selected.sum()
    atac_tensor_sel = atac_tensor[:,:,cond_sig_selected][nrdr_condcodes]
    mat = np.log2(1+atac_tensor_sel.reshape(n_cond*n_type, n_dar).T)

    method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
    peakset_clst = method.fit_predict(mat)

    mat_mean = np.zeros((n_peakset_clsts, mat.shape[1]))
    for i in range(n_peakset_clsts):
        mean_val = np.mean(mat[peakset_clst==i], axis=0) 
        z_mean = (mean_val - np.mean(mean_val))/np.std(mean_val)
        mat_mean[i] = z_mean 

    # sns.heatmap(mat_mean, cmap='coolwarm')
    # set of A vs C peaks together 
    g = sns.clustermap(mat_mean, col_cluster=False, cmap='coolwarm', figsize=(8,3), xticklabels=False, yticklabels=False)

    ax = g.ax_heatmap
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10)
    ax.set_title(title, pad=30)

    for i in range(n_cond):
        ax.axvline(i*n_type, color='k', linestyle='--', linewidth=1)
        ax.text(i*n_type, -0.5, f'{sample_conditions[nrdr_condcodes][i]}', fontsize=10, va='bottom')
        
    # NRDR boundary
    ax.axvline(3*n_type, ymin=1, ymax=1.05, color='k', linestyle='-', linewidth=1.5, clip_on=False)
    ax.axvline(7*n_type, ymin=0, ymax=1.05, color='k', linestyle='-', linewidth=1.5, clip_on=False)

# Save a peak list
- A
- Bp
- C 

In [None]:
peakset = pd.DataFrame(list(adata_pk.var.index.str.split(pat=':|-'))) # 
peakset

In [None]:
peakset_a = peakset[np.any(cond_sig_a, axis=0)]
peakset_b = peakset[np.any(cond_sig_bp, axis=0)]
peakset_c = peakset[np.any(cond_sig_c, axis=0)]
print(peakset_a.shape, peakset_b.shape, peakset_c.shape)
peakset_a

In [None]:
f_a = "/u/home/f/f7xiesnm/v1_multiome/atac_fragments/frag_bed/processed_peaks/consensus_peaks_v2_DAR_A.bed" 
f_b = "/u/home/f/f7xiesnm/v1_multiome/atac_fragments/frag_bed/processed_peaks/consensus_peaks_v2_DAR_B.bed" 
f_c = "/u/home/f/f7xiesnm/v1_multiome/atac_fragments/frag_bed/processed_peaks/consensus_peaks_v2_DAR_C.bed" 

peakset_a.to_csv(f_a, sep='\t', header=False, index=False)
peakset_b.to_csv(f_b, sep='\t', header=False, index=False)
peakset_c.to_csv(f_c, sep='\t', header=False, index=False)

# cluster individual peaks show numbers 

In [None]:
condcode2cond

In [None]:
res = []

In [None]:
# cond_sig_selected = np.any(np.vstack([cond_sig_a, cond_sig_c, cond_sig_bp]), axis=0)

cond_sig_selected = np.any(cond_sig_a, axis=0)
n_dar = cond_sig_selected.sum()
atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
mat = np.log2(1+atac_tensor_sel.reshape(n_cond*n_type, n_dar).T)
fmat = (mat-np.mean(mat, axis=1).reshape(-1,1))
zmat = (mat-np.mean(mat, axis=1).reshape(-1,1))/np.std(mat, axis=1).reshape(-1,1)

n_peakset_clsts = 5
method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
peakset_clst = method.fit_predict(zmat)

zmat_ctrds = []
for i in range(n_peakset_clsts):
    zmat_ctrds.append(np.mean(zmat[peakset_clst == i], axis=0))
zmat_ctrds = np.array(zmat_ctrds).reshape(-1,5,11)[:,:,nr_condcodes]

clst_order = [2,1,3,4,0]
# clst_order = np.argsort(np.argmax(np.mean(zmat_ctrds, axis=1), axis=1)) # .shape
peakset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(peakset_clst).values
peakset_order = np.argsort(peakset_clst_renamed)

res.append({
    'title': 'A regions',
    'zmat': zmat,
    'fmat': fmat,
    'nclst': n_peakset_clsts,
    'clst': peakset_clst_renamed,
    'order': peakset_order,
})

In [None]:
cond_sig_selected = np.any(cond_sig_bp, axis=0)
n_dar = cond_sig_selected.sum()
atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
mat = np.log2(1+atac_tensor_sel.reshape(n_cond*n_type, n_dar).T)
fmat = (mat-np.mean(mat, axis=1).reshape(-1,1))
zmat = (mat-np.mean(mat, axis=1).reshape(-1,1))/np.std(mat, axis=1).reshape(-1,1)

n_peakset_clsts = 3
method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
peakset_clst = method.fit_predict(zmat)

zmat_ctrds = []
for i in range(n_peakset_clsts):
    zmat_ctrds.append(np.mean(zmat[peakset_clst == i], axis=0))
zmat_ctrds = np.array(zmat_ctrds).reshape(-1,5,11)[:,:,nr_condcodes]

clst_order = [0,2,1]
# clst_order = np.argsort(np.argmax(np.mean(zmat_ctrds, axis=1), axis=1)) # .shape
peakset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(peakset_clst).values
peakset_order = np.argsort(peakset_clst_renamed)

res.append({
    'title': 'B regions',
    'zmat': zmat,
    'fmat': fmat,
    'nclst': n_peakset_clsts,
    'clst': peakset_clst_renamed,
    'order': peakset_order,
})

In [None]:
cond_sig_selected = np.any(cond_sig_c, axis=0)
n_dar = cond_sig_selected.sum()
atac_tensor_sel = atac_tensor[:,:,cond_sig_selected]
mat = np.log2(1+atac_tensor_sel.reshape(n_cond*n_type, n_dar).T)
fmat = (mat-np.mean(mat, axis=1).reshape(-1,1))
zmat = (mat-np.mean(mat, axis=1).reshape(-1,1))/np.std(mat, axis=1).reshape(-1,1)

n_peakset_clsts = 5
method = KMeans(n_clusters=n_peakset_clsts, n_init=10, random_state=0)
peakset_clst = method.fit_predict(zmat)

zmat_ctrds = []
for i in range(n_peakset_clsts):
    zmat_ctrds.append(np.mean(zmat[peakset_clst == i], axis=0))
zmat_ctrds = np.array(zmat_ctrds).reshape(-1,5,11)[:,:,nr_condcodes]

clst_order = [1,2,0,4,3]
# clst_order = np.argsort(np.argmax(np.mean(zmat_ctrds, axis=1), axis=1)) # .shape
peakset_clst_renamed = pd.Series({clst: i for i, clst in enumerate(clst_order)}).reindex(peakset_clst).values
peakset_order = np.argsort(peakset_clst_renamed)

res.append({
    'title': 'C regions',
    'zmat': zmat,
    'fmat': fmat,
    'nclst': n_peakset_clsts,
    'clst': peakset_clst_renamed,
    'order': peakset_order,
})

In [None]:
for res_this in res:
# res_this = res[0]
    title = res_this['title']
    zmat  = res_this['zmat']
    order = res_this['order']

    fig, ax = plt.subplots(figsize=(15,8))

    sns.heatmap(zmat[order], yticklabels=2000, cmap='coolwarm', cbar_kws=dict(shrink=0.5), 
                vmax=3, vmin=-3,
                ax=ax)
    ax.set_xticks(0.5+np.arange(n_type))
    ax.set_xticklabels(['A', '<-', '-', '->', 'C'], fontsize=10, rotation=0)
    ax.set_title(title, pad=30)

    # for i in range(n_cond):
    for condcode, cond in condcode2cond.items():
        ax.axvline(condcode*n_type, color='k', linestyle='--', linewidth=1)
        ax.text(condcode*n_type, -0.5, f'{cond}', fontsize=10, va='bottom')

    plt.show()


In [None]:
for res_this in res:
# res_this = res[0]
    title = res_this['title']
    zmat  = res_this['zmat']
    order = res_this['order']
    clsts = res_this['clst']
    print(np.unique(clsts, return_counts=True)[1])

# Profile these modules

In [None]:
from matplotlib.colors import LinearSegmentedColormap

colors_a = [(0.0, 'black'), (1.0, 'C0')]      
colors_b = [(0.0, 'black'), (1.0, 'C1')]      
colors_c = [(0.0, 'black'), (1.0, 'C2')]      

# Create a custom colormap using LinearSegmentedColormap
cmap_a = LinearSegmentedColormap.from_list('cmap_a', colors_a)
cmap_b = LinearSegmentedColormap.from_list('cmap_b', colors_b)
cmap_c = LinearSegmentedColormap.from_list('cmap_c', colors_c)

colors_l23 = [
    np.array(cmap_a(1.0)),
    0.7*np.array(cmap_a(1.0))+0.3*np.array(cmap_b(1.0)),
    np.array(cmap_b(1.0)),
    0.7*np.array(cmap_b(1.0))+0.3*np.array(cmap_c(1.0)),
    np.array(cmap_c(1.0)),
]

In [None]:
times = np.array([6,8,10,12,14,17,21])
dr_times = np.array([12,14,17,21])

In [None]:
# res_this = res[0]

for res_this in res:
    title = res_this['title']
    fmat  = res_this['fmat']
    order = res_this['order']
    clsts = res_this['clst']

    fig, axs = plt.subplots(1,5,figsize=(5*3,1*3), sharex=True, sharey=True)
    fig.suptitle(f'{title} n={len(fmat):,}', y=1.08, fontsize=18)
    axs[0].set_ylabel('fold change')
    for i in range(5):
        ax = axs[i]
        prop = (clsts == i).sum() / len(fmat)
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[nr_condcodes] # .shape
        for ycol, color in zip(y.T, colors_l23):
            ax.plot(times, ycol, '-', color=color)
            ax.set_xticks(times)

        sns.despine(ax=ax)
        ax.grid(False, axis='x')
        ax.set_title(f'M{i+1}: {prop*100: .1f}%')

# add DR visuals

In [None]:
for res_this in res:
    title = res_this['title']
    fmat  = res_this['fmat']
    order = res_this['order']
    clsts = res_this['clst']

    fig, axs = plt.subplots(1,5,figsize=(6*3,1*3), sharex=True, sharey=True)
    fig.suptitle(f'{title} n={len(fmat):,}', y=1.08, fontsize=18)
    axs[0].set_ylabel('fold change')
    for i in range(5):
        ax = axs[i]
        prop = (clsts == i).sum() / len(fmat)
        
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[nr_condcodes] # .shape
        ax.set_xticks([6,10,14,17,21])
        for ycol, color in zip(y.T, colors_l23):
            ax.plot(nr_times, ycol, '-', color=color)
        
        y = np.mean(fmat[clsts==i], axis=0).reshape(11,5)[dr_condcodes] # .shape
        for ycol, color in zip(y.T, colors_l23):
            ax.plot(dr_times+10, ycol, '-', color=color)

        ax.axvline(12, color='k', linestyle='--', linewidth=1.5)
        ax.axvline(22, color='k', linestyle='--', linewidth=1.5)
        sns.despine(ax=ax)
        ax.grid(False, axis='x')
        ax.set_title(f'M{i+1}: {prop*100: .1f}%')

In [None]:
    
# for i in range(5):
#     fout = f'~/v1_multiome/L23A_regions_M{i+1}.bed'
#     peakset[np.any(cond_sig_a, axis=0)][res[0]['clst']==i].to_csv(fout, sep='\t', header=False, index=False)
    
# for i in range(3):
#     fout = f'~/v1_multiome/L23B_regions_M{i+1}.bed'
#     peakset[np.any(cond_sig_bp, axis=0)][res[1]['clst']==i].to_csv(fout, sep='\t', header=False, index=False)
    
# for i in range(5):
#     fout = f'~/v1_multiome/L23C_regions_M{i+1}.bed'
#     peakset[np.any(cond_sig_c, axis=0)][res[2]['clst']==i].to_csv(fout, sep='\t', header=False, index=False)