In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP

from matplotlib.colors import LinearSegmentedColormap

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  

from atac_utils import merge_peaks

In [None]:
outdir_fig = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures"

In [None]:
cond_order_dict = {
    'P6':  0,
    'P8':  1,
    'P10': 2,
    'P12': 3,
    'P14': 4,
    'P17': 5,
    'P21': 6,
}
unq_conds = np.array(list(cond_order_dict.keys()))
unq_conds

# gene annotation

In [None]:
f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/gencode.vM25.TSS.bed'
dfg = pd.read_csv(f, sep='\t', header=None)
dfg = dfg.set_index(3)
dfg

# DEGs

In [None]:
# %%time

def rename_genes(g):
    """
    """
    if g == "March1":
        return "Marchf1"
    else:
        return g

adata_rna = anndata.read("../../data/v1_multiome/L23_allmultiome_proc_P6toP21.h5ad", backed='r')
genes = adata_rna.var.index.values
genes = np.array([rename_genes(g) for g in genes]) 
adata_rna

In [None]:
# %%time
f1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/rna_qs_avc_p6to21.txt'
f2 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/rna_l2fc_avc_p6to21.txt'

rna_qs_avc = np.loadtxt(f1)
rna_l2fc_avc = np.loadtxt(f2)
rna_qs_avc.shape, rna_l2fc_avc.shape

In [None]:
# get avsc genes
degs_a = []
degs_c = []
for i, t in enumerate(unq_conds):
    cond_a = np.logical_and(rna_qs_avc[i]<0.05, rna_l2fc_avc[i] < -1)
    cond_c = np.logical_and(rna_qs_avc[i]<0.05, rna_l2fc_avc[i] >  1)
    
    deg_a = np.sort(genes[cond_a])
    deg_c = np.sort(genes[cond_c])
    
    df_a = dfg.loc[deg_a].reset_index()
    df_a[4] = f"{t}_A"
    df_a = df_a[[0,1,2,3,4]]
    
    df_c = dfg.loc[deg_c].reset_index()
    df_c[4] = f"{t}_C"
    df_c = df_c[[0,1,2,3,4]]
    
    degs_a.append(df_a)
    degs_c.append(df_c)
    

In [None]:
df_agenes = pd.concat(degs_a).groupby(3).agg({4: ', '.join, 
                                  0: 'first',
                                  1: 'first',
                                  2: 'first',
                                 }).reset_index()
df_agenes['chrom_order'] = pd.Categorical(df_agenes[0], categories=natsorted(df_agenes[0].unique()), ordered=True)
df_agenes = df_agenes.sort_values(['chrom_order',1,2])[[0,1,2,3,4]]
df_agenes

In [None]:
df_agenes[0].unique()

In [None]:
df_cgenes = pd.concat(degs_c).groupby(3).agg({4: ', '.join, 
                                  0: 'first',
                                  1: 'first',
                                  2: 'first',
                                 }).reset_index()
df_cgenes['chrom_order'] = pd.Categorical(df_cgenes[0], categories=natsorted(df_cgenes[0].unique()), ordered=True)
df_cgenes = df_cgenes.sort_values(['chrom_order',1,2])[[0,1,2,3,4]]

In [None]:
df_cgenes[0].unique()

In [None]:
df_ac_genes = pd.concat(degs_a+degs_c).groupby(3).agg({4: ', '.join, 
                                  0: 'first',
                                  1: 'first',
                                  2: 'first',
                                 }).reset_index()
df_ac_genes['chrom_order'] = pd.Categorical(df_ac_genes[0], categories=natsorted(df_ac_genes[0].unique()), ordered=True)
df_ac_genes = df_ac_genes.sort_values(['chrom_order',1,2])[[0,1,2,3,4]]

In [None]:
df_ac_genes[0].unique()

In [None]:
len(df_ac_genes), len(df_agenes), len(df_cgenes)

In [None]:
f = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/all_a_genes_unique.bed"
df_agenes.to_csv(f, sep='\t', header=False, index=False)
f = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/all_c_genes_unique.bed"
df_cgenes.to_csv(f, sep='\t', header=False, index=False)
f = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/all_ac_genes_unique.bed"
df_ac_genes.to_csv(f, sep='\t', header=False, index=False)

# associate the two using distance 

In [None]:
import subprocess

wkdir = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/" 
f_g = "all_ac_genes_unique.bed"
f_p = "all_AvsC_peaks_unique.bed"
f_out = wkdir+"all_ac_peaks_to_ac_genes.bed"

cmd = ['bedtools', 'closest', '-d', '-a', f_p, '-b', f_g]
with open(f_out, 'w') as fh:
    subprocess.run(cmd, cwd=wkdir, stdout=fh)

# check resulting distance

In [None]:
df_res = pd.read_csv(f_out, sep='\t', header=None)
df_res

In [None]:
sns.histplot(np.log10(1+df_res[9]))

In [None]:
df_res2 = df_res[df_res[9]<1e6].copy()
df_res2

In [None]:
df_res2.groupby(7).size().sort_values(ascending=False).head(20)

# present it
- peaks and genes - over time and type

show these matrices side-by-side
- gene by (type & time) (zscore)
- peak (average over the same gene) by (tye & time) (zscore)

In [None]:
f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/all_AvsC_peak_tensor.npy'
tensor_peak = np.load(f)
tensor_peak.shape

In [None]:
cond_peak = (df_res[9]<1e6).values
tensor_peak_g = tensor_peak[:,:,:,cond_peak]
tensor_peak_g.shape

In [None]:
f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/all_ac_genes_tensor.npy'
tensor_rna = np.load(f)
tensor_rna.shape

In [None]:
f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_atac/all_ac_genes_unique.bed'
df_genes_ordered = pd.read_csv(f, sep='\t', header=None)
genes_order = df_genes_ordered[3].values
df_genes_ordered

In [None]:
A = tensor_rna.copy()
A = 1.0/2*(A[:,:,0,:]+A[:,:,1,:])
A = (A.reshape(-1, A.shape[-1]))
A = (A-np.mean(A, axis=0))/np.std(A, axis=0)
A = A.T
A.shape

In [None]:
sns.clustermap(A, col_cluster=False, cmap='coolwarm')

In [None]:
# get a gene by peak association matrix 
n_peaks = len(df_res2)
n_genes = len(genes_order)
closest_gene_to_peaks = df_res2[7].values

peak_idx = np.arange(n_peaks)
gene_idx = basicu.get_index_from_array(genes_order, closest_gene_to_peaks)
print(peak_idx, peak_idx.shape)
print(gene_idx, gene_idx.shape)
I_pg = np.array(sparse.coo_matrix(([1]*n_peaks, (peak_idx, gene_idx)), shape=(n_peaks, n_genes)).todense())
I_pg = (I_pg)/(I_pg.sum(axis=0)+1e-10)
I_pg.shape

In [None]:
B = tensor_peak_g.copy()
B = 1.0/2*(B[:,:,0,:]+B[:,:,1,:])
B = (B.reshape(-1, B.shape[-1]))
B = (B-np.mean(B, axis=0))/np.std(B, axis=0)
B = (B.dot(I_pg)).T
B.shape

In [None]:
sns.clustermap(B, col_cluster=False, cmap='coolwarm', vmin=-2, vmax=2)

# same order

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

In [None]:
fig, axs = plt.subplots(1,2,figsize=(2*8,1*6))
ax = axs[0]
Z = linkage(A, method='ward')#, metric='correlation')
leaves = dendrogram(Z, no_plot=True)['leaves']
sns.heatmap(A[leaves], cmap='coolwarm', vmax=3, vmin=-3, ax=ax)

ax = axs[1]
Z = linkage(B, method='ward')#, metric='correlation')
leaves = dendrogram(Z, no_plot=True)['leaves']
sns.heatmap(B[leaves], cmap='coolwarm', vmax=2, vmin=-2, ax=ax)
plt.show()

In [None]:
Z = linkage(A, method='ward')#, metric='correlation')
leaves = dendrogram(Z, no_plot=True)['leaves']

fig, axs = plt.subplots(1,2,figsize=(2*8,1*6))
ax = axs[0]
sns.heatmap(A[leaves], cmap='coolwarm', vmax=3, vmin=-3, ax=ax)

ax = axs[1]
sns.heatmap(B[leaves], cmap='coolwarm', vmax=2, vmin=-2, ax=ax)
plt.show()