In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
import scanpy as sc
from scipy import stats
import os

from scipy import spatial
from scipy import sparse
from scipy.interpolate import CubicSpline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import networkx as nx
from umap import UMAP
from scipy.stats import ttest_ind, mannwhitneyu
from scipy.stats import pearsonr, spearmanr, zscore
from statsmodels.stats.multitest import multipletests

import json

In [None]:
import importlib
import scroutines
importlib.reload(scroutines)
from scroutines import powerplots
from scroutines.miscu import is_in_polygon

import utils_merfish
importlib.reload(utils_merfish)
from utils_merfish import rot2d, st_scatter, st_scatter_ax, plot_cluster, binning
from utils_merfish import RefLineSegs

import merfish_datasets
import merfish_genesets
importlib.reload(merfish_datasets)
importlib.reload(merfish_genesets)
from merfish_datasets import merfish_datasets
from merfish_datasets import merfish_datasets_params

from scroutines import basicu

In [None]:
def get_qc_metrics(df):
    """
    return metrics
     - key
      - (name, val, medval, bins)
    """
    metrics = {}
    cols  = ['volume', 'gncov', 'gnnum']
    names = ['cell volume', 'num transcripts', 'num genes']
    
    for col, name in zip(cols, names):
        val = df[col].values
        medval = np.median(val)
        bins = np.linspace(0, 10*medval, 50)
        
        metrics[col] = (name, val, medval, bins)
    return metrics

def get_norm_counts(adata, scaling=500):
    """norm - equalize the volume to be 500 for all cells
    """
    cnts = adata.X
    vol = adata.obs['volume'].values
    normcnts = cnts/vol.reshape(-1,1)*scaling
    adata.layers['norm'] = normcnts
    
    return normcnts

In [None]:
def preprocessing(adata):
    # filter genes
    cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
    adata_sub = adata[:,cond]

    # counts
    x = adata_sub.X
    cov = adata_sub.obs['n_counts'].values

    # CP10k
    xn = (sparse.diags(1/cov).dot(x))*1e4

    # log10(CP10k+1)
    xln = xn.copy()
    xln.data = np.log10(xln.data+1)

    adata_sub.layers['norm'] = xn
    adata_sub.layers['lognorm'] = xln
    
    return adata_sub

In [None]:
def get_hvgs(adata, layer, nbin=20, qth=0.3):
    """
    """
    xn = adata.layers[layer]
    
    # min
    gm = np.ravel(xn.mean(axis=0))

    # var
    tmp = xn.copy()
    tmp.data = np.power(tmp.data, 2)
    gv = np.ravel(tmp.mean(axis=0))-gm**2

    # cut 
    lbl = pd.qcut(gm, nbin, labels=np.arange(nbin))
    gres = pd.DataFrame()
    gres['lbl'] = lbl
    gres['mean'] = gm
    gres['var'] = gv
    gres['ratio']= gv/gm

    # select
    gres_sel = gres.groupby('lbl')['ratio'].nlargest(int(qth*(len(gm)/nbin))) #.reset_index()
    gsel_idx = np.sort(gres_sel.index.get_level_values(1).values)

    assert np.all(gsel_idx != -1)
    
    return adata.var.index.values[gsel_idx]

In [None]:
def binning_pipe2(adata, col_to_bin, layer, bins=None, n=20):
    """
    """
    if bins is None:
        # bin it 
        bins, binned = utils_merfish.binning(adata.obs[col_to_bin].values, n)
    else:
        binned = pd.cut(adata.obs[col_to_bin].values, bins=bins)

    norm_ = pd.DataFrame(adata.layers[layer], columns=adata.var.index)
    norm_['thebin'] = binned
    
    norm_mean = norm_.groupby('thebin').mean(numeric_only=True)
    norm_sem  = norm_.groupby('thebin').sem(numeric_only=True)
    norm_std  = norm_.groupby('thebin').std(numeric_only=True)
    norm_n    = norm_['thebin'].value_counts(sort=False)

    return norm_mean, norm_sem, norm_std, norm_n, binned, bins 

In [None]:
def neighbor_label_transfer(k, ref_emb, qry_emb, ref_lbl, p_cutoff=0.5, dist_cutoff=None):
    """ref vs qry neighbors
    """
    unq_lbls = np.unique(ref_lbl).astype(str) # array(['L2/3_A', 'L2/3_B', 'L2/3_C'])
    n_unq_lbls = len(unq_lbls)
    ref_n = len(ref_emb)
    qry_n = len(qry_emb)
    
    neigh = NearestNeighbors(n_neighbors=k) # , radius=0.4)
    neigh.fit(ref_emb)
    dists, idx = neigh.kneighbors(qry_emb, k, return_distance=True)
    
    raw_pred = ref_lbl[idx]

    # p
    pabc = np.empty((qry_n, n_unq_lbls))
    for i, lbl in enumerate(unq_lbls):
        p = np.sum(raw_pred==lbl, axis=1)/k
        pabc[:,i] = p

    # max
    max_pred = unq_lbls[np.argmax(pabc, axis=1)]

    # 
    gated_pred = max_pred.copy()
    cond1 = np.max( pabc, axis=1) > p_cutoff
    gated_pred[~cond1] = 'NA' 
    if dist_cutoff is not None:
        cond2 = np.max(dists, axis=1) < dist_cutoff
        gated_pred[~cond2] = 'NA' 
    
    return max_pred, gated_pred, np.max(dists, axis=1)


def neighbor_self_nonself(k, ref_emb, qry_emb):
    """ref vs qry neighbors
    """
    unq_lbls = np.unique(ref_lbl).astype(str) # array(['L2/3_A', 'L2/3_B', 'L2/3_C'])
    n_unq_lbls = len(unq_lbls)
    ref_n = len(ref_emb)
    qry_n = len(qry_emb)
    lbls = np.array([0]*ref_n+[1]*qry_n)
    
    neigh = NearestNeighbors(n_neighbors=k) # , radius=0.4)
    neigh.fit(np.vstack([ref_emb, qry_emb]))
    idx = neigh.kneighbors(qry_emb, k, return_distance=False)
    
    isself = lbls[idx]

    p = np.sum(isself, axis=1)/k

    
    return p # max_pred, gated_pred, np.max(dists, axis=1)

In [None]:
from py_pcha import PCHA
def get_aa(X):
    """
    """
    np.random.seed(0)
    XC, S, C, SSE, varexpl = PCHA(X, noc=3, delta=0)
    XC = np.array(XC)
    XC = XC[:,np.argsort(XC[0])].copy() # order this
    return XC

In [None]:
def add_triangle(XC, ax, zorder=0, vertices=False, label='', linecolor='gray', linewidth=1, **kwargs):
    # add the triangle
    ax.plot(XC[0].tolist()+[XC[0,0]], XC[1].tolist()+[XC[1,0]], '--', 
            color=linecolor, label=label, zorder=zorder, linewidth=linewidth, markersize=3)
    
    # add vertices
    if vertices:
        ax.scatter(XC[0,0], XC[1,0], color='C0', zorder=zorder, **kwargs)
        ax.scatter(XC[0,1], XC[1,1], color='C1', zorder=zorder, **kwargs)
        ax.scatter(XC[0,2], XC[1,2], color='C2', zorder=zorder, **kwargs)

In [None]:
def p_mark(p):
    """
    """
    
    if p > 0.05:
        mark = 'ns'
    elif p < 0.05 and p > 0.001:
        mark = '*'
    elif p < 0.001:
        mark = '***'
        
    return mark

In [None]:
def wrap_label_transfer(ref, qry, key_emb, key_lbl, k=30):
    """write results into 
    qry.obs[key_lbl]
    """
    
    # # label transfer from RNA data
    ref_emb = ref.obsm[key_emb][:,:2]
    qry_emb = qry.obsm[key_emb][:,:2]
    ref_lbl = ref.obs[key_lbl].values.astype(str)

    qry_lbl, _, _ = neighbor_label_transfer(k, ref_emb, qry_emb, ref_lbl, p_cutoff=0.5, dist_cutoff=None)
    qry.obs[key_lbl] = qry_lbl
    
    return 

In [None]:
def get_abc_scores(adata, agenes, bgenes, cgenes):
    """
    """
    
    # get ABC scores
    g0_a = zscore(adata[:,agenes].layers['ljnorm'], axis=0).mean(axis=1)
    g0_b = zscore(adata[:,bgenes].layers['ljnorm'], axis=0).mean(axis=1)
    g0_c = zscore(adata[:,cgenes].layers['ljnorm'], axis=0).mean(axis=1)

    # make ABC scores comparable and norm to [0,1] [40% to 95%]
    vmin_p, vmax_p = 40, 95
    vmin_a = np.percentile(g0_a, vmin_p)
    vmax_a = np.percentile(g0_a, vmax_p)

    vmin_b = np.percentile(g0_b, vmin_p)
    vmax_b = np.percentile(g0_b, vmax_p)

    vmin_c = np.percentile(g0_c, vmin_p)
    vmax_c = np.percentile(g0_c, vmax_p)

    g0_a = np.clip((g0_a-vmin_a)/(vmax_a-vmin_a), 0, 1)
    g0_b = np.clip((g0_b-vmin_b)/(vmax_b-vmin_b), 0, 1)
    g0_c = np.clip((g0_c-vmin_c)/(vmax_c-vmin_c), 0, 1)

    # separate them into scale and frequency (mag 0~3 vs direction 0 or 1)
    g0_sum  = (g0_a+g0_b+g0_c)
    freq0_a = g0_a/(g0_sum+1e-5)
    freq0_b = g0_b/(g0_sum+1e-5)
    freq0_c = g0_c/(g0_sum+1e-5)

    # record
    adata.obsm['size_freq_abc'] = np.vstack([freq0_a, freq0_b, freq0_c, g0_sum]).T
    
    return

def get_abc_stats(adata, samples):
    """per sample
    
    assigned to the best score
    some na due to 0,0,0
    """
    # score based (ABC) assignment
    res = []
    for sample in samples:
        if 'ant' in sample:
            cond = 'ant'
        elif 'pos' in sample:
            cond = 'pos'

        adatasub = adata[adata.obs['sample']==sample]
        freq_a = adatasub.obsm['size_freq_abc'][:,0]
        freq_b = adatasub.obsm['size_freq_abc'][:,1]
        freq_c = adatasub.obsm['size_freq_abc'][:,2]

        n = len(adatasub)
        cond_na = (freq_a+freq_b+freq_c)==0
        tn = np.sum(cond_na)

        rank = np.argsort(np.vstack([freq_a,freq_b,freq_c]).T[~cond_na], axis=1)[:,-1]
        ta = np.sum(rank==0)
        tb = np.sum(rank==1)
        tc = np.sum(rank==2)

        assert np.abs(n-(ta+tb+tc)-tn) < 1
        res.append([sample, cond, ta/n*100, tb/n*100, tc/n*100, tn/n*100])

    res = pd.DataFrame(res, columns=['sample', 'cond', 'L2/3_A', 'L2/3_B', 'L2/3_C', 'NA']).set_index('sample')
    return res

def get_abc_stats_typeready(adata, samples, sample_col, type_col):
    """
    """
    num_types = adata.obs.groupby([sample_col, type_col]).size().unstack().reindex(samples)
    frq_types = num_types.divide(num_types.sum(axis=1), axis=0)*100
    frq_types['cond'] = np.where(frq_types.index.str.contains('pos'), 'pos', 'ant')
    
    return frq_types

# load data

In [None]:
outfigdir = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures/250414"
!mkdir -p $outfigdir
fig_manager = powerplots.FigManager(outfigdir)

In [None]:
np.random.seed(0)

### MERFISH genes

In [None]:
f = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/merfish/merfish_genes.txt" 
genes = np.loadtxt(f, dtype='str')

genesets, typegenes_df = merfish_genesets.get_all_genesets()
for key, item in genesets.items():
    print(key, len(item))
    
genes = genesets['allmerfish']

### new ABC genes

In [None]:
f = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/DEG_l23abc_gene_list_250409.csv"
df_genes_newabc = pd.read_csv(f)
df_genes_newabc_p8 = df_genes_newabc[df_genes_newabc['cond']=='P8NR']
df_genes_newabc_p14 = df_genes_newabc[df_genes_newabc['cond']=='P14NR']
df_genes_newabc_p21 = df_genes_newabc[df_genes_newabc['cond']=='P21NR']

abcgenes = df_genes_newabc['gene'].unique()

p8_agenes = df_genes_newabc_p8.loc[df_genes_newabc_p8['archetype']=='A', 'gene'].unique()
p8_bgenes = df_genes_newabc_p8.loc[df_genes_newabc_p8['archetype']=='B', 'gene'].unique()
p8_cgenes = df_genes_newabc_p8.loc[df_genes_newabc_p8['archetype']=='C', 'gene'].unique()

p14_agenes = df_genes_newabc_p14.loc[df_genes_newabc_p14['archetype']=='A', 'gene'].unique()
p14_bgenes = df_genes_newabc_p14.loc[df_genes_newabc_p14['archetype']=='B', 'gene'].unique()
p14_cgenes = df_genes_newabc_p14.loc[df_genes_newabc_p14['archetype']=='C', 'gene'].unique()

p21_agenes = df_genes_newabc_p21.loc[df_genes_newabc_p21['archetype']=='A', 'gene'].unique()
p21_bgenes = df_genes_newabc_p21.loc[df_genes_newabc_p21['archetype']=='B', 'gene'].unique()
p21_cgenes = df_genes_newabc_p21.loc[df_genes_newabc_p21['archetype']=='C', 'gene'].unique()

# agenes = np.union1d(genesets['a'], df_genes_newabc.loc[df_genes_newabc['archetype']=='A', 'gene'].unique())
# bgenes = np.union1d(genesets['b'], df_genes_newabc.loc[df_genes_newabc['archetype']=='B', 'gene'].unique())
# cgenes = np.union1d(genesets['c'], df_genes_newabc.loc[df_genes_newabc['archetype']=='C', 'gene'].unique())

# print(len(abcgenes), 
#       len(agenes)+len(bgenes)+len(cgenes),
#       len(agenes),len(bgenes),len(cgenes), 
#      )

# overlap with MERFISH
p8_agenes = np.intersect1d(genes, p8_agenes)
p8_bgenes = np.intersect1d(genes, p8_bgenes)
p8_cgenes = np.intersect1d(genes, p8_cgenes)

p14_agenes = np.intersect1d(genes, p14_agenes)
p14_bgenes = np.intersect1d(genes, p14_bgenes)
p14_cgenes = np.intersect1d(genes, p14_cgenes)

p21_agenes = np.intersect1d(genes, p21_agenes)
p21_bgenes = np.intersect1d(genes, p21_bgenes)
p21_cgenes = np.intersect1d(genes, p21_cgenes)

# print(len(abcgenes), 
#       len(agenes)+len(bgenes)+len(cgenes),
#       len(agenes),len(bgenes),len(cgenes), 
#      )

p8_agenes_idx = basicu.get_index_from_array(genes, p8_agenes)
p8_bgenes_idx = basicu.get_index_from_array(genes, p8_bgenes)
p8_cgenes_idx = basicu.get_index_from_array(genes, p8_cgenes)

p14_agenes_idx = basicu.get_index_from_array(genes, p14_agenes)
p14_bgenes_idx = basicu.get_index_from_array(genes, p14_bgenes)
p14_cgenes_idx = basicu.get_index_from_array(genes, p14_cgenes)

p21_agenes_idx = basicu.get_index_from_array(genes, p21_agenes)
p21_bgenes_idx = basicu.get_index_from_array(genes, p21_bgenes)
p21_cgenes_idx = basicu.get_index_from_array(genes, p21_cgenes)

### MERFISH cells (integrated L2/3)

In [None]:
ddir = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/merfish/organized" 

# get MERFISH data (cells) from integrated V1L23Glut 
fin = os.path.join(ddir, 'P8P14P21NR_v1l23glut_rna_merfish_250411.h5ad')
adata_ = ad.read(fin, backed='r')
adata_ = adata_[adata_.obs['modality']=='merfish'] 
print(adata_.obs['gated_pred_subclass'].unique()) # should be L2/3 only

l23_cells = adata_.obs.index.values
l23_cells.shape

### MERFISH raw - all genes

In [None]:
# %%time
names = [
    'P8NRa_ant2', 
    'P8NRb_ant2',
    'P8NRc_ant2', 
    'P8NRd_ant2',
    
    'P8NRa_pos2', 
    'P8NRb_pos2',
    'P8NRc_pos2', 
    'P8NRd_pos2',
    
    'P14NRa_ant', 
    'P14NRa_pos',
    'P14NRb_ant', 
    'P14NRb_pos',
    
    'P21NRb_ant',
    'P21NRb_pos',
    
    'P21NRc_ant',
    'P21NRc_ant2',
    'P21NRc_pos2',
]


mean_total_rna_target = 500
adata_merged = []

for i, name in enumerate(names):
    j = i // 4
    i = i % 4
    
    adatasub = ad.read(os.path.join(ddir, f'{name}_l2_v1_250410.h5ad')) 
    print(name, len(adatasub))
    
    adatasub.obs.index = np.char.add(f'{name}', adatasub.obs.index.values)
    adatasub.obs['sample'] = name
    
    norm_cnts = adatasub.layers['norm']
    mean_per_batch = np.mean(norm_cnts.sum(axis=1))
    
    adatasub.layers['jnorm']  = norm_cnts*(mean_total_rna_target/mean_per_batch)
    adatasub.layers['ljnorm'] = np.log2(1+adatasub.layers['jnorm'])
    
    adatasub.obs['norm_transcript_count']  = adatasub.layers['norm'].sum(axis=1)
    adatasub.obs['jnorm_transcript_count'] = adatasub.layers['jnorm'].sum(axis=1)
    
    adatasub.obs['depth_show'] = -adatasub.obs['depth'].values - i*400 # name
    adatasub.obs['width_show'] =  adatasub.obs['width'].values - np.min(adatasub.obs['width'].values) + j*2500   # name
    
    adata_merged.append(adatasub)
    
adata_merged = ad.concat(adata_merged)

### MERFISH raw - get high-qual L2/3
- using anatomical features to filter out cells
- using transcript counts 
- check cell density - insignificant

In [None]:
# filter by L2/3 label
adata_l23 = adata_merged[l23_cells].copy()

# by depth and by counts
conds = np.logical_and(
    adata_l23.obs['depth']    < 400,
    adata_l23.obs['transcript_count'] > 50,
)
adata_mer = adata_l23[conds].copy()
len(adata_l23), len(adata_mer)

In [None]:
width_min = adata_mer.obs.groupby('sample')['width'].min().reindex(names)
width_max = adata_mer.obs.groupby('sample')['width'].max().reindex(names)
width_rng = width_max - width_min 
width_cum = pd.Series(np.cumsum(np.hstack([0, width_rng[:-1]+100])), index=names)

adata_mer.obs['width_n0']    =  adata_mer.obs['width']    - width_min.reindex(adata_mer.obs['sample']).values
adata_mer.obs['width_show2'] =  adata_mer.obs['width_n0'] + width_cum.reindex(adata_mer.obs['sample']).values
adata_mer.obs['depth_show2'] = -adata_mer.obs['depth']

In [None]:
colors = ['C1', 'C1', 'k', 'C2', 'C2']
for i, name in enumerate(names):
    j = i // 4
    color = colors[j]
    
    adatasub = adata_l23[adata_l23.obs['sample']==name]
    sns.histplot(adatasub.obs['depth'].values, element='step', fill=False, color=color)

### summarize things needed

In [None]:
# add cols
adata_mer.obs['modality'] = 'merfish'


lognorm_mer = adata_mer.layers['ljnorm']  
adata_mer.layers['zscore'] = zscore(lognorm_mer, axis=0)

adata_mer

# Downstream calc

# Viz set up

In [None]:
adata_plot1 = adata_mer 

clsts_palette2 = {
    'L2/3_A': 'C0',    
    'L2/3_B': 'C1',    
    'L2/3_C': 'C2',    
    'NA': 'gray',
}

In [None]:
from matplotlib.colors import LinearSegmentedColormap

# ABC map
colors_a = [(0.0, 'black'), (1.0, 'C0')]      
colors_b = [(0.0, 'black'), (1.0, 'C1')]      
colors_c = [(0.0, 'black'), (1.0, 'C2')]      
cmap_a = LinearSegmentedColormap.from_list('cmap_a', colors_a)
cmap_b = LinearSegmentedColormap.from_list('cmap_b', colors_b)
cmap_c = LinearSegmentedColormap.from_list('cmap_c', colors_c)

# NRDR map
colors_nr = [(0.0, 'white'), (1.0, 'C1'),]
colors_dr = [(0.0, 'white'), (1.0, 'black'),]
colors_nrdr = [(0.0, 'C1'), (0.5, 'white'), (1.0, 'black')]

cmap_nr = LinearSegmentedColormap.from_list('cmap_nr', colors_nr)
cmap_dr = LinearSegmentedColormap.from_list('cmap_dr', colors_dr)
cmap_nrdr = LinearSegmentedColormap.from_list('cmap_nrdr', colors_nrdr)

# FISH stats

In [None]:
colors = ['C1', 'C1', 'k', 'C2', 'C2']
for i, name in enumerate(names):
    j = i // 4
    color = colors[j]
    adatasub = adata_mer[adata_mer.obs['sample']==name]
    depths = adatasub.obs['depth'].values
    depth_min = np.percentile(depths,  1)
    depth_max = np.percentile(depths, 99)
    sns.histplot(depths, element='step', fill=False, stat='density', color=color)
    plt.axvline(depth_min, linestyle='--', color=color)
    plt.axvline(depth_max, linestyle='--', color=color)

In [None]:
for i, name in enumerate(names):
    j = i // 4
    color = colors[j]
    adatasub = adata_mer[adata_mer.obs['sample']==name]
    depths = adatasub.obs['depth'].values
    depth_min = np.percentile(depths,  1)
    depth_max = np.percentile(depths, 99)
    sns.histplot((depths-depth_min)/(depth_max-depth_min), element='step', fill=False, stat='density', color=color)


In [None]:
stats = {}
bins = np.linspace(0, 1, 4*3+1)
midpoints = np.mean(np.vstack([bins[:-1], bins[1:]]), axis=0)

for name in names:
    adatasub = adata_mer[adata_mer.obs['sample']==name].copy() # v1l23_data[name]
    depths = adatasub.obs['depth'].values
    depth_min = np.percentile(depths,  1)
    depth_max = np.percentile(depths, 99)
    adatasub.obs['depth_norm'] = (depths-depth_min)/(depth_max-depth_min)
    lnorm_mean, lnorm_sem, lnorm_std, n, d, db = binning_pipe2(adatasub, 'depth_norm', 'ljnorm', bins=bins)
    stats[name] = (lnorm_mean, lnorm_sem, lnorm_std, n, d, db)

In [None]:
agenes_idx = p8_agenes_idx
bgenes_idx = p8_bgenes_idx
cgenes_idx = p8_cgenes_idx

In [None]:
# mean expression level across V1 L2/3 in NR
# offset = 1
base_a0 = []
base_b0 = []
base_c0 = []
for name in [
    'P8NRa_ant2', 'P8NRb_ant2', 'P8NRc_ant2', 'P8NRd_ant2',
    'P8NRa_pos2', 'P8NRb_pos2', 'P8NRc_pos2', 'P8NRd_pos2',
    ]:
    (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    base_a = np.mean(lnorm_mean.iloc[:,agenes_idx], axis=0) # across depth bins for each gene
    base_b = np.mean(lnorm_mean.iloc[:,bgenes_idx], axis=0) # across depth bins for each gene
    base_c = np.mean(lnorm_mean.iloc[:,cgenes_idx], axis=0) # across depth bins for each gene
    
    base_a0.append(base_a)
    base_b0.append(base_b)
    base_c0.append(base_c)
    
base_a0 = np.mean(base_a0, axis=0)
base_b0 = np.mean(base_b0, axis=0)
base_c0 = np.mean(base_c0, axis=0)


means = {}
sems = {}
for name in names:
    (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    
    amean = np.mean(lnorm_mean.iloc[:,agenes_idx]-base_a0, axis=1) # a bin vector
    bmean = np.mean(lnorm_mean.iloc[:,bgenes_idx]-base_b0, axis=1) # a bin vector
    cmean = np.mean(lnorm_mean.iloc[:,cgenes_idx]-base_c0, axis=1) # a bin vector
    
    asem = np.mean(lnorm_sem.iloc[:,agenes_idx], axis=1)
    bsem = np.mean(lnorm_sem.iloc[:,bgenes_idx], axis=1)
    csem = np.mean(lnorm_sem.iloc[:,cgenes_idx], axis=1)
    
    means[name] = [amean, bmean, cmean, ]
    sems[name] = [asem, bsem, csem,]
    


In [None]:
samp_gene_dpth_mat = np.array([np.array(means[name]) for name in names]) 
print(samp_gene_dpth_mat.shape) # sample, gene group, depth

p8_mat = samp_gene_dpth_mat[:8]
p8_mean = np.mean(p8_mat, axis=0) # gene group, depth
p8_sem  = np.std(p8_mat, axis=0)/np.sqrt(4) # gene group, depth

p14_mat = samp_gene_dpth_mat[8:8+4]
p14_mean = np.mean(p14_mat, axis=0) # gene group, depth
p14_sem  = np.std(p14_mat, axis=0)/np.sqrt(4) # gene group, depth

p21_mat = samp_gene_dpth_mat[8+4:]
p21_mean = np.mean(p21_mat, axis=0) # gene group, depth
p21_sem  = np.std(p21_mat, axis=0)/np.sqrt(4) # gene group, depth

p8_mean.shape, p14_mean.shape, p21_mean.shape

In [None]:
# # t-test between NR and DR for each gene group and each location
# ts, ps = ttest_ind(nr_mat, dr_mat)
# rejs, qs, _, _ = multipletests(np.nan_to_num(ps, nan=1).reshape(-1,), alpha=0.05, method='fdr_bh')
# qs = qs.reshape(ps.shape)
# nrdr_mean = np.stack([nr_mean, dr_mean], axis=2).mean(axis=2)

In [None]:
gnames = ['A genes (n=64)', 'B genes (n=35)', 'C genes (n=71)']

fig, axs = plt.subplots(5, 4, figsize=(5*4,4*5), sharex=True, sharey=True)

# ax.set_title('P28NR')
linestyle = '-'
for ax, name in zip(axs.flat, names):
    # (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    amean, bmean, cmean,  = means[name]
    asem, bsem, csem,  = sems[name]
    
    x = midpoints
    ax.plot(x, amean, label='A genes', color='C0', linestyle=linestyle)
    ax.fill_between(x, amean-asem, amean+asem, color='C0', alpha=0.1, edgecolor='none')
    ax.plot(x, bmean, label='B genes', color='C1', linestyle=linestyle)
    ax.fill_between(x, bmean-bsem, bmean+bsem, color='C1', alpha=0.1, edgecolor='none')
    ax.plot(x, cmean, label='C genes', color='C2', linestyle=linestyle)
    ax.fill_between(x, cmean-csem, cmean+csem, color='C2', alpha=0.1, edgecolor='none')
    ax.axhline(color='lightgray', linestyle='dotted', zorder=1)

    sns.despine(ax=ax)
    # ax.set_xticks([0, 100, 200, 300, 400])
    # ax.set_xlim(left=50, right=400)
    # ax.set_ylim([-0.4, 0.4])
    ax.grid(False)
    ax.set_title(name)
axs.flat[0].set_ylabel('mean (expr. +/- sem)')

    
fig.subplots_adjust(wspace=0.1)
fig_manager.savefig(fig)
# powerplots.savefig_autodate(fig, outdatadir+'/grant_saumya_lineq_abc_v3.pdf')

In [None]:
titles = ['P8', 'P14', 'P21']
data_mean = [p8_mean, p14_mean, p21_mean]
data_sem = [p8_sem, p14_sem, p21_sem]
gnames = ['A genes', 'B genes', 'C genes']
colors = ['C0', 'C1', 'C2']

fig, axs = plt.subplots(1, 3, figsize=(4*3,4), sharex=True, sharey=True)
for ax, cond_mean, cond_sem, title in zip(axs, data_mean, data_sem, titles):
    # ax.set_title('P28NR')
    linestyle = '-'
    ax.axhline(color='lightgray', linestyle='dotted', zorder=1)
    for i, (gname, color) in enumerate(zip(gnames, colors)):
        ax.plot(midpoints, cond_mean[i], label=gname, color=color, linestyle=linestyle)
        ax.fill_between(midpoints, cond_mean[i]-cond_sem[i], cond_mean[i]+cond_sem[i], color=color, alpha=0.1, edgecolor='none')

    sns.despine(ax=ax)
    # ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    # ax.set_xlim(left=50, right=400)
    ax.set_ylim([-0.4, 0.2])
    ax.grid(False)
    ax.set_title(title)
    ax.set_xlabel('top->bottom L2/3')
    
axs[0].set_ylabel('log2(Fold change)')
fig.tight_layout()
fig_manager.savefig(fig)
plt.show()

In [None]:
linestyles = ['-', '--', '-.']
data_mean = [p8_mean, p14_mean, p21_mean]
data_sem = [p8_sem, p14_sem, p21_sem]
labels = ['P8', 'P14', 'P21']
gnames = ['A genes', 'B genes', 'C genes']
titles = gnames
colors = ['C0', 'C1', 'C2']
# sigs = qs
# allmeans = nrdr_mean

fig, axs = plt.subplots(1, 3, figsize=(4*3,4), sharex=True, sharey=True)
for i, (ax, gname, color) in enumerate(zip(axs, gnames, colors)):
    ax.axhline(color='lightgray', linestyle='dotted', zorder=1)
    for cond_mean, cond_sem, title, linestyle in zip(data_mean, data_sem, titles, linestyles):
        ax.plot(midpoints, cond_mean[i], label=gname, color=color, linestyle=linestyle, marker='o', markersize=5)
        ax.fill_between(midpoints, cond_mean[i]-cond_sem[i], cond_mean[i]+cond_sem[i], color=color, alpha=0.1, edgecolor='none')
        
    # for _x, _y, _sig in zip(midpoints, allmeans[i], sigs[i]):
    #     if _sig < 1e-3:
    #         ax.text(_x, _y, "***", ha='left', va='center', fontsize=12, rotation=90)
    #         ax.vlines(_x, _y-0.02, _y+0.02, color='k', linewidth=0.5)
    #     elif _sig < 5e-2:
    #         ax.text(_x, _y, "*", ha='left', va='center', fontsize=12, rotation=90)
    #         ax.vlines(_x, _y-0.02, _y+0.02, color='k', linewidth=0.5)

    sns.despine(ax=ax)
    # ax.set_xticks([0, 100, 200, 300, 400])
    # ax.set_xlim(left=50, right=400)
    # ax.set_ylim([-0.2, 0.3])
    ax.grid(False)
    ax.set_title(gname)
    ax.set_xlabel('upper->lower L2/3')
    
axs[0].set_ylabel('log2(Fold change)')
fig.subplots_adjust(wspace=0.1)
fig_manager.savefig(fig)
plt.show()

# internal

In [None]:
agenes_idx = p21_agenes_idx
bgenes_idx = p21_bgenes_idx
cgenes_idx = p21_cgenes_idx

In [None]:
# mean expression level across V1 L2/3 in NR
base_a0 = []
base_b0 = []
base_c0 = []
for name in [
    'P8NRa_ant2', 'P8NRb_ant2', 'P8NRc_ant2', 'P8NRd_ant2',
    'P8NRa_pos2', 'P8NRb_pos2', 'P8NRc_pos2', 'P8NRd_pos2',
    ]:
    (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    base_a = np.mean(lnorm_mean.iloc[:,agenes_idx], axis=0) # across depth bins for each gene
    base_b = np.mean(lnorm_mean.iloc[:,bgenes_idx], axis=0) # across depth bins for each gene
    base_c = np.mean(lnorm_mean.iloc[:,cgenes_idx], axis=0) # across depth bins for each gene
    
    base_a0.append(base_a)
    base_b0.append(base_b)
    base_c0.append(base_c)
    
base_a0 = np.mean(base_a0, axis=0)
base_b0 = np.mean(base_b0, axis=0)
base_c0 = np.mean(base_c0, axis=0)

# mean expression level across V1 L2/3 in NR
base_a1 = []
base_b1 = []
base_c1 = []
for name in [
    'P14NRa_ant', 'P14NRa_pos', 'P14NRb_ant', 'P14NRb_pos',
    ]:
    (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    base_a = np.mean(lnorm_mean.iloc[:,agenes_idx], axis=0) # across depth bins for each gene
    base_b = np.mean(lnorm_mean.iloc[:,bgenes_idx], axis=0) # across depth bins for each gene
    base_c = np.mean(lnorm_mean.iloc[:,cgenes_idx], axis=0) # across depth bins for each gene
    
    base_a1.append(base_a)
    base_b1.append(base_b)
    base_c1.append(base_c)
    
base_a1 = np.mean(base_a1, axis=0)
base_b1 = np.mean(base_b1, axis=0)
base_c1 = np.mean(base_c1, axis=0)

# mean expression level across V1 L2/3 in NR
base_a2 = []
base_b2 = []
base_c2 = []
for name in [
    'P21NRb_ant', 'P21NRb_pos', 'P21NRc_ant', 'P21NRc_ant2', 'P21NRc_pos2',
    ]:
    (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    base_a = np.mean(lnorm_mean.iloc[:,agenes_idx], axis=0) # across depth bins for each gene
    base_b = np.mean(lnorm_mean.iloc[:,bgenes_idx], axis=0) # across depth bins for each gene
    base_c = np.mean(lnorm_mean.iloc[:,cgenes_idx], axis=0) # across depth bins for each gene
    
    base_a2.append(base_a)
    base_b2.append(base_b)
    base_c2.append(base_c)
    
base_a2 = np.mean(base_a2, axis=0)
base_b2 = np.mean(base_b2, axis=0)
base_c2 = np.mean(base_c2, axis=0)

means = {}
sems = {}
for i, name in enumerate(names):
    (lnorm_mean, lnorm_sem, lnorm_std, n, d, db) = stats[name]
    
    if i < 8:
        amean = np.mean(lnorm_mean.iloc[:,agenes_idx]-base_a0, axis=1) # a bin vector
        bmean = np.mean(lnorm_mean.iloc[:,bgenes_idx]-base_b0, axis=1) # a bin vector
        cmean = np.mean(lnorm_mean.iloc[:,cgenes_idx]-base_c0, axis=1) # a bin vector
    elif i < 12:
        amean = np.mean(lnorm_mean.iloc[:,agenes_idx]-base_a1, axis=1) # a bin vector
        bmean = np.mean(lnorm_mean.iloc[:,bgenes_idx]-base_b1, axis=1) # a bin vector
        cmean = np.mean(lnorm_mean.iloc[:,cgenes_idx]-base_c1, axis=1) # a bin vector
    else:
        amean = np.mean(lnorm_mean.iloc[:,agenes_idx]-base_a2, axis=1) # a bin vector
        bmean = np.mean(lnorm_mean.iloc[:,bgenes_idx]-base_b2, axis=1) # a bin vector
        cmean = np.mean(lnorm_mean.iloc[:,cgenes_idx]-base_c2, axis=1) # a bin vector
        
    asem = np.mean(lnorm_sem.iloc[:,agenes_idx], axis=1)
    bsem = np.mean(lnorm_sem.iloc[:,bgenes_idx], axis=1)
    csem = np.mean(lnorm_sem.iloc[:,cgenes_idx], axis=1)
    
    means[name] = [amean, bmean, cmean, ]
    sems[name] = [asem, bsem, csem,]

In [None]:
samp_gene_dpth_mat = np.array([np.array(means[name]) for name in names]) 
print(samp_gene_dpth_mat.shape) # sample, gene group, depth

p8_mat = samp_gene_dpth_mat[:8]
p8_mean = np.mean(p8_mat, axis=0) # gene group, depth
p8_sem  = np.std(p8_mat, axis=0)/np.sqrt(4) # gene group, depth

p14_mat = samp_gene_dpth_mat[8:8+4]
p14_mean = np.mean(p14_mat, axis=0) # gene group, depth
p14_sem  = np.std(p14_mat, axis=0)/np.sqrt(4) # gene group, depth

p21_mat = samp_gene_dpth_mat[8+4:]
p21_mean = np.mean(p21_mat, axis=0) # gene group, depth
p21_sem  = np.std(p21_mat, axis=0)/np.sqrt(4) # gene group, depth

p8_mean.shape, p14_mean.shape, p21_mean.shape

In [None]:
linestyles = ['-', '--', '-.']
data_mean = [p8_mean, p14_mean, p21_mean]
data_sem = [p8_sem, p14_sem, p21_sem]
titles = ['P8', 'P14', 'P21']

gnames = ['A genes', 'B genes', 'C genes']
colors = ['C0', 'C1', 'C2']

fig, axs = plt.subplots(1, 3, figsize=(4*3,4), sharex=True, sharey=True)
for ax, cond_mean, cond_sem, title in zip(axs, data_mean, data_sem, titles):
    # ax.set_title('P28NR')
    linestyle = '-'
    ax.axhline(color='lightgray', linestyle='dotted', zorder=1)
    for i, (gname, color) in enumerate(zip(gnames, colors)):
        ax.plot(midpoints, cond_mean[i], label=gname, color=color, linestyle=linestyle)
        ax.fill_between(midpoints, cond_mean[i]-cond_sem[i], cond_mean[i]+cond_sem[i], color=color, alpha=0.1, edgecolor='none')

    sns.despine(ax=ax)
    # ax.set_xticks([0, 0.25, 0.5, 0.75, 1])
    # ax.set_xlim(left=50, right=400)
    # ax.set_ylim([-0.3, 0.3])
    ax.grid(False)
    ax.set_title(title)
    ax.set_xlabel('upper->lower L2/3')
    
axs[0].set_ylabel('log2(Fold change)')
fig.tight_layout()
fig_manager.savefig(fig)
plt.show()

In [None]:

linestyles = ['-', '--', '-.']
data_mean = [p8_mean, p14_mean, p21_mean]
data_sem = [p8_sem, p14_sem, p21_sem]
labels = ['P8', 'P14', 'P21']

gnames = ['A genes', 'B genes', 'C genes']
titles = gnames
colors = ['C0', 'C1', 'C2']
# sigs = qs
# allmeans = nrdr_mean

fig, axs = plt.subplots(1, 3, figsize=(4*3,4), sharex=True, sharey=True)
for i, (ax, gname, color) in enumerate(zip(axs, gnames, colors)):
    ax.axhline(color='lightgray', linestyle='dotted', zorder=1)
    for cond_mean, cond_sem, title, linestyle in zip(data_mean, data_sem, titles, linestyles):
        ax.plot(midpoints, cond_mean[i], label=gname, color=color, linestyle=linestyle, marker='o', markersize=5)
        ax.fill_between(midpoints, cond_mean[i]-cond_sem[i], cond_mean[i]+cond_sem[i], color=color, alpha=0.1, edgecolor='none')
        
    # for _x, _y, _sig in zip(midpoints, allmeans[i], sigs[i]):
    #     if _sig < 1e-3:
    #         ax.text(_x, _y, "***", ha='left', va='center', fontsize=12, rotation=90)
    #         ax.vlines(_x, _y-0.02, _y+0.02, color='k', linewidth=0.5)
    #     elif _sig < 5e-2:
    #         ax.text(_x, _y, "*", ha='left', va='center', fontsize=12, rotation=90)
    #         ax.vlines(_x, _y-0.02, _y+0.02, color='k', linewidth=0.5)

    sns.despine(ax=ax)
    # ax.set_xticks([0, 100, 200, 300, 400])
    # ax.set_xlim(left=50, right=400)
    # ax.set_ylim([-0.2, 0.3])
    ax.grid(False)
    ax.set_title(gname)
    ax.set_xlabel('upper->lower L2/3')
    
axs[0].set_ylabel('log2(Fold change)')
fig.subplots_adjust(wspace=0.1)
fig_manager.savefig(fig)
plt.show()