# analysis V4 - new basis code - Dec 6, 2023
- cleaned up the data QC and organizations a bit
- having two anndata and their spin offs - both FISH and proj data 
- flip an axis (y-axis) when plotting - not in data
- focus on L2/3 cells

TODO: 
- separate and organize plotting functions 
- organize the plots and generate more insights

In [None]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
import umap
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib

import anndata 
import scanpy as sc
from scipy.stats import spearmanr

from scroutines import config_plots
from scroutines import basicu
from scroutines import miscu

In [None]:
import sys
sys.setrecursionlimit(10000)
from scipy.cluster import hierarchy as sch

In [None]:
def norm_data(adata):
    """This procedure is independnet of each gene
    norm by size (cell volume)
    norm by log2(1+)
    norm by zscore
    """
    # size
    med_size = adata.obs['area'].median()
    max_size = adata.obs['area'].max()
    min_size = adata.obs['area'].min()

    print(f"Min cell size {min_size:.1f} um^3\t  {np.power(min_size,1/3):.1f} um")
    print(f"Med cell size {med_size:.1f} um^3\t  {np.power(med_size,1/3):.1f} um")
    print(f"Max cell size {max_size:.1f} um^3\t  {np.power(max_size,1/3):.1f} um")

    size_factor = (adata.obs['area']/med_size).values


    # norm by size; by log2+1; by zscore
    mat_raw = np.array(adata.X)
    mat_nrm = mat_raw/size_factor.reshape(-1,1) # .divide(size_factor, axis=0)
    mat_log = np.log2(1+mat_nrm)
    mat_zsc = (mat_log - np.mean(mat_log, axis=0))/np.std(mat_log, axis=0)

    adata.layers['nrm'] = mat_nrm
    adata.layers['log'] = mat_log
    adata.layers['zsc'] = mat_zsc
    
    return # mat_raw, mat_nrm, mat_log, mat_zsc

In [None]:
pth_dat = '/u/home/f/f7xiesnm/project-zipursky/easifish/273LU/proc/tile0_v1/'
!ls $pth_dat
!wc -l $pth_dat*.csv

In [None]:
# features
var_names = {
    'tile0_c0': 'mCherry',
    'tile0_c1': 'Slc1a3',
    'tile0_c2': 'empty',
    'tile0_c4': 'GFP',
}
proj_idx = np.array([])

# cells
f_meta = os.path.join(pth_dat, 'roi.csv')

# spots
f_spot = os.path.join(pth_dat, 'spotcount.csv')

In [None]:
# # f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/res/L23-ABC-genes-n288-n286unq-annot_v2.csv'
# f = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/gene_ptime_P28_L23_Mar27.tsv'
# df_annot = pd.read_csv(f).sort_values('gene_ptime')

In [None]:
var = pd.Series(var_names).to_frame('name')
var['proj'] = False
var.loc[proj_idx, 'proj'] = True

# var['ptime'] = df_annot.set_index('gene').reindex(var['name'])['gene_ptime'].values
# var['ptime_order'] = var['ptime'].rank()  

var_idx = var.index.values.astype(str)
var_i2n = var['name'] 
var_n2i = var.reset_index().set_index('name')['index']

gene_idx = np.array([idx for idx in var_idx if idx not in proj_idx])

# ftrs_order  = gene_names.index.values 
# genes_order = gene_names.values
# var_oidx = np.argsort(var['ptime'].values)
# var_order = var_i2n[var_idx[var_oidx]]

In [None]:
var_order = [
    'tile0_c0',
    'tile0_c1',
    # 'tile0_c2',
    'tile0_c4',
]

var_order_manual = pd.Series({
    'tile0_c0': 'mCherry',
    'tile0_c1': 'Slc1a3',
    # 'tile0_c2': 'empty',
    'tile0_c4': 'GFP',
    
})


print(var_idx)
print(var_order)
print(var_order_manual)

In [None]:
# cols
raw_var_idx = np.char.add(var_idx, '_raw')
nrm_var_idx = np.char.add(var_idx, '_nrm')
log_var_idx = np.char.add(var_idx, '_log')
zsc_var_idx = np.char.add(var_idx, '_zsc')

raw_gene_idx = np.char.add(gene_idx, '_raw')
nrm_gene_idx = np.char.add(gene_idx, '_nrm')
log_gene_idx = np.char.add(gene_idx, '_log')
zsc_gene_idx = np.char.add(gene_idx, '_zsc')

In [None]:
meta = pd.read_csv(f_meta, index_col=0)
spot = pd.read_csv(f_spot, index_col=0)
assert np.all(meta.index.values == spot.index.values)
print(meta.shape, spot.shape) # 

max_x, max_y, max_z = meta[['x', 'y', 'z']].describe().loc['max']
min_x, min_y, min_z = meta[['x', 'y', 'z']].describe().loc['min']
print(f'x: {min_x:.1f}\t{max_x:.1f}')
print(f'y: {min_y:.1f}\t{max_y:.1f}')
print(f'z: {min_z:.1f}\t{max_z:.1f}')

meta['to_edge'] = np.minimum(
    np.minimum(meta['x']-min_x, max_x-meta['x']), 
    np.minimum(meta['y']-min_y, max_y-meta['y']), 
    np.minimum(meta['z']-min_z, max_z-meta['z']),
)
meta['cov'] = spot.sum(axis=1)

# bin data 
bins_8p = np.linspace(0,400,8+1).astype(int)
bins_4p = np.linspace(0,400,4+1).astype(int)
print(bins_8p, bins_4p)

meta['xb_8p'] = pd.cut(meta['x'], bins=bins_8p)
meta['yb_8p'] = pd.cut(meta['y'], bins=bins_8p)
meta['zb_8p'] = pd.cut(meta['z'], bins=bins_8p)

meta['xb_4p'] = pd.cut(meta['x'], bins=bins_4p)
meta['yb_4p'] = pd.cut(meta['y'], bins=bins_4p)
meta['zb_4p'] = pd.cut(meta['z'], bins=bins_4p)

In [None]:
adata = anndata.AnnData(X=spot.values, obs=meta, var=var)
adata

In [None]:
# # remove outliers adata -> adata2
# df = adata.obs
# conds = [
#     df['area'] > 500,
#     df['x'] > min_x + 20,
#     df['x'] < max_x - 20,
    
#     df['y'] > min_y + 20,
#     df['y'] < max_y - 20,
    
#     df['z'] > min_z + 20,
#     df['z'] < max_z - 20,
# ]
# cond_all = np.ones(len(df)) > 0
# for cond in conds:
#     cond_all = np.logical_and(cond_all, cond)
#     print(cond_all.sum())
# print(f"Num cells before and after: {len(df)} -> {cond_all.sum()}")

# # remove outliers
# adata2 = adata[cond_all].copy()
# adata2
adata2 = adata

In [None]:
# normalize the data and record
norm_data(adata2)
df_p2 = adata2.obs.copy()
df_p2[nrm_var_idx] = np.array(adata2.layers['nrm'])

# report 

In [None]:
for idx in var_idx:
    val = adata[:,idx].X[:,0]
    print(f'{idx}\t{var_i2n.loc[idx]}\t{100*np.sum(val>0)/len(val):.2f}%\t{np.min(val):.1f}\t{np.median(val):.1f}\t{np.percentile(val, 99):.1f}\t{np.max(val):.1f}')

In [None]:
cols = ['x', 'y', 'z', 'area']
with sns.plotting_context('paper'):
    fig, axs = plt.subplots(4, 1, figsize=(1*6, 2*4))
    for ax, col in zip(axs, cols):
        sns.histplot(adata.obs[col], ax=ax)
        ax.set_xlabel(col)
    fig.subplots_adjust(hspace=0.5)
    plt.show()


In [None]:
sns.scatterplot(data=adata2.obs, x='area', y='cov', s=2, edgecolor='none')
plt.show()

plt.scatter(np.log2(adata2.obs['area']), 
            np.log2(adata2.obs['cov']),
            s=2, edgecolor='none',
           )

In [None]:
with sns.plotting_context('paper'):
    fig, axs = plt.subplots(2, 1, figsize=(10*1,4*2))
    ax = axs[0]
    sns.boxplot(data=adata.X, ax=ax)
    ax.set_xticklabels(adata.var.index.values, rotation=90) 
    ax.set_ylabel('counts')
    ax.set_xlabel('Genes')
    sns.despine(ax=ax)

    ax = axs[1]
    sns.boxplot(data=adata.X, ax=ax)
    ax.set_xticklabels(adata.var['name'].values, rotation=90)
    sns.despine(ax=ax)
    ax.set_ylim([0,50])
    ax.set_ylabel('counts')
    ax.set_xlabel('Genes')

# z-sectioning visuals
- bin into zbin
- plot for each zbin

In [None]:
sys.path.insert(0, '../')
import plotting_easifish

import importlib
importlib.reload(plotting_easifish)

from plotting_easifish import view_z_sections
from plotting_easifish import view_z_sections_4panels
from plotting_easifish import view_z_sections_labels
from plotting_easifish import gen_discrete_colors


In [None]:
sp_x, sp_y = 'x', 'y'
fig, axs = plt.subplots(1,3,figsize=(3*4,4*1), sharey=True, sharex=True)
cbar_ax = fig.add_axes([0.92, 0.5, 0.01, 0.2])
axs.flat[0].invert_yaxis()
for i, (col, ax) in enumerate(zip(var_order_manual.index.values, axs.flat)):
    x = df_p2[sp_x].values
    y = df_p2[sp_y].values
    c = df_p2[col+'_nrm'].values
    vmax=np.percentile(c, 95)
    vmin=np.percentile(c,  5)
    c = (c-vmin)/(vmax-vmin)
    
    g = ax.scatter(x, y, c=c, s=5, edgecolor='none', cmap='gray_r', vmax=1, vmin=0) #vmax, vmin=-0.1*vmax)
    sns.despine(ax=ax)
    ax.set_title(var_i2n[col])
    ax.set_xlabel(sp_x)
    ax.set_ylabel(sp_y)
    ax.set_aspect('equal')
    ax.grid(False)
    ax.axis('off')
    
    
fig.colorbar(g, cax=cbar_ax, label='Normed counts\n(5-95 perctl.)', aspect=5, shrink=0.3, ticks=[0, 1])
fig.subplots_adjust(hspace=0.1, wspace=0.02)

In [None]:
sp_x, sp_y = 'x', 'z'
fig, axs = plt.subplots(1,3,figsize=(3*4,4*1), sharey=True, sharex=True)
cbar_ax = fig.add_axes([0.92, 0.5, 0.01, 0.2])
axs.flat[0].invert_yaxis()
for i, (col, ax) in enumerate(zip(var_order_manual.index.values, axs.flat)):
    x = df_p2[sp_x].values
    y = df_p2[sp_y].values
    c = df_p2[col+'_nrm'].values
    vmax=np.percentile(c, 95)
    vmin=np.percentile(c,  5)
    c = (c-vmin)/(vmax-vmin)
    
    g = ax.scatter(x, y, c=c, s=5, edgecolor='none', cmap='gray_r', vmax=1, vmin=0) #vmax, vmin=-0.1*vmax)
    sns.despine(ax=ax)
    ax.set_title(var_i2n[col])
    ax.set_xlabel(sp_x)
    ax.set_ylabel(sp_y)
    ax.set_aspect('equal')
    ax.grid(False)
    ax.axis('off')
    
    
fig.colorbar(g, cax=cbar_ax, label='Normed counts\n(5-95 perctl.)', aspect=5, shrink=0.3, ticks=[0, 1])
fig.subplots_adjust(hspace=0.1, wspace=0.02)

In [None]:
# Slc1a3+ cells as astrocytes
# GFP - should be expressed in neurons and astrocytes - In CKO, GFP expression should decrease specifically in Slc1a3+ cells
# mCherry - ; In KOE - mCherry expression should increase specifically in Slc17a+ cells

In [None]:
expr     = adata2.X[:,1]
expr_nrm = adata2.layers['nrm'][:,1]
expr_mch, expr_gfp = adata2.layers['nrm'][:,0], adata2.layers['nrm'][:,3]

In [None]:
sns.histplot(expr)
sns.histplot(expr_nrm)

In [None]:
sns.histplot(expr_nrm, cumulative=True)

In [None]:
print(np.sum(expr_nrm > 200)/len(expr_nrm))
print(np.sum(expr_nrm > 210)/len(expr_nrm))
print(np.sum(expr_nrm > 220)/len(expr_nrm))
print(np.sum(expr_nrm > 230)/len(expr_nrm))
cond_astro = expr_nrm > 210

In [None]:
fig, axs = plt.subplots(1,2,figsize=(2*5,5))
fig.suptitle('Control')
ax = axs[0]
ax.scatter(expr_nrm, expr_gfp, s=5)
ax.scatter(expr_nrm[cond_astro], 
            expr_gfp[cond_astro], s=5, label='astro')
ax.set_xlabel('Slc1a3')
ax.set_ylabel('GFP')

ax = axs[1]
ax.scatter(expr_nrm, expr_mch, s=5)
ax.scatter(expr_nrm[cond_astro], 
            expr_mch[cond_astro], s=5, label='astro')
ax.set_ylabel('mCherry')
ax.set_xlabel('Slc1a3')
ax.legend()
plt.show()


In [None]:
# Control
# GFP-KO in astrocytes
# GFP-KO and mCherry-OE in astrocytes