In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import collections
from natsort import natsorted

from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from umap import UMAP
from statsmodels.stats.multitest import multipletests

from matplotlib.colors import LinearSegmentedColormap

from scroutines.config_plots import *
from scroutines import powerplots # .config_plots import *
from scroutines import pnmf
from scroutines import basicu
from scroutines.gene_modules import GeneModules  


In [2]:
outfigdir = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/figures/250409"
!mkdir -p $outfigdir

In [3]:
fin  = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw.h5ad"
fout = "/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_L23.h5ad"

adata = anndata.read(fin) # , backed='r')
adata

AnnData object with n_obs × n_vars = 396318 × 16572
    obs: 'Age', 'Doublet', 'Doublet Score', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'Type', 'Subclass', 'Class', 'Sample', 'total_counts', 'pct_counts_mt', 'n_genes_by_counts', 'total_counts_mt', 'Doublet?', 'Study', 'Type_leiden'
    var: 'feature_types'

In [4]:
adata.X = adata.raw.X 

In [5]:
meta = adata.obs.copy()
meta = meta[meta['Study']=='2023 Multiome']
meta.groupby(['Subclass', 'Age']).size().unstack() # dropna()

Age,P6,P8,P10,P12,P12DR,P14,P14DR,P17,P17DR,P21,P21DR,P28,P28_dl,P28_dr,P38,P38_dr
Subclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Astro,2038,2149,1204,1428,1144,1171,953,808,991,884,1258,0,0,0,0,0
Endo,32,68,117,109,6,128,50,74,97,74,96,0,0,0,0,0
Frem1,82,116,27,45,29,22,20,19,36,42,42,0,0,0,0,0
L2/3,6612,5137,3135,3793,2782,3048,2348,2455,2760,2213,4256,0,0,0,0,0
L2/3/4,0,0,0,0,0,0,752,0,0,0,214,0,0,0,0,0
L4,5501,4891,3093,2842,2901,2439,2061,1991,2150,2173,3758,0,0,0,0,0
L5IT,953,783,465,597,365,540,464,410,515,424,787,0,0,0,0,0
L5NP,461,363,206,257,236,200,186,149,191,204,291,0,0,0,0,0
L5PT,1299,849,493,584,475,493,399,365,495,392,762,0,0,0,0,0
L6CT,3708,2954,1612,1773,1854,1529,1332,1010,1389,1414,2007,0,0,0,0,0


In [6]:
meta = adata.obs.copy()
meta = meta[meta['Study']=='2023 Multiome']
meta = meta[meta['Subclass'].isin(['L2/3'])]

In [7]:
adata = adata[meta.index]
adata

View of AnnData object with n_obs × n_vars = 38539 × 16572
    obs: 'Age', 'Doublet', 'Doublet Score', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'Type', 'Subclass', 'Class', 'Sample', 'total_counts', 'pct_counts_mt', 'n_genes_by_counts', 'total_counts_mt', 'Doublet?', 'Study', 'Type_leiden'
    var: 'feature_types'

In [8]:
sample_labels = ["-".join(cell.split(' ')[0].split('-')[2:]).replace('-2023', '') for cell in adata.obs.index]
time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

adata.obs['n_counts'] = adata.obs['total_counts'] # adata.obs['nCount_RNA']
adata.obs['sample'] = sample_labels
adata.obs['time']   = time_labels

uniq_samples = natsorted(np.unique(sample_labels))
uniq_times = natsorted(np.unique(time_labels))

nr_samples = [s for s in uniq_samples if "DR" not in s]
dr_samples = [s for s in uniq_samples if "DR" in s]
print(uniq_times)
print(nr_samples)
print(dr_samples)

# adata.obs['sample'] = sample_labels

  adata.obs['n_counts'] = adata.obs['total_counts'] # adata.obs['nCount_RNA']


['P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P21']
['P6a', 'P6b', 'P6c', 'P8a', 'P8b', 'P8c', 'P10a', 'P10b', 'P12a', 'P12b', 'P12c', 'P14a', 'P14b', 'P17a', 'P17b', 'P21a', 'P21b']
['P12DRa', 'P12DRb', 'P14DRa', 'P14DRb', 'P17DRa', 'P17DRb', 'P21DRa', 'P21DRb']


In [9]:
# filter genes
cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
adata = adata[:,cond]
genes = adata.var.index.values

# counts
x = adata.X
cov = adata.obs['n_counts'].values

# CP10k
# xn = x/cov.reshape(x.shape[0], -1)*1e4
xn = (sparse.diags(1/cov).dot(x))*1e4

# log2(CP10k+1)
# xln = xn.copy()
# xln.data = np.log2(xln.data+1)

In [10]:
adata.layers[    'norm'] = np.array(xn.todense())

# log_xn = np.log2(1+np.array(xn.todense()))
# adata.layers[ 'lognorm'] = log_xn 
# adata.layers['zlognorm'] = zscore(log_xn, axis=0)

In [11]:
adata.write(fout)