In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import os
from natsort import natsorted

import scanpy as sc
import seaborn as sns

from scroutines import basicu

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tools.sm_exceptions import ValueWarning
from tqdm import tqdm


import lmm

In [None]:
outfigdir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_nrdr_lmm'
!mkdir $outfigdir

In [None]:
# use those 286 genes
df = pd.read_csv("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/res/L23-ABC-genes-n288-n286unq-annot_v3_july8_2024.csv")
genes_l23 = df['gene'].astype(str).values
genes_l23a = df[df['P17on']=='A']['gene'].astype(str).values
genes_l23b = df[df['P17on']=='B']['gene'].astype(str).values
genes_l23c = df[df['P17on']=='C']['gene'].astype(str).values

print(genes_l23a.shape, genes_l23b.shape, genes_l23c.shape)
genes_grp = df['P17on'].astype(str).values
assert len(genes_l23) == len(np.unique(genes_l23))

genes_l23.shape

In [None]:
scores_abc = pd.read_csv("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/scores_l23abc.csv", 
                         index_col=0,
                        )
scores_abc['scores_c-a'] = scores_abc['scores_c'] - scores_abc['scores_a']
scores_abc

In [None]:
adata = sc.read("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad")
adata.X = adata.raw.X
adata

In [None]:
# remove mitocondria genes
adata = adata[:,~adata.var.index.str.contains(r'^mt-')]

# remove sex genes
sex_genes = ["Xist", "Uty", "Eif2s3y", "Kdm5d", "Ddx3y"]
adata = adata[:,[g for g in adata.var.index if g not in sex_genes]]

# filter genes
cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
adata = adata[:,cond].copy()
# genes = adata.var.index.values

adata

In [None]:
np.array(natsorted(np.unique(adata.obs['Age'].values)))

In [None]:
adata.obs['scores_a'] = scores_abc.loc[adata.obs.index,'scores_a'].copy()
adata.obs['scores_b'] = scores_abc.loc[adata.obs.index,'scores_b'].copy()
adata.obs['scores_c'] = scores_abc.loc[adata.obs.index,'scores_c'].copy()
adata.obs['scores_c-a'] = scores_abc.loc[adata.obs.index,'scores_c-a'].copy()
adata.obs['cond'] = adata.obs['cond'].apply(lambda x: x.replace('NR', ""))

sample_labels = adata.obs['Sample'].values
time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

adata.obs['sample'] = sample_labels #
adata.obs['time']   = time_labels

uniq_samples = natsorted(np.unique(sample_labels))
nr_samples = [s for s in uniq_samples if "DR" not in s]
dr_samples = [s for s in uniq_samples if "DR" in s]

uniq_conds = np.array(natsorted(np.unique(adata.obs['cond'].values)))

nr_idx = np.array([0,1,2,4,6,8,10])
dr_idx = np.array([3,5,7,9])

nr_times = np.array([6,8,10,12,14,17,21])
dr_times = np.array(       [12,14,17,21])
print(uniq_conds)

In [None]:
time = 'P17'
exp_conds = [time, time+'DR']
subclass  = 'L2/3'
subclass_cure = subclass.replace('/', '')
offset = 1e-2
scale = 1e4
num_archetypal_cells = 100

adatasub_a = []
adatasub_b = []
adatasub_c = []
adatasub_all = []

for exp_cond in exp_conds: 
    adatasub = adata[(adata.obs['Age']==exp_cond) & (adata.obs['Subclass']==subclass)]
    n_cells = adatasub.shape[0]
    
    # get A vs C 
    ranks_ac = adatasub.obs['scores_c-a'].rank()
    ranks_b  = adatasub.obs['scores_b'].rank()
    
    precond_a = ranks_ac <= num_archetypal_cells
    precond_c = ranks_ac > adatasub.shape[0] - num_archetypal_cells
    precond_b = ranks_b  > adatasub.shape[0] - num_archetypal_cells
    
    cond_a = np.all([ precond_a, ~precond_b, ~precond_c], axis=0)
    cond_b = np.all([~precond_a,  precond_b, ~precond_c], axis=0)
    cond_c = np.all([~precond_a, ~precond_b,  precond_c], axis=0)
    
    adatasub_a.append(adatasub[cond_a])
    adatasub_b.append(adatasub[cond_b])
    adatasub_c.append(adatasub[cond_c])
    adatasub_all.append(adatasub)

adatasub_a = sc.concat(adatasub_a)
adatasub_b = sc.concat(adatasub_b)
adatasub_c = sc.concat(adatasub_c)
adatasub_all = sc.concat(adatasub_all)

In [None]:
%%time
for adatasub, tag in zip([adatasub_a, adatasub_b, adatasub_c, adatasub_all], 
                         ['A', 'B', 'C', 'ALL'],
                        ) :
    
    # ### test
    # adatasub = adatasub[:,:20]
    # ### test
    
    genes = adatasub.var.index.values 

    obs_fixed = 'Age'
    obs_random = 'Sample'
    obs = adatasub.obs[[obs_fixed, obs_random]].copy()
    obs = obs.dropna()

    adatasub = adatasub[obs.index]

    output = os.path.join(outfigdir, f'NRDR_DEGs_LMM_{time}_{subclass_cure}_{tag}_v2.csv')

    # mat
    mat = np.array(adatasub.X.todense())/adatasub.obs['total_counts'].values.reshape(-1,1)*scale

    df_res = lmm.run_lmm(mat, genes, obs, obs_fixed, obs_random, output=output, offset=offset)

In [None]:
df_res

In [None]:
# df_res.set_index('gene').loc['Nptx2']

In [None]:
df_res.set_index('gene').loc['Igsf9b']