In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import os
from natsort import natsorted

import scanpy as sc
import seaborn as sns

from scroutines import basicu

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tools.sm_exceptions import ValueWarning
from tqdm import tqdm


import lmm

In [None]:
outfigdir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_nrdr_lmm'
!mkdir $outfigdir

In [None]:
adata = sc.read("/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_P21NRDR.h5ad")
adata.X = adata.raw.X
adata

In [None]:
# remove mitocondria genes
adata = adata[:,~adata.var.index.str.contains(r'^mt-')]

# remove sex genes
sex_genes = ["Xist", "Uty", "Eif2s3y", "Kdm5d", "Ddx3y"]
adata = adata[:,[g for g in adata.var.index if g not in sex_genes]]

# filter genes
cond = np.ravel((adata.X>0).sum(axis=0)) > 10 # expressed in more than 10 cells
adata = adata[:,cond].copy()
# genes = adata.var.index.values

adata

In [None]:
np.array(natsorted(np.unique(adata.obs['Age'].values)))

In [None]:
cell_abundances = adata.obs.groupby(['Subclass', 'Age']).size().unstack()
cell_abundances
# value_counts()

In [None]:
num_cells_th = 100
uniq_subclasses = cell_abundances[cell_abundances.min(axis=1) > num_cells_th].index.values.astype(str)
uniq_subclasses

In [None]:
# adata.obs['cond'] = adata.obs['cond'].apply(lambda x: x.replace('NR', ""))

# sample_labels = adata.obs['Sample'].values
# time_labels = [s[:-1].replace('DR', '') for s in sample_labels]

# adata.obs['sample'] = sample_labels #
# adata.obs['time']   = time_labels

# uniq_samples = natsorted(np.unique(sample_labels))
# nr_samples = [s for s in uniq_samples if "DR" not in s]
# dr_samples = [s for s in uniq_samples if "DR" in s]

# uniq_conds = np.array(natsorted(np.unique(adata.obs['cond'].values)))

# print(uniq_conds)

In [None]:
%%time

for subclass in uniq_subclasses:
    time = 'P21'
    exp_conds = [time, time+'DR']
    # subclass  = 'L2/3'
    subclass_cure = subclass.replace('/', '')
    offset = 1e-2
    scale = 1e4
    tag = 'v1'

    adatasub = adata[(adata.obs['Age'].isin(exp_conds)) & (adata.obs['Subclass']==subclass)]

    ### test
    adatasub = adatasub[:,:20]
    ### test

    genes = adatasub.var.index.values 

    obs_fixed = 'Age'
    obs_random = 'Sample'
    obs = adatasub.obs[[obs_fixed, obs_random]].copy()
    obs = obs.dropna()

    adatasub = adatasub[obs.index]

    output = os.path.join(outfigdir, f'NRDR_DEGs_LMM_{time}_{subclass_cure}_{tag}.csv')

    # mat
    mat = np.array(adatasub.X.todense())/adatasub.obs['total_counts'].values.reshape(-1,1)*scale

    df_res = lmm.run_lmm(mat, genes, obs, obs_fixed, obs_random, output=output, offset=offset)