In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
from scipy import stats
import matplotlib.pyplot as plt


import scanpy as sc
import seaborn as sns

from scroutines import basicu


In [None]:
f1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad'
f2 = '/u/home/f/f7xiesnm/v1_multiome/multiome_cell_sex_assignment_saumya.csv'
adata  = sc.read(f1)
df_sex = pd.read_csv(f2)
adata.obs = adata.obs.join(df_sex.set_index('cell'))
adata.X = adata.raw.X
adata = adata[:,~adata.var.index.str.contains(f'^mt')]
genes = adata.var.index.values
adata

In [None]:
%%time

time = 'P21'
exp_conds = [time, time+'DR']
subclass  = 'L2/3'
offset = 1e-2

adatasub = adata[(adata.obs['Age'].isin(exp_conds)) & (adata.obs['Subclass']==subclass)]
mat = np.array(adatasub.X.todense())/adatasub.obs['total_counts'].values.reshape(-1,1)*1e4
zmat = stats.zscore(np.log2(mat+1), axis=0)

zdf = pd.DataFrame(zmat, columns=np.char.add('g', np.arange(len(genes)).astype(str)), index=adatasub.obs.index)
zdf = zdf.join(adatasub.obs[['Sample', 'Age']])
print(zdf.shape)

df = pd.DataFrame(mat, columns=np.char.add('g', np.arange(len(genes)).astype(str)), index=adatasub.obs.index)
df = df.join(adatasub.obs[['Sample', 'Age']])
print(df.shape)

# FC (fast)
df_mean = df.groupby(['Age']).mean(numeric_only=True)
log2fc  = ( np.log2(df_mean.loc[exp_conds[1]]+offset)
           -np.log2(df_mean.loc[exp_conds[0]]+offset)).values
cond_fc = (np.abs(log2fc) > np.log2(2))
print(exp_conds, subclass, cond_fc.sum()) # , genes[cond_fc])

# # formal test (slow)
# pvals = []
# for i in tqdm(range(len(genes))):
#     model = smf.mixedlm(f"g{i} ~ Age", df, groups="Sample")
#     with warnings.catch_warnings():
#         warnings.simplefilter("ignore") 
#         # warnings.simplefilter("ignore", ConvergenceWarning)
#         # warnings.simplefilter("ignore", RuntimeWarning)
#         result = model.fit()

#     pval = result.pvalues[f'Age[T.{exp_conds[1]}]']
#     pvals.append(pval)

# pvals = np.nan_to_num(np.array(pvals), 1)
# rej, qvals, _, _ =  multipletests(pvals, alpha=0.05, method='fdr_bh')
# cond_both = np.logical_and(rej, cond_fc)

# print(time, subclass, genes[cond_both])
# # save results: exp_cond, subclass, genes, log2fc, qvals

# df_res = pd.DataFrame()
# df_res['gene'] = genes
# df_res['log2fc'] = log2fc
# df_res['qval'] = qvals
# output = os.path.join(outfigdir, f'NRDR_DEGs_LMM_{time}.csv')
# print(output)
# df_res.to_csv(output)


In [None]:

i = basicu.get_index_from_array(genes, 
                                ['Nptx2'] # convergence issue
                                # ['Matn2']
                                
                                # ['Sema6a']
                                # ['Rgs12']
                                # ['Ebi3'] 
                                # ['Txnl4b']
                                # ['Anxa4']
                                # ['Coa5']
                                # ['Gm29650']
                                # ['Inpp5d'],
                                # ['Tiparp'],
                                
                                # ['Trib1'], 
                                # ['Tanc1'], 
                                # ['Nr4a3'], 
                                # ['Gm35021'], 
                                # ['Gm17949'], 
                                # ['Gatd1'], 
                                # ['Enpp3'], 
                                # ['Ehd2'], 
                                # ['Gm26801'],
                                
                               )[0]
sns.boxplot(data=zdf, x='Age', hue='Sample', y=f'g{i}')

model = smf.mixedlm(f"g{i} ~ Age", zdf, groups="Sample")
result = model.fit()
# result = model.fit(method='bfgs')
# result = model.fit(method=['lbfgs', 'bfgs', 'cg', 'powell'])
# result = model.fit(method='lbfgs', maxiter=2000, tol=1e-6)


print(df_mean[f'g{i}'])
result.summary()

In [None]:
result.converged

In [None]:
result.pvalues