In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
from scipy import stats

import scanpy as sc
import seaborn as sns

from scroutines import basicu


In [None]:
f1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad'
f2 = '/u/home/f/f7xiesnm/v1_multiome/multiome_cell_sex_assignment_saumya.csv'
adata  = sc.read(f1)
df_sex = pd.read_csv(f2)
adata.obs = adata.obs.join(df_sex.set_index('cell'))
adata.X = adata.raw.X
adata = adata[:,~adata.var.index.str.contains(f'^mt')]
genes = adata.var.index.values
adata

In [None]:
exp_cond = 'P14DR'
subclass = 'L2/3'
adatasub = adata[(adata.obs['Age']==exp_cond) & (adata.obs['Subclass']==subclass)]
adatasub

In [None]:
offset = 1e-2

In [None]:
mat = np.array(adatasub.X.todense())/adatasub.obs['total_counts'].values.reshape(-1,1)*1e4
zmat = stats.zscore(np.log2(mat+1), axis=0)
cond_nan = np.any(np.isnan(zmat), axis=0)
print(zmat.shape)

mat = mat[:,~cond_nan]
zmat = zmat[:,~cond_nan]
genes = adatasub.var.index.values[~cond_nan]

print(zmat.shape)

# CP10k scale, ctrds and log2FC (fast)
df = pd.DataFrame(mat, columns=np.char.add('g', np.arange(len(genes)).astype(str)), index=adatasub.obs.index)
df = df.join(adatasub.obs[['sex', 'Sample']])
df = df.dropna()
df['sex'] = df['sex'].apply(lambda x: x[0].upper())
df['subject'] = np.char.add(df['Sample'].values.astype(str), df['sex'].values.astype(str))
print(df.shape)

df_mean = df.groupby(['sex']).mean(numeric_only=True)
log2fc  = ( np.log2(df_mean.loc['M']+offset)
           -np.log2(df_mean.loc['F']+offset)).values
cond_fc = (np.abs(log2fc) > np.log2(2))

df_mean_subject = df.groupby(['subject']).mean(numeric_only=True)
df_mean_subject.columns = np.array([col[1:] for col in df_mean_subject.columns]).astype(int)
cond_expr = (df_mean_subject.max() > 0.1).values

cond_all = np.logical_and(cond_expr, cond_fc)
print(exp_cond, subclass, cond_all.sum(), genes[cond_all])

# zscore(log2(CP10k)) scale [zscore is needed for the mixedLM model to converge], test (slow)
zdf = pd.DataFrame(zmat, columns=np.char.add('g', np.arange(len(genes)).astype(str)), index=adatasub.obs.index)
zdf = zdf.join(adatasub.obs[['sex', 'Sample']])
zdf = zdf.dropna()
zdf['sex'] = zdf['sex'].apply(lambda x: x[0].upper())
zdf['subject'] = np.char.add(zdf['Sample'].values.astype(str), zdf['sex'].values.astype(str))
print(zdf.shape)


In [None]:


i = basicu.get_index_from_array(genes, 
                                
                                ['Nr4a3']
                                # ['Vcpip1']
                                # ['Rgs12']
                                # ['Ebi3'] 
                                # ['Txnl4b']
                                # ['Anxa4']
                                # ['Coa5']
                                # ['Gm29650']
                                # ['Inpp5d'],
                                # ['Tiparp'],
                                
                                # ['Trib1'], 
                                # ['Tanc1'], 
                                # ['Nr4a3'], 
                                # ['Gm35021'], 
                                # ['Gm17949'], 
                                # ['Gatd1'], 
                                # ['Enpp3'], 
                                # ['Ehd2'], 
                                # ['Gm26801'],
                                
                               )[0]
sns.boxplot(data=zdf, x='sex', hue='Sample', y=f'g{i}')

model = smf.mixedlm(f"g{i} ~ sex", zdf, groups="subject")
result = model.fit()
print(df_mean[f'g{i}'])
result.summary()