In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt


import scanpy as sc
import seaborn as sns

from scroutines import basicu


In [None]:
f1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad'
f2 = '/u/home/f/f7xiesnm/v1_multiome/multiome_cell_sex_assignment_saumya.csv'
adata  = sc.read(f1)
df_sex = pd.read_csv(f2)
adata.obs = adata.obs.join(df_sex.set_index('cell'))
adata.X = adata.raw.X
adata = adata[:,~adata.var.index.str.contains(f'^mt')]
adata

In [None]:
adatasub = adata[(adata.obs['Age']=='P6') & (adata.obs['Subclass']=='L2/3')]
# normalize
mat = np.log2(np.array(adatasub.X.todense())/adatasub.obs['total_counts'].values.reshape(-1,1)*1e4+1)

adatasub.shape, mat.shape

In [None]:
genes = adatasub.var.index.values
df = pd.DataFrame(mat, columns=np.char.add('g', np.arange(len(genes)).astype(str)), index=adatasub.obs.index)
df = df.join(adatasub.obs[['sex', 'Sample']])
df = df.dropna()
df['sex'] = df['sex'].apply(lambda x: x[0].upper())
df['subject'] = np.char.add(df['Sample'].values.astype(str), df['sex'].values.astype(str))
df.shape

In [None]:
i = basicu.get_index_from_array(genes, ['Kdm5d'])[0]
sns.boxplot(data=df, x='sex', hue='Sample', y=f'g{i}')

model = smf.mixedlm(f"g{i} ~ sex", df, groups="subject")
result = model.fit()
result.summary()

In [None]:
gene_list = ['Dpp10', 'Snhg11', 'Xist', 'Nlgn1', '6530403H02Rik', 'Lingo2', 'Stmn1', 'Actb',
 'Ptn', 'Cntnap2', 'Lrrtm4', 'Gabrg3', 'Dlgap2', 'Cdh13', 'Gpc6', 'Cntn5', 'Hspa8',
 'Eef1a1', 'Slit3', 'Tubb2b', 'Pde4d', 'Nrxn3', 'Robo1', 'Tubb5', 'Eif2s3y', 'Uty',
 'AC149090.1']

for gene in gene_list:
    i = basicu.get_index_from_array(genes, [gene])[0]
    # sns.boxplot(data=df, x='sex', hue='Sample', y=f'g{i}')

    model = smf.mixedlm(f"g{i} ~ sex", df, groups="subject")
    result = model.fit()
    result.summary()
    print(gene, result.pvalues['sex[T.M]'])

In [None]:
df_mean = df.groupby(['sex']).mean(numeric_only=True)
log2fc  = df_mean.loc['M']-df_mean.loc['F']
cond_fc = (np.abs(log2fc) > np.log2(1.5))

In [None]:
log2fc.loc[f'g{i}']

In [None]:
2**0.44

In [None]:
%%time
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tools.sm_exceptions import ValueWarning
from tqdm import tqdm

pvals = []
for i in tqdm(range(len(genes))):
    # gene = 'Meis2'
    # Fit LMM: random intercept model
    
    model = smf.mixedlm(f"g{i} ~ sex", df, groups="subject")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore") 
        # warnings.simplefilter("ignore", ConvergenceWarning)
        # warnings.simplefilter("ignore", RuntimeWarning)
        result = model.fit()
        
    pval = result.pvalues['sex[T.M]']
    pvals.append(pval)
    
pvals = np.nan_to_num(np.array(pvals), 1)
rej, qvals, _, _ =  multipletests(pvals, alpha=0.05, method='fdr_bh')
cond_both = np.logical_and(rej, cond_fc)

In [None]:
plt.scatter(log2fc, -np.log10(qvals), s=5)
# plt.ylim([-0.2,10])