In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import os
from scipy import stats

import scanpy as sc
import seaborn as sns

from scroutines import basicu

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tools.sm_exceptions import ValueWarning
from tqdm import tqdm

In [None]:
outfigdir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_sexual_dimorphism'
!mkdir $outfigdir

# 3 criteria
- FDR < 0.05 LMM (g ~ 1 + sex + subject)  # (does sex have an effect beyond subject noise?)
- sufficient expression (max subject) > 0.1 (CP10k)
- effect size (log2(FC) > 1)

In [None]:
f1 = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/v1_multiome/superdupermegaRNA_hasraw_multiome_l23.h5ad'
f2 = '/u/home/f/f7xiesnm/v1_multiome/multiome_cell_sex_assignment_saumya.csv'
adata  = sc.read(f1)
df_sex = pd.read_csv(f2)
adata.obs = adata.obs.join(df_sex.set_index('cell'))
adata.X = adata.raw.X
adata = adata[:,~adata.var.index.str.contains(f'^mt')]
genes = adata.var.index.values
adata

In [None]:
def run_lmm(mat, genes, obs, obs_fixed, obs_random, output=None):
    """
     mat - cell by gene - cp10k norm
     genes - gene names in mat
     obs - cell names and other metadata in mat
     
    """
    c0, c1 = np.unique(obs[obs_fixed])
    obs = obs[[obs_fixed, obs_random]]
    zmat = stats.zscore(np.log2(mat+1), axis=0)
    print(zmat.shape, obs.shape)
    
    # remove genes that are NaN in Zmat (no variation at all; or all zero)
    cond_nan = np.any(np.isnan(zmat), axis=0)
    mat = mat[:,~cond_nan]
    zmat = zmat[:,~cond_nan]
    genes = genes[~cond_nan]
    genes_idx = np.arange(len(genes)) # local index
    print(zmat.shape, obs.shape)

    # cp10k scale, ctrds and log2fc (fast)
    df = pd.DataFrame(mat, columns=np.char.add('g', genes_idx.astype(str)), index=obs.index)
    df = df.join(obs).dropna()
    print(df.shape)

    df_mean = df.groupby([obs_fixed]).mean(numeric_only=True)
    log2fc  = ( np.log2(df_mean.loc[c1]+offset)
               -np.log2(df_mean.loc[c0]+offset)).values
    cond_fc = (np.abs(log2fc) > np.log2(2))

    df_mean_sample = df.groupby([obs_random]).mean(numeric_only=True)
    df_mean_sample.columns = genes_idx
    cond_expr = (df_mean_sample.max() > 0.1).values

    cond_all = np.logical_and(cond_expr, cond_fc)
    print(cond_all.sum(), genes[cond_all])

    # zscore(log2(cp10k)) scale [zscore is needed for the mixedlm model to converge], test (slow)
    zdf = pd.DataFrame(zmat, columns=np.char.add('g', genes_idx.astype(str)), index=obs.index)
    zdf = zdf.join(obs).dropna()

    # formal test (slow)
    pvals = []
    converges = []
    for i in tqdm(genes_idx):
        model = smf.mixedlm(f"g{i} ~ {obs_fixed}", zdf, groups=obs_random)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore") 
            # warnings.simplefilter("ignore", convergencewarning)
            # warnings.simplefilter("ignore", runtimewarning)
            result = model.fit()

        pval = result.pvalues[f'{obs_fixed}[T.{c1}]']
        converged = result.converged
        pvals.append(pval)
        converges.append(converged)

    converges = np.array(converges)
    pvals = np.nan_to_num(np.array(pvals), 1)
    rej, qvals, _, _ =  multipletests(pvals, alpha=0.05, method='fdr_bh')
    cond_all = np.logical_and(cond_all, rej)

    print(cond_all.sum(), genes[cond_all])
    # save results: exp_cond, subclass, genes, log2fc, qvals

    df_res = pd.DataFrame(index=genes_idx)
    df_res['gene'] = genes
    df_res['log2fc'] = log2fc
    df_res['qval'] = qvals
    df_res['coverged'] = converges
    df_res = df_res.join(df_mean_sample.T)
    subclass_cure = subclass.replace('/', '')
    if output is not None:
        print(output)
        df_res.to_csv(output)
        
    return df_res

In [None]:
import lmm
import importlib
importlib.reload(lmm)

In [None]:
%%time

for exp_cond in ['P6', 'P8', 'P10', 'P12', 'P14', 'P17', 'P12DR', 'P14DR', 'P17DR', 'P21DR']:
    for subclass in ['L2/3']:
        subclass_cure = subclass.replace('/', '')
        output = os.path.join(outfigdir, f'{exp_cond}_{subclass_cure}.csv')

        adatasub = adata[(adata.obs['Age']==exp_cond) & (adata.obs['Subclass']==subclass)]
        
        ### test
        adatasub = adatasub[:,:20]
        genes = genes[:20]
        ### test
        
        obs = adatasub.obs[['sex', 'Sample']].copy()
        obs = obs.dropna()
    
        obs['sex'] = obs['sex'].apply(lambda x: x[0].upper())
        obs['subject'] = np.char.add(obs['Sample'].values.astype(str), obs['sex'].values.astype(str))
        adatasub = adatasub[obs.index]
        
        obs_fixed = 'sex'
        obs_random = 'subject'
        
        # mat
        mat = np.array(adatasub.X.todense())/adatasub.obs['total_counts'].values.reshape(-1,1)*1e4
        
        df_res = lmm.run_lmm(mat, genes, obs, obs_fixed, obs_random, output=output)
        
        
    break

In [None]:
df_res

In [None]:
#!head /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results_sexual_dimorphism/P21DR_L23.csv