In [1]:
# Import packages
import os, sys, glob, re, math, pickle
import phate, scprep, magic, meld
import graphtools as gt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import time,random,datetime
import networkx as nx
import scvelo as scv
from sklearn import metrics
from sklearn import model_selection
from scipy import sparse
from scipy.stats import mannwhitneyu, tiecorrect, rankdata
from statsmodels.stats.multitest import multipletests
import scanpy as sc
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import SpectralClustering, OPTICS, cluster_optics_dbscan, AgglomerativeClustering
from bbknn import bbknn
import warnings
%matplotlib inline
%load_ext memory_profiler


In [2]:
import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [3]:
# settings
plt.rc('font', size = 8)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype']=42
plt.rcParams['ps.fonttype']=42
plt.rcParams['text.usetex']=False
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=0.5
sc.set_figure_params(dpi=300,dpi_save=600,
                     frameon=False,
                     fontsize=8)
plt.rcParams['savefig.dpi']=600
sc.settings.verbosity=2
sc._settings.ScanpyConfig.n_jobs=-1


In [4]:
# reproducibility
rs = np.random.seed(42)


In [5]:
# utils
def mwu(X,Y,gene_names,correction=None,debug=False) :
    '''
    Benjamini-Hochberg correction implemented. Can change to Bonferonni
    gene_names (list)
    if X,Y single gene expression array, input x.reshape(-1,1), y.reshape(-1,1)
    NOTE: get zeros sometimes because difference (p-value is so small)
    '''
    p=pd.DataFrame()
    print('Mann-Whitney U w/Benjamini/Hochberg correction\n')
    start = time.time()
    for i,g in enumerate(gene_names) :
        if i==np.round(np.quantile(np.arange(len(gene_names)),0.25)) :
            print('... 25% completed in {:.2f}-s'.format(time.time()-start))
        elif i==np.round(np.quantile(np.arange(len(gene_names)),0.5)) :
            print('... 50% completed in {:.2f}-s'.format(time.time()-start))
        elif i==np.round(np.quantile(np.arange(len(gene_names)),0.75)) :
            print('... 75% completed in {:.2f}-s'.format(time.time()-start))
        p.loc[i,'Gene']=g
        if (tiecorrect(rankdata(np.concatenate((np.asarray(X[:,i]),np.asarray(Y[:,i])))))==0) :
            if debug :
                print('P-value not calculable for {}'.format(g))
            p.loc[i,'pval']=np.nan
        else :
            _,p.loc[i,'pval']=mannwhitneyu(X[:,i],Y[:,i]) # continuity correction is True
    print('... mwu computed in {:.2f}-s\n'.format(time.time() - start))
    # ignore NaNs, since can't do a comparison on these (change numbers for correction)
    p_corrected = p.loc[p['pval'].notna(),:]
    new_pvals = multipletests(p_corrected['pval'],method='fdr_bh')
    p_corrected['pval_corrected'] = new_pvals[1]
    return p_corrected

def log2aveFC(X,Y,gene_names,AnnData=None) :
    '''not sensitivity to directionality due to subtraction
    X and Y full arrays, subsetting performed here
    `gene_names` (list): reduced list of genes to calc
    `adata` (sc.AnnData): to calculate reduced list. NOTE: assumes X,Y drawn from adata.var_names
    '''
    if not AnnData is None :
        g_idx = [i for i,g in enumerate(AnnData.var_names) if g in gene_names]
        fc=pd.DataFrame({'Gene':AnnData.var_names[g_idx],
                         'log2FC':np.log2(X[:,g_idx].mean(axis=0)) - np.log2(Y[:,g_idx].mean(axis=0))}) # returns NaN if negative value 
    else :
        fc=pd.DataFrame({'Gene':gene_names,
                         'log2FC':np.log2(X.mean(axis=0)) - np.log2(Y.mean(axis=0))})
    return fc

In [6]:
# fps
dfp = '/home/cl2292/project/SCA1_snRNAseq/Mouse/data/'
pfp = '/home/cl2292/project/SCA1_snRNAseq/Mouse/results/'
sc.settings.figdir = pfp

In [7]:
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='220325_WT_imp.h5ad' # for full, can maybe get away with ~300G
    %memit wt = sc.read_h5ad(os.path.join(dfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))
    
if True :
    start = time.time()
    backed=None # None if not
    fname='220325_SCA1_imp.h5ad' # for full, can maybe get away with ~300G
    %memit mut = sc.read_h5ad(os.path.join(dfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))    

peak memory: 37708.37 MiB, increment: 37408.70 MiB
loaded @220330.13:44:17
took 35.88-s to load data
peak memory: 74052.29 MiB, increment: 36351.14 MiB
loaded @220330.13:44:53
took 36.86-s to load data


In [8]:
print(wt.obs['timepoint'].value_counts())
print()
# print(mut.obs['timepoint'].value_counts())

18wk    50640
30wk    31609
24wk    31027
5wk     29355
12wk    18910
Name: timepoint, dtype: int64



In [9]:
wt5 = wt.obs.loc[wt.obs['timepoint']=='5wk', :].sample(n=18910, replace=False).index.to_list()
wt12 = wt.obs.loc[wt.obs['timepoint']=='12wk', :].sample(n=18910, replace=False).index.to_list()
wt18 = wt.obs.loc[wt.obs['timepoint']=='18wk', :].sample(n=18910, replace=False).index.to_list()
wt24 = wt.obs.loc[wt.obs['timepoint']=='24wk', :].sample(n=18910, replace=False).index.to_list()
wt30 = wt.obs.loc[wt.obs['timepoint']=='30wk', :].sample(n=18910, replace=False).index.to_list()
sample_wt = wt[(wt.obs.index.isin(wt5)|wt.obs.index.isin(wt12)|
                wt.obs.index.isin(wt18)|wt.obs.index.isin(wt24)|wt.obs.index.isin(wt30)), :]

In [10]:
mut5 = mut.obs.loc[mut.obs['timepoint']=='5wk', :].sample(n=18910, replace=False).index.to_list()
mut12 = mut.obs.loc[mut.obs['timepoint']=='12wk', :].sample(n=18910, replace=False).index.to_list()
mut18 = mut.obs.loc[mut.obs['timepoint']=='18wk', :].sample(n=18910, replace=False).index.to_list()
mut24 = mut.obs.loc[mut.obs['timepoint']=='24wk', :].sample(n=18910, replace=False).index.to_list()
mut30 = mut.obs.loc[mut.obs['timepoint']=='30wk', :].sample(n=18910, replace=False).index.to_list()
sample_mut = mut[(mut.obs.index.isin(mut5)|mut.obs.index.isin(mut12)|mut.obs.index.isin(mut18)|mut.obs.index.isin(mut24)|mut.obs.index.isin(mut30)), :]

In [11]:
sample_wt.obs['timepoint'].value_counts()

30wk    18910
24wk    18910
18wk    18910
12wk    18910
5wk     18910
Name: timepoint, dtype: int64

In [12]:
# save data objects
sample_wt.write(os.path.join(dfp,'220328_WT_sampling.h5ad'))
sample_mut.write(os.path.join(dfp,'220328_SCA1_sampling.h5ad'))
print('saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))

saved @220328.12:19:28


In [7]:
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='220328_WT_sampling.h5ad' # for full, can maybe get away with ~300G
    %memit wt = sc.read_h5ad(os.path.join(dfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))
    
if True :
    start = time.time()
    backed=None # None if not
    fname='220328_SCA1_sampling.h5ad' # for full, can maybe get away with ~300G
    %memit mut = sc.read_h5ad(os.path.join(dfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))    

peak memory: 22278.22 MiB, increment: 21978.61 MiB
loaded @220502.10:10:47
took 22.18-s to load data
peak memory: 44394.02 MiB, increment: 22115.77 MiB
loaded @220502.10:11:10
took 22.85-s to load data


In [8]:
for i in wt.obs['timepoint'].unique():
    wtt = wt[wt.obs['timepoint']==i,:]
    print("WT " + i)
    print(wtt.obs['sub3'].value_counts())
    print(" ")

WT 5wk
GC      14492
MLI1      821
OL        731
BG        685
AS        598
DCN       596
MLI2      233
END       200
PER       139
OPC        96
Z+PC       96
MG         72
GoC        65
UBC        49
Z-PC       37
Name: sub3, dtype: int64
 
WT 12wk
GC      16297
MLI1      671
BG        436
OL        421
AS        322
MLI2      213
END        96
Z+PC       90
DCN        83
PER        68
MG         51
OPC        50
GoC        48
UBC        42
Z-PC       22
Name: sub3, dtype: int64
 
WT 18wk
GC      15063
MLI1      930
OL        627
BG        606
AS        546
MLI2      275
DCN       229
END       173
Z+PC      122
PER        91
OPC        68
GoC        58
MG         56
Z-PC       35
UBC        31
Name: sub3, dtype: int64
 
WT 24wk
GC      16408
MLI1      626
OL        449
BG        390
AS        332
MLI2      173
DCN       107
END        93
OPC        64
MG         58
Z+PC       50
PER        46
GoC        42
Z-PC       38
UBC        34
Name: sub3, dtype: int64
 
WT 30wk
GC      15965

In [9]:
for i in mut.obs['timepoint'].unique():
    mutt = mut[mut.obs['timepoint']==i,:]
    print("SCA1 " + i)
    print(mutt.obs['sub3'].value_counts())
    print(" ")

SCA1 5wk
GC      14584
MLI1     1039
BG        853
AS        658
OL        565
MLI2      237
END       235
PER       210
Z-PC      101
Z+PC       94
DCN        93
GoC        77
OPC        61
MG         52
UBC        51
Name: sub3, dtype: int64
 
SCA1 12wk
GC      16143
MLI1      769
BG        497
OL        438
AS        322
MLI2      228
END       136
PER        71
Z+PC       59
MG         51
OPC        45
UBC        45
DCN        39
GoC        37
Z-PC       30
Name: sub3, dtype: int64
 
SCA1 18wk
GC      15301
MLI1      906
AS        571
OL        568
BG        564
MLI2      253
END       218
PER        97
Z-PC       93
MG         85
DCN        59
OPC        58
GoC        51
Z+PC       47
UBC        39
Name: sub3, dtype: int64
 
SCA1 24wk
GC      16109
MLI1      607
OL        494
BG        450
AS        414
MLI2      189
DCN       172
END       120
MG         70
PER        66
OPC        56
Z+PC       52
Z-PC       39
UBC        38
GoC        34
Name: sub3, dtype: int64
 
SCA1 30wk
GC 

In [10]:
## EMD; SUB4, IMP, Sampling

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = '220328_mouse_imp_samp' 
    dge = pd.DataFrame()
    for t in ['5wk','12wk','18wk','24wk','30wk'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in wt.obs[group].unique() :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt[((wt.obs[group]==i) & (wt.obs['timepoint']==t)), :].layers['imputed']
            Y = mut[((mut.obs[group]==i) & (mut.obs['timepoint']==t)), :].layers['imputed']
            

            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating 5wk

--------
...
--------


BG, WT vs SCA1
----
    Ncells in X:685
    Ncells in Y:853

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 18.29-s
... 50% completed in 39.51-s
... 75% completed in 64.94-s
... mwu computed in 94.23-s

... computed in 105.32-s

GC, WT vs SCA1
----
    Ncells in X:14492
    Ncells in Y:14584

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 102.50-s
... 50% completed in 210.74-s
... 75% completed in 325.92-s
... mwu computed in 446.98-s

... computed in 519.94-s

OL, WT vs SCA1
----
    Ncells in X:731
    Ncells in Y:565

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 17.99-s
... 50% completed in 39.03-s
... 75% completed in 64.12-s
... mwu computed in 93.49-s

... computed in 101.73-s

AS, WT vs SCA1
----
    Ncells in X:598
    Ncells in Y:658

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 17.56-s
... 50% completed in 38.32-s
... 75% completed in 63.39-s
...

... computed in 95.15-s

DCN, WT vs SCA1
----
    Ncells in X:229
    Ncells in Y:59

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.78-s
... 50% completed in 34.61-s
... 75% completed in 57.31-s
... mwu computed in 83.39-s

... computed in 87.11-s

MG, WT vs SCA1
----
    Ncells in X:56
    Ncells in Y:85

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.37-s
... 50% completed in 33.80-s
... 75% completed in 56.17-s
... mwu computed in 82.82-s

... computed in 86.59-s

MLI2, WT vs SCA1
----
    Ncells in X:275
    Ncells in Y:253

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.78-s
... 50% completed in 34.56-s
... 75% completed in 57.35-s
... mwu computed in 83.58-s

... computed in 87.74-s

MLI1, WT vs SCA1
----
    Ncells in X:930
    Ncells in Y:906

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 18.73-s
... 50% completed in 40.62-s
... 75% completed in 66.56-s
... mwu computed in 96.40-s

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.06-s
... 50% completed in 32.96-s
... 75% completed in 54.88-s
... mwu computed in 80.56-s

... computed in 84.47-s

PER, WT vs SCA1
----
    Ncells in X:56
    Ncells in Y:58

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.88-s
... 50% completed in 32.86-s
... 75% completed in 54.75-s
... mwu computed in 80.29-s

... computed in 83.98-s

END, WT vs SCA1
----
    Ncells in X:193
    Ncells in Y:202

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.70-s
... 50% completed in 34.60-s
... 75% completed in 57.40-s
... mwu computed in 84.20-s

... computed in 88.48-s

OPC, WT vs SCA1
----
    Ncells in X:39
    Ncells in Y:49

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.93-s
... 50% completed in 32.82-s
... 75% completed in 55.00-s
... mwu computed in 81.07-s

... computed in 84.68-s

GoC, WT vs SCA1
----
    Ncells in X:47
    Ncells in Y:55

Ma

In [None]:
## EMD; SUB4, No IMP, Sampling

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = '220415_mouse_no_imp_samp' 
    dge = pd.DataFrame()
    for t in ['5wk','12wk','18wk','24wk','30wk'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in wt.obs[group].unique() :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt.X[((wt.obs[group]==i) & (wt.obs['timepoint']==t)), :]
            Y = mut.X[((mut.obs[group]==i) & (mut.obs['timepoint']==t)), :]
            
            if sparse.issparse(X):
                X = X.todense()
            if sparse.issparse(Y):
                Y = Y.todense()
                
            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating 5wk

--------
...
--------


BG, WT vs SCA1
----
    Ncells in X:685
    Ncells in Y:853

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 16.13-s
... 50% completed in 35.09-s
... 75% completed in 57.91-s
... mwu computed in 84.28-s

... computed in 92.36-s

GC, WT vs SCA1
----
    Ncells in X:14492
    Ncells in Y:14584

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 38.50-s
... 50% completed in 79.24-s
... 75% completed in 124.24-s
... mwu computed in 172.90-s

... computed in 206.36-s

OL, WT vs SCA1
----
    Ncells in X:731
    Ncells in Y:565

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 16.04-s
... 50% completed in 34.92-s
... 75% completed in 57.63-s
... mwu computed in 83.96-s

... computed in 88.27-s

AS, WT vs SCA1
----
    Ncells in X:598
    Ncells in Y:658

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.99-s
... 50% completed in 34.82-s
... 75% completed in 57.46-s
... mwu

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.09-s
... 50% completed in 33.04-s
... 75% completed in 54.78-s
... mwu computed in 80.05-s

... computed in 83.37-s

MG, WT vs SCA1
----
    Ncells in X:56
    Ncells in Y:85

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.63-s
... 50% completed in 32.14-s
... 75% completed in 53.54-s
... mwu computed in 78.49-s

... computed in 81.79-s

MLI2, WT vs SCA1
----
    Ncells in X:275
    Ncells in Y:253

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 15.25-s
... 50% completed in 33.37-s
... 75% completed in 55.28-s
... mwu computed in 80.66-s

... computed in 84.15-s

MLI1, WT vs SCA1
----
    Ncells in X:930
    Ncells in Y:906

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 16.62-s
... 50% completed in 36.09-s
... 75% completed in 59.38-s
... mwu computed in 86.28-s

... computed in 91.10-s

UBC, WT vs SCA1
----
    Ncells in X:31
    Ncells in Y:39


In [None]:


# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='220325_WT_imp.h5ad' # for full, can maybe get away with ~300G
    %memit wtt = sc.read_h5ad(os.path.join(dfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))
    
if True :
    start = time.time()
    backed=None # None if not
    fname='220325_SCA1_imp.h5ad' # for full, can maybe get away with ~300G
    %memit mutt = sc.read_h5ad(os.path.join(dfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))    

In [9]:
## EMD, Z+PC and Z-PC, no_sampling, imp
wtt = wt
mutt = mut


if True :
    dge_grandtotal = time.time()
    group='sub3'
    fname = '220330_mouse_imp_no_samp_ZPC' 
    dge = pd.DataFrame()
    for t in ['5wk','12wk','18wk','24wk','30wk'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()

        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
        
        for i in ['Z+PC','Z-PC'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wtt[((wtt.obs[group]==i) & (wtt.obs['timepoint']==t)), :].layers['imputed']
            Y = mutt[((mutt.obs[group]==i) & (mutt.obs['timepoint']==t)), :].layers['imputed']
            

            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wtt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wtt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wtt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))            
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'    
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)


    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating 5wk

--------
...
--------


Z+PC, WT vs SCA1
----
    Ncells in X:154
    Ncells in Y:98

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.44-s
... 50% completed in 31.45-s
... 75% completed in 51.97-s
... mwu computed in 75.67-s

... computed in 86.48-s

Z-PC, WT vs SCA1
----
    Ncells in X:62
    Ncells in Y:104

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.21-s
... 50% completed in 31.14-s
... 75% completed in 51.62-s
... mwu computed in 75.43-s

... computed in 78.74-s

Finished timepoint 5wk in 2.75-min
Evaluating 12wk

--------
...
--------


Z+PC, WT vs SCA1
----
    Ncells in X:90
    Ncells in Y:59

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.20-s
... 50% completed in 31.11-s
... 75% completed in 51.53-s
... mwu computed in 75.23-s

... computed in 78.58-s

Z-PC, WT vs SCA1
----
    Ncells in X:22
    Ncells in Y:32

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14

In [None]:
## EMD; SUB4, IMP, Sampling, PC only

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = '220401_mouse_imp_samp_PC' 
    dge = pd.DataFrame()
    for t in ['5wk','12wk','18wk','24wk','30wk'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['PC'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt[((wt.obs[group]==i) & (wt.obs['timepoint']==t)), :].layers['imputed']
            Y = mut[((mut.obs[group]==i) & (mut.obs['timepoint']==t)), :].layers['imputed']
            

            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
#     dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
#     dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
#     dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
#     dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
    dge.to_csv(os.path.join(pfp,'dge_'+fname+'.csv'),index=False)

        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating 5wk

--------
...
--------


PC, WT vs SCA1
----
    Ncells in X:133
    Ncells in Y:195

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.51-s
... 50% completed in 31.61-s
... 75% completed in 52.27-s
... mwu computed in 76.40-s

... computed in 109.77-s

Finished timepoint 5wk in 1.83-min
Evaluating 12wk

--------
...
--------


PC, WT vs SCA1
----
    Ncells in X:112
    Ncells in Y:89

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.23-s
... 50% completed in 31.15-s
... 75% completed in 51.63-s
... mwu computed in 75.44-s

... computed in 78.61-s

Finished timepoint 12wk in 1.31-min
Evaluating 18wk

--------
...
--------


PC, WT vs SCA1
----
    Ncells in X:157
    Ncells in Y:140

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 14.46-s
... 50% completed in 31.59-s
