In [1]:
# Import packages
import os, sys, glob, re, math, pickle
import phate, scprep, magic, meld
import graphtools as gt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import time,random,datetime
import networkx as nx
import scvelo as scv
from sklearn import metrics
from sklearn import model_selection
from scipy import sparse
from scipy.stats import mannwhitneyu, tiecorrect, rankdata
from statsmodels.stats.multitest import multipletests
import scanpy as sc
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import SpectralClustering, OPTICS, cluster_optics_dbscan, AgglomerativeClustering
from bbknn import bbknn
import warnings
%matplotlib inline
%load_ext memory_profiler


In [2]:
import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [3]:
# settings
plt.rc('font', size = 8)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype']=42
plt.rcParams['ps.fonttype']=42
plt.rcParams['text.usetex']=False
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=0.5
sc.set_figure_params(dpi=300,dpi_save=600,
                     frameon=False,
                     fontsize=8)
plt.rcParams['savefig.dpi']=600
sc.settings.verbosity=2
sc._settings.ScanpyConfig.n_jobs=-1

# reproducibility
rs = np.random.seed(42)

# utils
def mwu(X,Y,gene_names,correction=None,debug=False) :
    '''
    Benjamini-Hochberg correction implemented. Can change to Bonferonni
    gene_names (list)
    if X,Y single gene expression array, input x.reshape(-1,1), y.reshape(-1,1)
    NOTE: get zeros sometimes because difference (p-value is so small)
    '''
    p=pd.DataFrame()
    print('Mann-Whitney U w/Benjamini/Hochberg correction\n')
    start = time.time()
    for i,g in enumerate(gene_names) :
        if i==np.round(np.quantile(np.arange(len(gene_names)),0.25)) :
            print('... 25% completed in {:.2f}-s'.format(time.time()-start))
        elif i==np.round(np.quantile(np.arange(len(gene_names)),0.5)) :
            print('... 50% completed in {:.2f}-s'.format(time.time()-start))
        elif i==np.round(np.quantile(np.arange(len(gene_names)),0.75)) :
            print('... 75% completed in {:.2f}-s'.format(time.time()-start))
        p.loc[i,'Gene']=g
        if (tiecorrect(rankdata(np.concatenate((np.asarray(X[:,i]),np.asarray(Y[:,i])))))==0) :
            if debug :
                print('P-value not calculable for {}'.format(g))
            p.loc[i,'pval']=np.nan
        else :
            _,p.loc[i,'pval']=mannwhitneyu(X[:,i],Y[:,i]) # continuity correction is True
    print('... mwu computed in {:.2f}-s\n'.format(time.time() - start))
    # ignore NaNs, since can't do a comparison on these (change numbers for correction)
    p_corrected = p.loc[p['pval'].notna(),:]
    new_pvals = multipletests(p_corrected['pval'],method='fdr_bh')
    p_corrected['pval_corrected'] = new_pvals[1]
    return p_corrected

def log2aveFC(X,Y,gene_names,AnnData=None) :
    '''not sensitivity to directionality due to subtraction
    X and Y full arrays, subsetting performed here
    `gene_names` (list): reduced list of genes to calc
    `adata` (sc.AnnData): to calculate reduced list. NOTE: assumes X,Y drawn from adata.var_names
    '''
    if not AnnData is None :
        g_idx = [i for i,g in enumerate(AnnData.var_names) if g in gene_names]
        fc=pd.DataFrame({'Gene':AnnData.var_names[g_idx],
                         'log2FC':np.log2(X[:,g_idx].mean(axis=0)) - np.log2(Y[:,g_idx].mean(axis=0))}) # returns NaN if negative value 
    else :
        fc=pd.DataFrame({'Gene':gene_names,
                         'log2FC':np.log2(X.mean(axis=0)) - np.log2(Y.mean(axis=0))})
    return fc

In [4]:
# fps
dfp = '/home/cl2292/project/SCA1_snRNAseq/Human/data/'
pfp = '/home/cl2292/project/SCA1_snRNAseq/Human/results_20230402/'
pdfp = '/home/cl2292/project/SCA1_snRNAseq/Human/data/processed/'
sc.settings.figdir = pfp

In [16]:
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='230502_Ctrl_sampling_UBC_PC_MLI1_MLI2_GoC.h5ad' # for full, can maybe get away with ~300G
    %memit wt = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))
    
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='230502_SCA1_sampling_UBC_PC_MLI1_MLI2_GoC.h5ad' # for full, can maybe get away with ~300G
    %memit mut = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))

peak memory: 8163.16 MiB, increment: 932.71 MiB
loaded @230503.10:16:40
took 6.69-s to load data
peak memory: 9450.94 MiB, increment: 1287.77 MiB
loaded @230503.10:16:48
took 8.24-s to load data


In [11]:
print(wt.obs.sub4.value_counts())
print(mut.obs.sub4.value_counts())

MLI1    1331
MLI2    1088
PC       370
GoC       44
UBC       43
Name: sub4, dtype: int64
MLI2    1960
MLI1    1684
PC       224
GoC      105
UBC       78
Name: sub4, dtype: int64


In [12]:
## EMD; SUB4, IMP, Sampling, 'UBC','PC','MLI1','MLI2','GoC'

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = 'hum_imp_samp_UBC_PC_MLI1_MLI2_GoC' 
    dge = pd.DataFrame()
    for t in ['all'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['UBC','PC','MLI1','MLI2','GoC'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt[((wt.obs[group]==i)), :].layers['imputed']
            Y = mut[((mut.obs[group]==i)), :].layers['imputed']
            

            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            #dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating all

--------
...
--------


UBC, WT vs SCA1
----
    Ncells in X:43
    Ncells in Y:78

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 19.67-s
... 50% completed in 43.80-s
... 75% completed in 73.23-s
... mwu computed in 109.19-s

... computed in 118.25-s

PC, WT vs SCA1
----
    Ncells in X:370
    Ncells in Y:224

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 20.38-s
... 50% completed in 45.36-s
... 75% completed in 75.87-s
... mwu computed in 112.75-s

... computed in 119.25-s

MLI1, WT vs SCA1
----
    Ncells in X:1331
    Ncells in Y:1684

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 25.97-s
... 50% completed in 56.41-s
... 75% completed in 92.31-s
... mwu computed in 134.62-s

... computed in 144.77-s

MLI2, WT vs SCA1
----
    Ncells in X:1088
    Ncells in Y:1960

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 25.77-s
... 50% completed in 56.06-s
... 75% completed in 91.82-s
.

In [17]:
## EMD; SUB4, Non_IMP, Sampling, 'UBC','PC','MLI1','MLI2','GoC'

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = 'hum_no_imp_samp_UBC_PC_MLI1_MLI2_GoC' 
    dge = pd.DataFrame()
    for t in ['all'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['UBC','PC','MLI1','MLI2','GoC'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt.X[(wt.obs[group]==i), :]
            Y = mut.X[(mut.obs[group]==i), :]
            
            if sparse.issparse(X):
                X = X.todense()
            if sparse.issparse(Y):
                Y = Y.todense()
                
            X = np.asarray(X)
            Y = np.asarray(Y)
            
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            #dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating all

--------
...
--------


UBC, WT vs SCA1
----
    Ncells in X:43
    Ncells in Y:78

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 18.97-s
... 50% completed in 42.45-s
... 75% completed in 71.59-s
... mwu computed in 107.10-s

... computed in 114.23-s

PC, WT vs SCA1
----
    Ncells in X:370
    Ncells in Y:224

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 19.09-s
... 50% completed in 42.83-s
... 75% completed in 72.19-s
... mwu computed in 108.07-s

... computed in 112.62-s

MLI1, WT vs SCA1
----
    Ncells in X:1331
    Ncells in Y:1684

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 22.16-s
... 50% completed in 48.94-s
... 75% completed in 81.21-s
... mwu computed in 119.98-s

... computed in 127.53-s

MLI2, WT vs SCA1
----
    Ncells in X:1088
    Ncells in Y:1960

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 21.91-s
... 50% completed in 48.35-s
... 75% completed in 80.23-s
.

In [5]:
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='230502_Ctrl_sampling_AS BG OPC OL MG PER END.h5ad' # for full, can maybe get away with ~300G
    %memit wt = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))
    
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='230502_SCA1_sampling_AS BG OPC OL MG PER END.h5ad' # for full, can maybe get away with ~300G
    %memit mut = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))

peak memory: 2939.37 MiB, increment: 2635.01 MiB
loaded @230503.10:49:02
took 3.49-s to load data
peak memory: 7085.73 MiB, increment: 4146.36 MiB
loaded @230503.10:49:07
took 4.70-s to load data


In [6]:
print(wt.obs.sub4.value_counts())
print(mut.obs.sub4.value_counts())

BG     2445
OL     2367
AS     1648
OPC     713
MG      638
END     365
PER      73
Name: sub4, dtype: int64
BG     4384
AS     3367
OL     1946
MG     1315
END     967
OPC     879
PER     254
Name: sub4, dtype: int64


In [20]:
## EMD; SUB4, IMP, Sampling, 'AS','BG','OPC','OL','MG','PER','END'

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = 'hum_imp_samp_AS_BG_OPC_OL_MG_PER_END' 
    dge = pd.DataFrame()
    for t in ['all'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['AS','BG','OPC','OL','MG','PER','END'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt[((wt.obs[group]==i)), :].layers['imputed']
            Y = mut[((mut.obs[group]==i)), :].layers['imputed']
            

            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            #dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating all

--------
...
--------


AS, WT vs SCA1
----
    Ncells in X:1648
    Ncells in Y:3367

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 30.69-s
... 50% completed in 65.95-s
... 75% completed in 106.82-s
... mwu computed in 153.97-s

... computed in 168.50-s

BG, WT vs SCA1
----
    Ncells in X:2445
    Ncells in Y:4384

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 34.81-s
... 50% completed in 73.92-s
... 75% completed in 118.71-s
... mwu computed in 169.78-s

... computed in 188.82-s

OPC, WT vs SCA1
----
    Ncells in X:713
    Ncells in Y:879

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 22.76-s
... 50% completed in 50.14-s
... 75% completed in 83.14-s
... mwu computed in 122.60-s

... computed in 129.52-s

OL, WT vs SCA1
----
    Ncells in X:2367
    Ncells in Y:1946

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 28.43-s
... 50% completed in 61.42-s
... 75% completed in 100.03-

In [7]:
## EMD; SUB4, Non_IMP, Sampling, 'AS','BG','OPC','OL','MG','PER','END'

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = 'hum_no_imp_samp_AS_BG_OPC_OL_MG_PER_END' 
    dge = pd.DataFrame()
    for t in ['all'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['AS','BG','OPC','OL','MG','PER','END'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt.X[(wt.obs[group]==i), :]
            Y = mut.X[(mut.obs[group]==i), :]
            
            if sparse.issparse(X):
                X = X.todense()
            if sparse.issparse(Y):
                Y = Y.todense()
                
            X = np.asarray(X)
            Y = np.asarray(Y)
            
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            #dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating all

--------
...
--------


AS, WT vs SCA1
----
    Ncells in X:1648
    Ncells in Y:3367

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 23.75-s
... 50% completed in 51.99-s
... 75% completed in 85.62-s
... mwu computed in 125.44-s

... computed in 137.72-s

BG, WT vs SCA1
----
    Ncells in X:2445
    Ncells in Y:4384

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 25.43-s
... 50% completed in 55.53-s
... 75% completed in 91.01-s
... mwu computed in 132.40-s

... computed in 144.75-s

OPC, WT vs SCA1
----
    Ncells in X:713
    Ncells in Y:879

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 19.99-s
... 50% completed in 44.52-s
... 75% completed in 74.61-s
... mwu computed in 110.92-s

... computed in 116.50-s

OL, WT vs SCA1
----
    Ncells in X:2367
    Ncells in Y:1946

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 22.63-s
... 50% completed in 49.83-s
... 75% completed in 82.64-s
.

In [5]:
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='230503_Ctrl_sampling_GC50000.h5ad' # for full, can maybe get away with ~300G
    %memit wt = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))
    
# Load Data

if True :
    start = time.time()
    backed=None # None if not
    fname='230503_SCA1_sampling_GC50000.h5ad' # for full, can maybe get away with ~300G
    %memit mut = sc.read_h5ad(os.path.join(pdfp,fname),backed=backed)
    print('loaded @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('took {:.2f}-s to load data'.format(time.time()-start))

peak memory: 15633.19 MiB, increment: 15331.92 MiB
loaded @230503.12:28:23
took 15.11-s to load data
peak memory: 31084.44 MiB, increment: 15451.24 MiB
loaded @230503.12:28:39
took 15.53-s to load data


In [6]:
print(wt.obs.sub4.value_counts())
print(mut.obs.sub4.value_counts())

GC    50000
Name: sub4, dtype: int64
GC    50000
Name: sub4, dtype: int64


In [8]:
## EMD; SUB4, IMP, Sampling, 'GC'

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = 'hum_imp_samp_GC' 
    dge = pd.DataFrame()
    for t in ['all'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['GC'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt[((wt.obs[group]==i)), :].layers['imputed']
            Y = mut[((mut.obs[group]==i)), :].layers['imputed']
            

            X = np.asarray(X)
            Y = np.asarray(Y)
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            #dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating all

--------
...
--------


GC, WT vs SCA1
----
    Ncells in X:50000
    Ncells in Y:50000

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 372.54-s
... 50% completed in 750.65-s
... 75% completed in 1141.34-s
... mwu computed in 1553.64-s

... computed in 1947.20-s

Finished timepoint all in 32.45-min
DGE finished in 32.46-min


In [7]:
## EMD; SUB4, Non_IMP, Sampling, 'GC'

if True :
    dge_grandtotal = time.time()
    group='sub4'
    fname = 'hum_no_imp_samp_GC' 
    dge = pd.DataFrame()
    for t in ['all'] :
        print('Evaluating {}'.format(t))
        t_total = time.time()
        dge_total = time.time()
        start_t=time.time()
        
        # up down dichotomy
        print('\n--------')
        print('...')
        print('--------\n')
#        dge = pd.DataFrame()
        for i in ['GC'] :
            start = time.time()
            print('\n{}, WT vs SCA1'.format(i))
            print('----')
            X = wt.X[(wt.obs[group]==i), :]
            Y = mut.X[(mut.obs[group]==i), :]
            
            if sparse.issparse(X):
                X = X.todense()
            if sparse.issparse(Y):
                Y = Y.todense()
                
            X = np.asarray(X)
            Y = np.asarray(Y)
            
        
            print('    Ncells in X:{}'.format(X.shape[0]))
            print('    Ncells in Y:{}\n'.format(Y.shape[0]))            
            
            emd = scprep.stats.differential_expression(X,Y,
                                                       measure = 'emd',
                                                       direction='both',
                                                       gene_names=wt.var_names,
                                                       n_jobs=-1)
            
            # mann-whitney u, corrected p-values
            p = mwu(X,Y,wt.var_names)
            emd['Gene']=emd.index
            emd=emd.drop(columns='rank')
            fc = log2aveFC(Y,X,wt.var_names.to_list())
            gene_mismatch = fc['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                fc = fc.loc[gene_mismatch,:]
                warnings.warn('Warning: {} genes dropped due to p-val NA.'.format((gene_mismatch==False).sum()))
            dt = pd.merge(p,fc,how='left',on="Gene")
            gene_mismatch = emd['Gene'].isin(p['Gene'])
            if gene_mismatch.any():
                emd = emd.loc[gene_mismatch,:]
            dt = pd.merge(dt,emd,how='left',on='Gene')
            dt['Cell type']=[i]*dt.shape[0]
            #dt['timepoint']=[str(t)]*dt.shape[0]
            dt['nlog10pvalcorrected']=(-1)*np.log10(dt['pval_corrected'])
            dge = dge.append(dt, ignore_index=True)
            print('... computed in {:.2f}-s'.format(time.time()-start))
        print('\nFinished timepoint {} in {:.2f}-min'.format(t,(time.time()-start_t)/60))  
    dgeup = dge.loc[dge['emd']>0,:] # take only 'up' (switch for down)
    dgedown = dge.loc[dge['emd']<0,:] # take only 'down'
    dgeup.to_csv(os.path.join(pfp,'dge_'+fname+'_WTup.csv'),index=False)
    dgedown.to_csv(os.path.join(pfp,'dge_'+fname+'_WTdown.csv'),index=False)
        

    print('DGE finished in {:.2f}-min'.format((time.time()-dge_grandtotal)/60))


Evaluating all

--------
...
--------


GC, WT vs SCA1
----
    Ncells in X:50000
    Ncells in Y:50000

Mann-Whitney U w/Benjamini/Hochberg correction

... 25% completed in 122.30-s
... 50% completed in 249.43-s
... 75% completed in 381.25-s
... mwu computed in 517.81-s

... computed in 674.33-s

Finished timepoint all in 11.24-min
DGE finished in 11.24-min
