# Set-up

In [1]:
import os
import sys
import yaml
import logging
import mudata
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from typing import List, Dict, Tuple, Union, Optional, Literal

from scipy import stats
from statsmodels.stats.multitest import multipletests

# Change path to wherever you have repo locally
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')

from src.evaluation import (
    compute_categorical_association,
    compute_geneset_enrichment,
    compute_trait_enrichment,
    compute_perturbation_association,
    compute_explained_variance_ratio,
    compute_motif_enrichment
)
from src.evaluation.enrichment_trait import process_enrichment_data

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [130]:
import os
import argparse

import mudata
import numpy as np
import pandas as pd

from scipy import stats, sparse
from scikit_posthocs import posthoc_dscf, posthoc_conover, posthoc_dunn

from joblib import Parallel, delayed
from tqdm.auto import tqdm

import logging
logging.basicConfig(level = logging.INFO)

In [9]:
# I/O paths
path_config = "/cellar/users/aklie/opt/gene_program_evaluation/app/tests/evaluation_pipeline.yml"
config = yaml.safe_load(open(path_config))

## I/O

In [179]:
io_config = config['io']
io_config

{'path_mdata': '/cellar/users/aklie/opt/gene_program_evaluation/examples/inference/iPSC_EC/cNMF/cNMF_30_0.2_gene_names.h5mu',
 'path_out': '/cellar/users/aklie/opt/gene_program_evaluation/app/examples/evaluation/iPSC_EC/cNMF_30',
 'data_key': 'rna',
 'prog_key': 'cNMF'}

In [180]:
# Load mdata
path_mdata = io_config['path_mdata']
mdata = mudata.read(path_mdata)
mdata

  utils.warn_names_duplicates("var")


In [181]:
prog_key = "cNMF"
data_key = "rna"

In [182]:
# choose the first 3 programs in prog_key
prog_names = list(mdata.mod[prog_key].var_names)[:3]
mdata.mod[prog_key] = mdata.mod[prog_key][:, prog_names]
mdata

# Categorical association testing

In [183]:
categorical_association_config = config['categorical_association']
categorical_association_config

{'categorical_key': 'sample',
 'pseudobulk_key': None,
 'test': 'correlation',
 'n_jobs': -1,
 'inplace': False}

In [201]:
categorical_key = categorical_association_config['categorical_key']
pseudobulk_key = categorical_association_config['pseudobulk_key']


## Update `perform_correlation`

In [236]:
from statsmodels.stats.multitest import fdrcorrection
from typing import Literal


def perform_correlation(
    prog_df: pd.DataFrame,
    group_col: str,
    val_col: str,
    low_threshold: float=0.0,
    correlation: Literal['pearsonr', 'spearmanr', 'kendalltau']='pearsonr',
    mode: Literal['one_vs_all', 'one_vs_one']='one_vs_all',
    df = [],
):
    """Perform post-hoc analysis with correlation tests

    Parameters
    ----------
    prog_df : pd.DataFrame
        DataFrame containing program scores and categorical information.
    group_col : str
        Column name of the categorical information.
    val_col : str
        Column name of the program scores.
    low_threshold : float
        Threshold to remove cells with low program scores.
    correlation : str
        Type of correlation test to perform.
    mode : str
        Type of correlation test to perform.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the results of the correlation tests:
        - program_name
        - {group_col}_{group_col_category_1}_{correlation}_stat
        - {group_col}_{group_col_category_1}_{correlation}_pval
        - {group_col}_{group_col_category_1}_{correlation}_adj_pval
        - {group_col}_{group_col_category_1}_log2FC
    """

    # Get unique categories
    categories = prog_df[group_col].unique().tolist()

    # Remove cell memberships that are below the threshold
    prog_df = prog_df.loc[prog_df[val_col] > low_threshold]
    
    # Perform correlation tests
    if mode=='one_vs_all':
        stats_df = pd.DataFrame(index=categories, columns=['stat', 'pval', 'adj_pval', "log2FC"], dtype=float)
        for idx in stats_df.index.values:
            if correlation=='pearsonr':
                prog_df['binarized'] = prog_df[group_col].apply(lambda x: 1 if x==idx else 0)
                stat, pval = stats.pearsonr(prog_df[val_col], prog_df['binarized'])
                _, adj_pval = fdrcorrection([pval])
                log2FC = np.log2(prog_df.loc[prog_df[group_col]==idx, val_col].mean() / prog_df.loc[prog_df[group_col]!=idx, val_col].mean())
                stats_df.loc[idx, 'stat'] = stat
                stats_df.loc[idx, 'pval'] = pval
                stats_df.loc[idx, 'adj_pval'] = adj_pval
                stats_df.loc[idx, 'log2FC'] = log2FC
            elif correlation=='spearmanr':
                stat, pval = stats.spearmanr(prog_df[val_col], prog_df[group_col].astype('category').cat.codes)
                _, adj_pval = fdrcorrection([pval])
                log2FC = np.log2(prog_df.loc[prog_df[group_col]==idx, val_col].mean() / prog_df.loc[prog_df[group_col]!=idx, val_col].mean())
                stats_df.loc[idx, 'stat'] = stat
                stats_df.loc[idx, 'pval'] = pval
                stats_df.loc[idx, 'adj_pval'] = adj_pval
                stats_df.loc[idx, 'log2FC'] = log2FC
            elif correlation=='kendalltau':
                stat, pval = stats.kendalltau(prog_df[val_col], prog_df[group_col].astype('category').cat.codes)
                _, adj_pval = fdrcorrection([pval])
                log2FC = np.log2(prog_df.loc[prog_df[group_col]==idx, val_col].mean() / prog_df.loc[prog_df[group_col]!=idx, val_col].mean())
                stats_df.loc[idx, 'stat'] = stat
                stats_df.loc[idx, 'pval'] = pval
                stats_df.loc[idx, 'adj_pval'] = adj_pval
                stats_df.loc[idx, 'log2FC'] = log2FC
            
        # Format
        wide_df = pd.DataFrame()
        columns = ['stat', 'pval', 'adj_pval', 'log2FC']
        for idx, row in stats_df.iterrows():
            for col in columns:
                wide_df[f'{group_col}_{idx}_{correlation}_{col}'] = [row[col]]
        wide_df.index = [val_col]
        wide_df.index.name = 'program_name'
        df.append(wide_df)
    
    elif mode=='one_vs_one':
        pvals = pd.DataFrame(index=categories, columns=categories)

        for idx in pvals.index.values:
            for col in pvals.columns.values:
                if idx==col:
                    continue
                else:
                    test_df = prog_df.loc[prog_df[group_col].isin([idx, col])]
                    if correlation=='pearsonr':
                        stat, pval = stats.pearsonr(test_df[val_col], 
                                                    test_df[group_col].astype('category').cat.codes)
                        pvals.loc[idx, col] = pval
                    elif correlation=='kendalltau':
                        stat, pval = stats.kendalltau(test_df[val_col], 
                                                      test_df[group_col].astype('category').cat.codes)
                        pvals.loc[idx, col] = pval

        return pvals

In [241]:
df = []
for prog_name in prog_names:
    
    # Get the program scores for the selected program
    prog_data_ = mdata[prog_key][:, prog_name].X
    if sparse.issparse(prog_data_):
        prog_data_ = prog_data_.toarray()

    # Turn into a dataframe
    prog_df = pd.DataFrame(
        prog_data_, 
        columns=[prog_name], 
        index=mdata[prog_key].obs.index
    )
    prog_df[categorical_key] = mdata[prog_key].obs[categorical_key].astype(str).values

    # Perform correlation test
    perform_correlation(prog_df, val_col=prog_name, group_col=categorical_key, correlation='pearsonr', mode='one_vs_all', low_threshold=0.0, df=df)
df = pd.concat(df, axis=0)
df

Unnamed: 0_level_0,sample_D0_pearsonr_stat,sample_D0_pearsonr_pval,sample_D0_pearsonr_adj_pval,sample_D0_pearsonr_log2FC,sample_sample_D1_pearsonr_stat,sample_sample_D1_pearsonr_pval,sample_sample_D1_pearsonr_adj_pval,sample_sample_D1_pearsonr_log2FC,sample_sample_D2_pearsonr_stat,sample_sample_D2_pearsonr_pval,sample_sample_D2_pearsonr_adj_pval,sample_sample_D2_pearsonr_log2FC,sample_sample_D3_pearsonr_stat,sample_sample_D3_pearsonr_pval,sample_sample_D3_pearsonr_adj_pval,sample_sample_D3_pearsonr_log2FC
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.870169,0.0,0.0,3.6224,-0.412455,0.0,0.0,-2.795765,-0.3458,0.0,0.0,-3.154612,-0.512979,0.0,0.0,-3.659611
1,-0.458007,0.0,0.0,-3.753409,-0.270776,0.0,0.0,-3.648265,-0.255153,0.0,0.0,-2.632564,0.732122,0.0,0.0,3.985067
2,0.32526,0.0,0.0,0.782145,0.072408,9.370198e-82,9.370198e-82,0.225597,-0.073127,2.36316e-83,2.36316e-83,-0.264392,-0.354213,0.0,0.0,-1.170326


In [242]:
perform_correlation(prog_df, val_col="2", group_col=categorical_key, correlation='pearsonr', mode='one_vs_one', low_threshold=0.0)

Unnamed: 0,D0,sample_D1,sample_D2,sample_D3
D0,,0.0,0.0,0.0
sample_D1,0.0,,0.0,0.0
sample_D2,0.0,0.0,,0.0
sample_D3,0.0,0.0,0.0,


## Perform posthoc

In [243]:
# Perfom posthoc test and compute categorical-program score
def perform_posthoc(mdata, prog_key='prog', prog_name=None,
                    categorical_key='batch', pseudobulk_key='sample',
                    test='dunn', mode='one_vs_one', df=[]):

    """
    Performs post-hoc tests for Kruskall-Wallis test and 
    pairwise p-vals b/w categorical levels are reduced via mean and min.
    If pseudobulk key is provided then data is averaged over those levels.
    Mudata object is updated in place with KW test results. 

    ARGS
        mdata : MuData
            mudata object containing anndata of program scores and cell-level metadata.
        prog_key: 
            index for the anndata object (mdata[prog_key]) in the mudata object.
        prog_name: str
            index of the feature (mdata[prog_key][:, prog_name]) which is the response variable.
        categorical_key: str
            index of the categorical levels (mdata[prog_key].obs[categorical_key]) being tested.
        pseudobulk_key: str (optional)
            index of the feature summarisation (mean) levels (mdata[prog_key].obs[pseudobulk_key]).
        test: {'conover','dunn', 'dscf', 'pearsonr', 'kendalltau'}
            posthoc test to use to test categorical levels.
    
    UPDATES
        if pseudobulk_key is None:
            store_key = categorical_key
        else:
            store_key = '_'.join(categorical_key, pseudobulk_key)
        mdata[prog_key].varm['{}_association_{}_min_pval'.format(store_key, test)]
        mdata[prog_key].varm['{}_association_{}_mean_pval'.format(store_key, test)] 
        mdata[prog_key].uns['{}_association_{}_pvals'.format(store_key, test)]

    """
    
    store_key = categorical_key

    prog_data_ = mdata[prog_key][:, prog_name].X
    if sparse.issparse(prog_data_):
        prog_data_ = prog_data_.toarray()

    prog_df = pd.DataFrame(prog_data_, 
                           columns=[prog_name], 
                           index=mdata[prog_key].obs.index)
    prog_df[categorical_key] = mdata[prog_key].obs[categorical_key].astype(str).values
    if pseudobulk_key is not None:
        store_key = '_'.join((categorical_key, pseudobulk_key))
        prog_df[pseudobulk_key] = pseudobulk_key
        prog_df = prog_df.groupby([pseudobulk_key, categorical_key]).mean().reset_index()

    # Peform post hoc tests
    if test=='dunn':
        p_vals = posthoc_dunn(prog_df, val_col=prog_name, group_col=categorical_key)
    elif test=='conover':
        p_vals = posthoc_conover(prog_df, val_col=prog_name, group_col=categorical_key)
    elif test=='dscf':
        p_vals = posthoc_dscf(prog_df, val_col=prog_name, group_col=categorical_key)
    elif test=='pearsonr':
        p_vals = perform_correlation(prog_df, val_col=prog_name, group_col=categorical_key,
                                     correlation='pearsonr', mode=mode, df=df)
        if mode == 'one_vs_all':
            return
    elif test=='kendalltau':
        p_vals = perform_correlation(prog_df, val_col=prog_name, group_col=categorical_key,
                                     correlation='kendalltau')

    # Create combined statistic with p-vals for each categorical level
    for i, category in enumerate(mdata[prog_key].obs[categorical_key].astype(str).unique()):
        min_pval = np.min(p_vals.loc[p_vals.index!=category, category])
        mean_pval = np.mean(p_vals.loc[p_vals.index!=category, category])

        prog_idx = mdata[prog_key].var.index.get_loc(prog_name)

        # Store results
        mdata[prog_key].varm['{}_association_{}_min_pval'.format(store_key, test)][prog_idx,i] = min_pval
        mdata[prog_key].varm['{}_association_{}_mean_pval'.format(store_key, test)][prog_idx,i] = mean_pval
         
    mdata[prog_key].uns['{}_association_{}_pvals'.format(store_key, test)][prog_name] = p_vals

In [244]:
df = []
for prog_name in prog_names:
    perform_posthoc(
        mdata,
        prog_key=prog_key,
        prog_name=prog_name,
        categorical_key=categorical_key,
        pseudobulk_key=pseudobulk_key,
        test="pearsonr",
        mode='one_vs_all',
        df=df
    )
df = pd.concat(df, axis=0)
df

Unnamed: 0_level_0,sample_D0_pearsonr_stat,sample_D0_pearsonr_pval,sample_D0_pearsonr_adj_pval,sample_D0_pearsonr_log2FC,sample_sample_D1_pearsonr_stat,sample_sample_D1_pearsonr_pval,sample_sample_D1_pearsonr_adj_pval,sample_sample_D1_pearsonr_log2FC,sample_sample_D2_pearsonr_stat,sample_sample_D2_pearsonr_pval,sample_sample_D2_pearsonr_adj_pval,sample_sample_D2_pearsonr_log2FC,sample_sample_D3_pearsonr_stat,sample_sample_D3_pearsonr_pval,sample_sample_D3_pearsonr_adj_pval,sample_sample_D3_pearsonr_log2FC
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.870169,0.0,0.0,3.6224,-0.412455,0.0,0.0,-2.795765,-0.3458,0.0,0.0,-3.154612,-0.512979,0.0,0.0,-3.659611
1,-0.458007,0.0,0.0,-3.753409,-0.270776,0.0,0.0,-3.648265,-0.255153,0.0,0.0,-2.632564,0.732122,0.0,0.0,3.985067
2,0.32526,0.0,0.0,0.782145,0.072408,9.370198e-82,9.370198e-82,0.225597,-0.073127,2.36316e-83,2.36316e-83,-0.264392,-0.354213,0.0,0.0,-1.170326


## Wrapper

In [252]:
# TODO: Add one way ANOVA, MANOVA, multi-variate KW, logistic-regression
def compute_categorical_association(
    mdata, 
    prog_key='prog', 
    categorical_key='batch', 
    pseudobulk_key=None, 
    test='dunn', 
    mode='one_vs_one',
    n_jobs=1, 
    inplace=True, 
    **kwargs
):

    """
    Compute association of continous features with categorical levels.
    Currently only the Kruskall-Wallis test is implemented to determine if
    any categorical level has a significant association and posthoc tests
    determine which categorical levels have significant associations.

    ARGS
        mdata : MuData
            mudata object containing anndata of program scores and cell-level metadata.
        prog_key: 
            index for the anndata object (mdata[prog_key]) in the mudata object.
        categorical_key: str
            index of the categorical levels (mdata[prog_key].obs[categorical_key]) being tested.
        pseudobulk_key: str (optional)
            index of the feature summarisation (mean) levels (mdata[prog_key].obs[pseudobulk_key]).
        test: {'conover','dunn', 'dscf', 'pearsonr', 'kendalltau'}
            posthoc test to use to test categorical levels.
        n_jobs: int (default: 1)
            number of threads to run processes on.
        inplace: Bool (default: True)
            update the mudata object inplace or return a copy
       
       ---
        if inplace:
            UPDATES
            mdata[prog_key].var.loc[prog_name, '{}_kruskall_wallis_stat'.format(store_key)]  
            mdata[prog_key].var.loc[prog_name, '{}_kruskall_wallis_pval'.format(store_key)] 
            mdata[prog_key].varm['{}_association_{}_min_pval'.format(store_key, test)]
            mdata[prog_key].varm['{}_association_{}_mean_pval'.format(store_key, test)] 
            mdata[prog_key].uns['{}_association_{}_pvals'.format(store_key, test)]
            mdata[prog_key].uns['{}_association_categories'.format(categorical_key)]    
        else:
            RETURNS
            results_df, posthoc_df       

    """
    # Read in mudata if it is provided as a path
    frompath=False
    if isinstance(mdata, str):
        if os.path.exists(mdata):
            mdata = mudata.read(mdata)
            if inplace:
                logging.warning('Changed to inplace=False since path was provided')
                inplace=False
            frompath=True
        else: raise ValueError('Incorrect mudata specification.')

    if not inplace and not frompath:
        mdata = mudata.MuData({prog_key: mdata[prog_key].copy()})
    
    if pseudobulk_key is None:
        logging.info('Performing tests at single-cell level. Significance will likely be inflated')
        store_key = categorical_key
    else:
        logging.info('Perform testing by averaging over {}'.format(pseudobulk_key))
        store_key = '_'.join((categorical_key, pseudobulk_key))

    mdata[prog_key].var['{}_kruskall_wallis_stat'.format(store_key)] = None
    mdata[prog_key].var['{}_kruskall_wallis_pval'.format(store_key)] = None
    
    # Run KW test
    Parallel(n_jobs=n_jobs, 
             backend='threading')(delayed(perform_kruskall_wallis)(mdata, 
                                                                   prog_key=prog_key,
                                                                   prog_name=prog_name, 
                                                                   categorical_key=categorical_key,
                                                                   pseudobulk_key=pseudobulk_key) \
                                                            for prog_name in tqdm(mdata[prog_key].var_names,
                                                            desc='Testing {} association'.format(categorical_key), 
                                                            unit='programs'))

    # Convert to float to prevent error when saving mudata
    mdata[prog_key].var['{}_kruskall_wallis_stat'.format(store_key)] = \
    mdata[prog_key].var['{}_kruskall_wallis_stat'.format(store_key)].astype(float)
    mdata[prog_key].var['{}_kruskall_wallis_pval'.format(store_key)] = \
    mdata[prog_key].var['{}_kruskall_wallis_pval'.format(store_key)].astype(float)


    # If test = 'pearsonr' with mode = 'one_vs_all' we are going to do something special until we can more properly integrate this
    if test=='pearsonr' and mode=='one_vs_all':
        logging.info('Running jamboree specific version of posthoc with pearsonr, this is not yet integrated into the main pipeline')
        res = []        
        # Run posthoc-tests and append results
        Parallel(n_jobs=n_jobs,
                    backend='threading')(delayed(perform_posthoc)(mdata,
                                                                prog_key=prog_key,
                                                                prog_name=prog_name, 
                                                                categorical_key=categorical_key,
                                                                pseudobulk_key=pseudobulk_key,
                                                                test=test,
                                                                mode=mode,
                                                                df=res) \
                                                                for prog_name in tqdm(mdata[prog_key].var_names,
                                                                desc='Identifying differential {}'.format(categorical_key), 
                                                                unit='programs'))
        
        posthoc_df = pd.concat(res, axis=0)
        results_df = mdata[prog_key].var.loc[:, ['{}_kruskall_wallis_stat'.format(store_key), 
                                                 '{}_kruskall_wallis_pval'.format(store_key)]]
        
        return (results_df, posthoc_df)

    else:

        mdata[prog_key].varm['{}_association_{}_min_pval'.format(store_key, test)] = \
        np.zeros((mdata[prog_key].shape[1], mdata[prog_key].obs[categorical_key].unique().shape[0]))
        mdata[prog_key].varm['{}_association_{}_mean_pval'.format(store_key, test)] = \
        np.ones((mdata[prog_key].shape[1], mdata[prog_key].obs[categorical_key].unique().shape[0]))
        mdata[prog_key].uns['{}_association_{}_pvals'.format(store_key, test)] = {}
        
        # Run posthoc-tests
        Parallel(n_jobs=n_jobs, 
                backend='threading')(delayed(perform_posthoc)(mdata, 
                                                            prog_key=prog_key,
                                                            prog_name=prog_name, 
                                                            categorical_key=categorical_key,
                                                            pseudobulk_key=pseudobulk_key,
                                                            test=test) \
                                                            for prog_name in tqdm(mdata[prog_key].var_names,
                                                            desc='Identifying differential {}'.format(categorical_key), 
                                                            unit='programs'))
        
        # Returning test results only
        if not inplace: 

            results_df = mdata[prog_key].var.loc[:, ['{}_kruskall_wallis_stat'.format(store_key), 
                                                    '{}_kruskall_wallis_pval'.format(store_key)]]
            min_pval_df = pd.DataFrame(mdata[prog_key].varm['{}_association_{}_min_pval'.format(store_key, test)],
                                    index=mdata[prog_key].var.index,
                                    columns=['{}_{}_association_{}_min_pval'.format(col, store_key, test) \
                                                for col in mdata[prog_key].uns['{}_association_categories'.format(categorical_key)]]
                                    )
            results_df = results_df.merge(min_pval_df, left_index=True, right_index=True)
            
            mean_pval_df = pd.DataFrame(mdata[prog_key].varm['{}_association_{}_mean_pval'.format(store_key, test)],
                                    index=mdata[prog_key].var.index,
                                    columns=['{}_{}_association_{}_mean_pval'.format(col, store_key, test) \
                                                for col in mdata[prog_key].uns['{}_association_categories'.format(categorical_key)]]
                                    )
            results_df = results_df.merge(mean_pval_df, left_index=True, right_index=True)

            posthoc_df = []
            dict_ = mdata[prog_key].uns['{}_association_{}_pvals'.format(store_key, test)]
            for key, values in dict_.items():

                tmp_df = values.where(np.triu(np.ones(values.shape), k=1).astype(bool))
                tmp_df = tmp_df.stack().reset_index()
                tmp_df.columns = ['{}_a'.format(store_key),
                                '{}_b'.format(store_key),
                                'p_value']
                tmp_df['program_name'] = key
                posthoc_df.append(tmp_df)

            posthoc_df = pd.concat(posthoc_df, ignore_index=True)

            return (results_df, posthoc_df)

In [253]:
results_old_df, posthoc_old_df = compute_categorical_association(
    mdata=mdata,
    prog_key=prog_key,
    categorical_key=categorical_association_config['categorical_key'],
    pseudobulk_key=categorical_association_config['pseudobulk_key'],
    test="dunn",
    n_jobs=categorical_association_config['n_jobs'],
    inplace=categorical_association_config['inplace']
)

INFO:root:Performing tests at single-cell level. Significance will likely be inflated


Testing sample association:   0%|          | 0/3 [00:00<?, ?programs/s]

Identifying differential sample:   0%|          | 0/3 [00:00<?, ?programs/s]

In [254]:
results_old_df

Unnamed: 0,sample_kruskall_wallis_stat,sample_kruskall_wallis_pval,D0_sample_association_dunn_min_pval,sample_D1_sample_association_dunn_min_pval,sample_D2_sample_association_dunn_min_pval,sample_D3_sample_association_dunn_min_pval,D0_sample_association_dunn_mean_pval,sample_D1_sample_association_dunn_mean_pval,sample_D2_sample_association_dunn_mean_pval,sample_D3_sample_association_dunn_mean_pval
0,68816.470381,0.0,0.0,0.0,0.0,0.0,0.0,1.254248e-59,0.0003198463,0.0003198463
1,34189.544565,0.0,0.0,0.0,0.0,0.0,0.290705,4.835328999999999e-19,0.2907049,0.0
2,17886.375794,0.0,0.0,0.0,0.0,0.0,0.0,4.038649e-124,4.038649e-124,5.663327e-207


In [255]:
posthoc_old_df

Unnamed: 0,sample_a,sample_b,p_value,program_name
0,D0,sample_D1,5.061536000000001e-25,1
1,D0,sample_D2,0.8721146,1
2,D0,sample_D3,0.0,1
3,sample_D1,sample_D2,1.450598e-18,1
4,sample_D1,sample_D3,0.0,1
5,sample_D2,sample_D3,0.0,1
6,D0,sample_D1,0.0,0
7,D0,sample_D2,0.0,0
8,D0,sample_D3,0.0,0
9,sample_D1,sample_D2,3.762743e-59,0


In [256]:
results_new_df, posthoc_new_df = compute_categorical_association(
    mdata=mdata,
    prog_key=prog_key,
    categorical_key=categorical_association_config['categorical_key'],
    pseudobulk_key=categorical_association_config['pseudobulk_key'],
    test="pearsonr",
    mode="one_vs_all",
    n_jobs=categorical_association_config['n_jobs'],
    inplace=categorical_association_config['inplace']
)

INFO:root:Performing tests at single-cell level. Significance will likely be inflated


Testing sample association:   0%|          | 0/3 [00:00<?, ?programs/s]

INFO:root:Running jamboree specific version of posthoc with pearsonr, this is not yet integrated into the main pipeline


Identifying differential sample:   0%|          | 0/3 [00:00<?, ?programs/s]

In [257]:
results_new_df

Unnamed: 0,sample_kruskall_wallis_stat,sample_kruskall_wallis_pval
0,68816.470381,0.0
1,34189.544565,0.0
2,17886.375794,0.0


In [258]:
posthoc_new_df

Unnamed: 0_level_0,sample_D0_pearsonr_stat,sample_D0_pearsonr_pval,sample_D0_pearsonr_adj_pval,sample_D0_pearsonr_log2FC,sample_sample_D1_pearsonr_stat,sample_sample_D1_pearsonr_pval,sample_sample_D1_pearsonr_adj_pval,sample_sample_D1_pearsonr_log2FC,sample_sample_D2_pearsonr_stat,sample_sample_D2_pearsonr_pval,sample_sample_D2_pearsonr_adj_pval,sample_sample_D2_pearsonr_log2FC,sample_sample_D3_pearsonr_stat,sample_sample_D3_pearsonr_pval,sample_sample_D3_pearsonr_adj_pval,sample_sample_D3_pearsonr_log2FC
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.870169,0.0,0.0,3.6224,-0.412455,0.0,0.0,-2.795765,-0.3458,0.0,0.0,-3.154612,-0.512979,0.0,0.0,-3.659611
1,-0.458007,0.0,0.0,-3.753409,-0.270776,0.0,0.0,-3.648265,-0.255153,0.0,0.0,-2.632564,0.732122,0.0,0.0,3.985067
2,0.32526,0.0,0.0,0.782145,0.072408,9.370198e-82,9.370198e-82,0.225597,-0.073127,2.36316e-83,2.36316e-83,-0.264392,-0.354213,0.0,0.0,-1.170326


# DONE!

---