In [4]:
import os
import glob
import sys
import numpy as np
import pandas as pd
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation')
sys.path.append('/cellar/users/aklie/opt/gene_program_evaluation/dashapp/')
import mudata
import scanpy as sc

from utils import count, count_unique

In [5]:
# Pipeline outputs
path_pipeline_outs = "/cellar/users/aklie/opt/gene_program_evaluation/dashapp/example_data/iPSC_EC_evaluations"

`path_pipeline_outs` should be structured something like as follows:
    
```bash
path_pipeline_outs/
├── cNMF_60
├── cNMF_59
├── cNMF_58
...
├── cNMF_1
└── output.mdata
```

where `output.mdata` contains the MuData object with keys `input` that contains the original input matrix and `cNMF_60`, `cNMF_59`, ..., `cNMF_1` containing the cell and gene loadings for each of the inference runs.

The inference runs can be 1 of 4 types:
1. A single run of a method with a fixed number of components.
2. Cross k-analysis: where the number of components is varied across the same method.
3. Cross method analysis: where the number of components is fixed and the method is varied.
4. A combination of 2) and 3).

The keys of the MuData and the output subdirectories in `path_pipeline_outs` should be named accordingly to the desired type of analysis:
1. If one key is present, it can be named anything. In this scenario, the cross run analysis will be omitted.
2. A cross k-analysis should share the same base name and be suffixed by an integer that indicates the number of components used in inference. e.g. `cNMF_60`, `cNMF_59`, ..., `cNMF_1`.
3. A cross method analysis should have separate names for each method. e.g. `cNMF`, `Topyfic`, etc.
4. A combination of 2) and 3) should have the same base name for each method and be suffixed by an integer that indicates the number of components used in inference. e.g. `cNMF_60`, `cNMF_59`, ..., `cNMF_1`, `Topyfic_60`, `Topyfic_59`, ..., `Topyfic_1`.

# Load MuData

In [6]:
try:
    path_mdata = os.path.join(path_pipeline_outs, "cNMF_60_0.2_gene_names.h5mu")
    mdata = mudata.read_h5mu(path_mdata)
except:
    print("Could not load mdata")
    sys.exit(1)

  utils.warn_names_duplicates("var")


In [7]:
with mudata.set_options(display_style="html", display_html_expand=0b000):
    display(mdata)

0,1,2
rna:sample,category,"D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0..."
rna:species,category,"hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,..."
rna:gene_count,int64,"1903,2664,2147,2011,1441,1607,1385,3690,1348,1247,..."
rna:tscp_count,int64,"3010,4710,3492,3286,2027,2616,2256,7268,2093,1934,..."
rna:mread_count,int64,"4079,6401,4666,4432,2773,3499,3044,9975,2837,2533,..."
rna:leiden,category,"0,2,1,0,0,0,4,5,4,4,4,0,1,3,2,0,2,2,1,1,1,3,0,0,0,..."
rna:n_counts,float32,"653.00,997.00,618.00,742.00,430.00,585.00,570.00,1..."
cNMF:sample,category,"D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0..."
cNMF:species,category,"hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,..."
cNMF:gene_count,int64,"1903,2664,2147,2011,1441,1607,1385,3690,1348,1247,..."

0,1,2,3
cNMF,bool,numpy.ndarray,
rna,bool,numpy.ndarray,

0,1,2,3
norm10k,float32,scipy.sparse._csr.csr_matrix,

0,1,2
sample,category,"D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0..."
species,category,"hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,..."
gene_count,int64,"1903,2664,2147,2011,1441,1607,1385,3690,1348,1247,..."
tscp_count,int64,"3010,4710,3492,3286,2027,2616,2256,7268,2093,1934,..."
mread_count,int64,"4079,6401,4666,4432,2773,3499,3044,9975,2837,2533,..."
leiden,category,"0,2,1,0,0,0,4,5,4,4,4,0,1,3,2,0,2,2,1,1,1,3,0,0,0,..."
n_counts,float32,"653.00,997.00,618.00,742.00,430.00,585.00,570.00,1..."

0,1,2,3
X_pca,float64,numpy.ndarray,50 columns
X_umap,float32,numpy.ndarray,2 columns

0,1,2
sample,category,"D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0,D0..."
species,category,"hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,hg38,..."
gene_count,int64,"1903,2664,2147,2011,1441,1607,1385,3690,1348,1247,..."
tscp_count,int64,"3010,4710,3492,3286,2027,2616,2256,7268,2093,1934,..."
mread_count,int64,"4079,6401,4666,4432,2773,3499,3044,9975,2837,2533,..."
leiden,category,"0,2,1,0,0,0,4,5,4,4,4,0,1,3,2,0,2,2,1,1,1,3,0,0,0,..."
n_counts,float32,"653.00,997.00,618.00,742.00,430.00,585.00,570.00,1..."

0,1,2,3
guide_assignment,int64,numpy.ndarray,1926 columns

0,1,2,3
guide_names,numpy.ndarray,1926 elements,"ACAA1_-_38178575.23-P1P2,ACAA1_+_38178488.23-P1P2,..."
guide_targets,numpy.ndarray,1926 elements,"ACAA1,ACAA1,ACAA1,ACAA1,ACAA1,ACAA1,ACTL7B,ACTL7B,..."
var_names,numpy.ndarray,5451 elements,"SEMA3F,CFTR,CYP51A1,HECW1,KLHL13,CASP10,CFLAR,TFPI..."


# Get subdirectories

In [8]:
# Get all subdirectories of the pipeline outputs
subdirs = [x[0] for x in os.walk(path_pipeline_outs)][1:]
subdirs

['/cellar/users/aklie/opt/gene_program_evaluation/dashapp/example_data/iPSC_EC_evaluations/cNMF']

# Parse inputs

## `method` and `n_components`

In [9]:
# Check the mdata.mod keys to extract the method names and the number of components used in inference
# The keys should have the same base name for each method and be suffixed by an integer that indicates the number of components used in inference.
# This should take into account the case where multiple "_" are present in the method name and take all the characters before the last "_"
# This should also take into account the case where the method has no "_"
# The n_components is the second dimension of the shape of the mod.X attribute
data_key = "rna"
methods = {}
n_components = {}
for key in mdata.mod.keys():
    method_split = key.split("_")
    if len(method_split) > 1:
        method = "_".join(method_split[:-1])
    else:
        method = method_split[0]
    if method != data_key:
        methods[key] = method
        n_components[method] = mdata.mod[key].X.shape[1]

def parse_methods(mdata, data_key="rna"):
    methods = {}
    n_components = {}
    for key in mdata.mod.keys():
        method_split = key.split("_")
        if len(method_split) > 1:
            method = "_".join(method_split[:-1])
        else:
            method = method_split[0]
        if method != data_key:
            methods[key] = method
            n_components[method] = mdata.mod[key].X.shape[1]
    return methods, n_components

In [10]:
# Double check that the subdirectories are named after the method names (keys of methods)
# If they are not, print a warning
subdir_names = [os.path.basename(subdir) for subdir in subdirs]
if set(methods.values()).issubset(subdir_names):
    print("Subdirectories are named after the method names")
else:
    print("Subdirectories are not named after the method names")

Subdirectories are named after the method names


In [11]:
parse_methods(mdata)

({'cNMF': 'cNMF'}, {'cNMF': 60})

## `explained_variance` and `cumulative_explained_variance`

In [12]:
# For each subdirectory, find the file explained_variance_ratio.txt
# Read it in and
# Store this in a dictionary with the subdirectory dir_name as the key
explained_variance_ratios = {}
cumulative_explained_variance = {}
for subdir in subdirs:
    try:
        run_name = os.path.basename(subdir)
        df = pd.read_csv(os.path.join(subdir, "explained_variance_ratio.txt"), sep="\t")
        df.columns = ["program_name", "explained_variance_ratio"]
        explained_variance_ratios[run_name] = df
        cumulative_explained_variance[run_name] = df["explained_variance_ratio"].sum()
    except FileNotFoundError:
        print(f"File not found: {subdir}")

def parse_explained_variance(subdirs):
    explained_variance_ratios = {}
    cumulative_explained_variance = {}
    for subdir in subdirs:
        try:
            run_name = os.path.basename(subdir)
            df = pd.read_csv(os.path.join(subdir, "explained_variance_ratio.txt"), sep="\t")
            df.columns = ["program_name", "explained_variance_ratio"]
            explained_variance_ratios[run_name] = df
            cumulative_explained_variance[run_name] = df["explained_variance_ratio"].sum()
        except FileNotFoundError:
            print(f"File not found: {subdir}")
    return explained_variance_ratios, cumulative_explained_variance

In [13]:
parse_explained_variance(subdirs)

({'cNMF':     program_name  explained_variance_ratio
  0              0                 -0.061864
  1              1                 -0.061864
  2              2                 -0.061864
  3              3                 -0.061864
  4              4                 -0.061864
  5              5                 -0.061864
  6              6                 -0.061864
  7              7                 -0.061864
  8              8                 -0.061864
  9              9                 -0.061864
  10            10                 -0.061864
  11            11                 -0.061864
  12            12                 -0.061864
  13            13                 -0.061864
  14            14                 -0.061864
  15            15                 -0.061864
  16            16                 -0.061864
  17            17                 -0.061864
  18            18                 -0.061864
  19            19                 -0.061864
  20            20                 -0.061864
  

## `loadings`

In [14]:
data_key = "rna"
loadings = {}
for key in mdata.mod.keys():
    if key != data_key:
        loadings[key] = pd.DataFrame(
            data=mdata.mod[key].varm["loadings"],
            index=mdata.mod[key].var_names,
            columns=mdata.mod[key].uns["var_names"]
        )
        loadings[key].index.name = "program_name"
        loadings[key].columns.name = "gene_name"

def parse_loadings(mdata, data_key="rna"):
    loadings = {}
    for key in mdata.mod.keys():
        if key != data_key:
            loadings[key] = pd.DataFrame(
                data=mdata.mod[key].varm["loadings"],
                index=mdata.mod[key].var_names,
                columns=mdata.mod[key].uns["var_names"]
            )
            loadings[key].index.name = "program_name"
            loadings[key].columns.name = "gene_name"
    return loadings

In [15]:
parse_loadings(mdata)["cNMF"]

gene_name,SEMA3F,CFTR,CYP51A1,HECW1,KLHL13,CASP10,CFLAR,TFPI,MTMR7,SLC7A2,...,AC026316.5,AL591485.1,AL162417.1,AL390957.1,LINC02478,AL033530.1,AL162718.1,AC111006.1,AL136419.1,AC007846.2
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7.192721e-07,0.0,0.00092,0.0,0.0,0.0,0.000951,0.000804,0.0,0.0,...,0.0,0.0,0.000123,3e-05,0.000295,0.0,0.000149,3.178526e-05,7.463991e-05,0.0
1,0.0005883004,0.0,0.0,0.001214,0.000113,0.0,1.3e-05,0.0,0.000463,0.0004803976,...,0.0,2.6e-05,0.000107,0.0,0.0,0.0,0.0,0.0001112985,0.000107361,4.3e-05
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0008722201,...,0.0,0.0,0.0,0.0,9e-06,0.000249,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000938,0.0,0.0,0.0,0.001015,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0003055367,0.0,0.0
4,0.0,0.0,0.001579,0.0,8.6e-05,0.0,0.0,0.0,0.0,0.0003712572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.974758e-05,0.0
5,0.000456613,0.0,0.0,0.0,0.0,0.000131,0.001787,0.0,0.0,0.0,...,0.0,3e-05,0.0,2.9e-05,0.000272,0.0,0.000579,0.0,1.412097e-05,0.0
6,5.917814e-05,0.0,0.0,0.000569,0.000967,0.000181,0.0,0.0,0.001034,0.001184264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.001828,0.0,0.000882,0.0,3.2e-05,0.0,0.0,0.0,...,0.0,0.0,0.000249,0.0,0.0,0.0,0.0,0.0002198484,5.304723e-05,0.0
8,3.6128e-05,0.0,0.001923,0.0,0.0,0.000417,0.00166,0.003698,0.0,0.0,...,0.0,8e-05,0.0,0.000112,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.000375,0.0,0.0,0.000337,0.002497,0.002511,0.0,0.0,...,0.0,0.0,0.000268,0.000109,0.0,0.0,0.0,6.068167e-07,0.0,0.0


## `obs`

In [29]:
data_key = "rna"
obs = {}

for key in mdata.mod.keys():
    if key != data_key:
        obs[key] = mdata.mod[key].obs

def parse_obs(mdata, data_key="rna"):
    obs = {}
    for key in mdata.mod.keys():
        if key != data_key:
            obs[key] = mdata.mod[key].obs
    return obs

In [30]:
parse_obs(mdata)["cNMF"]

Unnamed: 0_level_0,sample,species,gene_count,tscp_count,mread_count,leiden,n_counts
obs_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01_01_06__s1,D0,hg38,1903,3010,4079,0,653.0
01_01_17__s1,D0,hg38,2664,4710,6401,2,997.0
01_01_29__s1,D0,hg38,2147,3492,4666,1,618.0
01_01_34__s1,D0,hg38,2011,3286,4432,0,742.0
01_01_55__s1,D0,hg38,1441,2027,2773,0,430.0
...,...,...,...,...,...,...,...
48_95_94__s3,sample_D3,hg38,703,1070,1310,4,295.0
48_96_21__s3,sample_D3,hg38,706,950,1114,0,273.0
48_96_35__s3,sample_D3,hg38,995,1532,1858,0,404.0
48_96_42__s3,sample_D3,hg38,792,1129,1385,0,300.0


## `obs_membership`

In [31]:
data_key = "rna"
obs_memberships = {}

for key in mdata.mod.keys():
    if key != data_key:
        obs_memberships[key] = pd.DataFrame(
            data=mdata.mod[key].X,
            index=mdata.mod[key].obs_names,
            columns=mdata.mod[key].var_names
        )
        obs_memberships[key].index.name = "obs_name"
        obs_memberships[key].columns.name = "program_name"

def parse_obs_memberships(mdata, data_key="rna"):
    obs_memberships = {}
    for key in mdata.mod.keys():
        if key != data_key:
            obs_memberships[key] = pd.DataFrame(
                data=mdata.mod[key].X,
                index=mdata.mod[key].obs_names,
                columns=mdata.mod[key].var_names
            )
            obs_memberships[key].index.name = "obs_name"
            obs_memberships[key].columns.name = "program_name"
    return obs_memberships

In [23]:
parse_obs_memberships(mdata)["cNMF"]

program_name,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
obs_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01_01_06__s1,0.000000,0.176405,0.045799,0.017567,0.100730,0.000000,0.100166,0.005854,0.013151,0.000000,...,0.000000,0.000000,0.002727,0.000554,0.000000,0.000000,0.000000,0.000000,0.001956,0.000000
01_01_17__s1,0.016260,0.063795,0.024056,0.142945,0.075851,0.022543,0.120094,0.000000,0.001130,0.000000,...,0.015391,0.003875,0.000000,0.000000,0.000000,0.005808,0.000000,0.000000,0.000000,0.000000
01_01_29__s1,0.000000,0.000000,0.051363,0.005693,0.088982,0.000000,0.300334,0.022360,0.000000,0.000000,...,0.000000,0.015199,0.001340,0.004701,0.000000,0.000000,0.000802,0.041076,0.001664,0.002514
01_01_34__s1,0.000000,0.048640,0.032359,0.066726,0.055561,0.000000,0.021790,0.000000,0.000000,0.000000,...,0.000000,0.006191,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
01_01_55__s1,0.000000,0.148667,0.013570,0.000000,0.092070,0.000000,0.087339,0.000000,0.003308,0.000000,...,0.008801,0.000000,0.004659,0.014892,0.002530,0.000000,0.004538,0.000000,0.077684,0.005749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48_95_94__s3,0.257637,0.000000,0.013554,0.027701,0.000000,0.125097,0.000000,0.000000,0.199654,0.040209,...,0.000000,0.005911,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
48_96_21__s3,0.143127,0.000000,0.031115,0.010456,0.016909,0.082018,0.000000,0.024610,0.264499,0.104008,...,0.000000,0.028692,0.000130,0.026587,0.003097,0.007481,0.000000,0.000000,0.000295,0.006226
48_96_35__s3,0.255615,0.003386,0.055825,0.000000,0.037573,0.145554,0.000000,0.002054,0.174920,0.121603,...,0.000000,0.008321,0.000000,0.000000,0.000000,0.008878,0.000000,0.000000,0.000000,0.000000
48_96_42__s3,0.282907,0.000000,0.000000,0.000000,0.006133,0.142713,0.000000,0.000000,0.095451,0.062531,...,0.012237,0.000582,0.000000,0.000000,0.002562,0.051012,0.000000,0.000000,0.000000,0.000000


## `geneset_enrichments`

In [None]:
# For each subdirectory, find the file geneset_enrichment.txt
# Store this dataframe in a dictionary with the key being the subdirectory name
geneset_enrichments = {}
for subdir in subdirs:
    try:
        run_name = os.path.basename(subdir)
        gene_set_enrichment_file = os.path.join(subdir, "geneset_enrichment.txt")
        gene_set_enrichment_df = pd.read_csv(gene_set_enrichment_file, sep="\t")
        geneset_enrichments[run_name] = gene_set_enrichment_df
    except FileNotFoundError:
        print(f"File not found: {gene_set_enrichment_file}")
        continue

def parse_geneset_enrichments(subdirs):
    geneset_enrichments = {}
    for subdir in subdirs:
        try:
            run_name = os.path.basename(subdir)
            gene_set_enrichment_file = os.path.join(subdir, "geneset_enrichment.txt")
            gene_set_enrichment_df = pd.read_csv(gene_set_enrichment_file, sep="\t")
            geneset_enrichments[run_name] = gene_set_enrichment_df
        except FileNotFoundError:
            print(f"File not found: {gene_set_enrichment_file}")
            continue
    return geneset_enrichments

In [None]:
parse_geneset_enrichments(subdirs)["cNMF"]

Unnamed: 0,program_name,Term,ES,NES,NOM p-val,FDR q-val,FWER p-val,Gene %,Lead_genes,tag_before,tag_after
0,0,Downstream Signaling Events Of B Cell Receptor...,0.982097,1.326689,0.000000,0.006000,0.006,0.75,RASGRP3;CALM1,2,16
1,0,PI3K Cascade R-HSA-109704,0.937846,1.259827,0.004000,0.051546,0.314,2.86,KL;GAB2;FGF13,3,15
2,0,IRS-mediated Signaling R-HSA-112399,0.937841,1.261852,0.004000,0.052001,0.293,2.86,KL;GAB2;FGF13,3,16
3,0,Insulin Receptor Signaling Cascade R-HSA-74751,0.952342,1.287509,0.001000,0.052667,0.109,2.86,GRB10;KL;GAB2;INSR;FGF13,5,19
4,0,Signaling By Insulin Receptor R-HSA-74752,0.952342,1.287509,0.001000,0.052667,0.109,2.86,GRB10;KL;GAB2;INSR;FGF13,5,19
...,...,...,...,...,...,...,...,...,...,...,...
21535,59,Signaling By FGFR2 R-HSA-5654738,0.863109,1.108131,0.105000,1.000000,1.000,13.76,UBC;SPRY2;UBB;SHC1;RPS27A;PPP2R1A,6,16
21536,59,"Plasma Lipoprotein Assembly, Remodeling, And C...",0.855843,1.104612,0.119000,1.000000,1.000,9.17,UBC;APOE;BMP1;UBB;VLDLR;PRKCA;PCSK5;RPS27A,8,19
21537,59,Binding And Uptake Of Ligands By Scavenger Rec...,0.859521,1.104251,0.112000,1.000000,1.000,5.96,SPARC;FTL;FTH1;COL1A1;APOE;COL4A2;COL1A2;COLEC11,8,20
21538,59,Cyclin D Associated Events In G1 R-HSA-69231,0.863870,1.104019,0.144144,1.000000,1.000,13.76,CDKN2B;UBC;CCND2;UBB;CDKN1A;CCND3;RPS27A;CCND1...,10,15


In [None]:
categorical_var = "program_name"
count_var = "Term"
sig_var = "FDR q-val"
sig_threshold = 0.25

In [None]:
# Now for each df, calculate the number of unique gene sets that are enriched
# Include a FDR q-val cutoff
gene_sets_counts = {}
unique_gene_sets_counts = {}
for run_name, gene_set_enrichment_df in geneset_enrichments.items():
    # Filter the dataframe based on the FDR q-value
    filtered_data = gene_set_enrichment_df[gene_set_enrichment_df[sig_var] < sig_threshold]

    # Get the count of all gene sets passing the FDR q-value cutoff for each program
    count_df = count(categorical_var=categorical_var, count_var=count_var, dataframe=filtered_data)
    gene_sets_counts[run_name] = count_df

    # Get the count of unique gene sets passing the FDR q-value cutoff for each program
    unique_data = filtered_data.sort_values(by=sig_var)
    unique_data = unique_data.drop_duplicates(subset=count_var)
    unique_df = count_unique(categorical_var=categorical_var, count_var=count_var, dataframe=unique_data)
    unique_gene_sets_counts[run_name] = unique_df

In [None]:
unique_gene_sets_counts["cNMF"]

Unnamed: 0,program_name,Term
0,4,93
1,6,58
2,0,49
3,20,31
4,7,12
5,47,6
6,10,6
7,12,5
8,24,4
9,14,3


## `motif_enrichments`

In [None]:
# For each subdirectory, find the file motif_enrichment.txt
# Store this dataframe in a dictionary with the key being the subdirectory name
motif_enrichments = {}
for subdir in subdirs:
    try:
        run_name = os.path.basename(subdir)
        motif_enrichment_file = os.path.join(subdir, "motif_enrichment.txt")
        motif_enrichment_df = pd.read_csv(motif_enrichment_file, sep="\t")
        motif_enrichments[run_name] = motif_enrichment_df
    except FileNotFoundError:
        print(f"File not found: {motif_enrichment_file}")
        continue

def parse_motif_enrichments(subdirs):
    motif_enrichments = {}
    for subdir in subdirs:
        try:
            run_name = os.path.basename(subdir)
            motif_enrichment_file = os.path.join(subdir, "motif_enrichment.txt")
            motif_enrichment_df = pd.read_csv(motif_enrichment_file, sep="\t")
            motif_enrichments[run_name] = motif_enrichment_df
        except FileNotFoundError:
            print(f"File not found: {motif_enrichment_file}")
            continue
    return motif_enrichments

In [None]:
parse_motif_enrichments(subdirs)["cNMF"].sort_values(by="pval")

Unnamed: 0,motif,stat,pval,program_name
159,FOXF1.H12CORE.0.P.C,0.024264,0.073245,39
99,AHRR.H12CORE.0.P.C,0.024264,0.073245,39
279,TEAD4.H12CORE.1.P.B,0.024264,0.073245,39
39,AHR.H12CORE.0.P.B,0.024264,0.073245,39
219,MAF.H12CORE.0.PM.A,0.024264,0.073245,39
...,...,...,...,...
185,MAF.H12CORE.0.PM.A,0.000622,0.963406,5
5,AHR.H12CORE.0.P.B,0.000622,0.963406,5
245,TEAD4.H12CORE.1.P.B,0.000622,0.963406,5
125,FOXF1.H12CORE.0.P.C,0.000622,0.963406,5


In [None]:
categorical_var = "program_name"
count_var = "motif"
sig_var = "pval"
sig_threshold = 0.999999

In [None]:
# Now for each df, calculate the number of unique motifs that are enriched
# Include a FDR q-val cutoff
motif_counts = {}
unique_motif_counts = {}
for run_name, motif_enrichment_df in motif_enrichments.items():
    # Filter the dataframe based on the FDR q-value
    filtered_data = motif_enrichment_df[motif_enrichment_df[sig_var] < sig_threshold]

    # Get the count of all motifs passing the FDR q-value cutoff for each program
    count_df = count(categorical_var=categorical_var, count_var=count_var, dataframe=filtered_data)
    motif_counts[run_name] = count_df

    # Get the count of unique motifs passing the FDR q-value cutoff for each program
    unique_data = filtered_data.sort_values(by=sig_var)
    unique_data = unique_data.drop_duplicates(subset=count_var)
    unique_df = count_unique(categorical_var=categorical_var, count_var=count_var, dataframe=unique_data)
    unique_motif_counts[run_name] = unique_df

In [None]:
filtered_data

Unnamed: 0,motif,stat,pval,program_name
0,AHR.H12CORE.0.P.B,-0.004856,0.719995,0
1,AHR.H12CORE.0.P.B,-0.004579,0.735346,1
2,AHR.H12CORE.0.P.B,-0.003362,0.804014,2
3,AHR.H12CORE.0.P.B,-0.003923,0.772147,3
4,AHR.H12CORE.0.P.B,-0.004086,0.762977,4
...,...,...,...,...
295,TEAD4.H12CORE.1.P.B,-0.004107,0.761789,55
296,TEAD4.H12CORE.1.P.B,-0.004005,0.767531,56
297,TEAD4.H12CORE.1.P.B,-0.003249,0.810462,57
298,TEAD4.H12CORE.1.P.B,-0.003016,0.823809,58


In [None]:
unique_motif_counts

{'cNMF':    program_name  motif
 0            39      5}

## `trait_enrichments`

In [None]:
# For each subdirectory, find the file trait_enrichment.txt
# Store this dataframe in a dictionary with the key being the subdirectory name
trait_enrichments = {}
for subdir in subdirs:
    try:
        run_name = os.path.basename(subdir)
        trait_enrichment_file = os.path.join(subdir, "trait_enrichment.txt")
        trait_enrichment_df = pd.read_csv(trait_enrichment_file, sep="\t")
        trait_enrichments[run_name] = trait_enrichment_df
    except FileNotFoundError:
        print(f"File not found: {trait_enrichment_file}")
        continue

def parse_trait_enrichments(subdirs):
    trait_enrichments = {}
    for subdir in subdirs:
        try:
            run_name = os.path.basename(subdir)
            trait_enrichment_file = os.path.join(subdir, "trait_enrichment.txt")
            trait_enrichment_df = pd.read_csv(trait_enrichment_file, sep="\t")
            trait_enrichments[run_name] = trait_enrichment_df
        except FileNotFoundError:
            print(f"File not found: {trait_enrichment_file}")
            continue
    return trait_enrichments

In [None]:
parse_trait_enrichments(subdirs)["cNMF"]

Unnamed: 0,program_name,Term,ES,NES,NOM p-val,FDR q-val,FWER p-val,Gene %,Lead_genes,tag_before,tag_after
0,0,EFO_0010934,0.947881,1.291304,0.001000,0.049000,0.048,2.62,RP1;MECOM;ARHGAP29;RGL1;AUTS2;ATP8B1,6,23
1,0,EFO_0005128,0.935521,1.265218,0.005000,0.070000,0.124,4.84,RASGRP3;BMP2K;ETS1;BCL2;C9;CXCR4,6,16
2,0,EFO_0001645,0.913717,1.251630,0.001000,0.081334,0.201,8.77,ARHGAP26;NCALD;PHACTR1;FARP1;COL4A1;CDH13;PLPP...,10,27
3,0,"EFO_0003761, EFO_0004247",0.887342,1.201017,0.018000,0.100000,0.658,7.85,MEF2C;DCC;PLCL2;STAG1;SOX6,5,20
4,0,"EFO_0004337, EFO_0004784",0.881527,1.202918,0.012000,0.101334,0.635,2.95,MEF2C;PLCL1;DCC;NPAS2;FOXP1;PLCL2,6,24
...,...,...,...,...,...,...,...,...,...,...,...
13075,59,EFO_0007877,0.671076,0.874504,0.989000,1.000000,1.000,32.34,GRIK1;ASCC3;LRP8;NAALADL2;PPP1R3B;PCDH7;TMEM17,7,26
13076,59,"EFO_0007878, EFO_0009458",0.678138,0.869112,0.970971,1.000000,1.000,32.10,SYT14,1,17
13077,59,EFO_0009101,0.677645,0.868946,0.961962,1.000000,1.000,32.25,JADE2;CDH6;SDK1;BNC2;SORCS3;HIVEP2,6,16
13078,59,EFO_0006525,0.676773,0.865720,0.974000,1.000000,1.000,27.52,JADE2;AGBL4;FBXL17;PDE1C,4,17


## `perturbation_association_results`

In [None]:
# For each subdirectory, find the file pertubation_association_results.txt
# Store this dataframe in a dictionary with the key being the subdirectory name
perturbation_associations = {}
for subdir in subdirs:
    try:
        run_name = os.path.basename(subdir)
        perturbation_association_file = os.path.join(subdir, "perturbation_association_results.txt")
        perturbation_association_df = pd.read_csv(perturbation_association_file, sep="\t")
        perturbation_associations[run_name] = perturbation_association_df
    except FileNotFoundError:
        print(f"File not found: {perturbation_association_file}")
        continue

def parse_perturbation_associations(subdirs):
    perturbation_associations = {}
    for subdir in subdirs:
        try:
            run_name = os.path.basename(subdir)
            perturbation_association_file = os.path.join(subdir, "perturbation_association_results.txt")
            perturbation_association_df = pd.read_csv(perturbation_association_file, sep="\t")
            perturbation_associations[run_name] = perturbation_association_df
        except FileNotFoundError:
            print(f"File not found: {perturbation_association_file}")
            continue
    return perturbation_associations

In [None]:
parse_perturbation_associations(subdirs)["cNMF"]

Unnamed: 0,guide_name,program,stat,pval
0,ACAA1_-_38178575.23-P1P2,2,2462307.0,0.501388
1,ACAA1_-_38178575.23-P1P2,6,2484221.0,0.297235
2,ACAA1_-_38178575.23-P1P2,1,2414257.0,0.983857
3,ACAA1_-_38178575.23-P1P2,13,2407659.5,0.938228
4,ACAA1_-_38178575.23-P1P2,12,2555038.5,0.050428
...,...,...,...,...
111955,SAFE_TARGETING_2188,54,688937.0,0.617514
111956,SAFE_TARGETING_2188,56,649406.0,0.115564
111957,SAFE_TARGETING_2188,49,715566.5,0.819404
111958,SAFE_TARGETING_2188,55,725310.0,0.599544


## `categorical_associations`

In [66]:
# For each subdirectory, find the file categorical_association_results.txt
# Store this dataframe in a dictionary with the key being the subdirectory name
categorical_associations = {}
for subdir in subdirs:
    try:
        run_name = os.path.basename(subdir)
        categorical_association_file = os.path.join(subdir, "categorical_association_results.txt")
        categorical_association_df = pd.read_csv(categorical_association_file, sep="\t")
        categorical_associations[run_name] = categorical_association_df
    except FileNotFoundError:
        print(f"File not found: {categorical_association_file}")
        continue

def parse_categorical_associations(subdirs):
    categorical_associations = {}
    for subdir in subdirs:
        try:
            run_name = os.path.basename(subdir)
            categorical_association_file = os.path.join(subdir, "categorical_association_results.txt")
            categorical_association_df = pd.read_csv(categorical_association_file, sep="\t")
            categorical_associations[run_name] = categorical_association_df
        except FileNotFoundError:
            print(f"File not found: {categorical_association_file}")
            continue
    return categorical_associations

In [67]:
parse_categorical_associations(subdirs)["cNMF"]

Unnamed: 0,sample_kruskall_wallis_stat,sample_kruskall_wallis_pval,D0,sample_D1,sample_D2,sample_D3,D0_sample_association_dunn_mean_pval,sample_D1_sample_association_dunn_mean_pval,sample_D2_sample_association_dunn_mean_pval,sample_D3_sample_association_dunn_mean_pval
0,26789.076815,0.0,0.0,0.0,0.0,0.0,8.345006e-06,6.793647e-09,8.338213e-06,0.0
1,41639.459128,0.0,0.0,0.0,0.0,0.0,0.0,1.050058e-37,7.967925e-35,7.957425e-35
2,6187.142359,0.0,0.0,0.0,0.0,0.0,1.501578e-58,7.718537e-13,7.718537e-13,0.0
3,47489.701493,0.0,0.0,0.0,0.0,0.0,0.0,9.222973e-19,9.222973e-19,4.8443290000000003e-29
4,14508.366017,0.0,0.0,0.0,0.0,0.0,2.8960010000000002e-105,2.8960010000000002e-105,1.521976e-66,1.521976e-66
5,25539.217866,0.0,0.0,0.0,0.0,0.0,0.1638646,6.316053000000001e-23,0.1638646,0.0
6,47446.832406,0.0,0.0,0.0,0.0,0.0,0.0,4.047577e-226,1.926296e-05,1.926296e-05
7,13124.508712,0.0,0.0,0.0,0.0,0.0,5.601728e-75,5.601728e-75,1.982854e-71,1.982854e-71
8,26763.926155,0.0,0.0,0.0,0.0,0.0,7.661394e-06,7.661394e-06,9.676532e-39,0.0
9,25979.5714,0.0,0.0,0.0,0.0,0.0,0.07238739,6.079799e-07,0.07238799,0.0


# Infer dashboard type

In [100]:
from typing import List, Dict
import re

def infer_dashboard_type(keys: List[str]) -> str:
    if len(keys) == 1:
        return "single_run"
    
    base_names = {}
    for key in keys:
        match = re.match(r'([a-zA-Z]+)_?(\d+)?', key)
        if match:
            base_name, num = match.groups()
            if base_name not in base_names:
                base_names[base_name] = []
            if num:
                base_names[base_name].append(int(num))
    
    if len(base_names) == 1:
        return "cross_k"
    elif all(len(nums) == 0 for nums in base_names.values()):
        return "cross_method"
    else:
        return "mixed"

# Example usage
keys = ["cNMF_60", "cNMF_59", "cNMF_58", "Topyfic_60", "Topyfic_59", "Topyfic_1"]
dashboard_type = infer_dashboard_type(keys)
print(dashboard_type)  # Output should be "mixed"

mixed


In [101]:
def test_infer_dashboard_type():
    test_cases = [
        # Single Run
        (["cNMF_60"], "single_run"),
        (["methodA"], "single_run"),

        # Cross k Analysis
        (["cNMF_60", "cNMF_59", "cNMF_58"], "cross_k"),
        (["method_1", "method_12", "method_123"], "cross_k"),

        # Cross Method Analysis
        (["cNMF", "Topyfic", "AnotherMethod"], "cross_method"),
        (["methodA", "methodB", "methodC"], "cross_method"),

        # Mixed Analysis
        (["cNMF_60", "cNMF_59", "Topyfic_60", "Topyfic_59"], "mixed"),
        (["methodA_1", "methodA_2", "methodB_1", "methodB_3"], "mixed"),
        (["methodA", "methodB_1", "methodB_2", "methodC_3"], "mixed"),
        (["method_1", "method_2", "methodX", "anotherMethod_3", "anotherMethod_4"], "mixed"),
    ]

    for keys, expected in test_cases:
        result = infer_dashboard_type(keys)
        assert result == expected, f"Failed for keys: {keys}, expected: {expected}, got: {result}"
    print("All test cases passed.")

test_infer_dashboard_type()

All test cases passed.


In [102]:
infer_dashboard_type(list(methods.keys()))

'single_run'

# Overview page
Capture the output of this display call

In [14]:
sorted(mdata.mod.keys())

['cNMF', 'rna']

In [15]:
input_adata = mdata.mod["rna"]
input_adata

AnnData object with n_obs × n_vars = 92284 × 5451
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'leiden', 'n_counts'
    obsm: 'X_pca', 'X_umap'
    layers: 'norm10k'

# Single run analysis
Here we will go through each subdirectory and corresponding key in the MuData object and extract per program metrics for plotting and evaluation.

## Section 2: Covariate association

In [27]:
selected_run = "cNMF"

In [32]:
curr_obs = obs[selected_run]
curr_obs_membership = obs_memberships[selected_run]

### Topic trait heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.stats.multitest import fdrcorrection

In [None]:
def convertDatTraits(data):
    """
    get data trait module base on samples information

    :return: a dataframe contains information in suitable format for plotting module trait relationship heatmap
    :rtype: pandas dataframe
    """
    datTraits = pd.DataFrame(index=data.index)
    for i in range(data.shape[1]):
        data.iloc[:, i] = data.iloc[:, i].astype(str)
        if len(np.unique(data.iloc[:, i])) == 2:
            datTraits[data.columns[i]] = data.iloc[:, i]
            org = np.unique(data.iloc[:, i]).tolist()
            rep = list(range(len(org)))
            datTraits.replace(to_replace=org, value=rep,
                              inplace=True)
        elif len(np.unique(data.iloc[:, i])) > 2:
            for name in np.unique(data.iloc[:, i]):
                datTraits[name] = data.iloc[:, i]
                org = np.unique(data.iloc[:, i])
                rep = np.repeat(0, len(org))
                rep[np.where(org == name)] = 1
                org = org.tolist()
                rep = rep.tolist()
                datTraits.replace(to_replace=org, value=rep, inplace=True)

    return datTraits

In [36]:
min_cell_participation = curr_obs_membership.min().min()
datTraits = convertDatTraits(curr_obs[["sample", "leiden", "n_counts"]])
topicsTraitCor = pd.DataFrame(
    index=curr_obs_membership.columns,
    columns=datTraits.columns,
    dtype="float"
)
topicsTraitPvalue = pd.DataFrame(
    index=curr_obs_membership.columns,
    columns=datTraits.columns,
    dtype="float"
)

for i in curr_obs_membership.columns:
    for j in datTraits.columns:
        tmp = curr_obs_membership[~np.isclose(curr_obs_membership[i], min_cell_participation, atol=min_cell_participation)]
        tmp = stats.spearmanr(tmp[i], datTraits.loc[tmp.index, j], alternative='greater')
        topicsTraitCor.loc[i, j] = tmp[0]
        topicsTraitPvalue.loc[i, j] = tmp[1]

topicsTraitCor.fillna(0.0, inplace=True)
topicsTraitPvalue.fillna(1.0, inplace=True)

for i in range(topicsTraitPvalue.shape[0]):
        rejected, tmp = fdrcorrection(topicsTraitPvalue.iloc[i, :])
        if not rejected.all():
            topicsTraitPvalue.iloc[i, :] = tmp

xlabels = curr_obs_membership.columns
ylabels = datTraits.columns

fig, ax = plt.subplots(figsize=(topicsTraitPvalue.shape[0] * 1.5,
                                topicsTraitPvalue.shape[1] * 1.5), facecolor='white')

# Loop over data dimensions and create text annotations.
tmp_cor = topicsTraitCor.T.round(decimals=3)
tmp_pvalue = topicsTraitPvalue.T.round(decimals=3)
labels = (np.asarray(["{0}\n({1})".format(cor, pvalue)
                        for cor, pvalue in zip(tmp_cor.values.flatten(),
                                                tmp_pvalue.values.flatten())])) \
    .reshape(topicsTraitCor.T.shape)

sns.set(font_scale=1.5)
res = sns.heatmap(topicsTraitCor.T, annot=labels, fmt="", cmap='RdBu_r',
                    vmin=-1, vmax=1, ax=ax, annot_kws={'size': 20, "weight": "bold"},
                    xticklabels=xlabels, yticklabels=ylabels)

0.0

# Cross run analysis

# Program

In [73]:
selected_run = "cNMF"
selected_program = "0"

### Loadings lollipop

In [91]:
data_to_plot = loadings[selected_run].loc[selected_program].to_frame(name="loadings").reset_index()

In [92]:
import plotly.graph_objects as go

def lollipop_plot(
    data: pd.DataFrame,
    x_column: str,
    y_column: str,
    title: str,
    x_axis_title: str = None,
    y_axis_title: str = None,
    show_xaxis_labels: bool = True,
    marker_colors: list = None,
    line_colors: list = None
):
    """Create a lollipop plot with colored markers and lines based on the sign of the y-axis values."""

    fig = go.Figure()

    # Add the vertical lines for the lollipops
    print(line_colors)
    for i, row in data.iterrows():
        print(i, line_colors[i])
        fig.add_shape(
            type="line",
            x0=row[x_column],
            y0=0,
            x1=row[x_column],
            y1=row[y_column],
            line=dict(color=line_colors[i])
        )

    # Add the markers for the lollipops
    fig.add_trace(go.Scattergl(
        x=data[x_column],
        y=data[y_column],
        mode='markers',
        marker=dict(color=marker_colors, size=8),
        text=data[x_column]
    ))

    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title=x_axis_title if x_axis_title else x_column,
        yaxis_title=y_axis_title if y_axis_title else y_column,
        xaxis=dict(showticklabels=show_xaxis_labels),
        plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    )

    return fig

In [93]:
n=25

# Select top n genes by loading
data_to_plot = data_to_plot.sort_values(by="loadings", ascending=False).head(n).reset_index(drop=True)

# Define colors based on the sign of the loadings
marker_colors = ['blue' if val > 0 else 'red' for val in data_to_plot['loadings']]
line_colors = ['blue' if val > 0 else 'red' for val in data_to_plot['loadings']]

# Plot the data
fig = lollipop_plot(
    data=data_to_plot,
    x_column='gene_name',
    y_column='loadings',
    title='Gene Loadings for Selected Program',
    x_axis_title='Gene',
    y_axis_title='Loadings',
    marker_colors=marker_colors,
    line_colors=line_colors,
)


['blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue']
0 blue
1 blue
2 blue
3 blue
4 blue
5 blue
6 blue
7 blue
8 blue
9 blue
10 blue
11 blue
12 blue
13 blue
14 blue
15 blue
16 blue
17 blue
18 blue
19 blue
20 blue
21 blue
22 blue
23 blue
24 blue


### Covariate boxplot

In [76]:
concat_df = pd.concat([curr_obs_membership[selected_program], curr_obs["sample"]], axis=1)
concat_df

Unnamed: 0_level_0,0,sample
obs_name,Unnamed: 1_level_1,Unnamed: 2_level_1
01_01_06__s1,0.000000,D0
01_01_17__s1,0.016260,D0
01_01_29__s1,0.000000,D0
01_01_34__s1,0.000000,D0
01_01_55__s1,0.000000,D0
...,...,...
48_95_94__s3,0.257637,sample_D3
48_96_21__s3,0.143127,sample_D3
48_96_35__s3,0.255615,sample_D3
48_96_42__s3,0.282907,sample_D3


In [79]:
import plotly.express as px
def barplot(
    data: pd.DataFrame,
    x_column: str,
    y_column: str,
    title: str,
    x_axis_title: str = None,
    y_axis_title: str = None,
    show_xaxis_labels: bool = True
):
    """Create a layout for a filtered bar plot.

    Parameters
    ----------
    app : Dash
        The Dash app instance.
    data : pd.DataFrame
        DataFrame containing the data for the plot.
    x_column : str
        The column to use for the x-axis.
    y_column : str
        The column to use for the y-axis.
    filter_column : str
        The column to apply the filter on.
    starting_filter_value : float
        Initial filter value.
    title : str
        Title of the plot.
    x_axis_title : str, optional
        Title for the x-axis.
    y_axis_title : str, optional
        Title for the y-axis.
    id_suffix : str, optional
        Suffix to add to the id of the input and output components to ensure uniqueness.
    show_xaxis_labels : bool, optional
        Whether to show x-axis labels.

    Returns
    -------
    html.Div
        A Div containing the filter controls and bar plot.
    """
    fig = px.bar(
        data,
        x=x_column,
        y=y_column,
        template='plotly_white'
    ).update_layout(
        title=title,
        xaxis_title=x_axis_title if x_axis_title else x_column,
        yaxis_title=y_axis_title if y_axis_title else y_column,
        xaxis=dict(showticklabels=show_xaxis_labels)
    )
    return fig

def boxplot(
    data: pd.DataFrame,
    x_column: str,
    y_column: str,
    title: str,
    x_axis_title: str = None,
    y_axis_title: str = None,
    show_xaxis_labels: bool = True
):
    """Create a layout for a box plot.

    Parameters
    ----------
    app : Dash
        The Dash app instance.
    data : pd.DataFrame
        DataFrame containing the data for the plot.
    x_column : str
        The column to use for the x-axis.
    y_column : str
        The column to use for the y-axis.
    filter_column : str
        The column to apply the filter on.
    starting_filter_value : float
        Initial filter value.
    title : str
        Title of the plot.
    x_axis_title : str, optional
        Title for the x-axis.
    y_axis_title : str, optional
        Title for the y-axis.
    id_suffix : str, optional
        Suffix to add to the id of the input and output components to ensure uniqueness.
    show_xaxis_labels : bool, optional
        Whether to show x-axis labels.

    Returns
    -------
    html.Div
        A Div containing the filter controls and box plot.
    """
    fig = px.box(
        data,
        x=x_column,
        y=y_column,
        template='plotly_white'
    ).update_layout(
        title=title,
        xaxis_title=x_axis_title if x_axis_title else x_column,
        yaxis_title=y_axis_title if y_axis_title else y_column,
        xaxis=dict(showticklabels=show_xaxis_labels)
    )
    return fig

In [None]:
concat_df

In [82]:
boxplot(concat_df, x_column=selected_program, y_column="sample", title="")

# DONE!

---

In [None]:
def count_unique(categorical_var, count_var, dataframe, cummul=False, unique=False):
    counts_df = count(categorical_var, count_var, dataframe)
    new_df = []
    terms = []
    # AK: the order of programs is based on the number of unique terms, not on the most significant terms
    for prog in counts_df[categorical_var].unique():
        terms_ = dataframe.loc[dataframe[categorical_var] == prog, count_var].unique()
        unique_terms = [term for term in terms_ if term not in terms]
        terms.extend(unique_terms)
        new_df.append([prog, len(unique_terms)])
    new_df = pd.DataFrame(new_df, columns=[categorical_var, count_var])
    if cummul:
        new_df[count_var] = new_df[count_var].cumsum()
    if unique:
        return new_df
    else:
        return counts_df

In [None]:
# A term that is shared across multiple programs will be counted once for each program it is associated with
counts_df = dataframe.value_counts([categorical_var, count_var])
counts_df = counts_df.groupby(categorical_var).sum()
counts_df = counts_df.sort_values(ascending=False)
counts_df = pd.DataFrame(counts_df.reset_index().values,
                             columns=[categorical_var, count_var])
counts_df        

Unnamed: 0,program_name,Term
0,4,182
1,6,132
2,20,119
3,0,97
4,10,45
5,7,34
6,8,14
7,12,14
8,16,12
9,2,9


In [169]:
count_unique(categorical_var, count_var, filtered_data, cummul=False, unique=False)["Term"].sum()

291

In [None]:
# A program that has multiple terms that are shared across multiple programs will only be counted in its most significant program

# Start by sorting the dataframe by the FDR q-value and removing duplicates in the Term column


# Now count the number of unique terms for each program
counts_df = count(categorical_var, count_var, dataframe)
counts_df

Unnamed: 0,program_name,Term
0,4,93
1,6,58
2,0,49
3,20,31
4,7,12
5,47,6
6,10,6
7,12,5
8,24,4
9,14,3


In [None]:
new_df = []
terms = []
for prog in counts_df[categorical_var].unique():
    terms_ = dataframe.loc[dataframe[categorical_var] == prog, count_var].unique()
    unique_terms = [term for term in terms_ if term not in terms]
    terms.extend(unique_terms)
    new_df.append([prog, len(unique_terms)])
new_df = pd.DataFrame(new_df, columns=[categorical_var, count_var]).sort_values(count_var, ascending=False)
new_df

Unnamed: 0,program_name,Term
0,4,182
3,0,45
1,6,28
2,20,8
10,47,5
6,8,4
4,10,3
17,14,3
5,7,2
12,35,2
