# Enrichment test
What is the probability to randomly select at least k "changed" reactions out of n "changed" reactions when selecting N out of M reactions.   
* k: number of diferentially expressed reactions in a subsystem,
* n: number of diferentially expressed reactions in the model,
* N: number of reactions in a subsystem,
* M: number of reactions in the model.

$P(x \geq k) = 1 - hypergeom.cdf(k-1, M, n, N)$

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from itertools import permutations, product, combinations
from scipy.stats import pearsonr, spearmanr, mannwhitneyu, hypergeom

from itertools import permutations
from itertools import combinations

In [2]:
#https://www.scribbr.com/statistics/two-way-anova/
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multitest as multi

In [3]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning, ValueWarning
# ignore these warning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=HessianInversionWarning)
warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Settings

In [4]:
#analysis = "Fastcore"
analysis = "iMAT"

#analysis = "gimme"
#analysis = "init"
#analysis = "tinit"

In [5]:
analysis_type = "FVA"
#analysis_type = "pFBA"

In [6]:
fdr = True
randomization = False

## Read the data

In [7]:
reactions =  pd.read_csv("data\\"+analysis_type+"_"+analysis+".csv", sep=";").iloc[:,0]

In [8]:
if randomization:
    if fdr:
        df = pd.read_csv("results_ANOVA\\"+analysis_type+"_"+analysis+"_randomization_q.csv") 
    else:
        df = pd.read_csv("results_ANOVA\\"+analysis_type+"_"+analysis+"_randomization_p.csv") 
else:
    if fdr:
        df = pd.read_csv("results_ANOVA\\"+analysis_type+"_"+analysis+"_basic_q.csv") 
    else:
        df = pd.read_csv("results_ANOVA\\"+analysis_type+"_"+analysis+"_basic_q.csv") 

tests = list(df.columns[1:])

### Fill the analysis data with all the reactions
As a basis I take the union of the reactions included in the selected group of models.

In [9]:
df_reactions = pd.DataFrame(columns=["rxn"])
df_reactions["rxn"] = reactions

df = pd.merge(df, df_reactions, how="outer").fillna(1)

### Get the subsystems data

In [10]:
df_subsystems = pd.read_csv("models\\iMM865_subsystems.txt", sep=";")
df_subsystems_f = df_subsystems.copy()
df_subsystems_f['rxn'] = df_subsystems_f['rxn']+'_f'
df_subsystems_b = df_subsystems.copy()
df_subsystems_b['rxn'] = df_subsystems_b['rxn']+'_b'

df_subsystems = pd.concat((df_subsystems, df_subsystems_b, df_subsystems_f), ignore_index=True).reindex()

df_subsystems.head()

Unnamed: 0,rxn,subsystem
0,10FTHF5GLUtl,"Transport, lysosomal"
1,10FTHF5GLUtm,"Transport, mitochondrial"
2,10FTHF6GLUtl,"Transport, lysosomal"
3,10FTHF6GLUtm,"Transport, mitochondrial"
4,10FTHF7GLUtl,"Transport, lysosomal"


Keep only the reactions that are present in the observed models

In [11]:
df_subsystems = df_subsystems[df_subsystems.rxn.isin(reactions)]

In [12]:
subsystems = df_subsystems.subsystem.dropna().unique()

In [13]:
#df_subsystems[df_subsystems['rxn'].str.endswith("_f")]

### Merge

In [14]:
df = pd.merge(df, df_subsystems, how="left")

In [15]:
df = df[['rxn', 'subsystem'] + tests]
df.head()

Unnamed: 0,rxn,subsystem,q(gender),q(genotype),q(diet),"q(gender,genotype)","q(gender,diet)","q(genotype,diet)","q(gender,genotype,diet)"
0,13_CIS_RETNte_b,,0.873724,0.824109,0.859399,0.835817,0.870207,0.573116,1.0
1,13_CIS_RETNte_f,,0.873724,0.824109,0.859399,0.835817,0.870207,0.573116,1.0
2,1331TACRhr,Drug metabolism,0.873724,0.616996,0.859399,0.620733,0.870207,0.573116,1.0
3,1331TACRtev,Drug metabolism,0.873724,0.616996,0.859399,0.620733,0.870207,0.573116,1.0
4,13DMThr,Drug metabolism,0.873724,0.616996,0.859399,0.620733,0.870207,0.573116,1.0


## Analysis

In [16]:
df_enrich = pd.DataFrame(columns = ['subsystem'] + tests)
df_enrich['subsystem'] = subsystems

n_all = len(reactions)

for test in tests:
    df_test = df[[test,'subsystem']]
    n_signif_all = (df_test[test] < 0.05).sum()         
    for subsystem in subsystems:
        df_sub = df_test[df_test.subsystem == subsystem]
        n_sub = len(df_sub)
        n_signif_sub = (df_sub[test] < 0.05).sum()
        
        M = n_all # all reactions in a model
        n = n_signif_all # all significant 
        N = n_sub # reactions in a subsystem
        k = n_signif_sub # significant in a subsystem
        
        if n:
            p = 1 - hypergeom.cdf(k-1, M, n, N)
        else:
            p = 1.0
            
        df_enrich.loc[(df_enrich['subsystem'] == subsystem), test] = p
        
        #print(k, M, n, N)     

In [17]:
1-hypergeom.cdf(10, 4000, 30, 100)

5.880740339136992e-11

## Save the results

In [18]:
df_enrich_q = df_enrich.copy()

for c in df_enrich_q.columns[1:]:    
    df_enrich_q[c] = multi.multipletests(df_enrich_q[c], method = 'fdr_bh')[1]

In [19]:
df_enrich.columns = list(map(lambda x: x.replace("q(", "p("), df_enrich.columns))

In [20]:
df_enrich.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_ANOVA_enrich.csv", index=False)
df_enrich_q.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_ANOVA_enrich_q.csv", index=False)


In [21]:
df_enrich[(df_enrich[df_enrich.columns[1:]]<0.05).any(axis=1)]

Unnamed: 0,subsystem,p(gender),p(genotype),p(diet),"p(gender,genotype)","p(gender,diet)","p(genotype,diet)","p(gender,genotype,diet)"


In [22]:
df_enrich_q[(df_enrich_q[df_enrich_q.columns[1:]]<0.05).any(axis=1)]

Unnamed: 0,subsystem,q(gender),q(genotype),q(diet),"q(gender,genotype)","q(gender,diet)","q(genotype,diet)","q(gender,genotype,diet)"
