# Enrichment test
What is the probability to randomly select at least k "changed" reactions out of n "changed" reactions when selecting N out of M reactions.   
* k: number of diferentially expressed reactions in a subsystem,
* n: number of diferentially expressed reactions in the model,
* N: number of reactions in a subsystem,
* M: number of reactions in the model.

$P(x \geq k) = 1 - hypergeom.cdf(k-1, M, n, N)$


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from itertools import permutations, product, combinations
from scipy.stats import pearsonr, spearmanr, mannwhitneyu, hypergeom

from itertools import permutations
from itertools import combinations

In [None]:
#https://www.scribbr.com/statistics/two-way-anova/
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multitest as multi

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning, ValueWarning
# ignore these warning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=HessianInversionWarning)
warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Settings

In [None]:
#analysis = "Fastcore"
analysis = "iMAT"

#analysis = "gimme"
#analysis = "init"
#analysis = "tinit"

In [None]:
#analysis_type = "FVA"
#analysis_type = "pFBA"
analysis_type = "sampling"

In [None]:
fdr = True

## Read the data

In [None]:
reactions =  pd.read_csv("data\\" + analysis_type + "_" + analysis + ".csv", sep=";").iloc[:,0]

In [None]:
if fdr:
    df_genotype = pd.read_csv("results_pairs\\" + analysis_type + "_"+ analysis + "_genotype_compare_q.csv")
    df_gender = pd.read_csv("results_pairs\\" + analysis_type + "_"+ analysis + "_gender_compare_q.csv")
    df_diet = pd.read_csv("results_pairs\\" + analysis_type + "_"+ analysis + "_diet_compare_q.csv")
else:
    df_genotype = pd.read_csv("results_pairs\\" + analysis_type + "_" + analysis + "_genotype_compare.csv")
    df_gender = pd.read_csv("results_pairs\\" + analysis_type + "_" + analysis + "_gender_compare.csv")
    df_diet = pd.read_csv("results_pairs\\" + analysis_type + "_" + analysis + "_diet_compare.csv")

In [None]:
tests_genotype = list(df_genotype.columns[1:])
tests_gender = list(df_gender.columns[1:])
tests_diet = list(df_diet.columns[1:])

### Fill the analysis data with all the reactions
As a basis I take the union of the reactions included in the selected group of models.

In [None]:
df_reactions = pd.DataFrame(columns=["rxn"])
df_reactions["rxn"] = reactions

df_genotype = pd.merge(df_genotype, df_reactions, how="outer").fillna(1)
df_gender = pd.merge(df_gender, df_reactions, how="outer").fillna(1)
df_diet = pd.merge(df_diet, df_reactions, how="outer").fillna(1)

### Get the subsystems data

In [None]:
df_subsystems = pd.read_csv("models\\iMM865_subsystems.txt", sep=";")
df_subsystems_f = df_subsystems.copy()
df_subsystems_f['rxn'] = df_subsystems_f['rxn']+'_f'
df_subsystems_b = df_subsystems.copy()
df_subsystems_b['rxn'] = df_subsystems_b['rxn']+'_b'

df_subsystems = pd.concat((df_subsystems, df_subsystems_b, df_subsystems_f), ignore_index=True).reindex()

df_subsystems.head()

Keep only the reactions that are present in the observed models

In [None]:
df_subsystems = df_subsystems[df_subsystems.rxn.isin(reactions)]

In [None]:
subsystems = df_subsystems.subsystem.dropna().unique()

### Merge

In [None]:
df_genotype = pd.merge(df_genotype, df_subsystems, how="left")
df_gender = pd.merge(df_gender, df_subsystems, how="left")
df_diet = pd.merge(df_diet, df_subsystems, how="left")

In [None]:
df_genotype = df_genotype[['rxn', 'subsystem'] + tests_genotype]
df_gender = df_gender[['rxn', 'subsystem'] + tests_gender]
df_diet = df_diet[['rxn', 'subsystem'] + tests_diet]

## Analysis

In [None]:
df_test

In [None]:
"""
GENOTYPE
"""
df_genotype_enrich = pd.DataFrame(columns = ['subsystem'] + tests_genotype)
df_genotype_enrich['subsystem'] = subsystems

n_all = len(reactions)

for test in tests_genotype:
    df_test = df_genotype[[test,'subsystem']]
    n_signif_all = (df_test[test] < 0.05).sum()         
    for subsystem in subsystems:
        df_sub = df_test[df_test.subsystem == subsystem]
        n_sub = len(df_sub)
        n_signif_sub = (df_sub[test] < 0.05).sum()
        
        M = n_all # all reactions in a model
        n = n_signif_all # all significant 
        N = n_sub # reactions in a subsystem
        k = n_signif_sub # significant in a subsystem
        
        if n:         
            p = 1 - hypergeom.cdf(k-1, M, n, N)                
        else:
            p = 1.0
            
        df_genotype_enrich.loc[(df_genotype_enrich['subsystem'] == subsystem), test] = p
        
        #print(k, M, n, N)     

In [None]:
"""
GENDER
"""
df_gender_enrich = pd.DataFrame(columns = ['subsystem'] + tests_gender)
df_gender_enrich['subsystem'] = subsystems

n_all = len(reactions)

for test in tests_gender:
    df_test = df_gender[[test,'subsystem']]
    n_signif_all = (df_test[test] < 0.05).sum()         
    for subsystem in subsystems:
        df_sub = df_test[df_test.subsystem == subsystem]
        n_sub = len(df_sub)
        n_signif_sub = (df_sub[test] < 0.05).sum()
        
        M = n_all # all reactions in a model
        n = n_signif_all # all significant 
        N = n_sub # reactions in a subsystem
        k = n_signif_sub # significant in a subsystem
        
        if n:         
            p = 1 - hypergeom.cdf(k-1, M, n, N)                
        else:
            p = 1.0
            
        df_gender_enrich.loc[(df_gender_enrich['subsystem'] == subsystem), test] = p
        
        #print(k, M, n, N)     

In [None]:
"""
DIET
"""
df_diet_enrich = pd.DataFrame(columns = ['subsystem'] + tests_diet)
df_diet_enrich['subsystem'] = subsystems

n_all = len(reactions)


for test in tests_diet:
    df_test = df_diet[[test,'subsystem']]
    n_signif_all = (df_test[test] < 0.05).sum()         
    
    
    for subsystem in subsystems:
        df_sub = df_test[df_test.subsystem == subsystem]
        n_sub = len(df_sub)
        n_signif_sub = (df_sub[test] < 0.05).sum()
                
        
        k = n_signif_sub # significant in a subsystem
        M = n_all # all reactions in a model
        n = n_signif_all # all significant 
        N = n_sub # reactions in a subsystem
        
        
        if n:         
            p = 1 - hypergeom.cdf(k-1, M, n, N)                
        else:
            p = 1.0

        df_diet_enrich.loc[(df_diet_enrich['subsystem'] == subsystem), test] = p
        
        if subsystem.startswith("Chole") and test.startswith("(HFnC"):
            print(test)
            print("k, M, n, N:", k, M, n, N)   
            print("p",p)
        

In [None]:
# at least 0 reactions are changed
1 - hypergeom.cdf(-1, 6781, 110, 10)

In [None]:
1 - hypergeom.cdf(2, 6781, 110, 10)

## Save the results

In [None]:
df_genotype_enrich.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_pairs_genotype_enrich.csv", index=False)
df_gender_enrich.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_pairs_gender_enrich.csv", index=False)
df_diet_enrich.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_pairs_diet_enrich.csv", index=False)

In [None]:
df_genotype_enrich_q = df_genotype_enrich.copy()
df_gender_enrich_q = df_gender_enrich.copy()
df_diet_enrich_q = df_diet_enrich.copy()

for c in df_genotype_enrich_q.columns[1:]:    
    df_genotype_enrich_q[c] = multi.multipletests(df_genotype_enrich_q[c], method = 'fdr_bh')[1]
for c in df_gender_enrich_q.columns[1:]:
    df_gender_enrich_q[c] = multi.multipletests(df_gender_enrich_q[c], method = 'fdr_bh')[1]
for c in df_diet_enrich_q.columns[1:]:
    df_diet_enrich_q[c] = multi.multipletests(df_diet_enrich_q[c], method = 'fdr_bh')[1]

In [None]:
df_genotype_enrich_q.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_pairs_genotype_enrich_q.csv", index=False)
df_gender_enrich_q.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_pairs_gender_enrich_q.csv", index=False)
df_diet_enrich_q.to_csv("results_enrich\\" + analysis_type + "_" + analysis + "_pairs_diet_enrich_q.csv", index=False)

In [None]:
df_diet_enrich_q.loc[df_diet_enrich_q.subsystem.str.startswith("Sq"), ["subsystem","M_WT_(HFnC>HFC)", "M_WT_(HFnC<HFC)", "M_WT_(HFnC:HFC)"]]


In [None]:
df_genotype_enrich_q[(df_genotype_enrich_q[df_genotype_enrich_q.columns[1:]] < 0.05).any(axis=1)]

In [None]:
df_gender_enrich_q[(df_gender_enrich_q[df_gender_enrich_q.columns[1:]] < 0.05).any(axis=1)]

In [None]:
df_diet_enrich_q[(df_diet_enrich_q[df_diet_enrich_q.columns[1:]] < 0.05).any(axis=1)]