# Assess the enrichment of metabolic reactions
Compares pairs (healthy, infected) on a reaction level based on flux samples. Calculates p-values indicating significance of difference between reaction activities in a healthy and an infected model with the two-sample Kolmogorov–Smirnov test. Also, calculates fold changes of a reaction between a reaction in a healthy ($R_h$) and a reaction in an infected model ($R_i$):
$$   FC = \frac{\overline{R_i} - \overline{R_h}}{\left|\overline{R_i} + \overline{R_h}\right|} $$

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scipy.stats import ks_2samp
import statsmodels.stats.multitest as multi

## Set folders

In [10]:
sampling_folder = "flux_samples"
results_folder = "results_enrichment_reactions"
results_active_reactions_folder = "results_active_reactions_pairs"

In [11]:
MEM_labels = {"INIT":"INIT", "TINIT":"tINIT", "GIMME":"GIMME", "IMAT":"iMAT"}
dataset_labels = {'NHBE':'HBE', 'LUNG':'Lung', '293T':'293T', 'CALU':'Calu-3', 'A549':'A549'}

order = {'MEM': ['iMAT', 'GIMME', 'INIT', 'tINIT'],
         'infection': ['healthy', 'infected'],
         'dataset': ['HBE', 'Lung', '293T', 'Calu-3', 'A549']}

## Preprocessing

In [12]:
file_names = [file_name for file_name in os.listdir(sampling_folder) if file_name.endswith('csv')]
MEMs = {}
datasets = {}
infections = {}
labels = {}
labels_no_infection = {}

for file in file_names:
    experiment = os.path.splitext(file)[0]
    
    MEM = experiment.split("_")[0]
    MEM = MEM_labels[MEM.upper()]
    
    dataset = experiment.split("_")[1]
    dataset = dataset_labels[dataset.upper()]
    
    infection = experiment.split("_")[3]
    infection = 'healthy' if infection == 'H' else 'infected'
    
    if MEM not in MEMs:
        MEMs[MEM] = []
    MEMs[MEM].append(file)
    
    if MEM not in MEMs:
        MEMs[MEM] = []
    MEMs[MEM].append(file)
    
    if dataset not in datasets:
        datasets[dataset] = []
    datasets[dataset].append(file)
    
    if infection not in infections:
        infections[infection] = []
    infections[infection].append(file)
    
    labels[file] = f'{MEM}_{dataset} {infection}'
    labels_no_infection[file] = f'{MEM}_{dataset}'
  

## Compare pairs

Generate pairs of models (healthy, infected)

In [13]:
pairs = []
for MEM, files1 in MEMs.items():
    for dataset, files2 in datasets.items():
        MEM_dataset = set(files1) & set(files2)
        healthy = list(set(infections['healthy']) & MEM_dataset)
        infected = list(set(infections['infected']) & MEM_dataset)
        pairs.append([healthy[0], infected[0]])

Go through the predefined pairs and for each reaction assess if the difference between healthy and infected is significant

In [14]:
for pair in pairs:
    print(pair, "-->", labels_no_infection[pair[0]])
    df_healthy = pd.read_csv(sampling_folder+"\\"+pair[0], sep=";")
    df_infected = pd.read_csv(sampling_folder+"\\"+pair[1], sep=";")
    
    n_samples= df_infected.shape[0]
    
    reactions = sorted(list(set(df_healthy.columns) | set(df_infected.columns)))
    
    df = pd.DataFrame(columns=['reaction', 'FC', 'p', 'q', 'enrichment'])
    df['reaction']=reactions
    
    for reaction in reactions:
        if reaction in df_healthy.columns:
            healthy = df_healthy[reaction].values
        else:
            healthy = np.zeros(n_samples)

        if reaction in df_infected.columns:
            infected = df_infected[reaction].values
        else:
            infected = np.zeros(n_samples)

        mean_healthy = np.mean(healthy)
        mean_infected = np.mean(infected)

        if mean_healthy != 0 or mean_infected != 0:
            FC = (mean_infected-mean_healthy)/(abs(mean_infected + mean_healthy))            
            p = ks_2samp(healthy,infected)[1]
        else:
            FC = 0
            p = 1          
            
        df.loc[df['reaction']==reaction, 'FC'] = FC
        df.loc[df['reaction']==reaction, 'p'] = p

    df['q'] = multi.multipletests(df['p'], method = 'fdr_bh')[1]
    df.loc[(df['FC'] >= 0.82) & (df['q'] < 0.05),'enrichment'] = 1
    df.loc[(df['FC'] <= -0.82) & (df['q'] < 0.05),'enrichment'] = -1
    df = df.fillna(0)

    df.to_csv(results_folder+"\\"+labels_no_infection[pair[0]]+".csv", index=False)

['Gimme_293T_sample_H.csv', 'Gimme_293T_sample_I.csv'] --> GIMME_293T
['Gimme_A549_sample_H.csv', 'Gimme_A549_sample_I.csv'] --> GIMME_A549
['Gimme_CALU_sample_H.csv', 'Gimme_CALU_sample_I.csv'] --> GIMME_Calu-3
['Gimme_Lung_sample_H.csv', 'Gimme_Lung_sample_I.csv'] --> GIMME_Lung
['Gimme_NHBE_sample_H.csv', 'Gimme_NHBE_sample_I.csv'] --> GIMME_HBE
['iMAT_293T_sample_H.csv', 'iMAT_293T_sample_I.csv'] --> iMAT_293T
['iMAT_A549_sample_H.csv', 'iMAT_A549_sample_I.csv'] --> iMAT_A549
['iMAT_CALU_sample_H.csv', 'iMAT_CALU_sample_I.csv'] --> iMAT_Calu-3
['iMAT_Lung_sample_H.csv', 'iMAT_Lung_sample_I.csv'] --> iMAT_Lung
['iMAT_NHBE_sample_H.csv', 'iMAT_NHBE_sample_I.csv'] --> iMAT_HBE
['init_293T_sample_H.csv', 'init_293T_sample_I.csv'] --> INIT_293T
['init_A549_sample_H.csv', 'init_A549_sample_I.csv'] --> INIT_A549
['init_CALU_sample_H.csv', 'init_CALU_sample_I.csv'] --> INIT_Calu-3
['init_Lung_sample_H.csv', 'init_Lung_sample_I.csv'] --> INIT_Lung
['init_NHBE_sample_H.csv', 'init_NHBE_sampl

## Extract active reactions

Extract and save (to CSV) the reactions that are active in healthy and infected models for each MEM and dataset

In [15]:
for pair in pairs:
    print(pair, "-->", labels_no_infection[pair[0]])
    df_healthy = pd.read_csv(sampling_folder+"\\"+pair[0], sep=";")
    df_infected = pd.read_csv(sampling_folder+"\\"+pair[1], sep=";")
    
    active_reactions = []
    
    for reaction in df_healthy.columns:
        fluxes = df_healthy[reaction].values
        if np.any(fluxes):
            active_reactions.append(reaction)
            
    for reaction in df_infected.columns:
        fluxes = df_infected[reaction].values
        if np.any(fluxes) and reaction not in active_reactions:
            active_reactions.append(reaction)
            
    f = open(results_active_reactions_folder+"\\"+labels_no_infection[pair[0]]+".csv", 'w')
    print(len(active_reactions), "reactions")
    f.write(";".join(active_reactions))
    f.close()


['Gimme_293T_sample_H.csv', 'Gimme_293T_sample_I.csv'] --> GIMME_293T
3155 reactions
['Gimme_A549_sample_H.csv', 'Gimme_A549_sample_I.csv'] --> GIMME_A549
3117 reactions
['Gimme_CALU_sample_H.csv', 'Gimme_CALU_sample_I.csv'] --> GIMME_Calu-3
3147 reactions
['Gimme_Lung_sample_H.csv', 'Gimme_Lung_sample_I.csv'] --> GIMME_Lung
3262 reactions
['Gimme_NHBE_sample_H.csv', 'Gimme_NHBE_sample_I.csv'] --> GIMME_HBE
3040 reactions
['iMAT_293T_sample_H.csv', 'iMAT_293T_sample_I.csv'] --> iMAT_293T
3563 reactions
['iMAT_A549_sample_H.csv', 'iMAT_A549_sample_I.csv'] --> iMAT_A549
3111 reactions
['iMAT_CALU_sample_H.csv', 'iMAT_CALU_sample_I.csv'] --> iMAT_Calu-3
3600 reactions
['iMAT_Lung_sample_H.csv', 'iMAT_Lung_sample_I.csv'] --> iMAT_Lung
4102 reactions
['iMAT_NHBE_sample_H.csv', 'iMAT_NHBE_sample_I.csv'] --> iMAT_HBE
3176 reactions
['init_293T_sample_H.csv', 'init_293T_sample_I.csv'] --> INIT_293T
1453 reactions
['init_A549_sample_H.csv', 'init_A549_sample_I.csv'] --> INIT_A549
1557 reactions