In [1]:
import pandas as pd
import os
from os.path import join
import matplotlib.pyplot as plt
import numpy as np
import json
import pickle5 as pickle
import warnings
warnings.filterwarnings("ignore")

## 1. Loading and preprocessing fluxes

In [2]:
flux_dir = join("..", "..", "..", "data", "BiGG_data", "fluxes")
model_IDs = ["iSbBS512_1146", "iJN1463", "iIT341", "iHN637", "iEK1008", "iECO111_1330"]

In [3]:
def read_fluxes(file, model_ID):
    with open(file, "rb") as fh:
        df = pickle.load(fh)
        
    df_flux = pd.DataFrame(columns = ["reaction", "flux_pFBA", "model_ID"])
    
    for ind in df.index:
        fluxes = abs(np.array(df.loc[ind]))
        fluxes = fluxes[fluxes>1e-6]
        if len(fluxes) == 0:
            df_flux = df_flux.append({"reaction" : ind, "flux_pFBA" : np.nan, "model_ID" : model_ID},
                                     ignore_index = True)
        else:
            df_flux = df_flux.append({"reaction" : ind, "flux_pFBA" : np.mean(fluxes), "model_ID" : model_ID},
                                     ignore_index = True)
        
    #Normalize fluxes such that mean is 1:
    mean = np.mean(df_flux["flux_pFBA"].loc[~pd.isnull(df_flux["flux_pFBA"])])
    df_flux["flux_pFBA"].loc[~pd.isnull(df_flux["flux_pFBA"])] = df_flux["flux_pFBA"].loc[~pd.isnull(df_flux["flux_pFBA"])]/mean
    
    return(df_flux)


def adding_FVA_fluxes(df_flux, file):
    with open(file, "rb") as fh:
        df = pickle.load(fh)
    
    df_flux["flux_FVA"] = np.nan
    for ind in df.index:
        fluxes = abs(np.array(df.loc[ind]))
        fluxes = fluxes[fluxes>1e-6]
        if len(fluxes) > 0:
             df_flux["flux_FVA"].loc[df_flux["reaction"] == ind] = np.mean(fluxes)
                
    #Normalize fluxes such that mean is 1:
    mean = np.mean(df_flux["flux_FVA"].loc[~pd.isnull(df_flux["flux_FVA"])])
    df_flux["flux_FVA"].loc[~pd.isnull(df_flux["flux_FVA"])] = df_flux["flux_FVA"].loc[~pd.isnull(df_flux["flux_FVA"])]/mean
    
    return(df_flux)

#### Loading predicted fluxes. The fluxes were predicted using code from the following GitHub repository: https://github.com/Nina181/kcat_flux_relationship

In [4]:
df = pd.DataFrame(columns = ["reaction", "flux_pFBA", "flux_FVA", "model_ID"])

for i in range(len(model_IDs)):
    df_flux = read_fluxes(file = join(flux_dir, "pfba", "10000_" + model_IDs[i] + ".pkl"), model_ID = model_IDs[i])
    df_flux = adding_FVA_fluxes(df_flux, file = join(flux_dir, "fva", "10000_a_" + model_IDs[i] + ".pkl"))
    
    df = pd.concat([df, df_flux], ignore_index = True)

In [5]:
df

Unnamed: 0,reaction,flux_pFBA,flux_FVA,model_ID
0,EX_doxrbcn_e,,,iSbBS512_1146
1,EX_dtmp_e,1.298625,1.397942,iSbBS512_1146
2,EX_dump_e,1.389395,1.503175,iSbBS512_1146
3,EX_duri_e,1.527524,,iSbBS512_1146
4,EX_eca4colipa_e,,,iSbBS512_1146
...,...,...,...,...
10838,DMALRED,0.178703,,iECO111_1330
10839,AMMQT8_2,,,iECO111_1330
10840,CELLBpts_1,,,iECO111_1330
10841,FFSD,1.116601,,iECO111_1330


## 2. Mapping fluxes to kcat values

In [6]:
df_kcat = pd.read_pickle(join("..", "..", "..", "data", "kcat_data",
                         "df_kcat_with_BiGG_IDs.pkl"))

#### (a) Getting organism names:

Creating txt-file with all Uniprot IDs

In [7]:
IDs = list(set(df_kcat["Uniprot ID"]))    
f = open(join("..", "..", "..", "data", "enzyme_data", "UNIPROT_IDs_flux.txt"), "w") 
for ID in list(set(IDs)):
    f.write(str(ID) + "\n")
f.close()

Using the UniProt mapping service to get the organism name for all data points:

In [8]:
UNIPROT_df = pd.read_csv(join("..", "..", "..", "data", "enzyme_data",  "UNIPROT_results_flux.tsv"), sep = "\t")
UNIPROT_df.drop(columns = ["Entry"], inplace = True)

df_kcat = df_kcat.merge(UNIPROT_df, how = "left", on = "Uniprot ID")
df_kcat = df_kcat.loc[~pd.isnull(df_kcat["Uniprot ID"])]
df_kcat.head()

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,...,ESM1b,ESM1b_ts,log10_kcat,frac_of_max_UID,frac_of_max_RID,frac_of_max_EC,Uniprot ID,BiGG acc,BiGG ID,Organism
0,Reaction_127,Sequence_1959,[1.7],[Q7Z4W1],[0],[1],[0],[False],MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,...,"[-0.026505822, 0.16142353, 0.12178893, -0.1417...","[0.47018716, 0.1352054, 0.22768608, 1.1729294,...",0.230449,0.447368,0.077273,0.014286,Q7Z4W1,0.982542,ALR2,Homo sapiens (Human)
1,Reaction_796,Sequence_2315,[21.9],[Q8U4F6],[1],[0],[0],[True],MNYRYPPRYGPEWGSGGIYGLRFHNGTLYFTLAFEGEAHFITEDSH...,"{InChI=1S/H2O/h1H2, InChI=1S/C12H15NO8/c14-5-8...",...,"[0.035813812, 0.1608091, 0.010744683, 0.140818...","[0.20405163, 0.7924612, 0.029229933, 0.6447696...",1.340444,1.0,0.722772,1.0,Q8U4F6,0.860177,SALCNH,Pyrococcus furiosus (strain ATCC 43587 / DSM 3...
2,Reaction_565,Sequence_473,[2.85],[Q92871],[1],[0],[0],[True],MAVTAQAARRKERVLCLFDVDGTLTPARQKIDPEVAAFLQKLRSRV...,{InChI=1S/C6H13O9P/c7-1-2-3(8)4(9)5(10)6(14-2)...,...,"[-0.059231035, 0.20886274, -0.04375118, -0.059...","[-0.45956117, -0.6061388, -0.36703074, 0.82122...",0.454845,1.0,1.0,0.021127,Q92871,0.90099,S7PI_r,Homo sapiens (Human)
3,Reaction_781,Sequence_2711,[666.0],[Q9RF52],[1],[0],[0],[True],MTEAMKITLSTQPADARWGDKATYSINNDGITLHLNGKDDLGLIQR...,"{InChI=1S/H2O/h1H2, InChI=1S/C10H18N2O5/c1-5(2...",...,"[-0.014622692, 0.18103217, -0.005008551, 0.146...","[1.0933731, 0.75559866, -0.15526822, 0.3362466...",2.823474,0.72549,1.0,0.701053,Q9RF52,0.89426,LEULEULAPc,Salmonella typhimurium (strain LT2 / SGSC1412 ...
4,Reaction_3860,Sequence_2247,[0.025166666666666667],[B2HMK0],[0],[0],[1],[False],MAYHNPFIVNGKIRFPENTNLVRHVEKWARVRGDKLAYRFLDFSTE...,"{InChI=1S/p+1, InChI=1S/C10H16N5O13P3/c11-8-5-...",...,"[0.011039141, 0.24913643, 0.12042855, 0.022845...","[-0.2883399, -0.17133243, -0.3643182, 1.376142...",-1.599174,1.0,0.134821,1.572917,B2HMK0,0.966038,2AGPEAT160,Mycobacterium marinum (strain ATCC BAA-535 / M)


#### (b) Find correct BiGG model

In [9]:
def find_BiGG_model(organism):
    if "Escherichia coli" in organism:
        return("iECO111_1330")
    elif "Mycobacterium tuberculosis" in organism:
        return("iEK1008")
    elif "Clostridium ljungdahlii" in organism:
        return("iHN637")
    elif "Shigella boydii" in organism:
        return("iSbBS512_1146")
    elif "Pseudomonas putida" in organism:
        return("iJN1463")
    elif "Helicobacter pylori" in organism:
        return("iIT341")
    else:
        return(None)

In [10]:
df_kcat["model_ID"] = np.nan

for ind in df_kcat.index:
    org = df_kcat["Organism"][ind]
    if not pd.isnull(org):
        model_ID = find_BiGG_model(organism = org)
        if not model_ID is None:
            df_kcat["model_ID"][ind] = model_ID

#### (c) Mapping fluxes to df_kcat

In [11]:
df_flux.head()

Unnamed: 0,reaction,flux_pFBA,model_ID,flux_FVA
0,EX_cit_e,2.117947,iECO111_1330,
1,EX_cl_e,0.000429,iECO111_1330,
2,EX_cm_e,,iECO111_1330,
3,EX_cmp_e,1.471923,iECO111_1330,0.861807
4,EX_co2_e,2.356075,iECO111_1330,


In [12]:
def find_flux_with_model_ID(model_ID, bigg_ID):
    help_df = df_flux.loc[df_flux["model_ID"] == model_ID]
    fluxes = help_df["flux_pFBA"].loc[help_df["reaction"] == bigg_ID]
    if len(fluxes) > 0:
        return( np.mean(fluxes) )
    else:
        fluxes = help_df["flux_FVA"].loc[help_df["reaction"] == bigg_ID]
        if len(fluxes) > 0:
            return( np.mean(fluxes) )
        else:
            return(np.nan)
        
def find_flux(bigg_ID):
    fluxes = df_flux["flux_pFBA"].loc[df_flux["reaction"] == bigg_ID]
    if len(fluxes) > 0:
        return( np.mean(fluxes) )
    else:
        fluxes = df_flux["flux_FVA"].loc[df_flux["reaction"] == bigg_ID]
        if len(fluxes) > 0:
            return( np.mean(fluxes) )
        else:
            return(np.nan)

In [13]:
df_kcat["flux"] = np.nan

for ind in df_kcat.index:
    model_ID, bigg_ID = df_kcat["model_ID"][ind], df_kcat["BiGG ID"][ind]
    if bigg_ID[-2:] == "_r":
        bigg_ID = bigg_ID[:-2]
    if not pd.isnull(model_ID):
        flux = find_flux_with_model_ID(model_ID = model_ID, bigg_ID = bigg_ID)
    else:
        flux = find_flux(bigg_ID = bigg_ID)
    df_kcat["flux"][ind] = flux

In [14]:
df_kcat["flux"].loc[pd.isnull(df_kcat["flux"])] = np.mean(df_kcat["flux"].loc[~pd.isnull(df_kcat["flux"])])

In [15]:
df_kcat.to_pickle(join("..", "..", "..", "data", "kcat_data",
                         "df_kcat_with_fluxes.pkl"))