In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../../../../pycore/')
from utils import metabolites_dict_from_reaction_equation_RBA

In [2]:
# Load path
path_gen = '../../../../build_model/'

prot_path = path_gen + 'input/PROTEIN_stoich_curation.xlsx'
model_xlsx_path = path_gen + 'model/RBA_stoichiometry.xlsx'
ribonuc_path = path_gen + 'input/RIBOSOME_nucleus.xlsx'
ribomito_path = path_gen + 'input/RIBOSOME_mitochondria.xlsx'

In [3]:
df_raw = pd.read_excel('../raw_data_files/Bjorkeroth_Nielsen_2020_data_raw.xlsx',
                         sheet_name='Proteomics', skiprows=[0])
df_raw.index = df_raw.Gene.to_list()

# Load protein
df_prot = pd.read_excel(prot_path)
df_prot.index = df_prot.id.to_list()
# Strip compartment
df_prot.index = [i.split('_')[0] if '_' in i else i for i in df_prot.index]
df_prot['id'] = df_prot.index.to_list()
df_prot = df_prot.drop_duplicates(subset=['id'])
# Protein copy selector
df_select = pd.read_csv('./input/protein_copies_selector.txt', sep='\t')
df_select.index = df_select.gene_src.to_list()

# Ribosome (nucleus and mitochondrial)
df_ribonuc = pd.read_excel(ribonuc_path)
df_ribomito = pd.read_excel(ribomito_path)

In [4]:
#### HANDLE MISSING MEASUREMENTS FOR SUBUNIT COMPONENT OF HETEROMERIC ENZYMES
# E.g., missing subunit measurements for ATP synthase complex
# Stoichiometry
df_eqn = pd.read_excel(model_xlsx_path)
df_eqn.index = df_eqn.id.to_list()

Process data

In [5]:
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)', 'type', 'conc (g/gDW)', 'vtrans (mmol/gDW/h)']
idx = [i for i in df_prot.index if i in df_raw.index]

df_data = pd.DataFrame(columns=cols, index=idx)
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)']
df_data.loc[idx, cols] = df_prot.loc[idx, cols]

mu = 0.4953
ptot = (36.94 + 34.22*mu) / 100

cols_data = ['Rich_aerobic_1', 'Rich_aerobic_2']
for i in df_data.index:
    data = df_raw.loc[i, cols_data]
    data = [c for c in data if pd.isnull(c) == False]
    #if data == []:
    #    df_data.loc[i, 'conc (g/gDW)'] = 0
    #    df_data.loc[i, 'vtrans (mmol/gDW/h)'] = 0
    if data != []:
        c_avg = np.mean(data)
        mw = df_prot.loc[i, 'MW (g/mmol)']
        df_data.loc[i, 'conc (g/gDW)'] = c_avg * ptot
        df_data.loc[i, 'vtrans (mmol/gDW/h)'] = mu * c_avg * ptot / mw
        df_data.loc[i, 'type'] = 'truedata_enz'
        
        if i in df_ribonuc.id.to_list():
            df_data.loc[i, 'type'] = 'truedata_ribonuc'
        elif i in df_ribomito.id.to_list():
            df_data.loc[i, 'type'] = 'truedata_ribomito'

In [6]:
# Reindex - incorporate info from protein copy selector
idx = [df_select.selected_compartmental_copy[i] if i in df_select.index \
       else i for i in df_data.index]
df_data.index = idx
df_data['id'] = df_data.index.to_list()

In [7]:
# Clean out NaN rows
df_data = df_data[df_data['conc (g/gDW)'].isnull() == False]

Gap-fill data

In [8]:
# Load protein
df_prot = pd.read_excel(prot_path)
df_prot.index = df_prot.id.to_list()

In [9]:
idx_enzsyn = df_eqn[df_eqn.id.str.contains('ENZSYN-')].index
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)']

for i in idx_enzsyn:
    x = metabolites_dict_from_reaction_equation_RBA(df_eqn.reaction[i])
    met_dict = dict()
    for k,v in x.items():
        if k == '':
            continue
        if v.is_integer():
            met_dict[k] = int(v)
        else:
            met_dict[k] = v

In [10]:
idx_enzsyn = df_eqn[df_eqn.id.str.contains('ENZSYN-')].index
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)']

for i in idx_enzsyn:
    x = metabolites_dict_from_reaction_equation_RBA(df_eqn.reaction[i])
    met_dict = dict()
    for k,v in x.items():
        if k == '':
            continue
        if v.is_integer():
            met_dict[k] = int(v)
        else:
            met_dict[k] = v
            
    met_dict = {k.split('-', maxsplit=1)[1]:v for k,v in met_dict.items() if v < -1e-6}
    in_data = set(met_dict) & set(df_data.index)
    if len(in_data) > 0.5 and len(in_data) < len(met_dict):
        #print(i, len(in_data), len(met_dict), ','.join(in_data))
        vmin = min([df_data.loc[k, 'vtrans (mmol/gDW/h)'] / met_dict[k] for k in met_dict.keys() if k in in_data])
        cmin = min([df_data.loc[k, 'conc (g/gDW)'] / met_dict[k] for k in met_dict.keys() if k in in_data])
        for k in met_dict.keys():
            if k not in in_data:
                df_data.loc[k, cols] = df_prot.loc[k, cols]
                df_data.loc[k, 'conc (g/gDW)'] = cmin * met_dict[k]
                df_data.loc[k, 'vtrans (mmol/gDW/h)'] = vmin * met_dict[k]
                df_data.loc[k, 'type'] = 'gapfill_subunit'

Calculate fraction of non-enzymatic and non-ribosomal proteins

In [12]:
cols = ['id', 'pfrac (g protein/gDW)']
idx = [i for i in df_raw.index if i not in df_prot.index]

df_nomodel = pd.DataFrame(columns=cols, index=idx)
cols = ['id', 'pfrac (g protein/gDW)']
df_nomodel['id'] = idx

cols_data = ['Rich_aerobic_1', 'Rich_aerobic_2']
for i in df_nomodel.index:
    data = df_raw.loc[i, cols_data]
    data = [c for c in data if pd.isnull(c) == False]
    
    if data == []:
        df_nomodel.loc[i, 'pfrac (g protein/gDW)'] = 0
    elif data != []:
        c_avg = np.mean(data)
        df_nomodel.loc[i, 'pfrac (g protein/gDW)'] = c_avg

In [13]:
df_nomodel['pfrac (g protein/gDW)'].sum()

0.4118734793770407