In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../../../../pycore/')
from utils import metabolites_dict_from_reaction_equation_RBA

In [2]:
mu = 0.1734
cols_data = ['Gal']
fout_name = './Elsemman2022_batchGal.xlsx'

In [3]:
# Load path
path_gen = '../../../../build_model/'

prot_path = path_gen + 'input/PROTEIN_stoich_curation.xlsx'
model_xlsx_path = path_gen + 'model/RBA_stoichiometry.xlsx'
ribonuc_path = path_gen + 'input/RIBOSOME_nucleus.xlsx'
ribomito_path = path_gen + 'input/RIBOSOME_mitochondria.xlsx'

In [4]:
verified_dubious_genes = ['YJL182C', 'YPL044C']
df_raw = pd.read_excel('../raw_data_files/Elsemman_Nielsen_Teusink_2022_proteomics_sugar_batch.xlsx',
                         sheet_name='avgs')
df_raw.index = df_raw.Protein.to_list()
idx = [i for i in df_raw.index if i not in verified_dubious_genes]
df_raw = df_raw.loc[idx, :]

# Load protein
df_prot = pd.read_excel(prot_path)
df_prot.index = df_prot.id.to_list()
# Strip compartment
df_prot.index = [i.split('_')[0] if '_' in i else i for i in df_prot.index]
df_prot['id'] = df_prot.index.to_list()
df_prot = df_prot.drop_duplicates(subset=['id'])
# Protein copy selector
df_select = pd.read_csv('./input/protein_copies_selector.txt', sep='\t')
df_select.index = df_select.gene_src.to_list()

# Ribosome (nucleus and mitochondrial)
df_ribonuc = pd.read_excel(ribonuc_path)
df_ribomito = pd.read_excel(ribomito_path)

In [5]:
#### HANDLE MISSING MEASUREMENTS FOR SUBUNIT COMPONENT OF HETEROMERIC ENZYMES
# E.g., missing subunit measurements for ATP synthase complex
# Stoichiometry
df_eqn = pd.read_excel(model_xlsx_path)
df_eqn.index = df_eqn.id.to_list()

In [6]:
#### MW
df_mw = pd.read_csv('../scProteins_MW.csv', sep='\t')
df_mw.index = df_mw.gene_id.to_list()

Process data

In [7]:
pdata_raw = df_raw.loc[:, cols_data].mean(axis=1).fillna(0)
mw = df_mw.loc[pdata_raw.index.to_list(), 'MW (g/mmol)']

weight_tot = sum(pdata_raw * mw)
# pdata variable: protein fraction in proteome (g protein / gDW)
pdata = (pdata_raw * mw) / weight_tot
pdata = pdata[pdata > 0]

In [8]:
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)', 'type', 'conc (g/gDW)', 'vtrans (mmol/gDW/h)']
idx = [i for i in df_prot.index if i in pdata.index]

df_data = pd.DataFrame(columns=cols, index=idx)
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)']
df_data.loc[idx, cols] = df_prot.loc[idx, cols]

ptot = (36.94 + 34.22*mu) / 100

for i in df_data.index:
    pval = pdata[i]
    mw = df_data.loc[i, 'MW (g/mmol)']
    df_data.loc[i, 'conc (g/gDW)'] = pval * ptot
    df_data.loc[i, 'vtrans (mmol/gDW/h)'] = mu * pval * ptot / mw
    df_data.loc[i, 'type'] = 'truedata_enz'
    
    if i in df_ribonuc.id.to_list():
        df_data.loc[i, 'type'] = 'truedata_ribonuc'
    elif i in  df_ribomito.id.to_list():
        df_data.loc[i, 'type'] = 'truedata_ribomito'

In [9]:
# Store index that matches the raw data
idx_truedata_old = df_data.index.to_list()

# Reindex - incorporate info from protein copy selector
idx = [df_select.selected_compartmental_copy[i] if i in df_select.index \
       else i for i in df_data.index]
df_data.index = idx
df_data['id'] = df_data.index.to_list()

In [10]:
# Clean out NaN rows
df_data = df_data[df_data['conc (g/gDW)'].isnull() == False]

Gap-fill data

In [11]:
# Load protein
df_prot = pd.read_excel(prot_path)
df_prot.index = df_prot.id.to_list()

In [12]:
idx_enzsyn = df_eqn[df_eqn.id.str.contains('ENZSYN-')].index
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)']

for i in idx_enzsyn:
    x = metabolites_dict_from_reaction_equation_RBA(df_eqn.reaction[i])
    met_dict = dict()
    for k,v in x.items():
        if k == '':
            continue
        if v.is_integer():
            met_dict[k] = int(v)
        else:
            met_dict[k] = v

In [13]:
idx_enzsyn = df_eqn[df_eqn.id.str.contains('ENZSYN-')].index
cols = ['id', 'name', 'uniprot', 'MW (g/mmol)']

for i in idx_enzsyn:
    x = metabolites_dict_from_reaction_equation_RBA(df_eqn.reaction[i])
    met_dict = dict()
    for k,v in x.items():
        if k == '':
            continue
        if v.is_integer():
            met_dict[k] = int(v)
        else:
            met_dict[k] = v
            
    met_dict = {k.split('-', maxsplit=1)[1]:v for k,v in met_dict.items() if v < -1e-6}
    in_data = set(met_dict) & set(df_data.index)
    if len(in_data) > 0.5 and len(in_data) < len(met_dict):
        #print(i, len(in_data), len(met_dict), ','.join(in_data))
        vmin = min([df_data.loc[k, 'vtrans (mmol/gDW/h)'] / met_dict[k] for k in met_dict.keys() if k in in_data])
        cmin = min([df_data.loc[k, 'conc (g/gDW)'] / met_dict[k] for k in met_dict.keys() if k in in_data])
        for k in met_dict.keys():
            if k not in in_data:
                idx_truedata_old.append(k)
                df_data.loc[k, cols] = df_prot.loc[k, cols]
                df_data.loc[k, 'conc (g/gDW)'] = cmin * met_dict[k]
                df_data.loc[k, 'vtrans (mmol/gDW/h)'] = vmin * met_dict[k]
                df_data.loc[k, 'type'] = 'gapfill_subunit'

Calculate fraction of non-enzymatic and non-ribosomal proteins

In [15]:
idx = [i for i in pdata.index if i not in idx_truedata_old]
pdata[idx].sum()

0.29190918752160655