In [1]:
import cobra
import pandas as pd
import numpy as np

import sys
sys.path.append('/home/hvdinh16/Workspace/workpy3/common/')
from custom_functions import *

In [2]:
model = cobra.io.load_json_model('./input/model/y834_hvd_v3_rba.json')

#### Assign subunit stoichiometry of isozyme to an empty assignment

Assume isozymes having the same subunit stoichiometry

In [3]:
df_uni = pd.read_excel('./enz_info_uniprot_step2.xlsx')
df_uni.index = df_uni.id.to_list()

for rxn in model.reactions:
    genes = [g.id for g in rxn.genes]
    gpr = rxn.gene_reaction_rule
    
    if 'and' in gpr:
        continue
        
    if len(genes) > 1.5:
        #### Subunit assignments
        su_asgn = {g:df_uni.subunit[g] for g in genes}
        su_asgn_yes = [v for v in su_asgn.values() if pd.isnull(v) == False]
        genes_su_asgn_null = [g for g in su_asgn.keys() if pd.isnull(su_asgn[g])]

        # Checking for discrepancy in assignment of subunit for isozyme
        note_su = ''
        if len(su_asgn_yes) > 0.5:
            if len(su_asgn) > len(su_asgn_yes):
                if len(su_asgn_yes) > 1.5:
                    if len(set(su_asgn_yes)) > 1.5:
                        print(rxn.id, 'there might be discrepancy in subunit assignment for isozyme, check')
                    else:
                        for k in genes_su_asgn_null:
                            df_uni.loc[k, 'subunit'] = su_asgn_yes[0]
                            notes = df_uni.notes[k]
                            if pd.isnull(notes):
                                df_uni.loc[k, 'notes'] = 'suAsgnByIsozyme'
                            else:
                                df_uni.loc[k, 'notes'] = notes + ' | suAsgnByIsozyme'
                else:
                    for k,v in su_asgn.items():
                        if pd.isnull(v):
                            df_uni.loc[k, 'subunit'] = su_asgn_yes[0]
                            notes = df_uni.notes[k]
                            if pd.isnull(notes):
                                df_uni.loc[k, 'notes'] = 'suAsgnByIsozyme'
                            else:
                                df_uni.loc[k, 'notes'] = notes + ' | suAsgnByIsozyme'
                            #print('Assign subunit for ' + k + ' from isozyme')


        #### Cofactor assignments
        cof_asgn = {g:df_uni.cofactor[g] for g in genes}
        cof_asgn_yes = [v for v in cof_asgn.values() if pd.isnull(v) == False]
        genes_cof_asgn_null = [g for g in cof_asgn.keys() if pd.isnull(cof_asgn[g])]

        # Checking for discrepancy in assignment of cofactor for isozyme
        note_cof = ''
        if len(cof_asgn_yes) > 0.5:
            if len(cof_asgn) > len(cof_asgn_yes):
                if len(cof_asgn_yes) > 1.5:
                    if len(set(cof_asgn_yes)) > 1.5:
                        print(rxn.id, 'there might be discrepancy in cofactor assignment for isozyme, check')
                    else:
                        for k in genes_cof_asgn_null:
                            df_uni.loc[k, 'cofactor'] = cof_asgn_yes[0]
                            notes = df_uni.notes[k]
                            if pd.isnull(notes):
                                df_uni.loc[k, 'notes'] = 'cofAsgnByIsozyme'
                            else:
                                df_uni.loc[k, 'notes'] = notes + ' | cofAsgnByIsozyme'
                else:
                    for k,v in cof_asgn.items():
                        if pd.isnull(v):
                            df_uni.loc[k, 'cofactor'] = cof_asgn_yes[0]
                            notes = df_uni.notes[k]
                            if pd.isnull(notes):
                                df_uni.loc[k, 'notes'] = 'cofAsgnByIsozyme'
                            else:
                                df_uni.loc[k, 'notes'] = notes + ' | cofAsgnByIsozyme'

ALCD22yi_c there might be discrepancy in subunit assignment for isozyme, check
NADK_m there might be discrepancy in subunit assignment for isozyme, check
NADN_n there might be discrepancy in subunit assignment for isozyme, check
NADHK1_m there might be discrepancy in subunit assignment for isozyme, check
SPMDtpa_e there might be discrepancy in subunit assignment for isozyme, check
PI3PP_c there might be discrepancy in subunit assignment for isozyme, check


In [4]:
#### Resolving conflict in assigning by isozyme
df_uni.loc['YPL188W', 'subunit'] = 'Homohexamer.'
df_uni.loc['YDR191W', 'subunit'] = 'Monomer.'
df_uni.loc['YBR132C', 'subunit'] = 'Monomer.'
df_uni.loc['YOR273C', 'subunit'] = 'Monomer.'
df_uni.loc['YPL274W', 'subunit'] = 'Monomer.'
df_uni.loc['YLL028W', 'subunit'] = 'Monomer.'
df_uni.loc['YHL016C', 'subunit'] = 'Monomer.'
df_uni.loc['YJR110W', 'subunit'] = 'Monomer.'
for g in ['YPL188W', 'YDR191W', 'YBR132C', 'YOR273C', 'YPL274W',
          'YLL028W', 'YHL016C', 'YJR110W']:
    notes = df_uni.notes[g]
    if pd.isnull(notes):
        df_uni.loc[g, 'notes'] = 'suAsgnManual'
    else:
        df_uni.loc[g, 'notes'] = notes + ' | suAsgnManual'

#### Write protein stoichiometry

In [13]:
df_cof = pd.read_csv('./input/cofactor_stats.txt', sep='\t')
df_cof.index = df_cof.uniprot_id.to_list()

In [27]:
df_genes = pd.read_excel('./input/gene_info.xlsx')
df_genes.index = df_genes.gene_id.to_list()

In [55]:
cols = ['id', 'gene_src', 'name', 'uniprot', 'subloc_assigned', 'cofactor_stoich',
        'cofactor_comments', 'sequence', 'status']
df_prot = pd.DataFrame(columns=cols)

for i in df_uni.index:
    seq = df_genes.sequence[i]
    name = df_uni.name[i]
    uni = df_uni.uniprot[i]
    
    # Extract cofactors info
    cofs_uni = df_uni.cofactor[i]
    if pd.isnull(cofs_uni) == False:
        cofs_uni = cofs_uni.split(' | ')
        cofs_ok = []
        cofs_comment = []
        for cof in cofs_uni:
            if cof in df_cof.index:
                if df_cof.status[cof] == 'OK':
                    cofs_ok.append(df_cof.model_id[cof])
            else:
                cofs_comment.append(cof)
                
    else:
        cofs_ok = []
        cofs_comment = []
        
    # Extract and record subloc and cofactor
    sublocs = df_uni.subloc_assigned[i].split(',')
    
    # Create copies of protein due to presence in multiple compartments
    if len(sublocs) > 1.5:
        for subloc in sublocs:
            df_prot.loc[i+'_'+subloc, 'id'] = i+'_'+subloc
            df_prot.loc[i+'_'+subloc, 'subloc_assigned'] = subloc
            df_prot.loc[i+'_'+subloc, ['gene_src', 'name', 'uniprot', 'sequence']] = \
                (i, name, uni, seq)
            
            x = [cof+':' for cof in cofs_ok]
            df_prot.loc[i+'_'+subloc, 'cofactor_comments'] = ' | '.join(cofs_comment)
            if len(x) == 1 and cofs_comment == []:
                df_prot.loc[i+'_'+subloc, 'status'] = 'cofAsgnAuto'
                df_prot.loc[i+'_'+subloc, 'cofactor_stoich'] = x[0] + '1'
            else:
                df_prot.loc[i+'_'+subloc, 'cofactor_stoich'] = ','.join(x)
    
    # Proceed for protein presence in only a single compartment
    else:
        df_prot.loc[i, 'id'] = i
        df_prot.loc[i, 'subloc_assigned'] = sublocs[0]
        df_prot.loc[i, ['gene_src', 'name', 'uniprot', 'sequence']] = \
            (i, name, uni, seq)
        
        x = [cof+':' for cof in cofs_ok]
        df_prot.loc[i, 'cofactor_comments'] = ' | '.join(cofs_comment)
        if len(x) == 1 and cofs_comment == []:
            df_prot.loc[i, 'status'] = 'cofAsgnAuto'
            df_prot.loc[i, 'cofactor_stoich'] = x[0] + '1'
        else:
            if i == 'Q0080': print(i,x)
            df_prot.loc[i, 'cofactor_stoich'] = ','.join(x)

Q0080 []


In [56]:
df_prot.to_excel('./output/prot_stoich_raw.xlsx', index=False)

#### Write enzyme subunit stoichiometry

In [115]:
df_rxns = pd.read_excel('./output/rxns_enz.xlsx')
df_rxns.index = df_rxns.id.to_list()

In [116]:
stoich_asgn = {'Homodimer': 2, 'Homohexamer': 6, 'Homooctamer': 8,
               'Homopentamer': 5, 'Homotetramer': 4, 'Homotrimer': 3,
               'Monomer': 1}

idx = df_rxns.id.to_list()
cols = df_rxns.columns.to_list() + ['protein_stoich', 'subunit_comments', 'status']
df_enz = pd.DataFrame(index=idx, columns=cols)

df_enz['id'] = df_rxns.id
df_enz['rxn_src'] = df_rxns.rxn_src
df_enz['enz'] = df_rxns.enz
df_enz['gpr'] = df_rxns.gpr

for i in df_enz.index:
    
    # Remove compartment marker to get gene source ("prot")
    enz = df_enz.enz[i]
    if '_' in enz:
        comp = enz.split('_')[-1]
        prot = enz[:-len(comp)-1]
    else:
        prot = enz
        
    # Assign subunit for DUMMYENZ and SPONT
    if prot == 'DUMMYENZ':
        df_enz.loc[i, 'protein_stoich'] = 'DUMMYENZ:1'
        df_enz.loc[i, 'status'] = 'protStoichAsgnAuto'
        continue
    elif prot == 'SPONT':
        df_enz.loc[i, 'protein_stoich'] = 'zeroCost'
        df_enz.loc[i, 'status'] = 'protStoichAsgnAuto'
        continue
        
    # Parsing, and if possible, assign subunit for enzymes with subunit markers
    gpr = df_enz.gpr[i]
    if 'and' in gpr:
        gpr = gpr.replace('(', '')
        gpr = gpr.replace(')', '')
        genes = gpr.split(' and ')
        comments = []
        for g in genes:
            if '_' in g:
                comp = g.split('_')[-1]
                g_src = g[:-len(comp)-1]
            else:
                g_src = g
            comments.append(df_uni.subunit[g_src])
        
        if set(comments) == set([np.nan]):
            genes_stoich = [g+':1' for g in genes]
            df_enz.loc[i, 'protein_stoich'] = ','.join(genes_stoich)
            df_enz.loc[i, 'status'] = 'protStoichAsgnAuto'
            continue
        else:
            genes_stoich = [g+':' for g in genes]
            df_enz.loc[i, 'protein_stoich'] = ','.join(genes_stoich)
            comments_ok = [c for c in comments if pd.isnull(c) == False]
            df_enz.loc[i, 'subunit_comments'] = ' | '.join(comments_ok)
            continue
    
    # If gene source ("prot") is not found from extracted info from Uniprot, ignore
    if prot not in df_uni.index:
        continue
    
    # Parse enzyme with single protein associated, might contain some comments for subunits 
    # requiring manual curation
    su = df_uni.subunit[prot]
    if pd.isnull(su):
        df_enz.loc[i, 'protein_stoich'] = prot + ':1'
        df_enz.loc[i, 'status'] = 'protStoichAsgnAuto'
    else:
        su_i0 = su.split('.')[0]
        if ' ' not in su_i0:
            if su_i0 in stoich_asgn.keys():
                df_enz.loc[i, 'protein_stoich'] = prot + ':' + str(stoich_asgn[su_i0])
                df_enz.loc[i, 'status'] = 'protStoichAsgnAuto'
            else:
                df_enz.loc[i, 'protein_stoich'] = prot + ':'
                df_enz.loc[i, 'subunit_comments'] = df_uni.subunit[prot]
                
        else:
            df_enz.loc[i, 'protein_stoich'] = prot + ':'
            df_enz.loc[i, 'subunit_comments'] = df_uni.subunit[prot]

In [117]:
df_enz.to_excel('./output/enz_stoich_raw.xlsx', index=False)

#### Test codes

In [290]:
# Test sink
model = cobra.io.load_json_model('./input/model/y834_hvd_v3_rba.json')
model.reactions.EX_glc__D_e.lower_bound = -10

status, fba = test_metabolite_sink(model, 'sdh5req_c')
print(status, fba.objective_value)
#make_escher_csv(fba.fluxes, './test.csv')

True 1000.0


In [299]:
model.reactions.COX23REQ_c

0,1
Reaction identifier,COX23REQ_c
Name,COX23 requirement for cytochrome c oxidase assembly (for enzyme requirement purpose in RBA)
Memory address,0x07fe4d36854d0
Stoichiometry,--> cox23req_c  --> COX23 requirement for cytochrome c oxidase assembly (for enzyme requirement purpose in RBA)
GPR,YHR116W
Lower bound,0.0
Upper bound,1000.0


In [26]:
y8 = cobra.io.load_json_model('../../SCProjects/SCModels/yeast8/model/y834_hvd_v3.json')

In [24]:
model.metabolites.ni2

0,1
Metabolite identifier,fe2_c
Name,iron(2+) [cytoplasm]
Memory address,0x07fc5c188c890
Formula,Fe
Compartment,c
In 4 reaction(s),"FE2t_c_m, BIOMASS_SC_hvd, SHCHF_c, FE2t_c_e"


In [7]:
cofactors = []
for i in df_uni.index:
    entries = df_uni.cofactor[i]
    if pd.isnull(entries) == False:
        entries = entries.split(' | ')
        for entry in entries:
            if '/' in entry:
                cofactors.append(entry)

In [9]:
for i in set(cofactors):
    print(i)

Mg(2+)/CHEBI:18420
[4Fe-4S] cluster/CHEBI:49883
siroheme/CHEBI:60052
thiamine diphosphate/CHEBI:58937
biotin/CHEBI:57586
a divalent metal cation/CHEBI:60240
[3Fe-4S] cluster/CHEBI:21137
Zn(2+)/CHEBI:29105
heme/CHEBI:30413
[2Fe-2S] cluster/CHEBI:49601
a monovalent cation/CHEBI:60242
Ca(2+)/CHEBI:29108
dipyrromethane/CHEBI:60342
Cu cation/CHEBI:23378
Fe(2+)/CHEBI:29033
pantetheine 4'-phosphate/CHEBI:47942
(R)-lipoate/CHEBI:83088
Fe cation/CHEBI:24875
FMNH2/CHEBI:57618
Ni(2+)/CHEBI:49786
FMN/CHEBI:58210
prenyl-FMN/CHEBI:87746
pyridoxal 5'-phosphate/CHEBI:597326
pyruvate/CHEBI:15361
FAD/CHEBI:57692
Co(2+)/CHEBI:48828
K(+)/CHEBI:29103
AMP/CHEBI:456215
NAD(+)/CHEBI:57540
heme b/CHEBI:60344
Mn(2+)/CHEBI:29035
Binds 2 divalent metal cations per subunit. Site 1 may preferentially bind zinc ions, while site 2 has a preference for magnesium and/or manganese ions.
