In [1]:
import cobra
import pandas as pd

Load

In [2]:
model = cobra.io.load_json_model('./input/model/y834_hvd_v3_rba.json')

In [3]:
model

0,1
Name,yeastGEM_hvd
Memory address,0x07fd6a1e66f10
Number of metabolites,1597
Number of reactions,1893
Number of groups,0
Objective expression,1.0*BIOMASS_SC_hvd - 1.0*BIOMASS_SC_hvd_reverse_d556b
Compartments,"cell envelope, cytoplasm, extracellular, mitochondrion, nucleus, peroxisome, endoplasmic reticulum, Golgi, lipid particle, vacuole, endoplasmic reticulum membrane, vacuolar membrane, Golgi membrane, mitochondrial membrane,"


In [3]:
df_uniprot = pd.read_excel('./enz_info_uniprot.xlsx')
df_uniprot.index = df_uniprot.id.to_list()

df_add = pd.read_excel('./enz_info_uniprot_manExtract.xlsx')
df_add.index = df_add.id.to_list()

idx_null = df_uniprot[df_uniprot.subloc.isnull()].index
for i in idx_null:
    if i in df_add.index:
        df_uniprot.loc[i,'subloc'] = df_add.loc[i,'subloc']

In [4]:
cols = ['id', 'name', 'uniprot', 'uniprot_name', 'uniprot_subloc',
        'gem_subloc', 'uniprot_membrane', 'compiled_subloc']
idx = df_uniprot.index.to_list()
df_subloc = pd.DataFrame(columns=cols, index=idx)

df_subloc['id'] = df_subloc.index
df_subloc['name'] = df_uniprot.name
df_subloc['uniprot'] = df_uniprot.uniprot
df_subloc['uniprot_name'] = df_uniprot.uniprot_name

Grouping subloc entries

In [5]:
sublocs = []
for i in df_uniprot.index:
    x = df_uniprot.subloc[i]
    if pd.isnull(x) == False:
        sublocs += x.split(' | ')
sublocs = set(sublocs)

In [6]:
subloc_groups = {
    'c': ['Bud membrane', 'Bud neck', 'Cytoplasm', 'Cytoplasmic granule membrane',
          'Cytoplasmic vesicle','Cytoplasmic vesicle membrane', 'Endosome',
          'Endosome membrane', 'Endomembrane system', 'Late endosome membrane',
          'Prevacuolar compartment membrane', 'Prospore', 'Prospore membrane',
          'NotAvail'],
    'en': ['Cell membrane', 'Plasma membrane'],
    'rm': ['Endoplasmic reticulum membrane', 'Microsome membrane'],
    'r': ['Endoplasmic reticulum', 'Microsome'],
    'e': ['Extracellular', 'Secreted'],
    'g': ['Golgi apparatus'],
    'gm': ['Golgi apparatus membrane'],
    'l': ['Lipid droplet'],
    'm': ['Mitochondrion', 'Mitochondrion matrix'],
    'mm': ['Mitochondrion inner membrane', 'Mitochondrion intermembrane space',
           'Mitochondrion membrane', 'Mitochondrion outer membrane'],
    'n': ['Nucleus', 'Nucleus envelope', 'Nucleus inner membrane',
          'Nucleus membrane', 'Chromosome'],
    'x': ['Peroxisome', 'Peroxisome matrix', 'Peroxisome membrane'],
    'v': ['Vacuole'],
    'vm': ['Vacuole membrane'],
    'unknown_membrane': ['Membrane', 'Periplasm']
}

subloc_conv = {}
for k,vs in subloc_groups.items():
    for v in vs:
        subloc_conv[v] = k

In [7]:
for i in df_subloc.index:
    g = model.genes.get_by_id(i)
    gem_comps = [r.id.split('_')[-1] for r in g.reactions]
    gem_comps = set(gem_comps)
    df_subloc.loc[i, 'gem_subloc'] = ','.join(gem_comps)
    
    unisubloc = df_uniprot.subloc[i]
    if pd.isnull(unisubloc):
        df_subloc.loc[i, 'uniprot_subloc'] = 'c'
    else:
        uni_comps = set([subloc_conv[i] for i in unisubloc.split(' | ')])
        df_subloc.loc[i, 'uniprot_subloc'] = ','.join(uni_comps)

Compile subloc entries

In [8]:
# Confirmed identical subloc assignments
for i in df_subloc.index:
    uni_comps = df_subloc.uniprot_subloc[i].split(',')
    gem_comps = df_subloc.gem_subloc[i].split(',')
    if set(uni_comps) == set(gem_comps):
        df_subloc.loc[i, 'compiled_subloc'] = ','.join(set(uni_comps))

# Confirmed membrane-type subloc assignments from uniprot
for i in df_subloc.index:
    uni_comps = df_subloc.uniprot_subloc[i].split(',')
    if len(set(uni_comps) & {'mm', 'rm', 'en', 'gm', 'vm', 'unknown_membrane'}) > 0.5:
        df_subloc.loc[i, 'uniprot_membrane'] = True
    else:
        df_subloc.loc[i, 'uniprot_membrane'] = False
        
# Resolve membrane-type subloc assignments conflict
for i in df_subloc.index:
    uni_comps = set(df_subloc.uniprot_subloc[i].split(','))
    gem_comps = set(df_subloc.gem_subloc[i].split(','))
    
    if df_subloc.uniprot_membrane[i]:
        if 'unknown_membrane' in uni_comps:
            if len(uni_comps & {'mm', 'rm', 'en', 'gm', 'vm', 'unknown_membrane'}) > 1.5:
                uni_comps = set([c for c in uni_comps if c != 'unknown_membrane'])
        
        if uni_comps in [{'mm'}, {'m', 'mm'}] and gem_comps in [{'m'}, {'mm'}, {'m', 'mm'}]:
            df_subloc.loc[i, 'compiled_subloc'] = 'mm'
        elif uni_comps in [{'rm'}, {'r', 'rm'}] and gem_comps in [{'r'}, {'rm'}, {'r', 'rm'}]:
            df_subloc.loc[i, 'compiled_subloc'] = 'rm'
        elif uni_comps in [{'gm'}, {'g', 'gm'}] and gem_comps in [{'g'}, {'gm'}, {'g', 'gm'}]:
            df_subloc.loc[i, 'compiled_subloc'] = 'gm'
        elif uni_comps in [{'vm'}, {'v', 'vm'}] and gem_comps in [{'v'}, {'vm'}, {'v', 'vm'}]:
            df_subloc.loc[i, 'compiled_subloc'] = 'vm'
        elif uni_comps in [{'en'}, {'e', 'en'}] and gem_comps in [{'e'}, {'en'}, {'e', 'en'}]:
            df_subloc.loc[i, 'compiled_subloc'] = 'en'
            
        elif uni_comps == {'unknown_membrane'}:
            if set(gem_comps) in [{'mm'}, {'m'}, {'mm', 'm'}]:
                df_subloc.loc[i, 'compiled_subloc'] = 'mm'
            elif set(gem_comps) in [{'rm'}, {'r'}, {'rm', 'r'}]:
                df_subloc.loc[i, 'compiled_subloc'] = 'rm'
            elif set(gem_comps) in [{'gm'}, {'g'}, {'gm', 'g'}]:
                df_subloc.loc[i, 'compiled_subloc'] = 'gm'
            elif set(gem_comps) in [{'vm'}, {'v'}, {'vm', 'v'}]:
                df_subloc.loc[i, 'compiled_subloc'] = 'vm'
            elif set(gem_comps) in [{'en'}, {'e'}, {'en', 'e'}]:
                df_subloc.loc[i, 'compiled_subloc'] = 'en'

In [9]:
df_subloc.to_excel('./enz_subloc_compiled.xlsx', index=False)