In [1]:
import cobra
import pandas as pd
import io, re
from bioservices.uniprot import UniProt
import bioservices.xmltools

import sys
sys.path.append('/home/hvdinh16/Workspace/workpy3/common/')
from custom_functions import *

import lxml
from lxml import etree
from lxml.etree import ElementTree

In [2]:
model = cobra.io.load_json_model('./gsmodel/iRhtoC_r2_2.json')

Academic license - for non-commercial use only


In [3]:
idx = [g.id for g in model.genes if g not in ['UNKNOWN', 'SPONT', 'TRUE']]
cols = ['id', 'name', 'sc_id', 'uniprot', 'uniprot_name', 'subunit', 'subloc', 'cofactor']
df_info = pd.DataFrame(index=idx, columns=cols)
df_info['id'] = idx

for gid in df_info.index:
    g = model.genes.get_by_id(gid)
    if 'rt' not in g.name:
        df_info.loc[g.id, 'name'] = g.name
    
    if 'sc_id' in g.annotation.keys():
        df_info.loc[g.id, 'sc_id'] = g.annotation['sc_id']
        
uniprot = UniProt(verbose=True)
for i in df_info.index:
    subunit_list, subloc_list, cofactor_list = [], [], []
    
    if pd.isnull(df_info.sc_id[i]):
        continue
        
    if ',' not in df_info.sc_id[i]:
        sc_id = df_info.sc_id[i]
    else:
        sc_id = df_info.sc_id[i].split(',')[0]
    qstr = 'gene:' + df_info.sc_id[i] + ' AND taxonomy:4932 AND reviewed:yes'
    
    # Extract info from uniprot via REST API
    search = uniprot.search(qstr)
    if search == '':
        continue
    df_search = pd.read_table(io.StringIO(search))
    
    uniprot_id = df_search.Entry[0]
    retrieve = uniprot.retrieve(uniprot_id, frmt='xml')
    
    lxmlET = etree.fromstring(retrieve.data.encode())
    for elem in lxmlET.getiterator():
        elem.tag = etree.QName(elem.tag).localname
    etree.cleanup_namespaces(lxmlET)
    uniprot_entry = lxmlET.find('entry')
    
    for elem in uniprot_entry.iter(tag='comment'):
        # Extract cofactor info
        if elem.attrib['type'] == 'cofactor':
            for cofactor_entry in elem.getchildren():
                
                if cofactor_entry.tag == 'cofactor':
                    name = cofactor_entry.find('name')
                    if name != None:
                        name = name.text

                    dbref = cofactor_entry.find('dbReference')
                    if dbref != None:
                        dbref = dbref.attrib['id']

                    cofactor_list.append(str(name) + '/' + str(dbref))
                    
                elif cofactor_entry.tag == 'text':
                    cofactor_list.append(cofactor_entry.text)

        # Extract subcellular location
        elif elem.attrib['type'] == 'subcellular location':
            for subloc_entry in elem.getchildren():
                loc = subloc_entry.find('location')
                if loc != None:
                    subloc_list.append(loc.text)

                mol = subloc_entry.find('molecule')
                if mol != None:
                    subloc_list.append(mol)
                    
                t = subloc_entry.find('text')
                if t != None:
                    subloc_list.append(t.text)

        # Extract subunit information
        elif elem.attrib['type'] == 'subunit':
            for subunit_entry in elem.getchildren():
                su = subunit_entry.text
                if su != None:
                    subunit_list.append(su)
                
    # Record
    df_info.loc[i, 'uniprot'] = uniprot_id
    df_info.loc[i, 'uniprot_name'] = uniprot_entry.find('name').text
    df_info.loc[i, 'subunit'] = ' | '.join(subunit_list)
    df_info.loc[i, 'subloc'] = ' | '.join(subloc_list)
    df_info.loc[i, 'cofactor'] = ' | '.join(cofactor_list)

INFO    [bioservices:UniProt]:  Initialising UniProt service (REST)


In [4]:
df_info.to_excel('./enzyme_information.xlsx', sheet_name='uniprot_extract', index=False)

In [4]:
subloc_list

['Cytoplasm', 'Mitochondrion intermembrane space', None]

In [7]:
i

'rt7128'

In [8]:
gid

'rtmCOX1'

In [9]:
df_info.loc[i]

id                  rt7128
name                  ADK1
sc_id              YDR226W
uniprot             P07170
uniprot_name    KAD2_YEAST
subunit           Monomer.
subloc                 NaN
cofactor               NaN
Name: rt7128, dtype: object

In [13]:
for i in uniprot_entry.findall('comment'):
    print(i.attrib)

{'type': 'function'}
{'type': 'catalytic activity'}
{'type': 'subunit'}
{'type': 'subcellular location'}
{'type': 'domain'}
{'type': 'disruption phenotype'}
{'type': 'miscellaneous'}
{'type': 'similarity'}


In [16]:
x = uniprot_entry.findall('comment')[2]

In [21]:
y = x.getchildren()[0]

In [23]:
y.text

'Monomer.'