In [1]:
import cobra
import pandas as pd
import io, re
from bioservices.uniprot import UniProt
import bioservices.xmltools

import sys
sys.path.append('/home/hvdinh16/Workspace/workpy3/common/')
from custom_functions import *

import lxml
from lxml import etree
from lxml.etree import ElementTree

In [2]:
model = cobra.io.load_json_model('./gsmodel/iRhtoC_r2_2.json')

Academic license - for non-commercial use only


In [3]:
[g.id for g in model.genes if g.id[:2] != 'rt']

['SPONT', 'TRUE', 'UNKNOWN']

In [4]:
idx = [g.id for g in model.genes if g not in ['UNKNOWN', 'SPONT', 'TRUE']]
cols = ['id', 'name', 'sc_id', 'uniprot', 'uniprot_name', 'uniprot_subunit', 'uniprot_subloc', 'check']
df_info = pd.DataFrame(index=idx, columns=cols)
df_info['id'] = idx

for gid in df_info.index:
    g = model.genes.get_by_id(gid)
    if 'rt' not in g.name:
        df_info.loc[g.id, 'name'] = g.name
    
    if 'sc_id' in g.annotation.keys():
        df_info.loc[g.id, 'sc_id'] = g.annotation['sc_id']
        
uniprot = UniProt(verbose=True)

cofactor_attrib_list = []
subloc_attrib_list = []
subunit_attrib_list = []

for i in df_info.index:
    if pd.isnull(df_info.sc_id[i]):
        continue
        
    if ',' not in df_info.sc_id[i]:
        sc_id = df_info.sc_id[i]
    else:
        sc_id = df_info.sc_id[i].split(',')[0]
    qstr = 'gene:' + df_info.sc_id[i] + ' AND taxonomy:4932 AND reviewed:yes'
    
    search = uniprot.search(qstr)
    if search == '':
        continue
    df_search = pd.read_table(io.StringIO(search))
    
    uniprot_id = df_search.Entry[0]
    retrieve = uniprot.retrieve(uniprot_id, frmt='xml')
    
    lxmlET = etree.fromstring(retrieve.data.encode())
    for elem in lxmlET.getiterator():
        elem.tag = etree.QName(elem.tag).localname
    etree.cleanup_namespaces(lxmlET)
    uniprot_entry = lxmlET.find('entry')
    
    for elem in uniprot_entry.iter(tag='comment'):

        # Extract cofactor info
        if elem.attrib['type'] == 'cofactor':
            for cofactor_entry in elem.getchildren():
                cofactor_attrib_list.append(cofactor_entry.tag)

        # Extract subcellular location
        elif elem.attrib['type'] == 'subcellular location':
            for subloc_entry in elem.getchildren():
                subloc_attrib_list.append(subloc_entry.tag)

        # Extract subunit information
        elif elem.attrib['type'] == 'subunit':
            for subunit_entry in elem.getchildren():
                subunit_attrib_list.append(subunit_entry.tag)

INFO    [bioservices:UniProt]:  Initialising UniProt service (REST)


In [5]:
cofactor_attrib_list = set(cofactor_attrib_list)
subloc_attrib_list = set(subloc_attrib_list)
subunit_attrib_list = set(subunit_attrib_list)

In [6]:
print('cofactor_attrib_list')
print(cofactor_attrib_list)
print('subloc_attrib_list')
print(subloc_attrib_list)
print('subunit_attrib_list')
print(subunit_attrib_list)

cofactor_attrib_list
{'cofactor', 'text'}
subloc_attrib_list
{'subcellularLocation', 'text', 'molecule'}
subunit_attrib_list
{'text'}


In [15]:
elem.findall('subcellularLocation')

[<Element subcellularLocation at 0x7f0334640e60>,
 <Element subcellularLocation at 0x7f03341597d0>]

In [12]:
elem.getchildren()[0].tag

'subcellularLocation'