In [1]:
import cobra
import pandas as pd
import io, re
from bioservices.uniprot import UniProt
import bioservices.xmltools

import sys
sys.path.append('/home/hvdinh16/Workspace/workpy3/common/')
from custom_functions import *

import lxml
from lxml import etree
from lxml.etree import ElementTree

In [2]:
taxonId = '559292' # S. cerevisiae S288C

In [3]:
df_app = pd.read_excel('./input/translation_elongation_machinery.xlsx')
df_app = df_app[df_app.paralog.isnull()]
df_app = df_app[df_app.id.str.contains('rrna') == False]
df_app.index = df_app.id.to_list()

In [12]:
idx = df_app.id.to_list()
cols = ['id', 'name', 'sc_id', 'uniprot', 'uniprot_name', 'subunit', 'subloc', 'cofactor']
df_info = pd.DataFrame(index=idx, columns=cols)
df_info['id'] = idx

df_info['sc_id'] = df_info['id']
        
uniprot = UniProt(verbose=True)
for i in df_info.index:
    subunit_list, subloc_list, cofactor_list = [], [], []
    
    if pd.isnull(df_info.sc_id[i]):
        continue
        
    if ',' not in df_info.sc_id[i]:
        sc_id = df_info.sc_id[i]
    else:
        sc_id = df_info.sc_id[i].split(',')[0]
    qstr = 'gene:' + df_info.sc_id[i] + ' AND taxonomy:' + taxonId + ' AND reviewed:yes'
    
    # Extract info from uniprot via REST API
    search = uniprot.search(qstr)
    if search == '':
        continue
    df_search = pd.read_table(io.StringIO(search))
    
    for j in df_search.index:
        aliases = df_search.loc[j, 'Gene names'].split(' ')
        if df_info.sc_id[i] in aliases:
            uniprot_id = df_search.Entry[j]
    
    retrieve = uniprot.retrieve(uniprot_id, frmt='xml')
    
    lxmlET = etree.fromstring(retrieve.data.encode())
    for elem in lxmlET.getiterator():
        elem.tag = etree.QName(elem.tag).localname
    etree.cleanup_namespaces(lxmlET)
    uniprot_entry = lxmlET.find('entry')
    
    for elem in uniprot_entry.iter(tag='comment'):
        # Extract cofactor info
        if elem.attrib['type'] == 'cofactor':
            for cofactor_entry in elem.getchildren():
                
                if cofactor_entry.tag == 'cofactor':
                    name = cofactor_entry.find('name')
                    if name != None:
                        name = name.text

                    dbref = cofactor_entry.find('dbReference')
                    if dbref != None:
                        dbref = dbref.attrib['id']

                    cofactor_list.append(str(name) + '/' + str(dbref))
                    
                elif cofactor_entry.tag == 'text':
                    cofactor_list.append(cofactor_entry.text)

        # Extract subcellular location
        elif elem.attrib['type'] == 'subcellular location':
            for subloc_entry in elem.getchildren():
                loc = subloc_entry.find('location')
                if loc != None:
                    subloc_list.append(loc.text)

                mol = subloc_entry.find('molecule')
                if mol != None:
                    subloc_list.append(mol)
                    
                t = subloc_entry.find('text')
                if t != None:
                    subloc_list.append(t.text)

        # Extract subunit information
        elif elem.attrib['type'] == 'subunit':
            for subunit_entry in elem.getchildren():
                su = subunit_entry.text
                if su != None:
                    subunit_list.append(su)
                
    # Record
    df_info.loc[i, 'uniprot'] = uniprot_id
    df_info.loc[i, 'uniprot_name'] = uniprot_entry.find('name').text
    df_info.loc[i, 'subunit'] = ' | '.join(subunit_list)
    df_info.loc[i, 'subloc'] = ' | '.join(subloc_list)
    df_info.loc[i, 'cofactor'] = ' | '.join(cofactor_list)

In [16]:
df_info.name = df_app.name

In [21]:
df_info = df_info.drop(['sc_id'], axis=1)

In [23]:
df_info.to_excel('./output/enz_info_uniprot_appended.xlsx', sheet_name='uniprot_extract', index=False)