In [1]:
import cobra
import pandas as pd
import io, re
from bioservices.uniprot import UniProt
import bioservices.xmltools

import sys
sys.path.append('/home/hvdinh16/Workspace/workpy3/common/')
from custom_functions import *

import lxml
from lxml import etree
from lxml.etree import ElementTree

In [2]:
model = cobra.io.load_json_model('./input/model/y834_hvd_v4_rba.json')

In [3]:
taxonId = '559292' # S. cerevisiae S288C

In [4]:
idx = [g.id for g in model.genes if g not in ['UNKNOWN', 'SPONT', 'TRUE']]
cols = ['id', 'name', 'sc_id', 'uniprot', 'uniprot_name', 'subunit', 'subloc', 'cofactor']
df_info = pd.DataFrame(index=idx, columns=cols)
df_info['id'] = idx

df_info['sc_id'] = df_info['id']
        
uniprot = UniProt(verbose=True)
for i in df_info.index:
    subunit_list, subloc_list, cofactor_list = [], [], []
    
    if pd.isnull(df_info.sc_id[i]):
        continue
        
    if ',' not in df_info.sc_id[i]:
        sc_id = df_info.sc_id[i]
    else:
        sc_id = df_info.sc_id[i].split(',')[0]
    qstr = 'gene:' + df_info.sc_id[i] + ' AND taxonomy:' + taxonId + ' AND reviewed:yes'
    
    # Extract info from uniprot via REST API
    search = uniprot.search(qstr)
    if search == '':
        continue
    df_search = pd.read_table(io.StringIO(search))
    
    for j in df_search.index:
        aliases = df_search.loc[j, 'Gene names'].split(' ')
        if df_info.sc_id[i] in aliases:
            uniprot_id = df_search.Entry[j]
    
    retrieve = uniprot.retrieve(uniprot_id, frmt='xml')
    
    lxmlET = etree.fromstring(retrieve.data.encode())
    for elem in lxmlET.getiterator():
        elem.tag = etree.QName(elem.tag).localname
    etree.cleanup_namespaces(lxmlET)
    uniprot_entry = lxmlET.find('entry')
    
    for elem in uniprot_entry.iter(tag='comment'):
        # Extract cofactor info
        if elem.attrib['type'] == 'cofactor':
            for cofactor_entry in elem.getchildren():
                
                if cofactor_entry.tag == 'cofactor':
                    name = cofactor_entry.find('name')
                    if name != None:
                        name = name.text

                    dbref = cofactor_entry.find('dbReference')
                    if dbref != None:
                        dbref = dbref.attrib['id']

                    cofactor_list.append(str(name) + '/' + str(dbref))
                    
                elif cofactor_entry.tag == 'text':
                    cofactor_list.append(cofactor_entry.text)

        # Extract subcellular location
        elif elem.attrib['type'] == 'subcellular location':
            for subloc_entry in elem.getchildren():
                loc = subloc_entry.find('location')
                if loc != None:
                    subloc_list.append(loc.text)

                mol = subloc_entry.find('molecule')
                if mol != None:
                    subloc_list.append(mol)
                    
                t = subloc_entry.find('text')
                if t != None:
                    subloc_list.append(t.text)

        # Extract subunit information
        elif elem.attrib['type'] == 'subunit':
            for subunit_entry in elem.getchildren():
                su = subunit_entry.text
                if su != None:
                    subunit_list.append(su)
                
    # Record
    df_info.loc[i, 'uniprot'] = uniprot_id
    df_info.loc[i, 'uniprot_name'] = uniprot_entry.find('name').text
    df_info.loc[i, 'subunit'] = ' | '.join(subunit_list)
    df_info.loc[i, 'subloc'] = ' | '.join(subloc_list)
    df_info.loc[i, 'cofactor'] = ' | '.join(cofactor_list)

In [5]:
# Add name
for g in model.genes:
    if g.id in df_info.index:
        df_info.loc[g.id, 'name'] = g.name

In [6]:
# Complement sublocalization by reaction compartment assignment
for i in df_info.index:
    if pd.isnull(df_info.subloc[i]) or df_info.subloc[i] == '':
        g = model.genes.get_by_id(i)

        comps = []
        for rxn in g.reactions:
            comps.append(rxn.id.split('_')[-1])
        comps = set(comps)
        
        if comps == {'c'}:
            df_info.loc[i, 'subloc'] = 'Cytoplasm'
            df_info.loc[i, 'notes'] = 'subloc_addedByRxnComp'
        #print(i, comps)

In [9]:
df_info.to_excel('./enz_info_uniprot.xlsx', sheet_name='uniprot_extract', index=False)

Single entry search and check

In [23]:
i = 'YAL038W'
qstr = 'gene:' + df_info.sc_id[i] + ' AND taxonomy:' + taxonId + ' AND reviewed:yes'

# Extract info from uniprot via REST API
search = uniprot.search(qstr)
if search != '':
    df_search = pd.read_table(io.StringIO(search))

In [49]:
uniprot_id = 'P27616'
retrieve = uniprot.retrieve(uniprot_id, frmt='xml')

lxmlET = etree.fromstring(retrieve.data.encode())
for elem in lxmlET.getiterator():
    elem.tag = etree.QName(elem.tag).localname
etree.cleanup_namespaces(lxmlET)
uniprot_entry = lxmlET.find('entry')

In [50]:
for elem in uniprot_entry.iter(tag='comment'):
    # Extract cofactor info
    print(elem.attrib['type'])

catalytic activity
pathway
subunit
interaction
miscellaneous
similarity


In [64]:
model.reactions.get_by_id('4ABUTtps_e')

0,1
Reaction identifier,4ABUTtps_e
Name,4-aminobutyrate transport
Memory address,0x07f8878be4c90
Stoichiometry,4abut_e + h_e --> 4abut_c + h_c  gamma-aminobutyrate [extracellular] + H+ [extracellular] --> gamma-aminobutyrate [cytoplasm] + H+ [cytoplasm]
GPR,YDL210W or YOR348C
Lower bound,0.0
Upper bound,1000.0


In [66]:
model.genes.YBR018C

0,1
Gene identifier,YBR018C
Name,GAL7
Memory address,0x07f88793495d0
Functional,True
In 2 reaction(s),"GALT_c, UGLT_c"


In [62]:
for rxn in model.reactions:
    if 'synthase (NH3)' in rxn.name:
        print(rxn.id)

CTPS1_c


In [61]:
model.reactions.CTPS1_c

0,1
Reaction identifier,CTPS1_c
Name,CTP synthase (NH3)
Memory address,0x07f887905ae90
Stoichiometry,atp_c + nh4_c + utp_c --> adp_c + ctp_c + 2.0 h_c + pi_c  ATP [cytoplasm] + ammonium [cytoplasm] + UTP [cytoplasm] --> ADP [cytoplasm] + CTP [cytoplasm] + 2.0 H+ [cytoplasm] + phosphate [cytoplasm]
GPR,YBL039C or YJR103W
Lower bound,0.0
Upper bound,1000.0


In [60]:
model.reactions.CTPS2_c

0,1
Reaction identifier,CTPS2_c
Name,CTP synthase (glutamine)
Memory address,0x07f887905a650
Stoichiometry,atp_c + gln__L_c + h2o_c + utp_c --> adp_c + ctp_c + glu__L_c + 2.0 h_c + pi_c  ATP [cytoplasm] + L-glutamine [cytoplasm] + H2O [cytoplasm] + UTP [cytoplasm] --> ADP [cytoplasm] + CTP [cytoplasm] + L-glutamate [cytoplasm] + 2.0 H+ [cytoplasm] + phosphate [cytoplasm]
GPR,YBL039C or YJR103W
Lower bound,0.0
Upper bound,1000.0


In [51]:
elem.attrib

{'type': 'similarity'}

In [31]:
uniprot_entry.getchildren()

[<Element accession at 0x7f8877885a00>,
 <Element accession at 0x7f8877885aa0>,
 <Element accession at 0x7f8877885af0>,
 <Element name at 0x7f8877885b40>,
 <Element protein at 0x7f8877885b90>,
 <Element gene at 0x7f8877885be0>,
 <Element organism at 0x7f8877885c30>,
 <Element reference at 0x7f8877885c80>,
 <Element reference at 0x7f8877885cd0>,
 <Element reference at 0x7f8877885d20>,
 <Element reference at 0x7f8877885d70>,
 <Element reference at 0x7f8877885dc0>,
 <Element reference at 0x7f8877885e10>,
 <Element reference at 0x7f8877885e60>,
 <Element reference at 0x7f8877885eb0>,
 <Element reference at 0x7f8877885f00>,
 <Element reference at 0x7f8877885f50>,
 <Element reference at 0x7f8877885fa0>,
 <Element reference at 0x7f88778c1050>,
 <Element reference at 0x7f88778c10a0>,
 <Element reference at 0x7f88778c10f0>,
 <Element reference at 0x7f88778c1140>,
 <Element comment at 0x7f88778c1190>,
 <Element comment at 0x7f88778c11e0>,
 <Element comment at 0x7f88778c1230>,
 <Element comment a

In [25]:
search

"Entry\tEntry name\tStatus\tProtein names\tGene names\tOrganism\tLength\nP00549\tKPYK1_YEAST\treviewed\tPyruvate kinase 1 (PK 1) (EC 2.7.1.40) (cell division cycle protein 19)\tCDC19 PYK1 YAL038W\tSaccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)\t500\n"

In [24]:
df_search

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length
0,P00549,KPYK1_YEAST,reviewed,Pyruvate kinase 1 (PK 1) (EC 2.7.1.40) (cell d...,CDC19 PYK1 YAL038W,Saccharomyces cerevisiae (strain ATCC 204508 /...,500


In [4]:
subloc_list

['Cytoplasm', 'Mitochondrion intermembrane space', None]

In [7]:
i

'rt7128'

In [8]:
gid

'rtmCOX1'

In [9]:
df_info.loc[i]

id                  rt7128
name                  ADK1
sc_id              YDR226W
uniprot             P07170
uniprot_name    KAD2_YEAST
subunit           Monomer.
subloc                 NaN
cofactor               NaN
Name: rt7128, dtype: object

In [13]:
for i in uniprot_entry.findall('comment'):
    print(i.attrib)

{'type': 'function'}
{'type': 'catalytic activity'}
{'type': 'subunit'}
{'type': 'subcellular location'}
{'type': 'domain'}
{'type': 'disruption phenotype'}
{'type': 'miscellaneous'}
{'type': 'similarity'}


In [16]:
x = uniprot_entry.findall('comment')[2]

In [21]:
y = x.getchildren()[0]

In [23]:
y.text

'Monomer.'