In [13]:
import logging
import json
import copy
import pandas as pd
logger = logging.getLogger(__name__)

In [5]:
%run ../../scripts/bios_utils.py

In [7]:
CACHE_BASE_FOLDER = '/Users/fliu/workspace/jupyter/python3/annotation-server/data/'
bios_cache = read_json(CACHE_BASE_FOLDER + '/bios_cache_fungi.json')

In [29]:
model_spis = {}
for model_id in bios_cache:
    model_spis[model_id] = dict(map(
        lambda x : (x['id'] if 'id' in x else 'error', x), 
        bios_cache[model_id]['spi']))
    
model_rxns = {}
for model_id in bios_cache:
    model_rxns[model_id] = dict(map(
        lambda x : (x['id'] if 'id' in x else 'error', x), 
        bios_cache[model_id]['rxn']))

In [68]:
bios_cache['iWV1213']['spi'][0]

{'major_label': 'MetaboliteSpecie',
 'bios_references': [['C00311', 'LigandCompound'],
  ['META:THREO-DS-ISO-CITRATE', 'MetaCyc'],
  ['16087', 'ChEBI'],
  ['icit', 'BiGGMetabolite'],
  ['cpd00260', 'ModelSeed'],
  ['icit', 'BiGG']],
 'notes': '<notes>\n          <body xmlns="http://www.w3.org/1999/xhtml">\n            <p>FORMULA: </p>\n            <p>CHARGE: 0</p>\n          </body>\n        </notes>',
 'constant': 'false',
 'bios_spi_degree': 5,
 'hasOnlySubstanceUnits': 'false',
 'created_at': 1545980680786,
 'bios_id': 3784377,
 'proxy': False,
 'entry': 'species_581_7@iWV1213',
 'boundaryCondition': 'false',
 'updated_at': 1545980680786,
 'compartment': 'm',
 'name': 'isocitrate',
 'id': 'M_ICIT_m'}

In [206]:
#list(filter(lambda x : 'lys' in x['id'], bios_cache['iCT646']['spi']))

In [190]:
cpd_group_split = {}

In [193]:
def cpd_split(model_id, bios_cache, cpd_group_split):
    if model_id in bios_cache:
        print(model_id)
        cpd_group_split[model_id] = {}
        for o in bios_cache[model_id]['spi']:
            if 'id' in o:
                lst = o['id'][-2:]
                if lst[0] == '_':
                    if not o['id'][:-2] in cpd_group_split[model_id]:
                        cpd_group_split[model_id][o['id'][:-2]] = set()
                    cpd_group_split[model_id][o['id'][:-2]].add(o['id'])
                
cpd_split('iJL1454', bios_cache, cpd_group_split)
cpd_split('iCT646', bios_cache, cpd_group_split)
cpd_split('iWV1213', bios_cache, cpd_group_split)
cpd_split('iNX804', bios_cache, cpd_group_split)

iJL1454
iCT646
iWV1213
iNX804


In [160]:
df_cpd_map = pd.read_excel('../data/Fungal_Compound_Reaction_Mapping_Sam.xlsx', sheet_name='Compound Intergration')

In [161]:
'M_2NPPP' in model_spis['iWV1314']

True

In [234]:
def iJDZ836_cpd(x):
    x = str(x).replace('-', '__45__').replace('[', '__91__').replace(']', '__93__')
    x = 'M_' + x
    return x

spi_name_mod = {
    'iWV1314' : lambda x : "M_" + str(x),
    'iWV1213' : lambda x : "M_" + str(x), #cmp strip
    'iRL766' : lambda x : "M_" + str(x),
    'iNX804' : lambda x : "M_" + str(x),
    'iJDZ836' : lambda x : iJDZ836_cpd(x),
    'iNL895' : lambda x : "s_" + str(x),
    'iLC915' : lambda x : "M_" + str(x),
    'iOD907' : lambda x : "M_" + str(x),
    'iJL1454' : lambda x : "M_" + str(x), #cmp strip
    'iCT646' : lambda x : "M_" + str(x).replace('-', '_DASH_'), #cmp strip
    'iSS884' : lambda x : "M_" + str(x),
    'iAL1006' : lambda x : "M_" + str(x),
}

In [235]:
cpd_sam_mapping = {}
model_not_found = {}
model_found = {}
for row_id, d in df_cpd_map.iterrows():
    model_id = d['Model_Name']
    spi_id = d['ID']
    seed_id = d['MODELSEED']
    if not model_id in model_not_found:
        model_not_found[model_id] = set()
        model_found[model_id] = set()
        cpd_sam_mapping[model_id] = {}
    if model_id in bios_cache:
        if 'spi' in bios_cache[model_id]:
            search_id = spi_id
            if model_id in spi_name_mod:
                search_id = spi_name_mod[model_id](spi_id)
            if search_id in model_spis[model_id]:
                model_found[model_id].add(spi_id)
                cpd_sam_mapping[model_id][search_id] = [seed_id]
            elif model_id in cpd_group_split and search_id in cpd_group_split[model_id]:
                model_found[model_id].add(spi_id)
                for split_id in cpd_group_split[model_id][search_id]:
                    cpd_sam_mapping[model_id][split_id] = [seed_id]
            else:
                model_not_found[model_id].add(spi_id)
                logger.debug("[%s] [%s -> %s]", model_id, spi_id, search_id)
        else:
            logger.debug("[%s] model without spicies", model_id)
    else:
        logger.debug("[%s] model not in cache", model_id)
        
    #print(model_id, spi_id, seed_id)
    #break

In [236]:
for model_id in model_found:
    if model_id in model_spis:
        print(model_id, len(model_found[model_id]), len(model_not_found[model_id]))
        print(list(model_spis[model_id])[:5])
        print(list(model_not_found[model_id])[:5])

iWV1314 1099 2
['M_MALCOA', 'M_FEROm', 'M_PANT', 'M_AMIACE', 'M_A6RP5P2']
['DGLCe', '1368THN']
iWV1213 772 1
['M_ICIT_m', 'M_ERGOSE_c', 'M_ETH_m', 'M_AMP_m', 'M_PHPYR_c']
['UNDEF']
iRL766 1273 207
['M_m812', 'M_m621', 'M_m604', 'M_m1287', 'E_84']
['Cx_16', 517, 518, 10, 11]
iOD907 1477 0
['E_00301', 'E_00179', 'E_00535', 'M_00185', 'E_00445']
[]
iNX804 631 0
['M_DR1P_c', 'M_TYR_v', 'M_TYR_x', 'M_BTAP_c', 'M_OROA_c']
[]
iNL895 1587 260
['s_1285', 's_0631', 's_0565', 's_0901', 's_1412']
['1470', '2858', '2845', '2006', '2906']
iLC915 1359 152
['E_294', 'M_m751', 'M_m591', 'M_m1019', 'M_m1001']
['Cx_16', '706', '251', '446', 'Cx_1']
iJL1454 832 0
['M_hco3_e', 'M_phsp_c', 'M_fuacac_c', 'M_hco3_c', 'M_c12hacp_c']
[]
iJDZ836 693 44
['M_ARACHIDONIC_ACID__91__CCO__45__GLYOXYSOME__93__', 'M_CPD1F__45__129', 'M_CPD__45__9896', 'M_CPD2T__45__27', 'M_DCMP__91__CCO__45__MIT__93__']
['Deoxynucleotides', 'GLYCYLGLYCINE', 'CPD-13406', 'Sterols', 'CPD-13394']
iCT646 703 0
['M_ocdcya_c', 'M_glx_x', 'M_p

In [146]:
df_rxn_map = pd.read_excel('../data/Fungal_Compound_Reaction_Mapping_Sam.xlsx', sheet_name='Reaction Intergration')

In [225]:
def iJDZ836_rxn(x):
    x = str(x).replace('-', '__45__').replace('[', '__91__').replace(']', '__93__')
    if x.startswith('XN2T'):
        x = 'R' + x
    return x

In [226]:
rxn_name_mod = {
    'iWV1314' : lambda x : "R_" + str(x),
    'iWV1213' : lambda x : "R_" + str(x), #cmp strip
    'iRL766' : lambda x : "R_" + str(x),
    'iJDZ836' : lambda x : iJDZ836_rxn(x),
    'iLC915' : lambda x : "R_" + str(x),
    'iOD907' : lambda x : "R_" + str(x),
    'iJL1454' : lambda x : "R_" + str(x), #cmp strip
    'iCT646' : lambda x : "R_" + str(x), #cmp strip
    'iSS884' : lambda x : "R_" + str(x),
    'iAL1006' : lambda x : "R_" + str(x),
    'iNX804 ' : lambda x : "R_" + str(x),
}

In [224]:
list(filter(lambda x : 'XN2T' in x['id'], bios_cache['iJDZ836']['rxn']))

[{'major_label': 'ModelReaction',
  'notes': '<notes>\n          <html xmlns="http://www.w3.org/1999/xhtml">\n            <p>GENE_ASSOCIATION: ( NCU00168.5 or NCU07741.5 or NCU06194.5 )</p>\n            <p>SUBSYSTEM: </p>\n            <p>PROTEIN_CLASS: EC# 2.3.1.51</p>\n            <p>BIOCYC: RXN2T-287</p>\n            <p>REFERENCES: </p>\n          </html>\n        </notes>',
  'created_at': 1527566482636,
  'bios_rxn_gpr_data': {'major_label': 'ModelGPR',
   'entry': '( NCU00168.5 or NCU07741.5 or NCU06194.5 )@iJDZ836',
   'lexicographic_string': '(NCU00168.5 or NCU06194.5 or NCU07741.5)',
   'updated_at': 1529786855171,
   'created_at': 1529274701795},
  'bios_id': 1712968,
  'entry': 'reaction_59823_7@iJDZ836',
  'proxy': False,
  'bios_rxn_gpr_id': 3011677,
  'updated_at': 1527566482636,
  'bios_stoichiometry': {'r': [['M_CPD__45__8265', 1711305, ''],
    ['M_CO__45__A', 1711642, '']],
   'l': [['M_CPD2T__45__32', 1711149, ''],
    ['M_PALMITYL__45__COA', 1711614, '']]},
  'revers

In [227]:
rxn_sam_mapping = {}
model_not_found = {}
model_found = {}
for row_id, d in df_rxn_map.iterrows():
    model_id = d['Model_ID']
    rxn_id = d['ID']
    seed_id = d['MODELSEED']
    if not model_id in model_not_found:
        model_not_found[model_id] = set()
        model_found[model_id] = set()
        rxn_sam_mapping[model_id] = {}
    if model_id in bios_cache:
        if 'rxn' in bios_cache[model_id]:
            search_id = rxn_id
            if model_id in rxn_name_mod:
                search_id = rxn_name_mod[model_id](rxn_id)
            if search_id in model_rxns[model_id]:
                model_found[model_id].add(rxn_id)
                rxn_sam_mapping[model_id][search_id] = [seed_id]
            else:
                model_not_found[model_id].add(search_id)
                logger.debug("[%s] [%s -> %s]", model_id, rxn_id, search_id)
        else:
            logger.debug("[%s] model without spicies", model_id)
    else:
        logger.debug("[%s] model not in cache", model_id)

In [228]:
for model_id in model_found:
    if model_id in model_spis:
        print(model_id, len(model_found[model_id]), len(model_not_found[model_id]))
        print(list(model_rxns[model_id])[:5])
        print(list(model_not_found[model_id])[:5])

iAL1006 1660 0
['R_r1428', 'R_r1088', 'R_r0029', 'R_r0597', 'R_r1042']
[]
iCT646 1249 17
['R_CSND', 'R_FACOAL140p', 'R_3MBALDt', 'R_34HPPOR', 'R_GLUDyi']
['R_4HPRO-LTtm', 'R_ARAB-Dt', 'R_BTDD-RR', 'R_D-LACDm', 'R_NADH2-u6m']
iJDZ836 1552 192
['L__45__KYNURENINE__45__TRANS__45__RXN__45__R2L', 'RXN2T__45__287__45__L2R', 'ADENYLOSUCCINATE__45__SYNTHASE__45__RXN__45__L2R', 'TYROSINE__45____45__TRNA__45__LIGASE__45__RXN__45__L2R', 'PHEAMINOTRANS__45__RXN__45__R2L']
['XN__45__9545__45__L2R', 'XN__45__8631__45__L2R', 'XN__45__9615__45__L2R__91__CCO__45__MIT__93__', 'XN__45__6021__45__R2L', 'XN0__45__901__45__R2L']
iJL1454 1276 1
['R_r0849', 'R_r0848', 'R_r0843', 'R_r0842', 'R_r0841']
['R_Biomass']
iLC915 1448 1
['R_r217', 'R_r144', 'R_r595', 'R_r963', 'R_r1286']
['R_ID']
iNL895 1987 0
['r_0765', 'r_0223', 'r_0196', 'r_1_exchange', 'r_0296']
[]
iNX804 0 1180
['R_ZH1088', 'R_ZH1049', 'R_ZH1048', 'R_ZH1218', 'R_ZH1043']
['ZH1084', 'ZH0066', 'ZH0355', 'ZH0575', 'ZH0733']
iOD907 1867 0
['R_01400',

In [145]:
old_cpd_mapping = None
with open(CACHE_BASE_FOLDER + 'cpd_mapping_cache.json', 'r') as f:
    old_cpd_mapping = json.loads(f.read())

In [125]:
model_rxn_mapping = None
with open(CACHE_BASE_FOLDER + 'rxn_mapping_cache3.json', 'r') as f:
    model_rxn_mapping = json.loads(f.read())
model_cpd_mapping = None
with open(CACHE_BASE_FOLDER + 'cpd_mapping_cache3.json', 'r') as f:
    model_cpd_mapping = json.loads(f.read())

In [139]:
for o in bios_cache[model_id]['spi']:
    if 'id' in o:
        id = o['id']


{'major_label': 'MetaboliteSpecie',
 'speciesType': 't_1198',
 'annotation': '<annotation>\n          <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:vCard="http://www.w3.org/2001/vcard-rdf/3.0#" xmlns:bqbiol="http://biomodels.net/biology-qualifiers/" xmlns:bqmodel="http://biomodels.net/model-qualifiers/">\n            <rdf:Description rdf:about="#meta_e_0997">\n              <bqbiol:isEncodedBy>\n                <rdf:Bag>\n                  <rdf:li rdf:resource="http://identifiers.org/kegg.genes/sce:YIL009W"/>\n                  <rdf:li rdf:resource="http://identifiers.org/sgd/S000001271"/>\n                </rdf:Bag>\n              </bqbiol:isEncodedBy>\n              <bqbiol:is>\n                <rdf:Bag>\n                  <rdf:li rdf:resource="http://identifiers.org/uniprot/P39002"/>\n                </rdf:Bag>\n              </bqbiol:is>\n            </rdf:Description>\n  

In [141]:
for model_id in bios_cache:
    if not model_id in model_cpd_mapping:
        model_cpd_mapping[model_id] = {}
    if not model_id in model_rxn_mapping:
        model_rxn_mapping[model_id] = {}
    for o in bios_cache[model_id]['spi']:
        if 'id' in o:
            id = o['id']
            cmp = o['compartment']
            if not cmp in model_cpd_mapping[model_id]:
                model_cpd_mapping[model_id][cmp] = {}
            if not id in model_cpd_mapping[model_id][cmp]:
                model_cpd_mapping[model_id][cmp][id] = []
    for o in bios_cache[model_id]['rxn']:
        if 'id' in o:
            id = o['id']
            if not id in model_rxn_mapping[model_id]:
                model_rxn_mapping[model_id][id] = []

In [108]:
model_cpd_mapping.keys()

dict_keys(['iAL1006', 'iCT646', 'iLC915', 'iMA871', 'iMM904', 'iNL895', 'iRL766', 'iSS884', 'iTO977', 'yeast_6.06', 'yeast_7.6', 'iJDZ836', 'iWV1213', 'iJO1366', 'iCac802', 'iAF1260', 'iML1515', 'iOD907', 'iJL1454', 'iNX804', 'iWV1314'])

In [111]:
o

{'major_label': 'MetaboliteSpecie',
 'id': 'M_MALCOA',
 'compartment': 'Cytoplasm',
 'name': 'malonyl-CoA'}

In [112]:
bios_cache[model_id]['spi'][0]

{'major_label': 'MetaboliteSpecie',
 'id': 'M_MALCOA',
 'compartment': 'Cytoplasm',
 'name': 'malonyl-CoA'}

In [118]:
models = [
    'iNL895',
    'iCT646',
    
    'iMM904',
    'iTO977',
    'iSS884',
    'iLC915',
    
    'iWV1213',
    'iAL1006',
    
    'iRL766',
    'iMA871',
    
    'iJDZ836',
    'iWV1314',
    'iOD907',
    'iJL1454',
    'iNX804',
    'yeast_6.06',
    'yeast_7.6',
]

In [135]:
def count_mapped_r(a):
    c = 0
    for k in a:
        if len(a[k]) > 0:
            c += 1
    return c
def count_mapped_c(a):
    c = 0
    for cmp in a:
        for k in a[cmp]:
            if len(a[cmp][k]) > 0:
                c += 1
    return c
def count_any_c(a):
    c = 0
    for cmp in a:
        c += len(a[cmp])
    return c

In [213]:
for model_id in models:
    if model_id in model_rxn_mapping and model_id in model_cpd_mapping:
        print(model_id, 
              count_any_c(model_cpd_mapping[model_id]), 
              count_mapped_c(model_cpd_mapping[model_id]),
              len(model_rxn_mapping[model_id]),
              count_mapped_r(model_rxn_mapping[model_id]))
    else:
        print('!')

iNL895 1847 1587 2002 1987
iCT646 1120 1120 1422 1251
iMM904 1226 1217 1577 487
iTO977 1353 1064 1562 528
iSS884 2228 1293 1376 1376
iLC915 2301 1359 1448 1448
iWV1213 1406 1406 1326 1284
iAL1006 2429 1138 1660 1660
iRL766 2156 1324 1637 1504
iMA871 1210 399 1358 0
iJDZ836 1008 0 1845 12
iWV1314 1102 1099 1406 1329
iOD907 2338 1477 2180 1867
iJL1454 1160 1160 1416 1276
iNX804 1025 1025 1286 0
yeast_6.06 2606 1178 1888 759
yeast_7.6 3370 1171 3493 699


In [237]:
for model_id in model_cpd_mapping:
    print(model_id, model_cpd_mapping[model_id].keys())
    for cmp in model_cpd_mapping[model_id]:
        for id in model_cpd_mapping[model_id][cmp]:
            if model_id in cpd_sam_mapping and id in cpd_sam_mapping[model_id]:
                if not set(cpd_sam_mapping[model_id][id]) == set(model_cpd_mapping[model_id][cmp][id]):
                    if len((model_cpd_mapping[model_id][cmp][id])) == 0:
                        model_cpd_mapping[model_id][cmp][id] = cpd_sam_mapping[model_id][id]
                    else:
                        model_cpd_mapping[model_id][cmp][id] = cpd_sam_mapping[model_id][id]
                        #print(id, cpd_sam_mapping[model_id][id], model_cpd_mapping[model_id][cmp][id])
        #print('\t', cmp, len(model_cpd_mapping[model_id][cmp]))

iAL1006 dict_keys(['C_b', 'C_c', 'C_e', 'C_m', 'C_p'])
iCT646 dict_keys(['c', 'e', 'm', 'x'])
iLC915 dict_keys(['C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6', 'C_7', 'C_8'])
iMA871 dict_keys(['Cytosol', 'Extracellular_space', 'Mitochondrion', 'System_input'])
iMM904 dict_keys(['c', 'e', 'g', 'm', 'n', 'r', 'v', 'x'])
iNL895 dict_keys(['boundary', 'c_01', 'c_02', 'c_03', 'c_04', 'c_05', 'c_06', 'c_08', 'c_10', 'c_12', 'c_14', 'c_15', 'c_16'])
iRL766 dict_keys(['C_b', 'C_c', 'C_e', 'C_m', 'C_p'])
iSS884 dict_keys(['C_1', 'C_2', 'C_3', 'C_4', 'C_5'])
iTO977 dict_keys(['C_1', 'C_2', 'C_3', 'C_4', 'C_5'])
yeast_6.06 dict_keys(['c_01', 'c_02', 'c_03', 'c_04', 'c_05', 'c_06', 'c_07', 'c_08', 'c_09', 'c_10', 'c_11', 'c_13', 'c_14', 'c_15', 'c_16', 'c_17'])
yeast_7.6 dict_keys(['c_01', 'c_02', 'c_03', 'c_04', 'c_05', 'c_06', 'c_07', 'c_08', 'c_09', 'c_10', 'c_11', 'c_13', 'c_14', 'c_15', 'c_16', 'c_17'])
iJDZ836 dict_keys(['CCO__45__GLYOXYSOME', 'CCO__45__CYTOSOL', 'CCO__45__MIT', 'CCO__45__EXTRACEL

In [238]:
for model_id in model_rxn_mapping:
    print(model_id)
    for id in model_rxn_mapping[model_id]:
        if model_id in rxn_sam_mapping and id in rxn_sam_mapping[model_id]:
            if not set(rxn_sam_mapping[model_id][id]) == set(model_rxn_mapping[model_id][id]):
                if len((model_rxn_mapping[model_id][id])) == 0:
                    model_rxn_mapping[model_id][id] = rxn_sam_mapping[model_id][id]
                else:
                    model_rxn_mapping[model_id][id] = rxn_sam_mapping[model_id][id]

iAL1006
iCT646
iLC915
iMM904
iRL766
iSS884
iTO977
yeast_6.06
yeast_7.6
iJDZ836
iWV1213
iJO1366
iCac802
iAF1260
iML1515
iMA871
iOD907
iNL895
iJL1454
iNX804
iWV1314


In [239]:
with open(CACHE_BASE_FOLDER + 'rxn_mapping_cache4.json', 'w') as f:
    f.write(json.dumps(model_rxn_mapping))
with open(CACHE_BASE_FOLDER + 'cpd_mapping_cache4.json', 'w') as f:
    f.write(json.dumps(model_cpd_mapping))

In [240]:
model_cpd_mapping['iNL895']

{'boundary': {'s_0464_b': [],
  's_2808_b': [],
  's_2809_b': [],
  's_2810_b': [],
  's_2811_b': [],
  's_2812_b': [],
  's_2813_b': [],
  's_2814_b': [],
  's_2815_b': [],
  's_2816_b': [],
  's_2817_b': [],
  's_2818_b': [],
  's_2819_b': [],
  's_2820_b': [],
  's_2821_b': [],
  's_2822_b': [],
  's_2823_b': [],
  's_2824_b': [],
  's_2825_b': [],
  's_2826_b': [],
  's_2827_b': [],
  's_2828_b': [],
  's_2829_b': [],
  's_2830_b': [],
  's_2831_b': [],
  's_2832_b': [],
  's_2833_b': [],
  's_2834_b': [],
  's_2835_b': [],
  's_2836_b': [],
  's_2837_b': [],
  's_2838_b': [],
  's_2839_b': [],
  's_2840_b': [],
  's_2841_b': [],
  's_2842_b': [],
  's_2843_b': [],
  's_2844_b': [],
  's_2845_b': [],
  's_2846_b': [],
  's_2847_b': [],
  's_2848_b': [],
  's_2849_b': [],
  's_2850_b': [],
  's_2851_b': [],
  's_2852_b': [],
  's_2853_b': [],
  's_2854_b': [],
  's_2855_b': [],
  's_2856_b': [],
  's_2857_b': [],
  's_2858_b': [],
  's_2859_b': [],
  's_2860_b': [],
  's_2861_b': []