In [1]:
try:
    import CBRdb

except ImportError:
    import sys
    import os

    sys.path.append(os.path.abspath('../'))
    import CBRdb

import pandas as pd


Explicitly import supporting packages and files; identify changes

In [3]:
# Impose string type on specific columns when reading compound files
cp_str_cols = {k: str for k in CBRdb.tools_files.space_sep_str_cols_cps}
compounds = pd.read_csv('../CBRdb_C.csv', dtype=cp_str_cols, low_memory=False, index_col=0)
cpd_data, formula_table = CBRdb.compound_lookup_tables(compounds)

bad_flags_used = 'structure_missing|shortcut'
R_ids_bad = pd.read_csv('../data/R_ids_bad.dat', index_col=0).iloc[:, 0]
bad_flag_rxns = R_ids_bad.str.extractall(f'({bad_flags_used})')[0].groupby(level=0).apply(list)

dbs = dict()
sources = ['kegg', 'atlas']
for s in sources:
    print(f'\n\nProcessing source: {s}', flush=True)
    # download pre and post processed reaction data
    pre = pd.read_csv(f'../data/{s}_data_R_dedupedCs.csv', low_memory=False, index_col=0)
    post = pd.read_csv(f'../data/{s}_data_RCs_processed.csv', low_memory=False, index_col=0)

    kept = pre.index.intersection(post.index)
    dropped = bad_flag_rxns.loc[pre.index.difference(post.index)]
    changed = post.loc[kept, 'reaction'].compare(pre.loc[kept, 'reaction'],
                                                 result_names=('post', 'pre'))

    print('Getting pre-balancing stats...', flush=True)
    dfs_pre = CBRdb.filter_reactions_pandas(pre, formula_table=formula_table, data_c=cpd_data)
    print('Getting post-balancing stats...', flush=True)
    dfs_post = CBRdb.filter_reactions_pandas(post, formula_table=formula_table, data_c=cpd_data)

    dbs.update({s + '_pre': pre,
                s + '_post': post,
                s + '_kept': kept,
                s + '_dropped': dropped,
                s + '_changed': changed,
                s + '_dfs_pre': dfs_pre,
                s + '_dfs_post': dfs_post, })


making "cpd_data": DataFrame of compound attributes relevant for balancing reactions
making "formula_table": matrix-like DataFrame of element counts for each compound


Processing source: kegg
Getting pre-balancing stats...
making "rns": DataFrame of reaction attributes
making "formula_sides": pd.Series listing sets of formulas for to-rebalance reactions above.
making "el_diff_groups": DataFrame assigning labels to reactions based on element-count diff (R - L).
Getting post-balancing stats...
making "rns": DataFrame of reaction attributes
making "formula_sides": pd.Series listing sets of formulas for to-rebalance reactions above.
making "el_diff_groups": DataFrame assigning labels to reactions based on element-count diff (R - L).


Processing source: atlas
Getting pre-balancing stats...
making "rns": DataFrame of reaction attributes
making "formula_sides": pd.Series listing sets of formulas for to-rebalance reactions above.
making "el_diff_groups": DataFrame assigning labels to reactio

Get stats on datasets both pre- and post-balancing
- note that starred and var-list entries can't be confirmed as balanced
- some starred/var-list/etc. entries are removed due to being shortcuts or missing data

In [4]:
shortcuts = bad_flag_rxns[bad_flag_rxns.map(lambda x: 'shortcut' in x)].index
for s in sources:
    rns_pre = dbs[s + '_dfs_pre']['rns']
    rns_post = dbs[s + '_dfs_post']['rns']

    rns_pre['present'], rns_post['present'] = True, True
    rns_pre['shortcut'] = rns_pre.index.isin(shortcuts)
    rns_post['shortcut'] = rns_post.index.isin(shortcuts)
    rns_pre['confirmed_balanced'] = rns_pre['rebalanceable'] * rns_pre['is_balanced']
    rns_post['confirmed_balanced'] = rns_post['rebalanceable'] * rns_post['is_balanced']

    categories = ['present', 'bool_missing_data', 'shortcut', 'cpd_starred', 'bool_var_list', 'confirmed_balanced']
    rn_stats = rns_pre[categories].sum().rename('pre').to_frame().join(
        rns_post[categories].sum().rename('post').to_frame()).fillna(0).astype(int)
    rn_stats['change'] = rn_stats['post'] - rn_stats['pre']

    dbs[s + '_stats'] = rn_stats


In [5]:
dbs['kegg_stats'].rename_axis("KEGG")


Unnamed: 0_level_0,pre,post,change
KEGG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
present,11861,10639,-1222
bool_missing_data,928,0,-928
shortcut,369,0,-369
cpd_starred,2127,1920,-207
bool_var_list,71,68,-3
confirmed_balanced,7777,8333,556


In [6]:
dbs['atlas_stats'].rename_axis("ATLAS")


Unnamed: 0_level_0,pre,post,change
ATLAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
present,149082,147829,-1253
bool_missing_data,1206,0,-1206
shortcut,47,0,-47
cpd_starred,28724,28025,-699
bool_var_list,0,0,0
confirmed_balanced,110105,117102,6997


Finally, gets stats on the curated, merged database.
1. How many compounds are there? How many are starred (i.e. under-defined)?
2. How many reactions are there?

In [7]:
# TODO: did starred reactions get merged? if so, fix
# TODO: of starred reactions, what fraction is balanced if star is same on both sides?

reactions = pd.read_csv('../CBRdb_R.csv', low_memory=False, index_col=0)
dfs_R = CBRdb.filter_reactions_pandas(reactions, formula_table=formula_table, data_c=cpd_data)
rns = dfs_R['rns']
rns['present'] = True
rns['confirmed_balanced'] = rns['rebalanceable'] * rns['is_balanced']
categories = ['present', 'bool_missing_data', 'cpd_starred', 'bool_var_list', 'confirmed_balanced']
rn_stats = rns[categories].sum().to_frame()
dbs['CBRdb_stats'] = rn_stats
dbs['CBRdb_stats']


making "rns": DataFrame of reaction attributes
making "formula_sides": pd.Series listing sets of formulas for to-rebalance reactions above.
making "el_diff_groups": DataFrame assigning labels to reactions based on element-count diff (R - L).


Unnamed: 0,0
present,153411
bool_missing_data,0
cpd_starred,29429
bool_var_list,68
confirmed_balanced,120895


What does the reaction database contain?

In [8]:
reactions.describe(include='all').iloc[0:2].T


Unnamed: 0,count,unique
reaction,153411.0,153411.0
ec,152132.0,6144.0
module,1908.0,521.0
orthology,6127.0,4127.0
pathway,7141.0,855.0
rclass,9697.0,9267.0
rhea,5920.0,5894.0
balancer_failed,153411.0,2.0
bool_missing_data,153411.0,1.0
bool_var_list,153411.0,2.0


In [9]:
change_summary = dict()
for db in ['kegg', 'atlas']:
    changed = dbs[f'{db}_changed']
    cps_listed = changed.apply(lambda x: x.str.findall(r'C\d+')).map(sorted)
    cps_setted = cps_listed.map(set)
    cps_added_new = (cps_setted['post'] - cps_setted['pre']).explode().rename('post')
    # if len(post) == len(pre), only numbers changed
    chgd_coeffs_only = cps_listed.map(len).nunique(axis=1).eq(1)
    # if lists are different lengths, but sets are the same, an existing cpd was added to the other side
    cpd_both_sides = cps_listed.map(len).nunique(axis=1).eq(2) * cps_added_new.map(len, na_action='ignore').isna()
    # This means one compound will occur twice in the "post" compound list. Isolate these
    icps = cps_listed['post'].explode().loc[cpd_both_sides].reset_index()
    icps = icps.loc[icps.duplicated(keep=False)].drop_duplicates().set_index('id')['post']
    # Combine the additions
    cps_added_new.update(icps)
    dbs[f'{db}_cps_injected'] = cps_added_new
    change_summary[db] = cps_added_new.fillna('Coeffs').value_counts()

change_summary_df = pd.DataFrame(change_summary).fillna(0).astype(int).rename_axis('Change')
change_summary_df


Unnamed: 0_level_0,kegg,atlas
Change,Unnamed: 1_level_1,Unnamed: 2_level_1
C00001,36,81
C00007,171,6
C00027,4,0
C00067,17,35
C00080,169,4214
C00087,1,0
C00132,8,113
C00218,0,14
C00237,7,0
C00282,252,328
