# Setup

In [1]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import json

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

ALLOWED_METAL_NAMES =   {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'CPD-8123', 'Nickel': 'NI+2', 'Zinc': 'ZN+2',
                        'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                        'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6',
                         'Iron-sulfur \(4Fe-4S-S-AdoMet\)': 'CPD-7', 'Iron-sulfur \(3Fe-4S\)': '3FE-4S', 'Iron-oxo-sulfur \(4Fe-2O-2S\)': 'CPD-7',
                        'heme': 'Heme-b', 'Molybdate': 'CPD-3', 'heme B': 'Heme-b',
                         'Selenocysteine': 'L-SELENOCYSTEINE',
                        'Divalent metal cation': 'Any+2'}


ACCEPTED_OTHER_FEATURES = {'PYRIDOXAL_PHOSPHATE', 'THIAMINE-PYROPHOSPHATE', 'FMN', 'FAD'}

def get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, name_list, level_vector, level=2, parent_dict=None):

    if parent_dict is None:
        parent_dict = {}

    cur_pathway_level = level_vector[cur_pathway_idx]

    if cur_pathway_level == level:
        parent_dict[name_list[cur_pathway_idx]] = cur_pathway_level

    parent_slice = pathway_matrix[:, cur_pathway_idx]
    parent_idxs = np.where(parent_slice != 0)[0]


    for idx in parent_idxs:

        _ = get_pathway_ith_level_parents(idx, pathway_matrix, name_list, level_vector, level, parent_dict)

    return parent_dict

## Connect to api

In [2]:
password = input("Enter Password: ")

In [3]:
s = requests.Session() # create session
# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'cellulararchitect@protonmail.com', 'password': password})

<Response [401]>

## Reload data

In [4]:
parsed_complex_df = pd.read_csv('notebooks/cofactors/data/raw_complexes.csv', index_col=False)

# read stoichiometry, cofactors and enzyme_reaction as literal sets
for column in ['stoichiometry', 'cofactors', 'enzyme_reaction']:
    parsed_complex_df[column] = parsed_complex_df[column].apply(ast.literal_eval)

parsed_protein_df = pd.read_csv('notebooks/cofactors/data/raw_proteins.csv', index_col=False)

for column in ['cofactors', 'enzyme_reaction', 'metal_features', 'other_features', 'direct_annotations']:
    parsed_protein_df[column] = parsed_protein_df[column].apply(ast.literal_eval)


parsed_cofactor_df = pd.read_csv('notebooks/cofactors/data/raw_cofactors.csv', index_col=False)

for column in ['elemental_composition']:
    parsed_cofactor_df[column] = parsed_cofactor_df[column].apply(ast.literal_eval)

parsed_pathway_df = pd.read_csv('notebooks/cofactors/data/raw_pathways.csv', index_col=False)

for column in ['parents', 'children']:
    parsed_pathway_df[column] = parsed_pathway_df[column].apply(ast.literal_eval)

# Data processing into final tables
## Processing of pathway data into matrices

## Process raw EcoCyc annotations into standard EcoCyc names

In [5]:
metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')

print(metal_regex.search('Selenocysteine.'))

<re.Match object; span=(0, 15), match='Selenocysteine.'>


In [6]:

# remove all \ characters from keys in ALLOWED_METAL_NAMES
NON_REGEX_METAL = {key.replace('\\', ''): value for key, value in ALLOWED_METAL_NAMES.items()}

parsed_protein_df['metal_features_processed'] = 0
parsed_protein_df['metal_features_processed'] = parsed_protein_df['metal_features_processed'].astype(object)

metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')


for i in range(len(parsed_protein_df.index)):

    metal_binding = parsed_protein_df.loc[i, 'metal_features']

    metal_count_dict = {}
    existing_matches = set()

    for feature in metal_binding:
        matches = metal_regex.search(feature)
        if matches:
            metal = matches.group(0)[:-1]

            # eliminate duplicates
            if metal not in existing_matches:

                existing_matches.add(metal)

                if 'heme' in feature:
                    metal = metal.replace('Iron', 'heme')

                # check if last char of metal is a number, then crop
                if metal[-1].isdigit():
                    metal = metal[:-2]

                metal = metal.strip()

                # replace metal name with allowed metal name
                metal = NON_REGEX_METAL[metal]

                if metal in metal_count_dict:
                    metal_count_dict[metal] += 1
                else:
                    metal_count_dict[metal] = 1

        else:
            print(f'No match for {feature} in {parsed_protein_df.loc[i, "id"]}')




    parsed_protein_df.at[i, 'metal_features_processed'] = metal_count_dict

No match for UniProt: Magnesium or manganese. in 3-ISOPROPYLMALDEHYDROG-MONOMER
No match for conserved, Fe(III) binding motif in BASS-MONOMER
No match for predicted heme d ligand in CYDA-MONOMER
No match for UniProt: Fe(2+); catalytic. in CYTDEAM-MONOMER
No match for UniProt: Zn(2+); catalytic. in CYTDEAM-MONOMER
No match for The amino-terminus of ClpA contains a Zinc binding site. in EG10156-MONOMER
No match for The active-site magnesium ion is coordinated by three aspartate residues (401, 403, 555). Two of them form part of 
the PDXD active-site motif. in EG10238-MONOMER
No match for Divalent magnesium ions are chelated by three aspartate residues, two in the conserved DPD sequence (345, 347) 
and one in the conserved EGYMD sequence (269). in EG10239-MONOMER
No match for Based on crystal structures, Glu-265 and Asp-309 coordinate a divalent cation. in EG10239-MONOMER
No match for These residues are thought to coordinate the one or two divalent magnesium ions required for the 
gyrase 

In [7]:
parsed_protein_df = parsed_protein_df.drop(columns=['metal_features'])
parsed_protein_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,other_features,direct_annotations,metal_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,"{ENZRXN0-7991, ENZRXN0-8629, ENZRXN0-7993, ENZ...",{},[],"{PWY0-1319, PWY-5667}",{}
1,1-PFK-MONOMER,1-phosphofructokinase,{},{},[ATP],{PWY0-1314},{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},[NADP],{PANTO-PWY},{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,{ENZRXN0-6250},{},[],{LEUSYN-PWY},{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...","{ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",{},[],"{PWY-6708, MENAQUINONESYN-PWY}",{}
...,...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,{},{},[ATP],{},{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,{},{},[],{},{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,{},{},[],{},{'ZN+2': 1}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,{},{},[],{},{}


In [8]:
parsed_protein_df['other_features_processed'] = 0
parsed_protein_df['other_features_processed'] = parsed_protein_df['other_features_processed'].astype(object)

for i in range(len(parsed_protein_df.index)):

    other_features = parsed_protein_df.loc[i, 'other_features']

    other_feature_count_dict = {}
    existing_matches = set()

    for feature in other_features:

        # eliminate duplicates
        if feature not in existing_matches:

            existing_matches.add(feature)

            if feature in ACCEPTED_OTHER_FEATURES:
                if feature in other_feature_count_dict:
                    other_feature_count_dict[feature] += 1
                else:
                    other_feature_count_dict[feature] = 1

    parsed_protein_df.at[i, 'other_features_processed'] = other_feature_count_dict

In [9]:
parsed_protein_df = parsed_protein_df.drop(columns=['other_features'])

parsed_protein_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,direct_annotations,metal_features_processed,other_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,"{ENZRXN0-7991, ENZRXN0-8629, ENZRXN0-7993, ENZ...",{},"{PWY0-1319, PWY-5667}",{},{}
1,1-PFK-MONOMER,1-phosphofructokinase,{},{},{PWY0-1314},{},{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},{PANTO-PWY},{},{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,{ENZRXN0-6250},{},{LEUSYN-PWY},{},{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...","{ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",{},"{PWY-6708, MENAQUINONESYN-PWY}",{},{}
...,...,...,...,...,...,...,...
4415,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,{},{},{},{},{}
4416,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,{},{},{},{},{}
4417,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,{},{},{},{'ZN+2': 1},{}
4418,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,{},{},{},{},{}


## Create new column for monomer component stoichiometry

In [10]:
complex_ids = parsed_complex_df['id'].tolist()
monomer_names = parsed_protein_df['id'].tolist()

In [11]:
def recursive_component_tree(current_component_name, complex_table, protein_table,
                             current_multiplier=1, component_list=None, parent=None, return_cofactors=False):
    """
    Recursively find all downstream components of a given complex.
    """

    complex_names = complex_table['id'].tolist()
    monomer_names = protein_table['id'].tolist()


    my_children = {}

    if component_list is None:
        component_list = []


    if current_component_name in complex_names:


        cplx_idx = complex_table.index[complex_table['id'] == current_component_name][0]
        stoichiometry = complex_table.at[cplx_idx, 'stoichiometry']

        direct_children = {k: abs(v) for k, v in stoichiometry.items() if v < 0}

        for component_name, coefficient in stoichiometry.items():

            if coefficient < 0 and component_name != current_component_name:

                child_multiplier = abs(coefficient * current_multiplier)

                new_child = recursive_component_tree(component_name, complex_table, protein_table,
                                                     child_multiplier, component_list, current_component_name, return_cofactors)

                my_children = my_children | new_child


            elif coefficient > 0 and component_name == current_component_name:
                continue

            else:
                raise ValueError(f"key {component_name} and value {coefficient} for complex {component_name} not processed properly.")

        component_list.append({'name': current_component_name,'parent': parent, 'children': direct_children,
                               'multiplier': int(current_multiplier), })


    elif current_component_name in monomer_names:

        # TODO check if enzrxn
        if return_cofactors:
            protein_idx = protein_table.index[protein_table['id'] == current_component_name][0]

            protein_metals = protein_table.at[protein_idx, 'metal_features_processed']
            protein_other = protein_table.at[protein_idx, 'other_features_processed']

            table_cofactors = protein_metals | protein_other

            if len(table_cofactors) > 0:
                # TODO Add apo protein to component list
                my_children = {}

                for cofactor, cofactor_coefficient in table_cofactors.items():
                    if table_cofactors[cofactor] !=  None:
                        my_children[cofactor] = cofactor_coefficient
                        component_list.append({'parent': current_component_name,
                                               'name': cofactor,
                                               'multiplier': abs(int(current_multiplier * cofactor_coefficient)),
                                               'children': None})

            component_list.append({'parent': parent, 'name': current_component_name, 'multiplier': current_multiplier, 'children': my_children})

        else:
            my_children = None
            component_list.append({'parent': parent, 'name': current_component_name, 'multiplier': current_multiplier, 'children': None})



    else:
        print(f"component {current_component_name} not found in complex or protein tables")

        return {}


    if parent is None:
        return {current_component_name: my_children}, component_list
    else:
        return {current_component_name: my_children}


In [12]:
complex_tree_structure, nodes = recursive_component_tree('CPLX0-8167', parsed_complex_df, parsed_protein_df)
pp.pprint(nodes)

[{'children': None,
  'multiplier': 4,
  'name': 'HYAA-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': None,
  'multiplier': 4,
  'name': 'HYAB-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': None,
  'multiplier': 2,
  'name': 'HYAC-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': {'HYAA-MONOMER': 2, 'HYAB-MONOMER': 2, 'HYAC-MONOMER': 1},
  'multiplier': 2,
  'name': 'FORMHYDROGI-CPLX',
  'parent': 'CPLX0-8167'},
 {'children': {'FORMHYDROGI-CPLX': 2},
  'multiplier': 1,
  'name': 'CPLX0-8167',
  'parent': None}]


In [13]:
parsed_complex_df['monomer_component_stoichiometry'] = 0
parsed_complex_df['monomer_component_stoichiometry'] = parsed_complex_df['monomer_component_stoichiometry'].astype(object)

for i in range(len(parsed_complex_df.index)):
    complex_name = parsed_complex_df.loc[i, 'id']
    complex_tree_structure, nodes = recursive_component_tree(complex_name, parsed_complex_df, parsed_protein_df)

    monomer_components = {node['name']: node['multiplier'] for node in nodes if node['children'] is None}

    parsed_complex_df.at[i, 'monomer_component_stoichiometry'] = monomer_components

component CPLX0-7701 not found in complex or protein tables
component CPLX0-7677 not found in complex or protein tables
component MONOMER0-1781 not found in complex or protein tables
component CPLX0-7702 not found in complex or protein tables
component CSRB-RNA not found in complex or protein tables
component RNPB-RNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component IS061-RNA not found in complex or protein tables
component CSRC-RNA not found in complex or protein tables
component FFS-RNA not found in complex or protein tables


In [14]:
parsed_complex_df = parsed_complex_df.loc[:, ["id", "stoichiometry", "monomer_component_stoichiometry", "cofactors"]]
parsed_complex_df

Unnamed: 0,id,stoichiometry,monomer_component_stoichiometry,cofactors
0,1-PFK,"{'1-PFK': 1, '1-PFK-MONOMER': -2}",{'1-PFK-MONOMER': 2},{MG+2}
1,2OXOGLUTARATEDEH-CPLX,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':...","{'E1O-MONOMER': 12, 'E2O-MONOMER': 24, 'E3-MON...","{FAD, MG+2, THIAMINE-PYROPHOSPHATE, LIPOIC-ACID}"
2,3-ISOPROPYLMALDEHYDROG-CPLX,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY...",{'3-ISOPROPYLMALDEHYDROG-MONOMER': 2},"{MN+2, MG+2}"
3,3-ISOPROPYLMALISOM-CPLX,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':...","{'LEUC-MONOMER': 1, 'LEUD-MONOMER': 1}",{CPD-7}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3...",{'3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER': 10},{MG+2}
...,...,...,...,...
1063,CPLX0-8053,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}",{'EG10942-MONOMER': 1},{}
1064,CPLX0-8253,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON...",{'EG11447-MONOMER': 9},{}
1065,SRP-CPLX,"{'SRP-CPLX': 1, 'EG10300-MONOMER': -1, 'FFS-RN...",{'EG10300-MONOMER': 1},{}
1066,CPLX0-7796APO,"{'CPLX0-7796APO': 1, 'PD04032': -2}",{'PD04032': 2},{}


## Create tree matrix (also for Julia)

In [15]:
# save names
complex_ids = list(parsed_complex_df['id'])
protein_ids = list(parsed_protein_df['id'])
cofactor_ids = list(parsed_cofactor_df['id'])


name_idx = complex_ids + protein_ids + cofactor_ids
tree_matrix = np.zeros([len(complex_ids) + len(protein_ids) + len(cofactor_ids), len(complex_ids) + len(protein_ids) + len(cofactor_ids)], dtype=np.int64)

for i in range(len(parsed_complex_df)):
    name = parsed_complex_df.at[i, 'id']
    tree_structure, nodes = recursive_component_tree(name, parsed_complex_df, parsed_protein_df, return_cofactors=True)

    for node in nodes:
        node_name = node['name']
        node_children = node['children']

        if node_children != None:
            for child_name, child_coefficient in node_children.items():
                if child_name in name_idx:
                        tree_matrix[name_idx.index(node_name), name_idx.index(child_name)] = child_coefficient

component CPLX0-7701 not found in complex or protein tables
component CPLX0-7677 not found in complex or protein tables
component MONOMER0-1781 not found in complex or protein tables
component CPLX0-7702 not found in complex or protein tables
component CSRB-RNA not found in complex or protein tables
component RNPB-RNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component IS061-RNA not found in complex or protein tables
component CSRC-RNA not found in complex or protein tables
component FFS-RNA not found in complex or protein tables


# Create matrices to get cofactor counts

In [16]:
parsed_pathway_df

Unnamed: 0,id,parents,children,level
0,Signaling-Pathways,[],"[PWY0-1559, PWY0-1495, PWY0-1518, PWY0-1490, P...",1
1,PWY0-1559,[Signaling-Pathways],[],2
2,PWY0-1495,[Signaling-Pathways],[],2
3,PWY0-1518,[Signaling-Pathways],[],2
4,PWY0-1490,[Signaling-Pathways],[],2
...,...,...,...,...
1159,GLUTATHIONESYN-PWY,[Reductants],[],4
1160,Butanediol-Biosynthesis,[Other-biosynthesis],[],3
1161,CYCLOPEPTIDES,[Other-biosynthesis],[],3
1162,6-HM-Dihydropterin-PP-Biosynthesis,[Other-biosynthesis],[PWY-6147],3


In [17]:
pathway_name_list = list(parsed_pathway_df['id'])

priority_list = ['Activation-Inactivation-Interconversion','Metabolic-Clusters', 'Macromolecule-Modification', 'Glycan-Pathways','Detoxification',  'Degradation']

# pathway matrix is necessary to traverse tree
pathway_matrix = np.zeros((len(pathway_name_list), len(pathway_name_list)), dtype=np.int64)
level_vector = np.zeros(len(pathway_name_list), dtype=np.int64)

for i in range(len(parsed_pathway_df)):

    cur_pathway = parsed_pathway_df.at[i, 'id']
    level_vector[i] = parsed_pathway_df.at[i, 'level']

    pathway_parents = parsed_pathway_df.at[i, 'parents']
    pathway_children = parsed_pathway_df.at[i, 'children']

    for parent in pathway_parents:
        j = pathway_name_list.index(parent)
        pathway_matrix[j, i] = 1

    for child in pathway_children:
        j = pathway_name_list.index(child)
        pathway_matrix[i, j] = 1

original_pathway_matrix = pathway_matrix.copy()

# get superpathway indices
super_pathway_idx = pathway_name_list.index('Super-Pathways')
super_pathway_children_idxs = np.where(pathway_matrix[super_pathway_idx, :] == 1)[0]

# zero out all superpathway children
pathway_matrix[:, super_pathway_children_idxs] = 0

# for columns (children) with multiple parents, if one parent leads to degradation or glycans, remove it.
for i in range(len(pathway_matrix[0, :])):
    cur_pathway = pathway_name_list[i]

    if pathway_matrix[:, i].sum() > 1:

        nz_idxs = np.where(pathway_matrix[:, i] == 1)[0]
        top_level_classes = [list(get_pathway_ith_level_parents(j, original_pathway_matrix, pathway_name_list, level_vector, level=1).keys())[0] for j in nz_idxs]
        # print(f"multiple parents {top_level_classes} for {cur_pathway}")


        # when there are multiple parents, remove them in the following order of priority:
        for priority in priority_list:
            while priority in top_level_classes and len(nz_idxs) > 1:
                priority_index = top_level_classes.index(priority)
                pathway_matrix[nz_idxs[priority_index], i] = 0
                nz_idxs = np.where(pathway_matrix[:, i] == 1)[0]
                top_level_classes[priority_index] = 'N/A'


        # then, if there are still multiple parents, remove all but the first one
        # TODO Change to parent with most frequently occuring 2nd parent.
        if len(nz_idxs) > 1:
            # top_two_level_classes = [list(get_pathway_ith_level_parents(j, original_pathway_matrix, pathway_name_list, level_vector, level=2).keys())[0] for j in nz_idxs]
            # print(f"multiple parents with 2nd level categories {top_two_level_classes} for {cur_pathway}")
            pathway_matrix[nz_idxs[1:], i] = 0

        nz_idxs = np.where(pathway_matrix[:, i] == 1)[0]
        top_level_classes = [list(get_pathway_ith_level_parents(j, original_pathway_matrix, pathway_name_list, level_vector, level=1).keys()) for j in nz_idxs]

        # print(f"pruned to {top_level_classes}")

In [18]:
# create protein name to pathway mapping
W = np.zeros((len(parsed_protein_df.index), len(parsed_pathway_df.index)))

for i in range(len(parsed_protein_df.index)):

    cur_pathways = parsed_protein_df.at[i, 'direct_annotations']

    for pathway in cur_pathways:
        pathway_idx = pathway_name_list.index(pathway)
        W[i, pathway_idx] = 1


for i in range(len(parsed_protein_df.index)):

    cur_protein_pathways_idxs = np.where(W[i, :] == 1)[0]
    cur_protein = parsed_protein_df.at[i, 'id']

    if len(cur_protein_pathways_idxs) < 2:
        continue

    else:

        # for pathway_idx in cur_protein_pathways_idxs:

        # get top level class of every pathway, and remove all but the first one of each class.

        cur_protein_pathway_parents = list()
        cur_protein_pathway_two_parents = list()

        for pathway_idx in cur_protein_pathways_idxs:
            top_parents = get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_name_list, level_vector, level=1)
            top_two_parents = get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_name_list, level_vector, level=2)

            if len(top_parents) > 1:
                print(f"multiple parents {top_parents} for {pathway_name_list[pathway_idx]} for {cur_protein}, should not happen.")

            if len(top_parents) == 1:
                cur_protein_pathway_parents.append(list(top_parents.keys())[0])
                cur_protein_pathway_two_parents.append(list(top_two_parents.keys())[0])
            else:
                # remove pathway with no parents
                W[i, pathway_idx] = 0
                cur_protein_pathway_parents.append('N/A')
                cur_protein_pathway_two_parents.append('N/A')


        # TODO - remove direct annotations with deprioritized parents
        for priority in priority_list:
            while priority in cur_protein_pathway_parents and len(np.where(W[i, :] == 1)[0]) > 1:
                priority_index = cur_protein_pathway_parents.index(priority)
                W[i, cur_protein_pathways_idxs[priority_index]] = 0
                cur_protein_pathway_parents[priority_index] = 'N/A'


        # remove all N/A
        # cur_protein_pathways_idxs = cur_protein_pathways_idxs[cur_protein_pathway_parents != 'N/A']

        if len(np.unique(cur_protein_pathway_parents)) < 2:
            continue


In [19]:
# some post-analysis fixes
# replace these 2nd level pathways:
replacement_pathways = {'Aminoacyl-tRNAs-Charging': 'TRNA-CHARGING-PWY'}
for key, value in replacement_pathways.items():
    pathway_idx_to_replace = pathway_name_list.index(key)
    proteins_with_pathway = np.where(W[:, pathway_idx_to_replace] == 1)[0]

    pathway_idx_to_replace_with = pathway_name_list.index(value)

    W[proteins_with_pathway, pathway_idx_to_replace] = 0
    W[proteins_with_pathway, pathway_idx_to_replace_with] = 1

# normalize all W outputs to 1
W = W / (W.sum(axis=1, keepdims=1) + 1e-10)

In [20]:
# create pathway to 2nd layer pathway mapping
W2 = np.zeros((len(parsed_pathway_df.index), len(parsed_pathway_df.index)))

for i in range(len(parsed_pathway_df.index)):

    cur_pathway = parsed_pathway_df.at[i, 'id']
    cur_pathway_idx = pathway_name_list.index(cur_pathway)

    # get 2nd level parents
    cur_pathway_parents = get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, pathway_name_list, level_vector, level=2)

    # if len(cur_pathway_parents) > 1:
    #     print(f"cur pathway {cur_pathway} has parents {cur_pathway_parents}")

    for parent in cur_pathway_parents:
        parent_idx = pathway_name_list.index(parent)
        W2[i, parent_idx] = 1

# zero diagonal (don't return self, since some pathways return themselves as level 2 parents)
np.fill_diagonal(W2, 0)

W1 = np.zeros((len(parsed_pathway_df.index), len(parsed_pathway_df.index)))

for i in range(len(parsed_pathway_df.index)):

    if parsed_pathway_df.at[i, 'level'] <= 2:
        cur_pathway = parsed_pathway_df.at[i, 'id']
        cur_pathway_idx = pathway_name_list.index(cur_pathway)

        # get 2nd level parents
        cur_pathway_parents = get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, pathway_name_list, level_vector, level=1)

        for parent in cur_pathway_parents:
            parent_idx = pathway_name_list.index(parent)
            W1[i, parent_idx] = 1

np.fill_diagonal(W1, 0)



In [21]:

array_name_list = np.array(pathway_name_list)
pathway = 'PWY0-1321'
pathway_idx = pathway_name_list.index(pathway)

get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_name_list, level_vector, level=2)

{'Electron-Transfer': 2}

In [22]:
pwy_vec = np.zeros(len(parsed_pathway_df.index))
pwy_vec[pathway_idx] = 1

pathway_name_list[np.where(pwy_vec @ W2)[0][0]]

'Electron-Transfer'

In [23]:
# C matrix transforms complexes + monomers to just monomers.

# create protein name to index mapping
protein_name_to_index = {}
for i in range(len(parsed_protein_df.index)):
    protein_name_to_index[parsed_protein_df.at[i, 'id']] = i

# C matrix: complexes x proteins
C = np.zeros((len(parsed_complex_df.index), len(parsed_protein_df.index)))

for i in range(len(parsed_complex_df.index)):

    complex_components = parsed_complex_df.loc[i, 'monomer_component_stoichiometry']

    # TODO consider cofactors
    # complex_cofactors = filter_complex_df.loc[i, 'cofactors']


    for component_name, component_count in complex_components.items():
        if component_count is not None:             # side effect of parquet
            # get index of component in filter_protein_df
            component_index = protein_name_to_index[component_name]

            if parsed_complex_df.at[i, 'id'] == 'APORNAP-CPLX':
                print(f'component_name: {component_name}, component_count: {component_count}, component_index: {component_index}')

            C[i, component_index] = component_count

# append an identity matrix to C
C = np.concatenate((C, np.identity(len(parsed_protein_df.index))), axis=0)

C_names = list(parsed_complex_df['id']) + list(parsed_protein_df['id'])

component_name: EG10893-MONOMER, component_count: 2, component_index: 697
component_name: RPOC-MONOMER, component_count: 1, component_index: 4108
component_name: RPOB-MONOMER, component_count: 1, component_index: 4107


In [24]:
# P matrix transforms proteins to their respective cofactor counts.

# create cofactor name to index mapping
cofactor_name_to_index = {}
for i in range(len(parsed_cofactor_df.index)):
    cofactor_name_to_index[parsed_cofactor_df.at[i, 'id']] = i

cofactor_ids = list(parsed_cofactor_df['id'])

# P matrix: proteins x cofactors
P = np.zeros((len(parsed_protein_df.index), len(parsed_cofactor_df.index)))

for i in range(len(parsed_protein_df.index)):
    protein_metals = parsed_protein_df.loc[i, 'metal_features_processed']
    protein_other = parsed_protein_df.loc[i, 'other_features_processed']

    for metal, count in protein_metals.items():
        if count is not None:             # side effect of parquet
            cofactor_index = cofactor_name_to_index[metal]
            P[i, cofactor_index] = count

    for other, count in protein_other.items():
        if count is not None:             # side effect of parquet
            cofactor_index = cofactor_name_to_index[other]
            P[i, cofactor_index] = count


In [25]:
P

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
# E matrix transforms cofactors to their respective elemental composition

# create list of unique elements
unique_elements = set()
for i in range(len(parsed_cofactor_df.index)):
    cofactor = parsed_cofactor_df.at[i, 'elemental_composition']
    unique_elements.update(cofactor.keys())

unique_elements = list(unique_elements)

# create E matrix: cofactors x elements
E = np.zeros((len(parsed_cofactor_df.index), len(unique_elements)))

for i in range(len(parsed_cofactor_df.index)):
    cofactor = parsed_cofactor_df.at[i, 'elemental_composition']

    for element, count in cofactor.items():
        if count is not None:             # side effect of parquet
            element_index = unique_elements.index(element)
            E[i, element_index] = count


element_ids = unique_elements

In [27]:
unique_elements

['CU',
 'FE',
 'MG',
 'NI',
 'O',
 'MO',
 'H',
 'S',
 'MN',
 'K',
 'ZN',
 'SE',
 'N',
 'NA',
 'CO',
 'C',
 'P',
 'R',
 'CA']

In [28]:
C_to_E = C @ P @ E

In [29]:
C_names[1577]

'EG10471-MONOMER'

# Now ... add the counts >:o

In [30]:
time = '50'
date = '2023-06-09'
experiment = 'fba-redux'
entry = f'{experiment}_{time}_{date}'
folder = f'out/fbagd/{entry}/'

In [31]:
output = np.load(folder + 'output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']
bulk = pd.DataFrame(output['bulk'])

In [32]:
bulk

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40879,40880,40881,40882,40883,40884,40885,40886,40887,40888
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1934,1006,864,1590,2772,3232
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1936,1006,864,1590,2774,3232
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1938,1006,864,1590,2774,3234
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1940,1006,864,1590,2775,3234
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1941,1006,864,1591,2776,3234
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1941,1006,864,1591,2776,3235
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1942,1007,866,1592,2776,3237
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1941,1007,866,1594,2778,3237
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1942,1007,866,1595,2780,3238
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1942,1008,866,1595,2781,3239


In [33]:
f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

metabolism = agent['ecoli-metabolism-redux']
stoichiometry = metabolism.stoichiometry


In [34]:
initial_state = json.load(open('data/wcecoli_t0.json'))

bulk_ids = [item[0] for item in initial_state['bulk']]

bulk.columns = bulk_ids

In [35]:
ecocyc_to_wcm_map = {}

# combined complex and protein names
complex_protein_names = list(parsed_protein_df['id']) + list(parsed_complex_df['id'])

for name in complex_protein_names:

    # find complex name in bulk_ids
    found = False

    try:
        idx = bulk_ids.index(name+'[c]')
        ecocyc_to_wcm_map[name] = name+'[c]'
        found = True
        # print(f'found {complex_name} at {idx}')

    except ValueError:
        # delete key
        found = False


    if found == False:

        for id in bulk_ids:
            if name+'[' in id and id.startswith(name) and bulk.loc[:, id].sum() > 0:
                #print(f'found {name} in {id} with nonzero count')
                ecocyc_to_wcm_map[name] = id
                found = True
                break           # ensures preferring nonzero counts

            elif name+'[' in id and id.startswith(name):
                # print(f'found {name} in {id} with zero count')
                ecocyc_to_wcm_map[name] = id
                found = True

    if found == False:
        ecocyc_to_wcm_map[name] = '--TRANS-ACENAPHTHENE-12-DIOL[j]' # should be none
        print(f'could not find {name}')


could not find MONOMER0-1241
could not find MONOMER0-4223


In [36]:
complex_wcm_names = [ecocyc_to_wcm_map[name] for name in C_names]

counts = bulk.loc[0, complex_wcm_names]

# Finally ... add the counts >:o

In [37]:
factored_cofactor_elements = np.array(counts).reshape(-1,1) * C @ P @ E

In [38]:
factored_cofactor_elements

array([[   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,  344.,    0.,    0.],
       [   0.,    0., 2098., ...,    0.,    0.,    0.],
       ...,
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.]])

In [39]:
{unique_elements[i]: factored_cofactor_elements.sum(axis=0)[i] for i in range(len(unique_elements))}

{'CU': 4872.0,
 'FE': 219263.0,
 'MG': 246699.0,
 'NI': 8625.0,
 'O': 834542.0,
 'MO': 1642.0,
 'H': 702.0,
 'S': 251064.0,
 'MN': 40243.0,
 'K': 14221.0,
 'ZN': 149239.0,
 'SE': 22.0,
 'N': 345984.0,
 'NA': 1003.0,
 'CO': 1018.0,
 'C': 1554692.0,
 'P': 119428.0,
 'R': 70880.0,
 'CA': 10028.0}

In [40]:
parsed_protein_df[parsed_protein_df['id'] == 'FDOG-MONOMER']

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,direct_annotations,metal_features_processed,other_features_processed
1710,FDOG-MONOMER,formate dehydrogenase O subunit &alpha;,{},{},"{PWY0-1356, PWY0-1321, PWY0-1585, PWY0-1355}","{'L-SELENOCYSTEINE': 1, 'CPD-7': 1}",{}


In [41]:
fdog_idx = C_names.index('FDOG-MONOMER')
factored_cofactor_elements[fdog_idx, :]

array([ 0., 48.,  0.,  0., 24.,  0.,  0., 48.,  0.,  0.,  0., 12., 12.,
        0.,  0., 36.,  0.,  0.,  0.])

In [42]:
factored_cofactors = np.array(counts).reshape(-1,1) * C @ P
{cofactor_ids[i]: factored_cofactors.sum(axis=0)[i] for i in range(len(cofactor_ids))}

{'FAD': 21050.0,
 'Heme-b': 7940.0,
 'FE+2': 38789.0,
 'NI+2': 8625.0,
 'CA+2': 10028.0,
 'L-SELENOCYSTEINE': 22.0,
 'CPD-7': 32055.0,
 'MN+2': 40243.0,
 '3FE-4S': 2958.0,
 'ZN+2': 149239.0,
 'CU+2': 4872.0,
 'K+': 14221.0,
 'CO+2': 1018.0,
 'NA+': 1003.0,
 'THIAMINE-PYROPHOSPHATE': 4224.0,
 'PYRIDOXAL_PHOSPHATE': 55966.0,
 'FMN': 12680.0,
 'CPD-3': 1408.0,
 'CPD-6': 17720.0,
 'CPD-8123': 234.0,
 'Any+2': 12330.0,
 'MG+2': 246699.0}

In [43]:
np.where(P[:,cofactor_ids.index('L-SELENOCYSTEINE')])[0]

array([1707, 1710, 1745])

In [44]:
cidx = C_names.index('APORNAP-CPLX')
{unique_elements[i]: factored_cofactor_elements[cidx,i] for i in range(len(unique_elements)) if factored_cofactor_elements[cidx,i] > 0}

{'MG': 2674.0, 'ZN': 5348.0}

In [45]:
C_names.index('CARBPSYN-CPLX')

138

In [46]:
parsed_protein_df[parsed_protein_df['id'] == 'CARBPSYN-LARGE']

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,direct_annotations,metal_features_processed,other_features_processed
222,CARBPSYN-LARGE,carbamoyl phosphate synthetase subunit &beta;,{},{},"{PWY-5686, GLUTAMINDEG-PWY, AMMASSIM-PWY, ARGS...",{'MN+2': 4},{}


In [47]:
parsed_complex_df[parsed_complex_df['id'] == 'CARBPSYN-CPLX']

Unnamed: 0,id,stoichiometry,monomer_component_stoichiometry,cofactors
138,CARBPSYN-CPLX,"{'CARBPSYN-CPLX': 1, 'CARBPSYN-LARGE': -2, 'CA...","{'CARBPSYN-LARGE': 2, 'CARBPSYN-SMALL': 2}",{MG+2}


In [48]:
test_vec = np.zeros(len(counts))
test_vec[1] = 1

np.where((C @ W @ W2 @ W1).T @ test_vec)[0]

array([588, 658])

In [49]:
iron_idx = element_ids.index('FE')
iron_counts = (np.array(counts).reshape(-1,1) * C @ P @ E)[:, iron_idx]

In [50]:
# each complex count of 1 counts as 1 total annotation, so need to normalize
C_n = C / (C.sum(axis=1, keepdims=1) + 1e-10)


nz_idx = np.where((C_n @ W @ W2).T @ iron_counts)[0]
iron_counts_group = (C_n @ W @ W2).T @ iron_counts

In [51]:
iron_counts_group.sum()

104986.99998688056

In [52]:
iron_counts.sum()

219263.0

In [53]:
for idx in nz_idx:
    if iron_counts_group[idx] > 1000:
        print(pathway_name_list[idx], int(iron_counts_group[idx]))

REACTIVE-OXYGEN-SPECIES-DEGRADATION 1049
Fermentation 3704
TCA-VARIANTS 11897
Electron-Transfer 19565
Cofactor-Biosynthesis 32051
Nucleotide-Biosynthesis 2935
SECONDARY-METABOLITE-BIOSYNTHESIS 5890
Amino-Acid-Biosynthesis 26506


In [54]:
# iron_count_idxs = np.where(iron_counts)[0]
#
# for idx in iron_count_idxs:
#     print(f'{C_names[idx]}: {int(iron_counts[idx])}')
#     test_vec = np.zeros(len(iron_counts))
#     test_vec[idx] = iron_counts[idx]
#
#
#     nz_idx = np.where((C_n @ W @ W2).T @ test_vec)[0]
#
#     if len(nz_idx) > 0:
#         category_count = int(((C_n @ W @ W2).T @ test_vec)[nz_idx][0])
#         print(f'Transformed: {category_count}')
#     else:
#         print('Transformed: 0!')

# Tests

First, let's test that the cofactor counts are correct for SULFITE-REDUCT-CPLX and its constituent BETACOMP-MONOMER

In [55]:
print(f'Sulfite reductase complex count: {counts[1020]}, hemoprotein subunit count: {counts[1271]}')
print(f'Expected iron count: {counts[1020] * 5 * 4} for the first, {counts[1271] * 5 } for the second')

Sulfite reductase complex count: 92, hemoprotein subunit count: 373
Expected iron count: 1840 for the first, 1865 for the second


Let's check the pathway counts for the related pathway.

In [56]:
cplx_pathways = pathway_name_list[ np.where((C @ W)[1020, :])[0][0] ]
monomer_pathways = pathway_name_list[ np.where((C @ W)[1271, :])[0][0] ]


print(f'Sulfite reductase is in the following pathways: {cplx_pathways}, while the hemoprotein is in {monomer_pathways}')

Sulfite reductase is in the following pathways: SO4ASSIM-PWY, while the hemoprotein is in SO4ASSIM-PWY


That means the pathway counts for iron should be the same as the cplx counts (or larger)

In [57]:
pathway_counts = (iron_counts @ C_n @ W)[  np.where((C @ W)[1020, :])[0][0]  ]

print(f'Pathway counts for iron: {pathway_counts}')

Pathway counts for iron: 3704.9999994276664


In [58]:
print(f"""Monomer index: {C_names.index('SUPEROX-DISMUTFE-MONOMER')}, complex index: {C_names.index('SUPEROX-DISMUTFE-CPLX')}""")

Monomer index: 5232, complex index: 1021


In [59]:
print(f"""Monomer count: {counts[5232]}, complex count: {counts[1021]}""")

Monomer count: 0, complex count: 161


In [60]:
print(f"""Monomer index: {C_names.index('SUPEROX-DISMUTMN-MONOMER')}, complex index: {C_names.index('SUPEROX-DISMUTMN-CPLX')}""")
print(f"""Monomer count: {counts[5233]}, complex count: {counts[1022]}""")

Monomer index: 5233, complex index: 1022
Monomer count: 1, complex count: 9973


In [61]:
mn_idx = element_ids.index('MN')
mn_counts = (np.array(counts).reshape(-1, 1) * C @ P @ E)[:, mn_idx]
# each complex count of 1 counts as 1 total annotation, so need to normalize
C_n = C / (C.sum(axis=1, keepdims=1) + 1e-10)

nz_idx = np.where((C_n @ W @ W2).T @ mn_counts)[0]
mn_counts_group = (C_n @ W @ W2).T @ mn_counts

In [62]:
cplx_pathways = pathway_name_list[ np.where((C @ W)[5233, :])[0][0] ]
monomer_pathways = pathway_name_list[ np.where((C @ W)[1022, :])[0][0] ]


print(f'Superoxide dismutase is in the following pathways: {cplx_pathways}, while the monomer is in {monomer_pathways}')

Superoxide dismutase is in the following pathways: DETOX1-PWY, while the monomer is in DETOX1-PWY


In [63]:
pathway_counts = (mn_counts @ C_n @ W @ W2)[  np.where((C @ W @ W2)[5233, :])[0][0]  ]
pathway_counts

19946.9999970079

In [64]:
mn_counts[1022]

19946.0

# Saving outputs to files compatible with Julia

In [65]:
# save C, P and E to julia-compatible formats
np.savetxt('notebooks/cofactors/data/C_matrix.csv', C.astype(np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/P_matrix.csv', P.astype(np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/E_matrix.csv', E.astype(np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/W_matrix.csv', W.astype(np.float64), delimiter=',')
np.savetxt('notebooks/cofactors/data/W1_matrix.csv', W1.astype(np.float64), delimiter=',')
np.savetxt('notebooks/cofactors/data/W2_matrix.csv', W2.astype(np.float64), delimiter=',')


# write all names to single file with each list on a new line
with open('notebooks/cofactors/data/complex_ids.txt', 'w') as f:
    f.write('\n'.join(complex_ids))
with open('notebooks/cofactors/data/protein_ids.txt', 'w') as f:
    f.write('\n'.join(protein_ids))
with open('notebooks/cofactors/data/cofactor_ids.txt', 'w') as f:
    f.write('\n'.join(cofactor_ids))
with open('notebooks/cofactors/data/element_ids.txt', 'w') as f:
    f.write('\n'.join(element_ids))
with open('notebooks/cofactors/data/pathway_ids.txt', 'w') as f:
    f.write('\n'.join(pathway_name_list))


# save counts of proteins and complexes
np.savetxt('notebooks/cofactors/data/counts.csv', np.array(counts, dtype=np.int64), delimiter=',', fmt='%i')

# save tree_matrix
np.savetxt('notebooks/cofactors/data/tree_matrix.csv', tree_matrix, delimiter=',', fmt='%i')



In [66]:
counts

1-PFK[c]                                 31
2OXOGLUTARATEDEH-CPLX[c]                 86
3-ISOPROPYLMALDEHYDROG-CPLX[c]         1049
3-ISOPROPYLMALISOM-CPLX[c]             2617
3-METHYL-2-OXOBUT-OHCH3XFER-CPLX[c]     171
                                       ... 
YTFR-MONOMER[m]                           0
YTFT-MONOMER[i]                           0
ZNUA-MONOMER[p]                         522
ZNUB-MONOMER[i]                           0
ZNUC-MONOMER[i]                          39
Name: 0, Length: 5488, dtype: int64

In [67]:
element_ids

['CU',
 'FE',
 'MG',
 'NI',
 'O',
 'MO',
 'H',
 'S',
 'MN',
 'K',
 'ZN',
 'SE',
 'N',
 'NA',
 'CO',
 'C',
 'P',
 'R',
 'CA']

In [68]:
for name in protein_ids:
    if 'CPLX' in name:
        print(name)

AICARTRANSIMPCYCLO-CPLX


In [69]:
counts[ecocyc_to_wcm_map["NAP-CPLX"]]

10

In [70]:
ecocyc_to_wcm_map["NAP-CPLX"]

'NAP-CPLX[p]'

In [71]:
"^"+'NAP-CPLX'+"[" in 'NAP-CPLX['

False

In [72]:
protein_ids[1]

'1-PFK-MONOMER'

## go annotation for proteins