# Setup

In [1]:
import numpy as np

import seaborn as sns
import pandas as pd
import os
import pprint
import ast
import re
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import json

pp = pprint.PrettyPrinter(depth=6)

os.chdir(os.path.expanduser('~/vivarium-ecoli'))

ALLOWED_METAL_NAMES =   {'Iron': 'FE+2', 'Cobalt': 'CO+2', 'Copper': 'CU+2', 'Manganese': 'MN+2', 'Molybdenum': 'CPD-8123', 'Nickel': 'NI+2', 'Zinc': 'ZN+2',
                        'Calcium': 'CA+2', 'Magnesium': 'MG+2', 'Sodium': 'NA+', 'Potassium': 'K+',
                        'Iron-sulfur \(4Fe-4S\)': 'CPD-7', 'Iron-sulfur \(2Fe-2S\)': 'CPD-6',
                         'Iron-sulfur \(4Fe-4S-S-AdoMet\)': 'CPD-7', 'Iron-sulfur \(3Fe-4S\)': '3FE-4S', 'Iron-oxo-sulfur \(4Fe-2O-2S\)': 'CPD-7',
                         'Iron-sulfur': 'CPD-7', # has to be after others since it is a substring of others
                        'heme': 'Heme-b', 'Molybdate': 'CPD-3', 'heme B': 'Heme-b', 'Cobalamin': 'COB-I-ALAMIN',
                         'Selenocysteine': 'L-SELENOCYSTEINE',
                        'Divalent metal cation': 'Any+2'}


ACCEPTED_OTHER_FEATURES = {'PYRIDOXAL_PHOSPHATE', 'THIAMINE-PYROPHOSPHATE', 'FMN', 'FAD', 'LIPOIC-ACID', 'BIOTIN'}

AMINO_ACID_MAP = {'A': 'ALA', 'C': 'CYS', 'D': 'ASP', 'E': 'GLU', 'F': 'PHE', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
                  'K': 'LYS', 'L': 'LEU', 'M': 'MET', 'N': 'ASN', 'P': 'PRO', 'Q': 'GLN', 'R': 'ARG', 'S': 'SER',
                  'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR', 'U': 'SEL', '*': 'TER'}

# Create a list of temporarily allowed Gene Ontology terms to fix gaps in pathway annotations. Usually non-metabolic
# Currently: Tx Reg, Transcription, translation, DNA replication, Cell division, iron-sulfur cluster assembly, proteolysis, dna repair, copper response
TEMP_GO_TERMS = {'GO:0006355': 'Regulation of transcription', 'GO:0010468': 'Regulation of transcription',
                  'GO:0006351': 'Transcription', 'GO:0006350': 'Transcription',
                  'GO:0006412': 'Translation', 'GO:0006260': 'DNA replication', 'GO:0045454': 'Redox homeostasis',
                  'GO:0051301': 'Cell division', 'GO:0015288': 'Porin',
                  'GO:0009451': 'RNA modification', 'GO:0006400': 'tRNA modification', 'GO:0008033': 'tRNA processing',
                  'GO:0018339': 'Ribosome biogenesis', 'GO:0042254': 'Ribosome biogenesis', 'GO:0006364': 'rRNA processing',
                  'GO:0016226': 'Iron-sulfur cluster assembly', 'GO:0006508': 'Proteolysis', 'GO:0006281': 'DNA repair',
                  'GO:0006879': 'Iron homeostasis', 'GO:0033214': 'Iron homeostasis', 'GO:0015685': 'Iron homeostasis',  'GO:0015687': 'Iron homeostasis',
                  # 'GO:0006457': 'Protein folding', 
                  # 'GO:0030091': 'Protein repair',
                  # 'GO:0006605': 'Protein targeting', ' GO:0015031': 'Protein transport', 
                  # 'GO:0051205': 'Protein insertion', 
                 # summarize those in one term
                  'GO:0006457': 'Protein folding, localization and repair', 'GO:0030091': 'Protein folding, localization and repair', 
                  'GO:0006605': 'Protein folding, localization and repair', 'GO:0015031': 'Protein folding, localization and repair', 
                  'GO:0051205': 'Protein folding, localization and repair',
                  'GO:0046688': 'Response to copper ion', 
                  'GO:ZZZ': 'Non-porin small-molecule transport', 'GO:0071702': 'Non-porin small-molecule transport',
                  'GO:STRUCTURE': 'Structural maintenance',}

# 

# as residues
AMINO_ACID_RESIDUE_MASSES = {
    'ALA': 71.03711, 'ARG': 156.10111, 'ASN': 114.04293, 'ASP': 115.02694,
    'CYS': 103.00919, 'GLU': 129.04259, 'GLN': 128.05858, 'GLY': 57.02146,
    'HIS': 137.05891, 'ILE': 113.08406, 'LEU': 113.08406, 'LYS': 128.09496,
    'MET': 131.04049, 'PHE': 147.06841, 'PRO': 97.05276, 'SER': 87.03203,
    'THR': 101.04768, 'TRP': 186.07931, 'TYR': 163.06333, 'VAL': 99.06841,
    'SEL': 150.0379, 'TER': 0.0
}


erroneous_monomer_metal_interactions = ["EG11415-MONOMER", "EG12132-MONOMER",
                                        "EG12332-MONOMER", "EG11378-MONOMER", "G7748-MONOMER", 
                                        "CRR-MONOMER", "EG11663-MONOMER", "GLYOXI-MONOMER",
                                        "EG10697-MONOMER", "EG12310-MONOMER", "EG10694-MONOMER", "EG10698-MONOMER", #Mn
                                        "PPENTOMUT-MONOMER", ] #Mn 

erroneous_pathway_annotations = ['EG10695-MONOMER']



def get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, name_list, level_vector, level=2, parent_dict=None):

    if parent_dict is None:
        parent_dict = {}

    cur_pathway_level = level_vector[cur_pathway_idx]

    if cur_pathway_level == level:
        parent_dict[name_list[cur_pathway_idx]] = cur_pathway_level

    parent_slice = pathway_matrix[:, cur_pathway_idx]
    parent_idxs = np.where(parent_slice != 0)[0]


    for idx in parent_idxs:

        _ = get_pathway_ith_level_parents(idx, pathway_matrix, name_list, level_vector, level, parent_dict)

    return parent_dict

## Reload data

In [2]:
parsed_complex_df = pd.read_csv('notebooks/cofactors/data/raw_complexes.csv', index_col=False)

# read stoichiometry, cofactors and enzyme_reaction as literal sets
for column in ['stoichiometry', 'cofactors', 'enzyme_reaction']:
    parsed_complex_df[column] = parsed_complex_df[column].apply(ast.literal_eval)

parsed_protein_df = pd.read_csv('notebooks/cofactors/data/raw_proteins.csv', index_col=False)

for column in ['cofactors', 'enzyme_reaction', 'metal_features', 'other_features', 'direct_annotations', 'go_annotations']:
    parsed_protein_df[column] = parsed_protein_df[column].apply(ast.literal_eval)


parsed_cofactor_df = pd.read_csv('notebooks/cofactors/data/raw_cofactors.csv', index_col=False)

for column in ['elemental_composition']:
    parsed_cofactor_df[column] = parsed_cofactor_df[column].apply(ast.literal_eval)

parsed_pathway_df = pd.read_csv('notebooks/cofactors/data/raw_pathways.csv', index_col=False)

for column in ['parents', 'children']:
    parsed_pathway_df[column] = parsed_pathway_df[column].apply(ast.literal_eval)




# add GO "pathways" to pathway df
go_collection = []
already_seen = []
go_collection.append({'id': 'Other functions', 'common_name': 'Other functions', 'level': 1, 'parents': [], 'children': []})

for go_term, go_name in TEMP_GO_TERMS.items():
    if go_name not in already_seen:
        already_seen.append(go_name)
        go_collection[0]['children'].append(go_name)
        
        go_name_child = go_name + ' (child)'
        
        go_collection.append({'id': go_name, 'common_name': go_name, 'level': 2, 'parents': ['Other functions'], 'children': [go_name_child]})
        go_collection.append({'id': go_name_child, 'common_name': go_name_child, 'level': 3, 'parents': [go_name], 'children': []})

go_df = pd.DataFrame(go_collection)

parsed_pathway_df = pd.concat([parsed_pathway_df, go_df], ignore_index=True)
# parsed_pathway_df


# add membrane protein areas
membrane_monomers = pd.read_csv('notebooks/cofactors/data/monomer_vs_area.csv', index_col=False)
membrane_complexes = pd.read_csv('notebooks/cofactors/data/complex_vs_area.csv', index_col=False)

# rename complex column in membrane_complexes to id
membrane_complexes = membrane_complexes.rename(columns={'complex': 'id'})

# concat, pick out id and area_trans
membrane_proteins = pd.concat([membrane_monomers, membrane_complexes], ignore_index=True)
membrane_proteins = membrane_proteins.loc[:, ['id', 'area_trans']]

# Data processing into final tables
## Specific adjustments

In [3]:
# removing specific troublesome interactions that have been discovered in the data
# should not be in model


# remove metal features from EG11415-MONOMER in parsed_protein_df\
for protein in erroneous_monomer_metal_interactions:
    prot_idx = np.where(parsed_protein_df['id'] == protein)[0][0]
    parsed_protein_df.at[prot_idx, 'metal_features'] = []

# classify folE gene use as cofactor production (THF)
pathway_idx = parsed_pathway_df[parsed_pathway_df['id'] == '6-HM-Dihydropterin-PP-Biosynthesis'].index[0]
parsed_pathway_df.at[pathway_idx, 'parents'] = ['Cofactor-Biosynthesis']

protein_idx = parsed_protein_df[parsed_protein_df['id'] == 'GTP-CYCLOHYDRO-I-MONOMER'].index[0]
parsed_protein_df.at[protein_idx, 'direct_annotations'] = set(['PWY-6147'])

# remove pathway from erroneous pathway annotations
for protein in erroneous_pathway_annotations:
    prot_idx = np.where(parsed_protein_df['id'] == protein)[0][0]
    parsed_protein_df.at[prot_idx, 'direct_annotations'] 

# for all proteins with "port" in common_name, add "GO:ZZZ" to go_annotations
for i in range(len(parsed_protein_df.index)):
    common_name = parsed_protein_df.at[i, 'common_name']
    if type(common_name) is str and 'port' in common_name.lower():
        parsed_protein_df.at[i, 'go_annotations'].add('GO:ZZZ')

# add lpp to structural maintenance
for protein in ['EG10544-MONOMER', 'EG12117-MONOMER']:
    prot_idx = np.where(parsed_protein_df['id'] == protein)[0][0]
    parsed_protein_df.at[prot_idx, 'go_annotations'] = set(['GO:STRUCTURE'])



## Add membrane protein areas to parsed_protein_df

## Process raw EcoCyc annotations into standard EcoCyc names

In [4]:

# remove all \ characters from keys in ALLOWED_METAL_NAMES
NON_REGEX_METAL = {key.replace('\\', ''): value for key, value in ALLOWED_METAL_NAMES.items()}

parsed_protein_df['metal_features_processed'] = 0
parsed_protein_df['metal_features_processed'] = parsed_protein_df['metal_features_processed'].astype(object)

metal_pattern = '|'.join(ALLOWED_METAL_NAMES.keys())
metal_regex = re.compile(f'(({metal_pattern})(\s\d[\.,;]|[\.,;]|\s\())')


for i in range(len(parsed_protein_df.index)):

    metal_binding = parsed_protein_df.loc[i, 'metal_features']

    metal_count_dict = {}
    existing_matches = set()

    for feature in metal_binding:
        matches = metal_regex.search(feature)
        if matches:
            metal = matches.group(0)[:-1]

            # eliminate duplicates
            if metal not in existing_matches:

                existing_matches.add(metal)

                if 'heme' in feature:
                    metal = metal.replace('Iron', 'heme')
                if 'alamin' in feature:
                    metal = metal.replace('Cobalt', 'Cobalamin')

                # check if last char of metal is a number, then crop
                if metal[-1].isdigit():
                    metal = metal[:-2]

                metal = metal.strip()

                # replace metal name with allowed metal name
                metal = NON_REGEX_METAL[metal]

                if metal in metal_count_dict:
                    metal_count_dict[metal] += 1
                else:
                    metal_count_dict[metal] = 1

        else:
            print(f'No match for {feature} in {parsed_protein_df.loc[i, "id"]}')



    # EXCEPTIONS
    # if both magnesium and manganese are present, replace with magnesium
    # TODO remove when using UniProt data. Ecocyc data is not as reliable
    if 'MG+2' in metal_count_dict and 'MN+2' in metal_count_dict and metal_count_dict['MG+2'] == metal_count_dict['MN+2']:
        del metal_count_dict['MN+2']
    # same with cobalt
    if 'CO+2' in metal_count_dict and 'MG+2' in metal_count_dict and metal_count_dict['CO+2'] == metal_count_dict['MG+2']:
        del metal_count_dict['CO+2']
    elif 'CO+2' in metal_count_dict:
        metal_count_dict['MG+2'] = metal_count_dict['CO+2']
        del metal_count_dict['CO+2']

    parsed_protein_df.at[i, 'metal_features_processed'] = metal_count_dict

No match for UniProt: Magnesium or manganese. in 3-ISOPROPYLMALDEHYDROG-MONOMER
No match for conserved, Fe(III) binding motif in BASS-MONOMER
No match for predicted heme d ligand in CYDA-MONOMER
No match for UniProt: Zn(2+); catalytic. in CYTDEAM-MONOMER
No match for UniProt: Fe(2+); catalytic. in CYTDEAM-MONOMER
No match for The amino-terminus of ClpA contains a Zinc binding site. in EG10156-MONOMER
No match for The active-site magnesium ion is coordinated by three aspartate residues (401, 403, 555). Two of them form part of 
the PDXD active-site motif. in EG10238-MONOMER
No match for Divalent magnesium ions are chelated by three aspartate residues, two in the conserved DPD sequence (345, 347) 
and one in the conserved EGYMD sequence (269). in EG10239-MONOMER
No match for Based on crystal structures, Glu-265 and Asp-309 coordinate a divalent cation. in EG10239-MONOMER
No match for These residues are thought to coordinate the one or two divalent magnesium ions required for the 
gyrase 

In [5]:
# manual corrections in data
correction_idx = parsed_protein_df.index[parsed_protein_df['id'] == '3-OXOACYL-ACP-REDUCT-MONOMER'][0]

# remove cofactors
parsed_protein_df.at[correction_idx, 'metal_features_processed'] = {}

correction_idx = parsed_protein_df.index[parsed_protein_df['id'] == 'CARBPSYN-LARGE'][0]

# remove cofactors
parsed_protein_df.at[correction_idx, 'metal_features_processed']
mn_cofactor_count = parsed_protein_df.at[correction_idx, 'metal_features_processed']['MN+2']
parsed_protein_df.at[correction_idx, 'metal_features_processed'] = {'MG+2': mn_cofactor_count}

# dps
correction_idx = parsed_protein_df.index[parsed_protein_df['id'] == 'EG11415-MONOMER'][0]
parsed_protein_df.at[correction_idx, 'metal_features_processed'] = {}

In [6]:
parsed_protein_df = parsed_protein_df.drop(columns=['metal_features'])
parsed_protein_df

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,other_features,direct_annotations,go_annotations,uniprot_id,metal_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,MLYIFRLIITVIYSILVCVFGSIYCLFSPRNPKHVATFGHMFGRLA...,"{ENZRXN0-8629, ENZRXN0-7991, ENZRXN0-7992, 1-A...",{},[],"{PWY0-1319, PWY-5667}","{GO:0016020, GO:0016740, GO:0016024, GO:000665...",P26647,{}
1,1-PFK-MONOMER,1-phosphofructokinase,MSRRVATITLNPAYDLVGFCPEIERGEVNLVKTTGLHAAGKGINVA...,{},{},[ATP],{PWY0-1314},"{GO:0016773, GO:0016301, GO:0016740, GO:000552...",P0AEW9,{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,MKITVLGCGALGQLWLTALCKQGHEVQGWLRVPQPYCSVNLVETDG...,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},[NADP],{PANTO-PWY},"{GO:0016616, GO:0005829, GO:0005737, GO:000867...",P0A9J4,{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,MSQQVIIFDTTLRDGEQALQASLSVKEKLQIALALERMGVDVMEVG...,{ENZRXN0-6250},{},[],{LEUSYN-PWY},"{GO:0003985, GO:0030145, GO:0016740, GO:000908...",P09151,{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",MVDKSQETTHFGFQTVAKEQKADMVAHVFHSVASKYDVMNDLMSFG...,"{ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",{},[],"{PWY-6708, MENAQUINONESYN-PWY}","{GO:0030580, GO:0005515, GO:0043770, GO:001674...",P0A887,{}
...,...,...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,MTTDQHQEILRTEGLSKFFPGVKALDNVDFSLRRGEIMALLGENGA...,{},{},[ATP],{},"{GO:0016020, GO:0042875, GO:0103116, GO:000552...",Q6BEX0,{}
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,MMPQSLPDTTTPKRRFRWPTGMPQLVALLLVLLVDSLVAPHFWQVV...,{},{},[],{},"{GO:0016020, GO:0005515, GO:0022857, GO:ZZZ, G...",P39328,{}
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,MLHKKTLLFAALSAALWGGATQAADAAVVASLKPVGFIASAIADGV...,{},{},[],{},"{GO:0016020, GO:0071578, GO:0042597, GO:ZZZ, G...",P39172,{'ZN+2': 1}
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,MIELLFPGWLAGIMLACAAGPLGSFVVWRRMSYFGDTLAHASLLGV...,{},{},[],{},"{GO:0016020, GO:0071578, GO:ZZZ, GO:0055085, G...",P39832,{}


In [7]:
parsed_protein_df['other_features_processed'] = 0
parsed_protein_df['other_features_processed'] = parsed_protein_df['other_features_processed'].astype(object)

for i in range(len(parsed_protein_df.index)):

    other_features = parsed_protein_df.loc[i, 'other_features']

    other_feature_count_dict = {}
    existing_matches = set()

    for feature in other_features:

        # eliminate duplicates
        if feature not in existing_matches:

            existing_matches.add(feature)

            if feature in ACCEPTED_OTHER_FEATURES:
                if feature in other_feature_count_dict:
                    other_feature_count_dict[feature] += 1
                else:
                    other_feature_count_dict[feature] = 1

    parsed_protein_df.at[i, 'other_features_processed'] = other_feature_count_dict

In [8]:
# processed go_annotations. if no pathways exist for monomer, use go annotations
for i in range(len(parsed_protein_df.index)):

    cur_go_annotations = parsed_protein_df.loc[i, 'go_annotations']
    cur_pathways = parsed_protein_df.loc[i, 'direct_annotations']

    replacement_pathways = list()

    if len(cur_pathways) == 0:

        for go_term in TEMP_GO_TERMS:
            for go_annotation in cur_go_annotations:
                if go_term == go_annotation:
                    replacement_pathways.append(TEMP_GO_TERMS[go_term] + ' (child)')

        if len(replacement_pathways) > 0:
            print(parsed_protein_df.at[i, "id"], set([replacement_pathways[0]]))
            parsed_protein_df.at[i, 'direct_annotations'] = set([replacement_pathways[0]])


# parsed_protein_df[parsed_protein_df['id'] == 'PD00197']

ABC-MONOMER {'Non-porin small-molecule transport (child)'}
ABGT-MONOMER {'Non-porin small-molecule transport (child)'}
AMTB-MONOMER {'Non-porin small-molecule transport (child)'}
ARAE-MONOMER {'Non-porin small-molecule transport (child)'}
ARAF-MONOMER {'Non-porin small-molecule transport (child)'}
ARAG-MONOMER {'Non-porin small-molecule transport (child)'}
ARAH-MONOMER {'Non-porin small-molecule transport (child)'}
ARAJ-MONOMER {'Non-porin small-molecule transport (child)'}
ARCD-MONOMER {'Non-porin small-molecule transport (child)'}
ARGT-MONOMER {'Non-porin small-molecule transport (child)'}
AROP-MONOMER {'Non-porin small-molecule transport (child)'}
ARTI-MONOMER {'Non-porin small-molecule transport (child)'}
ARTJ-MONOMER {'Non-porin small-molecule transport (child)'}
ARTM-MONOMER {'Non-porin small-molecule transport (child)'}
ARTP-MONOMER {'Non-porin small-molecule transport (child)'}
ARTQ-MONOMER {'Non-porin small-molecule transport (child)'}
B0260-MONOMER {'Non-porin small-molecule 

In [9]:
parsed_protein_df.loc[parsed_protein_df['id'] == 'EG10230-MONOMER']

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,other_features,direct_annotations,go_annotations,uniprot_id,metal_features_processed,other_features_processed
425,EG10230-MONOMER,RNA polymerase-binding transcription factor DksA,MQEGQNRKTSSLSILAIAGVEPYQEKPGEEYMNEAQLAHFRRILEA...,{},{},[],{Regulation of transcription (child)},"{GO:0006302, GO:0097216, GO:0005829, GO:004687...",P0ABS1,{'ZN+2': 1},{}


In [10]:

# Add uncategorized pathways
new_rows = [
    {'id': 'Uncategorized (I)', 'parents': [], 'children': ['Uncategorized (II)'], 'level': 1, 'common_name': 'Uncategorized (I)'},
    {'id': 'Uncategorized (II)', 'parents': ['Uncategorized (I)'], 'children': ['Uncategorized (III)'], 'level': 2, 'common_name': 'Uncategorized'},
    {'id': 'Uncategorized (III)', 'parents': ['Uncategorized (II)'], 'children': [], 'level': 3, 'common_name': 'Uncategorized (III)'},
]

# Convert the dictionaries to a DataFrame.
new_rows_df = pd.DataFrame(new_rows)

# Insert the new rows into the existing DataFrame.
# Assuming you want to append at the end:
parsed_pathway_df = pd.concat([parsed_pathway_df, new_rows_df], ignore_index=True)

# for all proteins with empty direct_annotations, add 'Uncategorized (III)'
for i in range(len(parsed_protein_df.index)):
    if len(parsed_protein_df.at[i, 'direct_annotations']) == 0:
        parsed_protein_df.at[i, 'direct_annotations'] = set(['Uncategorized (III)'])
        
parsed_protein_df

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,other_features,direct_annotations,go_annotations,uniprot_id,metal_features_processed,other_features_processed
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,MLYIFRLIITVIYSILVCVFGSIYCLFSPRNPKHVATFGHMFGRLA...,"{ENZRXN0-8629, ENZRXN0-7991, ENZRXN0-7992, 1-A...",{},[],"{PWY0-1319, PWY-5667}","{GO:0016020, GO:0016740, GO:0016024, GO:000665...",P26647,{},{}
1,1-PFK-MONOMER,1-phosphofructokinase,MSRRVATITLNPAYDLVGFCPEIERGEVNLVKTTGLHAAGKGINVA...,{},{},[ATP],{PWY0-1314},"{GO:0016773, GO:0016301, GO:0016740, GO:000552...",P0AEW9,{},{}
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,MKITVLGCGALGQLWLTALCKQGHEVQGWLRVPQPYCSVNLVETDG...,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},[NADP],{PANTO-PWY},"{GO:0016616, GO:0005829, GO:0005737, GO:000867...",P0A9J4,{},{}
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,MSQQVIIFDTTLRDGEQALQASLSVKEKLQIALALERMGVDVMEVG...,{ENZRXN0-6250},{},[],{LEUSYN-PWY},"{GO:0003985, GO:0030145, GO:0016740, GO:000908...",P09151,{},{}
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...",MVDKSQETTHFGFQTVAKEQKADMVAHVFHSVASKYDVMNDLMSFG...,"{ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",{},[],"{PWY-6708, MENAQUINONESYN-PWY}","{GO:0030580, GO:0005515, GO:0043770, GO:001674...",P0A887,{},{}
...,...,...,...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,MTTDQHQEILRTEGLSKFFPGVKALDNVDFSLRRGEIMALLGENGA...,{},{},[ATP],{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0042875, GO:0103116, GO:000552...",Q6BEX0,{},{}
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,MMPQSLPDTTTPKRRFRWPTGMPQLVALLLVLLVDSLVAPHFWQVV...,{},{},[],{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0005515, GO:0022857, GO:ZZZ, G...",P39328,{},{}
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,MLHKKTLLFAALSAALWGGATQAADAAVVASLKPVGFIASAIADGV...,{},{},[],{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0071578, GO:0042597, GO:ZZZ, G...",P39172,{'ZN+2': 1},{}
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,MIELLFPGWLAGIMLACAAGPLGSFVVWRRMSYFGDTLAHASLLGV...,{},{},[],{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0071578, GO:ZZZ, GO:0055085, G...",P39832,{},{}


In [11]:
parsed_protein_df.loc[parsed_protein_df['id'] == 'EG10230-MONOMER']

Unnamed: 0,id,common_name,seq,enzyme_reaction,cofactors,other_features,direct_annotations,go_annotations,uniprot_id,metal_features_processed,other_features_processed
425,EG10230-MONOMER,RNA polymerase-binding transcription factor DksA,MQEGQNRKTSSLSILAIAGVEPYQEKPGEEYMNEAQLAHFRRILEA...,{},{},[],{Regulation of transcription (child)},"{GO:0006302, GO:0097216, GO:0005829, GO:004687...",P0ABS1,{'ZN+2': 1},{}


In [12]:
# decompose sequence into dict of amino acid counts
parsed_protein_df['sequence_processed'] = 0
parsed_protein_df['sequence_processed'] = parsed_protein_df['sequence_processed'].astype(object)

parsed_protein_df['sequence_mass'] = 0

unique_aa = set()

for i in range(len(parsed_protein_df.index)):

    sequence = parsed_protein_df.loc[i, 'seq']

    if type(sequence) != str:
        print(f'No sequence for {parsed_protein_df.loc[i, "id"]}')
        continue

    aa_count_dict = {}

    mass = 0

    for aa in sequence:
        if aa in aa_count_dict:
            aa_count_dict[aa] += 1
        else:
            aa_count_dict[aa] = 1

        if aa not in unique_aa:
            unique_aa.add(aa)

        mass += AMINO_ACID_RESIDUE_MASSES[AMINO_ACID_MAP[aa]]

    parsed_protein_df.at[i, 'sequence_processed'] = aa_count_dict
    parsed_protein_df.at[i, 'sequence_mass'] = mass

  parsed_protein_df.at[i, 'sequence_mass'] = mass


No sequence for MONOMER0-1241
No sequence for MONOMER0-4223


In [13]:
parsed_protein_df = parsed_protein_df.drop(columns=['other_features','seq'])

parsed_protein_df

Unnamed: 0,id,common_name,enzyme_reaction,cofactors,direct_annotations,go_annotations,uniprot_id,metal_features_processed,other_features_processed,sequence_processed,sequence_mass
0,1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER,1-acylglycerol-3-phosphate <i>O</i>-acyltransf...,"{ENZRXN0-8629, ENZRXN0-7991, ENZRXN0-7992, 1-A...",{},"{PWY0-1319, PWY-5667}","{GO:0016020, GO:0016740, GO:0016024, GO:000665...",P26647,{},{},"{'M': 6, 'L': 21, 'Y': 8, 'I': 22, 'F': 12, 'R...",27417.49715
1,1-PFK-MONOMER,1-phosphofructokinase,{},{},{PWY0-1314},"{GO:0016773, GO:0016301, GO:0016740, GO:000552...",P0AEW9,{},{},"{'M': 8, 'S': 21, 'R': 18, 'V': 32, 'A': 30, '...",33716.33963
2,2-DEHYDROPANTOATE-REDUCT-MONOMER,2-dehydropantoate 2-reductase,{2-DEHYDROPANTOATE-REDUCT-ENZRXN},{},{PANTO-PWY},"{GO:0016616, GO:0005829, GO:0005737, GO:000867...",P0A9J4,{},{},"{'M': 7, 'K': 7, 'I': 24, 'T': 21, 'V': 21, 'L...",33831.33232
3,2-ISOPROPYLMALATESYN-MONOMER,2-isopropylmalate synthase,{ENZRXN0-6250},{},{LEUSYN-PWY},"{GO:0003985, GO:0030145, GO:0016740, GO:000908...",P09151,{},{},"{'M': 13, 'S': 30, 'Q': 22, 'V': 46, 'I': 39, ...",57244.09138
4,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,"bifunctional 2-octaprenyl-6-methoxy-1,4-benzoq...","{ADOMET-DMK-METHYLTRANSFER-ENZRXN, 2-OCTAPRENY...",{},"{PWY-6708, MENAQUINONESYN-PWY}","{GO:0030580, GO:0005515, GO:0043770, GO:001674...",P0A887,{},{},"{'M': 10, 'V': 22, 'D': 19, 'K': 16, 'S': 16, ...",28037.26502
...,...,...,...,...,...,...,...,...,...,...,...
4429,YTFR-MONOMER,galactofuranose ABC transporter putative ATP b...,{},{},{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0042875, GO:0103116, GO:000552...",Q6BEX0,{},{},"{'M': 11, 'T': 26, 'D': 24, 'Q': 26, 'H': 7, '...",55207.40745
4430,YTFT-MONOMER,galactofuranose ABC transporter putative membr...,{},{},{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0005515, GO:0022857, GO:ZZZ, G...",P39328,{},{},"{'M': 12, 'P': 15, 'Q': 11, 'S': 17, 'L': 53, ...",35617.89729
4431,ZNUA-MONOMER,Zn<sup>2+</sup> ABC transporter periplasmic bi...,{},{},{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0071578, GO:0042597, GO:ZZZ, G...",P39172,{'ZN+2': 1},{},"{'M': 7, 'L': 34, 'H': 13, 'K': 20, 'T': 16, '...",33738.19213
4432,ZNUB-MONOMER,Zn<sup>2+</sup> ABC transporter membrane subunit,{},{},{Non-porin small-molecule transport (child)},"{GO:0016020, GO:0071578, GO:ZZZ, GO:0055085, G...",P39832,{},{},"{'M': 13, 'I': 18, 'E': 4, 'L': 47, 'F': 12, '...",27692.26700


## Create new column for monomer component stoichiometry

In [14]:
complex_ids = parsed_complex_df['id'].tolist()
monomer_names = parsed_protein_df['id'].tolist()

In [15]:
def recursive_component_tree(current_component_name, complex_table, protein_table,
                             current_multiplier=1, component_list=None, parent=None, return_cofactors=False):
    """
    Recursively find all downstream components of a given complex.
    """

    complex_names = complex_table['id'].tolist()
    monomer_names = protein_table['id'].tolist()


    my_children = {}

    if component_list is None:
        component_list = []


    if current_component_name in complex_names:


        cplx_idx = complex_table.index[complex_table['id'] == current_component_name][0]
        stoichiometry = complex_table.at[cplx_idx, 'stoichiometry']

        direct_children = {k: abs(v) for k, v in stoichiometry.items() if v < 0}

        for component_name, coefficient in stoichiometry.items():

            if coefficient < 0 and component_name != current_component_name:

                child_multiplier = abs(coefficient * current_multiplier)

                new_child = recursive_component_tree(component_name, complex_table, protein_table,
                                                     child_multiplier, component_list, current_component_name, return_cofactors)

                my_children = my_children | new_child


            elif coefficient > 0 and component_name == current_component_name:
                continue

            else:
                raise ValueError(f"key {component_name} and value {coefficient} for complex {component_name} not processed properly.")

        component_list.append({'name': current_component_name,'parent': parent, 'children': direct_children,
                               'multiplier': int(current_multiplier), })


    elif current_component_name in monomer_names:

        # TODO check if enzrxn
        if return_cofactors:
            protein_idx = protein_table.index[protein_table['id'] == current_component_name][0]

            protein_metals = protein_table.at[protein_idx, 'metal_features_processed']
            protein_other = protein_table.at[protein_idx, 'other_features_processed']

            table_cofactors = protein_metals | protein_other

            if len(table_cofactors) > 0:
                # TODO Add apo protein to component list
                my_children = {}

                for cofactor, cofactor_coefficient in table_cofactors.items():
                    if table_cofactors[cofactor] !=  None:
                        my_children[cofactor] = cofactor_coefficient
                        component_list.append({'parent': current_component_name,
                                               'name': cofactor,
                                               'multiplier': abs(int(current_multiplier * cofactor_coefficient)),
                                               'children': None})

            component_list.append({'parent': parent, 'name': current_component_name, 'multiplier': current_multiplier, 'children': my_children})

        else:
            my_children = None
            component_list.append({'parent': parent, 'name': current_component_name, 'multiplier': current_multiplier, 'children': None})



    else:
        print(f"component {current_component_name} not found in complex or protein tables")

        return {}


    if parent is None:
        return {current_component_name: my_children}, component_list
    else:
        return {current_component_name: my_children}


In [16]:
complex_tree_structure, nodes = recursive_component_tree('CPLX0-8167', parsed_complex_df, parsed_protein_df)
pp.pprint(nodes)

[{'children': None,
  'multiplier': 4,
  'name': 'HYAA-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': None,
  'multiplier': 4,
  'name': 'HYAB-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': None,
  'multiplier': 2,
  'name': 'HYAC-MONOMER',
  'parent': 'FORMHYDROGI-CPLX'},
 {'children': {'HYAA-MONOMER': 2, 'HYAB-MONOMER': 2, 'HYAC-MONOMER': 1},
  'multiplier': 2,
  'name': 'FORMHYDROGI-CPLX',
  'parent': 'CPLX0-8167'},
 {'children': {'FORMHYDROGI-CPLX': 2},
  'multiplier': 1,
  'name': 'CPLX0-8167',
  'parent': None}]


In [17]:
parsed_complex_df['monomer_component_stoichiometry'] = 0
parsed_complex_df['monomer_component_stoichiometry'] = parsed_complex_df['monomer_component_stoichiometry'].astype(object)

for i in range(len(parsed_complex_df.index)):
    complex_name = parsed_complex_df.loc[i, 'id']
    complex_tree_structure, nodes = recursive_component_tree(complex_name, parsed_complex_df, parsed_protein_df)

    monomer_components = {node['name']: node['multiplier'] for node in nodes if node['children'] is None}

    parsed_complex_df.at[i, 'monomer_component_stoichiometry'] = monomer_components

component CPLX0-7701 not found in complex or protein tables
component CPLX0-7677 not found in complex or protein tables
component MONOMER0-1781 not found in complex or protein tables
component CPLX0-7702 not found in complex or protein tables
component CSRB-RNA not found in complex or protein tables
component RNPB-RNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component IS061-RNA not found in complex or protein tables
component CSRC-RNA not found in complex or protein tables
component FFS-RNA not found in complex or protein tables


In [18]:
parsed_complex_df = parsed_complex_df.loc[:, ["id", "common_name", "stoichiometry", "monomer_component_stoichiometry", "cofactors"]]
parsed_complex_df

Unnamed: 0,id,common_name,stoichiometry,monomer_component_stoichiometry,cofactors
0,1-PFK,1-phosphofructokinase,"{'1-PFK': 1, '1-PFK-MONOMER': -2}",{'1-PFK-MONOMER': 2},{MG+2}
1,2OXOGLUTARATEDEH-CPLX,2-oxoglutarate dehydrogenase complex,"{'2OXOGLUTARATEDEH-CPLX': 1, 'E1O': -1, 'E2O':...","{'E1O-MONOMER': 12, 'E2O-MONOMER': 24, 'E3-MON...","{THIAMINE-PYROPHOSPHATE, FAD, LIPOIC-ACID, MG+2}"
2,3-ISOPROPYLMALDEHYDROG-CPLX,3-isopropylmalate dehydrogenase,"{'3-ISOPROPYLMALDEHYDROG-CPLX': 1, '3-ISOPROPY...",{'3-ISOPROPYLMALDEHYDROG-MONOMER': 2},"{MG+2, MN+2}"
3,3-ISOPROPYLMALISOM-CPLX,3-isopropylmalate dehydratase,"{'3-ISOPROPYLMALISOM-CPLX': 1, 'LEUC-MONOMER':...","{'LEUC-MONOMER': 1, 'LEUD-MONOMER': 1}",{CPD-7}
4,3-METHYL-2-OXOBUT-OHCH3XFER-CPLX,3-methyl-2-oxobutanoate hydroxymethyltransferase,"{'3-METHYL-2-OXOBUT-OHCH3XFER-CPLX': 1, '3-CH3...",{'3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER': 10},{MG+2}
...,...,...,...,...,...
1088,CPLX0-3964,ribosome,"{'CPLX0-3964': 1, 'CPLX0-3953': -1, 'CPLX0-396...","{'EG10900-MONOMER': 1, 'EG10901-MONOMER': 1, '...",{}
1089,CPLX0-8028,CsrA complex with McaS RNA,"{'CPLX0-8028': 1, 'IS061-RNA': -1, 'CPLX0-7956...",{'EG11447-MONOMER': 2},{}
1090,CPLX0-8053,SelB-L-selenocysteinyl-tRNA<sup>sec</sup>,"{'CPLX0-8053': 1, 'EG10942-MONOMER': -1}",{'EG10942-MONOMER': 1},{}
1091,CPLX0-8253,CsrA complex with CsrC RNA,"{'CPLX0-8253': 1, 'CSRC-RNA': -1, 'EG11447-MON...",{'EG11447-MONOMER': 9},{}


## Create tree matrix (also for Julia)

In [19]:
# save names
complex_ids = list(parsed_complex_df['id'])
protein_ids = list(parsed_protein_df['id'])
cofactor_ids = list(parsed_cofactor_df['id'])
pathway_ids = list(parsed_pathway_df['id'])


name_idx = complex_ids + protein_ids + cofactor_ids
tree_matrix = np.zeros([len(complex_ids) + len(protein_ids) + len(cofactor_ids), len(complex_ids) + len(protein_ids) + len(cofactor_ids)], dtype=np.int64)

for i in range(len(parsed_complex_df)):
    name = parsed_complex_df.at[i, 'id']
    tree_structure, nodes = recursive_component_tree(name, parsed_complex_df, parsed_protein_df, return_cofactors=True)

    for node in nodes:
        node_name = node['name']
        node_children = node['children']

        if node_children != None:
            for child_name, child_coefficient in node_children.items():
                if child_name in name_idx:
                        tree_matrix[name_idx.index(node_name), name_idx.index(child_name)] = child_coefficient

component CPLX0-7701 not found in complex or protein tables
component CPLX0-7677 not found in complex or protein tables
component MONOMER0-1781 not found in complex or protein tables
component CPLX0-7702 not found in complex or protein tables
component CSRB-RNA not found in complex or protein tables
component RNPB-RNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component RRSA-RRNA not found in complex or protein tables
component RRLA-RRNA not found in complex or protein tables
component RRFA-RRNA not found in complex or protein tables
component IS061-RNA not found in complex or protein tables
component CSRC-RNA not found in complex or protein tables
component FFS-RNA not found in complex or protein tables


# Create matrices to get cofactor counts

In [20]:
parsed_pathway_df[parsed_pathway_df['id'] == 'Regulation of transcription']

Unnamed: 0,id,parents,children,level,common_name
1174,Regulation of transcription,[Other functions],[Regulation of transcription (child)],2,Regulation of transcription


In [21]:
priority_list = ['Activation-Inactivation-Interconversion','Metabolic-Clusters', 'Macromolecule-Modification', 'Glycan-Pathways','Detoxification',  'Degradation']

# pathway matrix is necessary to traverse tree
pathway_matrix = np.zeros((len(pathway_ids), len(pathway_ids)), dtype=np.int64)
level_vector = np.zeros(len(pathway_ids), dtype=np.int64)

for i in range(len(parsed_pathway_df)):

    cur_pathway = parsed_pathway_df.at[i, 'id']
    level_vector[i] = parsed_pathway_df.at[i, 'level']

    pathway_parents = parsed_pathway_df.at[i, 'parents']
    pathway_children = parsed_pathway_df.at[i, 'children']

    for parent in pathway_parents:
        j = pathway_ids.index(parent)
        pathway_matrix[j, i] = 1

    for child in pathway_children:
        j = pathway_ids.index(child)
        pathway_matrix[i, j] = 1

original_pathway_matrix = pathway_matrix.copy()

# get superpathway indices
super_pathway_idx = pathway_ids.index('Super-Pathways')
super_pathway_children_idxs = np.where(pathway_matrix[super_pathway_idx, :] == 1)[0]

# zero out all superpathway children
pathway_matrix[:, super_pathway_children_idxs] = 0

# for columns (children) with multiple parents, if one parent leads to degradation or glycans, remove it.
for i in range(len(pathway_matrix[0, :])):
    cur_pathway = pathway_ids[i]

    if pathway_matrix[:, i].sum() > 1:

        nz_idxs = np.where(pathway_matrix[:, i] == 1)[0]
        top_level_classes = [list(get_pathway_ith_level_parents(j, original_pathway_matrix, pathway_ids, level_vector, level=1).keys())[0] for j in nz_idxs]
        # print(f"multiple parents {top_level_classes} for {cur_pathway}")


        # when there are multiple parents, remove them in the following order of priority:
        for priority in priority_list:
            while priority in top_level_classes and len(nz_idxs) > 1:
                priority_index = top_level_classes.index(priority)
                pathway_matrix[nz_idxs[priority_index], i] = 0
                nz_idxs = np.where(pathway_matrix[:, i] == 1)[0]
                top_level_classes[priority_index] = 'N/A'


        # then, if there are still multiple parents, remove all but the first one
        # TODO Change to parent with most frequently occuring 2nd parent.
        if len(nz_idxs) > 1:
            # top_two_level_classes = [list(get_pathway_ith_level_parents(j, original_pathway_matrix, pathway_name_list, level_vector, level=2).keys())[0] for j in nz_idxs]
            # print(f"multiple parents with 2nd level categories {top_two_level_classes} for {cur_pathway}")
            pathway_matrix[nz_idxs[1:], i] = 0

        nz_idxs = np.where(pathway_matrix[:, i] == 1)[0]
        top_level_classes = [list(get_pathway_ith_level_parents(j, original_pathway_matrix, pathway_ids, level_vector, level=1).keys()) for j in nz_idxs]

        # print(f"pruned to {top_level_classes}")

In [22]:
# if choice can be made, don't pick these. 
priority_list_second = ['SECONDARY-METABOLITE-BIOSYNTHESIS', 'Respiration', 
                        'Tetrapyrrole-Biosynthesis', 'Alcohol-Degradation', 'Carbohydrates-Biosynthesis']

# create protein name to pathway mapping
W = np.zeros((len(parsed_protein_df.index), len(parsed_pathway_df.index)))

for i in range(len(parsed_protein_df.index)):

    cur_pathways = parsed_protein_df.at[i, 'direct_annotations']

    for pathway in cur_pathways:
        pathway_idx = pathway_ids.index(pathway)
        W[i, pathway_idx] = 1


for i in range(len(parsed_protein_df.index)):

    cur_protein_pathways_idxs = np.where(W[i, :] == 1)[0]
    cur_protein = parsed_protein_df.at[i, 'id']

    if len(cur_protein_pathways_idxs) < 2:
        continue

    else:

        # for pathway_idx in cur_protein_pathways_idxs:

        # get top level class of every pathway, and remove all but the first one of each class.

        cur_protein_pathway_parents = list()
        cur_protein_pathway_two_parents = list()

        for pathway_idx in cur_protein_pathways_idxs:
            top_parents = get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_ids, level_vector, level=1)
            top_two_parents = get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_ids, level_vector, level=2)

            print(f"{cur_protein} has 2nd parents {top_two_parents} for {pathway_ids[pathway_idx]}")

            if len(top_parents) > 1:
                print(f"multiple parents {top_parents} for {pathway_ids[pathway_idx]} for {cur_protein}, should not happen.")

            if len(top_parents) == 1:
                cur_protein_pathway_parents.append(list(top_parents.keys())[0])
                cur_protein_pathway_two_parents.append(list(top_two_parents.keys())[0])
            else:
                # remove pathway with no parents
                W[i, pathway_idx] = 0
                cur_protein_pathway_parents.append('N/A')
                cur_protein_pathway_two_parents.append('N/A')


        # TODO - remove direct annotations with deprioritized parents
        for priority in priority_list:
            while priority in cur_protein_pathway_parents and len(np.where(W[i, :] == 1)[0]) > 1:
                priority_index = cur_protein_pathway_parents.index(priority)
                W[i, cur_protein_pathways_idxs[priority_index]] = 0
                cur_protein_pathway_parents[priority_index] = 'N/A'

        # same for 2nd level
        for priority in priority_list_second:
            while priority in cur_protein_pathway_two_parents and len(np.where(W[i, :] == 1)[0]) > 1:
                priority_index = cur_protein_pathway_two_parents.index(priority)
                W[i, cur_protein_pathways_idxs[priority_index]] = 0
                cur_protein_pathway_two_parents[priority_index] = 'N/A'

        new_protein_pathways_idxs = np.where(W[i, :] == 1)[0]

        # for pathway_idx in cur_protein_pathways_idxs:

        # get top level class of every pathway, and remove all but the first one of each class.

        new_protein_pathway_parents = list()
        new_protein_pathway_two_parents = list()

        for pathway_idx in new_protein_pathways_idxs:
            top_parents = get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_ids, level_vector, level=1)
            top_two_parents = get_pathway_ith_level_parents(pathway_idx, pathway_matrix, pathway_ids, level_vector, level=2)

            if len(top_parents) == 1:
                new_protein_pathway_parents.append(list(top_parents.keys())[0])
                new_protein_pathway_two_parents.append(list(top_two_parents.keys())[0])

        print(f"pruned to {cur_protein_pathway_two_parents}")


        # remove all N/A
        # cur_protein_pathways_idxs = cur_protein_pathways_idxs[cur_protein_pathway_parents != 'N/A']

        if len(np.unique(cur_protein_pathway_parents)) < 2:
            continue


1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for PWY-5667
1-ACYLGLYCEROL-3-P-ACYLTRANSFER-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for PWY0-1319
pruned to ['Lipid-Biosynthesis', 'Lipid-Biosynthesis']
2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER has 2nd parents {'Cofactor-Biosynthesis': 2} for PWY-6708
2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER has 2nd parents {'Cofactor-Biosynthesis': 2} for MENAQUINONESYN-PWY
pruned to ['Cofactor-Biosynthesis', 'Cofactor-Biosynthesis']
3-OXOACYL-ACP-REDUCT-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for PWY0-862
3-OXOACYL-ACP-REDUCT-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for PWY-6282
3-OXOACYL-ACP-REDUCT-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for PWY-5973
3-OXOACYL-ACP-REDUCT-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for PWY-5971
3-OXOACYL-ACP-REDUCT-MONOMER has 2nd parents {'Lipid-Biosynthesis': 2} for FASYN-ELONG-PWY
3-OXOACYL-ACP-REDUCT-MONOMER has 2nd parents {'Other-biosy

In [23]:
# create pathway to 2nd layer pathway mapping
W2 = np.zeros((len(parsed_pathway_df.index), len(parsed_pathway_df.index)))

for i in range(len(parsed_pathway_df.index)):

    cur_pathway = parsed_pathway_df.at[i, 'id']
    cur_pathway_idx = pathway_ids.index(cur_pathway)

    # get 2nd level parents
    cur_pathway_parents = get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, pathway_ids, level_vector, level=2)

    # if len(cur_pathway_parents) > 1:
    #     print(f"cur pathway {cur_pathway} has parents {cur_pathway_parents}")

    for parent in cur_pathway_parents:
        parent_idx = pathway_ids.index(parent)
        W2[i, parent_idx] = 1

# zero diagonal (don't return self, since some pathways return themselves as level 2 parents)
np.fill_diagonal(W2[0:(W2.shape[0] - len(go_collection)), 0:(W2.shape[0] - len(go_collection))], 0)


W1 = np.zeros((len(parsed_pathway_df.index), len(parsed_pathway_df.index)))

for i in range(len(parsed_pathway_df.index)):

    if parsed_pathway_df.at[i, 'level'] <= 2:
        cur_pathway = parsed_pathway_df.at[i, 'id']
        cur_pathway_idx = pathway_ids.index(cur_pathway)

        # get 2nd level parents
        cur_pathway_parents = get_pathway_ith_level_parents(cur_pathway_idx, pathway_matrix, pathway_ids, level_vector, level=1)

        for parent in cur_pathway_parents:
            parent_idx = pathway_ids.index(parent)
            W1[i, parent_idx] = 1

np.fill_diagonal(W1, 0)



In [24]:
W[protein_ids.index('EG10230-MONOMER'), pathway_ids.index('Regulation of transcription')]


0.0

In [25]:
# C matrix transforms complexes + monomers to just monomers.

# create protein name to index mapping
protein_name_to_index = {}
for i in range(len(parsed_protein_df.index)):
    protein_name_to_index[parsed_protein_df.at[i, 'id']] = i

# C matrix: complexes x proteins
C = np.zeros((len(parsed_complex_df.index), len(parsed_protein_df.index)))

for i in range(len(parsed_complex_df.index)):

    complex_components = parsed_complex_df.loc[i, 'monomer_component_stoichiometry']

    # TODO consider cofactors
    # complex_cofactors = filter_complex_df.loc[i, 'cofactors']


    for component_name, component_count in complex_components.items():
        if component_count is not None:             # side effect of parquet
            # get index of component in filter_protein_df
            component_index = protein_name_to_index[component_name]

            if parsed_complex_df.at[i, 'id'] == 'APORNAP-CPLX':
                print(f'component_name: {component_name}, component_count: {component_count}, component_index: {component_index}')

            C[i, component_index] = component_count

# append an identity matrix to C
C = np.concatenate((C, np.identity(len(parsed_protein_df.index))), axis=0)

C_names = list(parsed_complex_df['id']) + list(parsed_protein_df['id'])

component_name: EG10893-MONOMER, component_count: 2, component_index: 697
component_name: RPOC-MONOMER, component_count: 1, component_index: 4122
component_name: RPOB-MONOMER, component_count: 1, component_index: 4121


In [26]:
# P matrix transforms proteins to their respective cofactor counts.

# create cofactor name to index mapping
cofactor_name_to_index = {}
for i in range(len(parsed_cofactor_df.index)):
    cofactor_name_to_index[parsed_cofactor_df.at[i, 'id']] = i

cofactor_ids = list(parsed_cofactor_df['id'])

# P matrix: proteins x cofactors
P = np.zeros((len(parsed_protein_df.index), len(parsed_cofactor_df.index)))

for i in range(len(parsed_protein_df.index)):
    protein_metals = parsed_protein_df.loc[i, 'metal_features_processed']
    protein_other = parsed_protein_df.loc[i, 'other_features_processed']

    for metal, count in protein_metals.items():
        if count is not None:             # side effect of parquet
            cofactor_index = cofactor_name_to_index[metal]
            P[i, cofactor_index] = count

    for other, count in protein_other.items():
        if count is not None:             # side effect of parquet
            cofactor_index = cofactor_name_to_index[other]
            P[i, cofactor_index] = count


In [27]:
# E matrix transforms cofactors to their respective elemental composition

# create list of unique elements
unique_elements = set()
for i in range(len(parsed_cofactor_df.index)):
    cofactor = parsed_cofactor_df.at[i, 'elemental_composition']
    unique_elements.update(cofactor.keys())

unique_elements = list(unique_elements)

# create E matrix: cofactors x elements
E = np.zeros((len(parsed_cofactor_df.index), len(unique_elements)))

for i in range(len(parsed_cofactor_df.index)):
    cofactor = parsed_cofactor_df.at[i, 'elemental_composition']

    for element, count in cofactor.items():
        if count is not None:             # side effect of parquet
            element_index = unique_elements.index(element)
            E[i, element_index] = count


element_ids = unique_elements

In [28]:
# A matrix transforms proteins to their amino acid composition

A = np.zeros((len(parsed_protein_df.index), len(unique_aa)))

amino_acid_single_letter = list(unique_aa)
amino_acid_ids = [AMINO_ACID_MAP[aa] for aa in amino_acid_single_letter]

for i in range(len(parsed_protein_df.index)):
    protein = parsed_protein_df.at[i, 'sequence_processed']

    if type(protein) is not dict:
        continue

    for aa, count in protein.items():
        if count is not None:             # side effect of parquet
            aa_index = amino_acid_single_letter.index(aa)
            A[i, aa_index] = count

# Now ... add the counts >:o

In [29]:
# experiment = 'metabolism-redux-classic-minimal'
entry = f'cofactors_minimal'
folder = f'out/cofactors/{entry}/'

In [30]:
output_all = np.load(folder + '0_output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output_all['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']
bulk = pd.DataFrame(output['bulk'])

fluxes = np.array(fba['estimated_fluxes'][1:])
exchanges = fba['estimated_exchange_dmdt']

# output['listeners']['unique_molecule_counts']['active_ribosome']

In [31]:
f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

metabolism = agent['ecoli-metabolism-redux-classic']
stoichiometry = metabolism.stoichiometry


In [32]:
bulk.shape

(2973, 16255)

In [33]:
initial_state = json.load(open('data/vivecoli_t1.json'))

bulk_ids = [item[0] for item in initial_state['agents']['0']['bulk']]

bulk.columns = bulk_ids

In [34]:
# in the bulk dataframe, update RNAP and 50S, 30S rib counts from unique molecules since they are sequestered as unique when in use.
for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'), ('active_RNAP', 'APORNAP-CPLX')]:
    if unique_key in output['listeners']['unique_molecule_counts']:
        unique_count = output['listeners']['unique_molecule_counts'][unique_key]
        bulk.loc[:, bulk_id+'[c]'] += unique_count

In [35]:
bulk.loc[:, 'selC-tRNA[c]']

0       1935
1       1935
2       1935
3       1936
4       1937
        ... 
2968    5085
2969    5086
2970    5089
2971    5092
2972    5093
Name: selC-tRNA[c], Length: 2973, dtype: int64

In [36]:
ecocyc_to_wcm_map = {}

# combined complex and protein names
complex_protein_names = list(parsed_protein_df['id']) + list(parsed_complex_df['id'])

for name in complex_protein_names:

    # find complex name in bulk_ids
    found = False

    try:
        idx = bulk_ids.index(name+'[c]')
        ecocyc_to_wcm_map[name] = name+'[c]'
        found = True
        # print(f'found {complex_name} at {idx}')

    except ValueError:
        # delete key
        found = False


    if found == False:

        for id in bulk_ids:
            if name+'[' in id and id.startswith(name) and bulk.loc[:, id].sum() > 0:
                #print(f'found {name} in {id} with nonzero count')
                ecocyc_to_wcm_map[name] = id
                found = True
                break           # ensures preferring nonzero counts

            elif name+'[' in id and id.startswith(name):
                # print(f'found {name} in {id} with zero count')
                ecocyc_to_wcm_map[name] = id
                found = True

    if found == False:
        ecocyc_to_wcm_map[name] = '--TRANS-ACENAPHTHENE-12-DIOL[c]' # should be none
        print(f'could not find {name}')


could not find MONOMER0-1241
could not find MONOMER0-4223
could not find CPLX0-7450
could not find CPLX0-3964


In [37]:
complex_wcm_names = [ecocyc_to_wcm_map[name] for name in C_names]

counts = bulk.loc[:, complex_wcm_names]

In [38]:
entry = f'cofactors-rich'
folder = f'out/cofactors/{entry}/'

In [39]:
output_all = np.load(folder + '0_output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output_all['agents']['0']
fba = output['listeners']['fba_results']
mass = output['listeners']['mass']
bulk = pd.DataFrame(output['bulk'])

fluxes = np.array(fba['estimated_fluxes'][1:])
exchanges = fba['estimated_exchange_dmdt']

bulk_ids = [item[0] for item in initial_state['agents']['0']['bulk']]

bulk.columns = bulk_ids
# output['listeners']['unique_molecule_counts']['active_ribosome']

In [40]:
# in the bulk dataframe, update RNAP and 50S, 30S rib counts from unique molecules since they are sequestered as unique when in use.
for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'), ('active_RNAP', 'APORNAP-CPLX')]:
    if unique_key in output['listeners']['unique_molecule_counts']:
        unique_count = output['listeners']['unique_molecule_counts'][unique_key]
        bulk.loc[:, bulk_id+'[c]'] += unique_count

In [41]:
rich_counts = bulk.loc[:, complex_wcm_names]

# External data sets

# Saving outputs to files compatible with Julia

In [42]:
sum(W2[:, pathway_ids.index('Regulation of transcription')])

1.0

In [43]:
[protein_ids.index('EG10230-MONOMER'), pathway_ids.index('Regulation of transcription')]

[425, 1174]

In [44]:
W[protein_ids.index('EG10230-MONOMER'), pathway_ids.index('Regulation of transcription')]


0.0

In [45]:
complex_names = list(parsed_complex_df['common_name'])
protein_names = list(parsed_protein_df['common_name'])
cofactor_names = list(parsed_cofactor_df['common_name'])
pathway_names = list(parsed_pathway_df['common_name'])

# if cplx name is nan then use id
complex_names = [complex_names[i] if type(complex_names[i]) == str else parsed_complex_df['id'][i] for i in range(len(complex_names))]
protein_names = [protein_names[i] if type(protein_names[i]) == str else parsed_protein_df['id'][i] for i in range(len(protein_names))]
cofactor_names = [cofactor_names[i] if type(cofactor_names[i]) == str else parsed_cofactor_df['id'][i] for i in range(len(cofactor_names))]
pathway_names = [pathway_names[i] if type(pathway_names[i]) == str else parsed_pathway_df['id'][i] for i in range(len(pathway_names))]

# save C, P and E to julia-compatible formats
np.savetxt('notebooks/cofactors/data/C_matrix.csv', C.astype(np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/P_matrix.csv', P.astype(np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/E_matrix.csv', E.astype(np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/W_matrix.csv', W.astype(np.float64), delimiter=',')
np.savetxt('notebooks/cofactors/data/W1_matrix.csv', W1.astype(np.float64), delimiter=',')
np.savetxt('notebooks/cofactors/data/W2_matrix.csv', W2.astype(np.float64), delimiter=',')
np.savetxt('notebooks/cofactors/data/A_matrix.csv', A.astype(np.float64), delimiter=',')


# write all ids to single file with each list on a new line
with open('notebooks/cofactors/data/complex_ids.txt', 'w') as f:
    f.write('\n'.join(complex_ids))
with open('notebooks/cofactors/data/protein_ids.txt', 'w') as f:
    f.write('\n'.join(protein_ids))
with open('notebooks/cofactors/data/cofactor_ids.txt', 'w') as f:
    f.write('\n'.join(cofactor_ids))
with open('notebooks/cofactors/data/element_ids.txt', 'w') as f:
    f.write('\n'.join(element_ids))
with open('notebooks/cofactors/data/pathway_ids.txt', 'w') as f:
    f.write('\n'.join(pathway_ids))
with open('notebooks/cofactors/data/amino_acid_ids.txt', 'w') as f:
    f.write('\n'.join(amino_acid_ids))

# write all names to single file with each list on a new line
with open('notebooks/cofactors/data/complex_names.txt', 'w') as f:
    f.write('\n'.join(complex_names))
with open('notebooks/cofactors/data/protein_names.txt', 'w') as f:
    f.write('\n'.join(protein_names))
with open('notebooks/cofactors/data/cofactor_names.txt', 'w') as f:
    f.write('\n'.join(cofactor_names))
with open('notebooks/cofactors/data/pathway_names.txt', 'w') as f:
    f.write('\n'.join(pathway_names))


# save counts of proteins and complexes
np.savetxt('notebooks/cofactors/data/counts.csv', np.array(counts, dtype=np.int64), delimiter=',', fmt='%i')

# save counts of proteins and complexes
np.savetxt('notebooks/cofactors/data/rich_counts.csv', np.array(rich_counts, dtype=np.int64), delimiter=',', fmt='%i')

# save counts of proteins and complexes
#np.savetxt('notebooks/cofactors/data/anaerobic_counts.csv', np.array(anaerobic_counts, dtype=np.int64), delimiter=',', fmt='%i')



# save masses
np.savetxt('notebooks/cofactors/data/monomer_masses.csv', np.array(parsed_protein_df['sequence_mass']), delimiter=',', fmt='%f')

# list comprehension, if complex name in membrane_areas['id'], add membrane_ids['area_trans'], otherwise add 0 
membrane_ids = dict(zip(membrane_proteins['id'], membrane_proteins['area_trans']))
membrane_complex_areas = [membrane_ids[complex] if complex in membrane_ids else 0 for complex in complex_ids]
membrane_monomer_areas = [membrane_ids[monomer] if monomer in membrane_ids else 0 for monomer in protein_ids]

np.savetxt('notebooks/cofactors/data/complex_areas.csv', np.array(membrane_complex_areas), delimiter=',', fmt='%f')
np.savetxt('notebooks/cofactors/data/monomer_areas.csv', np.array(membrane_monomer_areas), delimiter=',', fmt='%f')

# save tree_matrix
np.savetxt('notebooks/cofactors/data/tree_matrix.csv', tree_matrix, delimiter=',', fmt='%i')



In [47]:
for i, entry in enumerate([f"rich-{i}" for i in range(1, 9)]):
    folder = f'out/cofactors/{entry}/'
    
    
    output_all = np.load(folder + '0_output.npy', allow_pickle='TRUE').item()
    # output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
    output = output_all['agents']['0']
    

    
    fba = output['listeners']['fba_results']
    mass = output['listeners']['mass']
    bulk = pd.DataFrame(output['bulk'])
    
    fluxes = np.array(fba['estimated_fluxes'][1:])
    exchanges = fba['estimated_exchange_dmdt']
    
    bulk_ids = [item[0] for item in initial_state['agents']['0']['bulk']]
    
    bulk.columns = bulk_ids
    # output['listeners']['unique_molecule_counts']['active_ribosome']
    # in the bulk dataframe, update RNAP and 50S, 30S rib counts from unique molecules since they are sequestered as unique when in use.
    for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'),
                                ('active_RNAP', 'APORNAP-CPLX')]:
        if unique_key in output['listeners']['unique_molecule_counts']:
            unique_count = output['listeners']['unique_molecule_counts'][unique_key]
            bulk.loc[:, bulk_id + '[c]'] += unique_count
            
    if i == 0:
        rich_counts = bulk.loc[:, complex_wcm_names].reset_index(names='timestep')
        rich_counts["cell"] = i
        vol_df = pd.DataFrame(output['listeners']['mass']['volume']).reset_index(names='timestep')
        vol_df["cell"] = i
        
    else:
        temp = bulk.loc[:, complex_wcm_names].reset_index(names='timestep')
        temp["cell"] = i
        rich_counts = pd.concat([rich_counts, temp])
        
        temp_vol = pd.DataFrame(output['listeners']['mass']['volume']).reset_index(names='timestep')
        temp_vol["cell"] = i
        
        vol_df = pd.concat([vol_df, temp_vol])

In [49]:
np.savetxt('notebooks/cofactors/data/rich_counts_big.csv', np.array(rich_counts, dtype=np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/rich_vol_big.csv', np.array(vol_df, dtype=np.float64), delimiter=',')

In [58]:
for i, entry in enumerate([f"min-{i}" for i in range(1, 4)]):
    folder = f'out/cofactors/{entry}/'
    
    
    output_all = np.load(folder + '0_output.npy', allow_pickle='TRUE').item()
    # output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
    output = output_all['agents']['0']
    
    fba = output['listeners']['fba_results']
    mass = output['listeners']['mass']
    bulk = pd.DataFrame(output['bulk'])
    
    fluxes = np.array(fba['estimated_fluxes'][1:])
    exchanges = fba['estimated_exchange_dmdt']
    
    bulk_ids = [item[0] for item in initial_state['agents']['0']['bulk']]
    
    bulk.columns = bulk_ids
    # output['listeners']['unique_molecule_counts']['active_ribosome']
    # in the bulk dataframe, update RNAP and 50S, 30S rib counts from unique molecules since they are sequestered as unique when in use.
    for unique_key, bulk_id in [('active_ribosome', 'CPLX0-3962'), ('active_ribosome', 'CPLX0-3953'),
                                ('active_RNAP', 'APORNAP-CPLX')]:
        if unique_key in output['listeners']['unique_molecule_counts']:
            unique_count = output['listeners']['unique_molecule_counts'][unique_key]
            bulk.loc[:, bulk_id + '[c]'] += unique_count
            
    if i == 0:
        minimal_counts = bulk.loc[:, complex_wcm_names].reset_index(names='timestep')
        minimal_counts["cell"] = i
        vol_df = pd.DataFrame(output['listeners']['mass']['volume']).reset_index(names='timestep')
        vol_df["cell"] = i
    else:
        temp = bulk.loc[:, complex_wcm_names].reset_index(names='timestep')
        temp["cell"] = i
        
        temp_vol = pd.DataFrame(output['listeners']['mass']['volume']).reset_index(names='timestep')
        temp_vol["cell"] = i
        
        minimal_counts = pd.concat([minimal_counts, temp])
        vol_df = pd.concat([vol_df, temp_vol])

In [59]:
vol_df

Unnamed: 0,timestep,0,cell
0,0,1.138467,0
1,1,1.138532,0
2,2,1.138643,0
3,3,1.138790,0
4,4,1.138968,0
...,...,...,...
2514,2514,2.338257,2
2515,2515,2.338922,2
2516,2516,2.339590,2
2517,2517,2.340259,2


In [60]:
np.savetxt('notebooks/cofactors/data/minimal_counts_big.csv', np.array(minimal_counts, dtype=np.int64), delimiter=',', fmt='%i')
np.savetxt('notebooks/cofactors/data/minimal_vol_big.csv', np.array(vol_df, dtype=np.float64), delimiter=',')

In [146]:
rich_counts

Unnamed: 0,timestep,1-PFK[c],2OXOGLUTARATEDEH-CPLX[c],3-ISOPROPYLMALDEHYDROG-CPLX[c],3-ISOPROPYLMALISOM-CPLX[c],3-METHYL-2-OXOBUT-OHCH3XFER-CPLX[c],3-OXOACYL-ACP-SYNTHII-CPLX[c],6PFK-1-CPX[c],6PFK-2-CPX[c],6PGLUCONDEHYDROG-CPLX[c],...,YRBE-MONOMER[i],YRBF-MONOMER[m],YRBG-MONOMER[i],YTFQ-MONOMER[p],YTFR-MONOMER[m],YTFT-MONOMER[i],ZNUA-MONOMER[p],ZNUB-MONOMER[i],ZNUC-MONOMER[i],cell
0,0,72,182,2388,4773,524,1056,1034,129,2048,...,107,191,51,0,8,5,1026,1,15,0
1,1,72,182,2388,4773,524,1056,1034,129,2048,...,107,191,51,0,8,5,1026,1,16,0
2,2,72,182,2388,4773,524,1056,1035,129,2048,...,107,191,51,0,8,5,1026,1,17,0
3,3,72,182,2388,4776,524,1056,1036,130,2048,...,107,191,51,0,8,5,1026,1,17,0
4,4,72,182,2389,4777,525,1056,1037,130,2048,...,107,191,51,0,8,5,1026,1,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1396,1396,101,279,2274,4675,946,1369,1692,111,3979,...,155,364,97,0,3,5,1272,0,30,1
1397,1397,101,279,2274,4675,947,1369,1693,111,3982,...,154,362,97,0,3,5,1272,0,30,1
1398,1398,101,279,2275,4675,947,1369,1694,111,3984,...,154,362,97,0,3,5,1272,0,30,1
1399,1399,101,279,2275,4675,948,1369,1695,111,3986,...,154,363,97,0,3,5,1272,0,30,1


In [52]:
# save counts as csv with header for Khoa
counts.to_csv('notebooks/cofactors/data/counts_header.csv', index=False)

parsed_protein_df.to_csv('notebooks/cofactors/data/protein_table.csv', index=False)