In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
fdir = '../../../data/databases/Brenda/'
fname = 'brenda.json'
df = pd.read_json(fdir+fname)

Remove columns (EC numbers) that do not contain any $k_{cat}$ values

In [None]:
remove_cols = []
for ECNumber in df.columns:
    kcats = df[ECNumber].turnover_number
    if (type(kcats) == float) and np.isnan(kcats):
        remove_cols.append(ECNumber)
len(remove_cols)

In [None]:
df = df.drop(remove_cols, axis=1)

In [None]:
df['1.1.1.1']

In [None]:
# Helper function to compare values in two organism/reference dictionaries
def isequal_org_ref_map(map1, map2):
    pH1 = map1['pH']
    pH2 = map2['pH']
    if np.isnan(pH1) and not(np.isnan(pH2)):
        return False
    elif not(np.isnan(pH1)) and np.isnan(pH1):
        return False
    elif not(np.isnan(pH1)) and not(np.isnan(pH2)) and pH1 != pH2:
        return False

    T1 = map1['Temperature']
    T2 = map2['Temperature']
    if np.isnan(T1) and not(np.isnan(T2)):
        return False
    elif not(np.isnan(T1)) and np.isnan(T2):
        return False
    elif not(np.isnan(T1)) and not(np.isnan(T2)) and T1 != T2:
        return False
 
    enz1 = map1['EnzymeType']
    enz2 = map2['EnzymeType']
    if len(enz1) == len(enz2):
         if len(enz1) > 0 and not(np.array_equal(enz1, enz2)):
              return False
    else:
        return False
    
    return True

In [None]:
# Extract relevant information about pH, Temperature, EnzymeType and cosubstrates from each segment of kcat['comment']
def parse_entry(entry, org_ref_map, org_inds, ref_inds, cosubstrate_list):
    if 'pH' in entry:
        pH = re.search('pH ?[0-9]+[.]*[0-9]*', entry)
        if (pH):
            pH = re.search('[0-9]+[.]*[0-9]*', pH.group()).group()
            pH = pH.replace('..', '.')
            pH = float(pH)
        else:
            pH = re.search('at [0-9]+[.]*[0-9]* pH', entry)
            if (pH):
                pH = re.search('[0-9]+[.]*[0-9]*', pH.group()).group()
                #print(entry)
                pH = pH.replace('..', '.')
                pH = float(pH)
        if (pH):
            for org in org_inds:
                for ref in ref_inds:
                    org_ref_map[(org, ref)]['pH'].append(pH)

    if '°C' in entry:
        temp = re.search('[0-9]+[.]*[0-9]* ?Â?°C', entry)
        if (temp):
            temp = re.search('[0-9]+[.]*[0-9]*', temp.group()).group()
            temp = float(temp)
        if (temp):
            for org in org_inds:
                for ref in ref_inds:
                    org_ref_map[(org, ref)]['Temperature'].append(temp)
                            
    if ('zyme' in entry) or ('mutant' in entry) or ('mutated') in entry:
        for org in org_inds:
            for ref in ref_inds:
                org_ref_map[(org, ref)]['EnzymeType'].append(entry)

    if ('co-substrate' in entry) or ('cosubstrate' in entry):
        cosubstrate = ''
        entry = entry.replace('co-substrate', 'cosubstrate')
        entry = entry.replace('cosubstrate:', 'cosubstrate')
        entry = entry.replace('donor ', '')
        if re.search('mM .+ as cosubstrate', entry):
            cosubstrate = re.search('mM .+ as', entry).group()[3:-3]
        elif re.search('with .+ as cosubstrate', entry):
            cosubstrate = re.search('with .+ as', entry).group()[5:-3]
        elif re.search('using .+ as cosubstrate', entry):
            cosubstrate = re.search('using .* as', entry).group()[6:-3]
        elif re.search('cosubstrate .+', entry):
            cosubstrate = re.search('cosubstrate .+', entry).group()[12:]
        
        if cosubstrate != '' and cosubstrate[-1] == ' ':
            cosubstrate = cosubstrate[:-1]
        if ' or ' in cosubstrate:
            cosubstrate = cosubstrate.split(' or ')[0]
        if ' + ' in cosubstrate:
            cosubstrate = cosubstrate.replace(' + ', ' and ')
        
        if cosubstrate != '':
            for cosubstrate in cosubstrate.split(' and '):
                cosubstrate_list.append(cosubstrate)

In [None]:
def parse_comment(comment, org_ref_map, cosubstrate_list):
    # each subcomment has #...# ... <...> pattern
    # each subcomment can refer to multiple organisms and references 
    org_inds = re.search('#.*#', comment).group()[1:-1].split(',')
    ref_inds = re.search('<.*>', comment).group()[1:-1].split(',')
    # create a subdictionary for each (org, ref) pair
    for org in org_inds:
        for ref in ref_inds:
            if not((org, ref) in org_ref_map):
                org_ref_map[(org, ref)] = {'pH': [], 'Temperature': [], 'EnzymeType': [], 'UniProtID': np.nan, 'PubMedID': np.nan}
    # resolve inconsistent use of commas to separate entries
    entries = re.search('#[^#]*<', comment).group()[1:-1]
    entries = entries.replace(' , ', ', ')
    entries = entries.replace(',,', ',')
    entries = entries.split(', ')
    
    for entry in entries:
        parse_entry(entry, org_ref_map, org_inds, ref_inds, cosubstrate_list)

In [None]:
# Remove data entries that cannot be uniquely mapped to temperature, pH or reference data
def clean_org_ref_map(org_ref_map):
    for i in org_ref_map.keys():
        # If there are multiple associated temperature or pH values we ignore them
        org_ref_map[i]['Temperature'] = np.unique(org_ref_map[i]['Temperature']) 
        if len(org_ref_map[i]['Temperature']) == 1:
            org_ref_map[i]['Temperature'] = org_ref_map[i]['Temperature'][0]
        else:
            org_ref_map[i]['Temperature'] = np.nan
        
        org_ref_map[i]['pH'] = np.unique(org_ref_map[i]['pH']) 
        if len(org_ref_map[i]['pH']) == 1:
            org_ref_map[i]['pH'] = org_ref_map[i]['pH'][0]
        else:
            org_ref_map[i]['pH'] = np.nan
        
        org_ref_map[i]['EnzymeType'] = np.unique(org_ref_map[i]['EnzymeType'])

    # Remove reference data for clarity if an entry is associated with multiple references and otherwise identical data
    keys = org_ref_map.copy().keys()
    for key in keys:
        if key in org_ref_map:
            remove = False
            for key2 in keys:
                if (key2 in org_ref_map) and (key[0] == key2[0]) and (key[1] != key2[1]) and isequal_org_ref_map(org_ref_map[key], org_ref_map[key2]):
                    remove = True
                    if key2 in org_ref_map:
                        del org_ref_map[key2]
                    if not((key[0], '-') in org_ref_map):
                        org_ref_map[(key[0], '-')] = org_ref_map[key]
            if remove:
                del org_ref_map[key]

In [None]:
rows_list = []

for ECNumber in df.columns:
    data = df[ECNumber]

    # Create a map linking proteins to their UniProtIDs that are recorded in the database
    uniprot_map = {}
    if type(data.proteins) == dict:
        for protein in data.proteins:
            # accessions is always in the first element of the protein_info list
            protein_info = data.proteins[protein][0]
            if 'accessions' in protein_info: # always associated with 'source' = 'uniprot'
                uniprot_map[protein] = protein_info['accessions'] # this can be a list longer than 1

    # Extract kcat information
    kcats = data.turnover_number
    for kcat in kcats:
        
        # ---------------------------------------------------------------------------------- 
        # Parse the comments to extract any relevant data
        
        org_ref_map = {}
        cosubstrate_list = []

        if ('comment' in kcat):
            # subcomments are separated by a semicolon
            comments = kcat['comment'].split('; ')
            for comment in comments:
                parse_comment(comment, org_ref_map, cosubstrate_list)

        clean_org_ref_map(org_ref_map)
        cosubstrate_list = np.unique(cosubstrate_list)
        
        # ----------------------------------------------------------------------------------
        # Parse the organism/reference combinations and leave only the unique entries
        # Remove reference info if they cannot be uniquely identified

        n_orgs = len(kcat['organisms'])
        n_refs = len(kcat['references'])

        if (n_orgs == 1) and (n_refs == 1):
            org = kcat['organisms'][0] 
            ref = kcat['references'][0]
            if bool(org_ref_map) and not((org, ref) in org_ref_map):
                print("Unexpected organism/reference assignment")
            elif not(org_ref_map):
                org_ref_map[(org, ref)] = {'pH': np.nan, 'Temperature': np.nan, 'EnzymeType': [], 'UniProtID': np.nan, 'PubMedID': np.nan}
        elif (n_orgs == 1) and (n_refs > 1):
            org = kcat['organisms'][0] 
            if not(org_ref_map):
                org_ref_map[(org, '-')] = {'pH': np.nan, 'Temperature': np.nan, 'EnzymeType': [], 'UniProtID': np.nan, 'PubMedID': np.nan}
        elif (n_orgs > 1) and (n_refs == 1):
            ref = kcat['references'][0]
            for org in kcat['organisms']:
                if not((org, ref) in org_ref_map):
                    org_ref_map[(org, ref)] = {'pH': np.nan, 'Temperature': np.nan, 'EnzymeType': [], 'UniProtID': np.nan, 'PubMedID': np.nan}
        elif (n_orgs > 1) and (n_refs > 1):
            org_keys = [key[0] for key in org_ref_map.keys()]
            for org in kcat['organisms']:
                # add an entry if the organism is not already associated with some reference 
                if not(org in org_keys):
                    org_ref_map[(org, '-')] = {'pH': np.nan, 'Temperature': np.nan, 'EnzymeType': [], 'UniProtID': np.nan, 'PubMedID': np.nan}
        else:
            print("Unexpected number of organisms and references") 

        # ----------------------------------------------------------------------------------
        # Parse the protein/organism list and retrieve the associated UniProtIDs
        # Note that protein and organism indices seem to be equivalent

        for org in kcat['organisms']:
            if org in uniprot_map:
                for key in org_ref_map.keys():
                    if key[0] == org:
                        org_ref_map[key]['UniProtID'] = uniprot_map[org]

        # ----------------------------------------------------------------------------------
        # Parse the reference list and retrieve the associated PubmedIDs
    
        for key in org_ref_map:
            if (key[1] != '-') and ('pmid' in data.references[key[1]]):
                org_ref_map[key]['PubMedID'] = data.references[key[1]]['pmid']
        
        # ----------------------------------------------------------------------------------
        # Retrieve the associated organisms

        for key in org_ref_map:
            org_ref_map[key]['Organism'] = data.organisms[key[0]]['value']

        # ----------------------------------------------------------------------------------
        # Extract data associated with each organism/reference combination
 
        data_map = {"ECNumber": ECNumber, "EnzymeName": data['name']}

        # kcat        
        if 'num_value' in kcat:
            data_map['parameter.startValue'] = kcat['num_value']
            data_map['parameter.endValue'] = np.nan
        elif 'min_value' in kcat:
            data_map['parameter.startValue'] = kcat['min_value']
            data_map['parameter.endValue'] = kcat['max_value']
        else:
            print("Inconsistent kcat value")

        data_map['parameter.standardDeviation'] = np.nan
        data_map['parameter.unit'] = 's^(-1)'

        # Substrate
        data_map['Substrate'] = kcat['value']        
        
        # ----------------------------------------------------------------------------------
        # Parse the reaction and cosubstrate lists to identify other co-substrates
         
        # Check if the reference and organism lists associated with kcat together with the kcat substrate have a unique hit in the reaction list
        reaction_list = []
        for reaction in data.reaction:
            
            if not('organisms' in reaction) or not('references' in reaction) or not('educts' in reaction):
                #print(reaction)
                continue
            
            org_check = np.all([org in reaction['organisms'] for org in kcat['organisms']])
            ref_check = np.all([ref in reaction['references'] for ref in kcat['references']])
            substrate_check = kcat['value'] in reaction['educts']
            if org_check and ref_check and substrate_check:
                educts = list(np.sort(reaction['educts']))
                if not(educts in reaction_list):
                    reaction_list.append(educts)
        
        if len(reaction_list) == 0:
            data_map['Substrate'] = [kcat['value']]
        elif len(reaction_list) == 1:
            data_map['Substrate'] = reaction_list[0]
        elif (len(reaction_list) > 1) and (len(cosubstrate_list) > 0):
            # If the reaction cannot be uniquely assigned, try using the cosubstrates parsed from the comment to narrow it down
            # NOTE: in a handful of cases, individual org/ref combinations may have different cosubstrates associated with it (would require a bit of a rewrite, ignoring it currently)
            cosub_reaction_list = []
            for reaction in reaction_list:
                cosubstrate_check = np.all([cosubstrate in reaction for cosubstrate in cosubstrate_list])
                if cosubstrate_check:
                    cosub_reaction_list.append(reaction)
            if len(cosub_reaction_list) == 1:
                data_map['Substrate'] = cosub_reaction_list[0]
            else:
                # if cannot uniquely assign
                data_map['Substrate'] = reaction_list[0]
        else:
            data_map['Substrate'] = [kcat['value']]

        data_map['Substrate'] = np.sort(data_map['Substrate'])

        # ----------------------------------------------------------------------------------
        # Append data entries
        
        for key in org_ref_map.keys():
            org_ref_map[key].update(data_map)
            rows_list.append(org_ref_map[key])

df = pd.DataFrame(rows_list)

In [None]:
df

In [None]:
np.sort(['Ethanol', 'NAD+'])

In [None]:
# parse PubMedIDs as integers
df['PubMedID'] = df['PubMedID'].astype('Int64')
# remove duplicate entries
df = df.loc[df.astype(str).drop_duplicates().index]
len(df)

#### `parameter.endValue`

List the number of $k_{cat}$ values with an associated `endValue`

In [None]:
np.sum(~df['parameter.endValue'].isnull())

In [None]:
df[~df['parameter.endValue'].isnull()]

Relatively few entries exist, so ignoring `endValue` seems to be the way to go. Discard any entries that have both `startValue` and `endValue` defined, and remove the `parameter.endValue` column completely.

In [None]:
mask = df['parameter.endValue'].isnull()
df = df.drop('parameter.endValue', axis=1)
df = df.loc[mask]
len(df)

#### `pH`

In [None]:
np.sum(~df['pH'].isnull())

#### `Temperature`

In [None]:
np.sum(~df['Temperature'].isnull())

# Save dataset

In [None]:
df = df.rename(columns={'parameter.startValue': 'Value'})
df = df.rename(columns={'parameter.standardDeviation': 'StandardDeviation'})
df = df.drop('parameter.unit', axis=1)

In [None]:
df = df[['PubMedID', 'Organism', 'Substrate', 'ECNumber', 'EnzymeName', 'EnzymeType', 'UniProtID', 'pH', 'Temperature', 'Value', 'StandardDeviation']]

In [None]:
mask = df.astype(str).drop_duplicates().index
df = df.loc[mask]
df = df.reset_index(drop=True)
len(df)

In [None]:
df

In [None]:
fdir = '../../../data/databases/Brenda/'

In [None]:
df.to_csv(fdir+'kcats.csv', index=False)

In [None]:
df.to_json(fdir+'kcats.json', index=False)