## Load data

In [None]:
import pandas as pd
import numpy as np
import re
import csv

In [None]:
pwd

In [None]:
fdir = '../../data/databases/'
fdir_Sabio = fdir+'Sabio-RK/'
fdir_Brenda = fdir+'Brenda/'

In [None]:
df_Sabio = pd.read_json(fdir_Sabio+'kcats.json')
df_Brenda = pd.read_json(fdir_Brenda+'kcats.json')
df_Sabio['PubMedID'] = df_Sabio['PubMedID'].astype('Int64')
df_Brenda['PubMedID'] = df_Brenda['PubMedID'].astype('Int64')

In [None]:
print(len(df_Sabio))
print(len(df_Brenda))

In [None]:
# a few fixes to make the `Organism` entries more consistent between the two databases
# np.unique(df_Sabio['Organism'][df_Sabio['Organism'].apply(lambda x: 'subsp' in x)])
# np.unique(df_Brenda['Organism'][df_Brenda['Organism'].apply(lambda x: 'subsp' in x)])
# e.g. "acinetobacter calcoaceticus (subsp. anitratus)" -> "acinetobacter calcoaceticus subsp. anitratus"
def fix_org(org):
    _org = org
    if re.search(' \\(subsp.*\\)', org):
        inds = re.search(' \\(subsp.*\\)', org).span()
        _org = org[0:inds[0]+1] + org[inds[0]+2:inds[1]-1]
    _org = _org.replace("(nom. inval.)", "")
    _org = _org.replace("(nom. rej.)", "")
    _org = _org.replace("(nom. dub.)", "")
    _org = 'priestia megaterium' if _org == 'bacillus megaterium' else _org
    _org = _org[1:] if _org[0] == " " else _org
    _org = _org[:-1] if _org[-1] == " " else _org
    return _org

In [None]:
# Brenda and Sabio both have slightly different ordering for 'Substrate' due to uppercase/lowercase inconsistencies
df_Brenda['Substrate'] = df_Brenda['Substrate'].apply(lambda x: np.sort([substrate.lower() for substrate in x]).tolist())
df_Sabio['Substrate'] = df_Sabio['Substrate'].apply(lambda x: np.sort([substrate.lower() for substrate in x]).tolist())
# Put 'ECNumber' for Brenda in lists (consistent with Sabio)
df_Brenda['ECNumber'] = df_Brenda['ECNumber'].apply(lambda x: [x])
# Change UniProtID `None` entries to empty lists []
df_Brenda['UniProtID'] = df_Brenda['UniProtID'].apply(lambda x: [] if x == None else x)
# Lowercase for 'Organism' just in case
df_Brenda['Organism'] = df_Brenda['Organism'].apply(lambda x: fix_org(x.lower()))
df_Sabio['Organism'] = df_Sabio['Organism'].apply(lambda x: fix_org(x.lower()))

In [None]:
# Expand Brenda `EnzymeType` lists into separate entries (plenty of mutants have kcats identical to wildtype)
rows_list = []
for row in df_Brenda.iterrows():
    entry = row[1]
    if row[1]['EnzymeType'] == []:
        entry['EnzymeType'] = ""
        rows_list.append(entry)
    else:
        for enzymetype in row[1]['EnzymeType']:
            entry['EnzymeType'] = enzymetype
            entry['EnzymeType'] = entry['EnzymeType'].replace("wild-type", "wildtype")
            entry['EnzymeType'] = entry['EnzymeType'].replace("wild type", "wildtype")
            rows_list.append(entry.copy())

df_Brenda = pd.DataFrame(rows_list)
df_Brenda = df_Brenda.reset_index(drop=True)
mask = df_Brenda.astype(str).drop_duplicates().index
df_Brenda = df_Brenda.loc[mask]
df_Brenda = df_Brenda.reset_index(drop=True)
df_Brenda['PubMedID'] = df_Brenda['PubMedID'].astype('Int64')
len(df_Brenda)

In [None]:
# Make the substrate names consistent 

with open('../../data/databases/substrate_synonym_map.csv','r') as f:
    reader = csv.reader(f)
    substrate_map = dict((rows[0],rows[1]) for rows in reader)

df_Brenda['Substrate'] = df_Brenda['Substrate'].apply(lambda x: np.sort([substrate_map[substrate].lower() for substrate in x]).tolist())
df_Sabio['Substrate'] = df_Sabio['Substrate'].apply(lambda x: np.sort([substrate_map[substrate].lower() for substrate in x]).tolist())

In [None]:
print(len(df_Sabio))
print(len(df_Brenda))
print("Total number of entries:", len(df_Sabio)+len(df_Brenda))

## Merge Sabio and Brenda entries with the same PubMedID

In [None]:
def compare_UniProtID(x_Brenda, x_Sabio):
    #print(x_Sabio['UniProtID'])
    #print(x_Brenda['UniProtID'])
    if len(x_Brenda['UniProtID']) == len(x_Sabio['UniProtID']):
        #print("Brenda has the same number of UniProtIDs as Sabio")
        if np.all([i in x_Brenda['UniProtID'] for i in x_Sabio['UniProtID']]):
            return x_Brenda['UniProtID']
        else:
            return False
    elif len(x_Brenda['UniProtID']) > len(x_Sabio['UniProtID']):
        #print("Brenda has more UniProtIDs")
        if np.all([i in x_Brenda['UniProtID'] for i in x_Sabio['UniProtID']]):
            return x_Brenda['UniProtID']
        else:
            return False
    else:
        #print("Sabio has more UniProtIDs")
        if np.all([i in x_Sabio['UniProtID'] for i in x_Brenda['UniProtID']]):
            return x_Sabio['UniProtID']
        else:
            return False
        
def compare_Substrate(x_Brenda, x_Sabio):
    #print(x_Sabio['Substrate'])
    #print(x_Brenda['Substrate'])
    if len(x_Brenda['Substrate']) == len(x_Sabio['Substrate']):
        #print("Brenda has the same number of substrates as Sabio")
        if np.all([i in x_Brenda['Substrate'] for i in x_Sabio['Substrate']]):
            return x_Brenda['Substrate']
        else:
            return False
    elif len(x_Brenda['Substrate']) > len(x_Sabio['Substrate']):
        #print("Brenda has more substrates")
        if np.all([i in x_Brenda['Substrate'] for i in x_Sabio['Substrate']]):
            return x_Brenda['Substrate']
        else:
            return False
    else:
        #print("Sabio has more substrates")
        if np.all([i in x_Sabio['Substrate'] for i in x_Brenda['Substrate']]):
            return x_Sabio['Substrate']
        else:
            return False
        
def compare_pH(x_Brenda, x_Sabio):
    #print(x_Sabio['pH'])
    #print(x_Brenda['pH'])
    if np.isnan(x_Brenda['pH']):
        #if np.isnan(x_Sabio['pH']):
        #    print("No pH values given")
        #else:
        #    print("Sabio has pH")
        return x_Sabio['pH']
    elif np.isnan(x_Sabio['pH']):
        #print("Brenda has pH")
        return x_Brenda['pH']
    else:
        if x_Brenda['pH'] == x_Sabio['pH']:
            #print("Both have the same pH")
            return x_Brenda['pH']
        else:
            #print("Brenda and Sabio have different pH")
            return False
        
def compare_Temperature(x_Brenda, x_Sabio):
    #print(x_Sabio['Temperature'])
    #print(x_Brenda['Temperature'])
    if np.isnan(x_Brenda['Temperature']):
        #if np.isnan(x_Sabio['Temperature']):
        #    print("No Temperature values given")
        #else:
        #    print("Sabio has Temperature")
        return x_Sabio['Temperature']
    elif np.isnan(x_Sabio['Temperature']):
        #print("Brenda has Temperature")
        return x_Brenda['Temperature']
    else:
        if x_Brenda['Temperature'] == x_Sabio['Temperature']:
            #print("Both have the same Temperature")
            return x_Brenda['Temperature']
        else:
            #print("Brenda and Sabio have different Temperature")
            return False
        
def compare_EnzymeType(x_Brenda, x_Sabio):
    #print(x_Sabio['EnzymeType'])
    #print(x_Brenda['EnzymeType'])
    if ('mutant' in x_Sabio['EnzymeType']) and ('mutant' in x_Brenda['EnzymeType']):
        #print("Both mutant")
        return x_Sabio['EnzymeType']
    elif ('wildtype' in x_Sabio['EnzymeType']) and ('wildtype' in x_Brenda['EnzymeType']):
        #print("Both wildtype")
        return x_Sabio['EnzymeType']
    elif ('wildtype' in x_Sabio['EnzymeType']) and ('mutant' in x_Brenda['EnzymeType']):
        #print("Sabio wildtype and Brenda mutant")
        return False
    elif ('mutant' in x_Sabio['EnzymeType']) and ('wildtype' in x_Brenda['EnzymeType']):
        #print("Sabio mutant and Brenda wildtype")
        return False
    elif (len(x_Sabio['EnzymeType']) == 0) and (len(x_Brenda['EnzymeType']) > 0):
        #print("Brenda contains more EnzymeType information")
        return x_Brenda['EnzymeType']
    elif (len(x_Brenda['EnzymeType']) == 0) and (len(x_Sabio['EnzymeType']) > 0):
        #print("Sabio contains more EnzymeType information")
        return x_Sabio['EnzymeType']
    else:
        #print("Unclear how to parse")
        return False

def similar_organism(org_Brenda, org_Sabio):
    # Brenda usually has less detailed information
    # Example: "lactococcus lactis" and "lactococcus lactis subsp. cremoris"
    #print(org_Brenda, " ", org_Sabio)
    if org_Brenda == org_Sabio:
        return True
    elif re.search('subsp.|sp.', org_Brenda):
        inds = re.search('subsp.|sp.', org_Brenda).span()
        _org_Brenda = org_Brenda[:inds[0]-1]
        return _org_Brenda in org_Sabio
    elif re.search('subsp.|sp.', org_Sabio):
        inds = re.search('subsp.|sp.', org_Sabio).span()
        _org_Sabio = org_Sabio[:inds[0]-1]
        return _org_Sabio in org_Brenda
    else:
        return False

In [None]:
rinds = np.intersect1d(df_Sabio['PubMedID'], df_Brenda['PubMedID']).astype(int)
len(rinds)

In [None]:
rows_list = []
for rind in rinds:
    entries_Sabio = df_Sabio[df_Sabio['PubMedID'] == rind]
    entries_Brenda = df_Brenda[df_Brenda['PubMedID'] == rind].copy()
    for i in range(len(entries_Brenda)):
        x_Brenda = entries_Brenda.loc[entries_Brenda.index[i]]
        org = x_Brenda['Organism']
        ecnumber = x_Brenda['ECNumber']
        xs_Sabio = entries_Sabio[entries_Sabio['ECNumber'].apply(lambda x: x == ecnumber) & entries_Sabio['Organism'].apply(lambda x: similar_organism(org, x))]
        rtol_val = 1e-2
        inds = np.where(np.isclose(xs_Sabio['Value'], x_Brenda['Value'], rtol=rtol_val))[0]
        counter = 0
        if (len(xs_Sabio) > 0) and (len(inds) > 0):
            inds_Sabio = xs_Sabio.index[inds].copy()
            for ind in inds_Sabio:
            #for j in inds:
                #x_Sabio = xs_Sabio.loc[xs_Sabio.index[j]]
                x_Sabio = xs_Sabio.loc[ind]
                ret_UniProtID = compare_UniProtID(x_Brenda, x_Sabio)
                ret_Substrate = compare_Substrate(x_Brenda, x_Sabio)
                ret_pH = compare_pH(x_Brenda, x_Sabio)
                ret_Temperature = compare_Temperature(x_Brenda, x_Sabio)
                ret_EnzymeType = compare_EnzymeType(x_Brenda, x_Sabio)
                # if any of these are False do nothing, otherwise remove duplicates merging data into one
                if (type(ret_UniProtID) == list) and (type(ret_Substrate) == list) and (ret_pH != False) and (ret_Temperature != False) and (ret_EnzymeType != False):
                    #print("MERGE")
                    row = x_Sabio.copy()
                    row['UniProtID'] = ret_UniProtID
                    row['Substrate'] = ret_Substrate
                    row['pH'] = ret_pH
                    row['Temperature'] = ret_Temperature
                    row['EnzymeType'] = ret_EnzymeType
                    rows_list.append(row)
                    counter += 1
                    df_Sabio = df_Sabio.drop(ind)
                    entries_Sabio = entries_Sabio.drop(ind)
        if counter == 0:
            rows_list.append(x_Brenda.copy())
        df_Brenda = df_Brenda.drop(x_Brenda.name)

    for i in range(len(entries_Sabio)):
        x_Sabio = entries_Sabio.loc[entries_Sabio.index[i]]
        rows_list.append(x_Sabio.copy())
        df_Sabio = df_Sabio.drop(x_Sabio.name)

In [None]:
print(len(rows_list))
print(len(df_Sabio))
print(len(df_Brenda))

## Merge Brenda entries without PubMedID

In [None]:
df_null_Brenda = df_Brenda[df_Brenda['PubMedID'].apply(lambda x: np.isnan(x))]

In [None]:
df_null_Brenda

In [None]:
for i in df_null_Brenda.index:
    x_Brenda = df_null_Brenda.loc[i]
    org = x_Brenda['Organism']
    ecnumber = x_Brenda['ECNumber']
    xs_Sabio = df_Sabio[df_Sabio['ECNumber'].apply(lambda x: x == ecnumber) & df_Sabio['Organism'].apply(lambda x: similar_organism(org, x))]
    rtol_val = 1e-2
    inds = np.where(np.isclose(xs_Sabio['Value'], x_Brenda['Value'], rtol=rtol_val))[0]
    counter = 0
    if (len(xs_Sabio) > 0) and (len(inds) > 0):
        inds_Sabio = xs_Sabio.index[inds].copy()
        for ind in inds_Sabio:
            x_Sabio = xs_Sabio.loc[ind]
            ret_UniProtID = compare_UniProtID(x_Brenda, x_Sabio)
            ret_Substrate = compare_Substrate(x_Brenda, x_Sabio)
            ret_pH = compare_pH(x_Brenda, x_Sabio)
            ret_Temperature = compare_Temperature(x_Brenda, x_Sabio)
            ret_EnzymeType = compare_EnzymeType(x_Brenda, x_Sabio)
            if (type(ret_UniProtID) == list) and (type(ret_Substrate) == list) and (ret_pH != False) and (ret_Temperature != False) and (ret_EnzymeType != False):
                row = x_Sabio.copy()
                row['UniProtID'] = ret_UniProtID
                row['Substrate'] = ret_Substrate
                row['pH'] = ret_pH
                row['Temperature'] = ret_Temperature
                row['EnzymeType'] = ret_EnzymeType
                rows_list.append(row)
                counter += 1
                df_Sabio = df_Sabio.drop(ind)
    if counter == 0:
        rows_list.append(x_Brenda.copy())
    df_Brenda = df_Brenda.drop(i)

In [None]:
print(len(rows_list))
print(len(df_Sabio))
print(len(df_Brenda))

## Merge Sabio entries without PubMedID

In [None]:
df_null_Sabio = df_Sabio[df_Sabio['PubMedID'].apply(lambda x: np.isnan(x))]

In [None]:
df_null_Sabio

In [None]:
for i in df_null_Sabio.index:
    x_Sabio = df_null_Sabio.loc[i]
    org = x_Sabio['Organism']
    ecnumber = x_Sabio['ECNumber']
    xs_Brenda = df_Brenda[df_Brenda['ECNumber'].apply(lambda x: x == ecnumber) & df_Brenda['Organism'].apply(lambda x: similar_organism(org, x))]
    rtol_val = 1e-2
    inds = np.where(np.isclose(xs_Brenda['Value'], x_Sabio['Value'], rtol=rtol_val))[0]
    counter = 0
    if (len(xs_Brenda) > 0) and (len(inds) > 0):
        inds_Brenda = xs_Brenda.index[inds].copy()
        for ind in inds_Brenda:
            x_Brenda = xs_Brenda.loc[ind]
            ret_UniProtID = compare_UniProtID(x_Brenda, x_Sabio)
            ret_Substrate = compare_Substrate(x_Brenda, x_Sabio)
            ret_pH = compare_pH(x_Brenda, x_Sabio)
            ret_Temperature = compare_Temperature(x_Brenda, x_Sabio)
            ret_EnzymeType = compare_EnzymeType(x_Brenda, x_Sabio)
            if (type(ret_UniProtID) == list) and (type(ret_Substrate) == list) and (ret_pH != False) and (ret_Temperature != False) and (ret_EnzymeType != False):
                row = x_Sabio.copy()
                row['UniProtID'] = ret_UniProtID
                row['Substrate'] = ret_Substrate
                row['pH'] = ret_pH
                row['Temperature'] = ret_Temperature
                row['EnzymeType'] = ret_EnzymeType
                rows_list.append(row)
                counter += 1
                df_Brenda = df_Brenda.drop(ind)
    if counter == 0:
        rows_list.append(x_Sabio.copy())
    df_Sabio = df_Sabio.drop(i)

In [None]:
print(len(rows_list))
print(len(df_Sabio))
print(len(df_Brenda))

## Final merge

In [None]:
df_dup = pd.DataFrame(rows_list)

In [None]:
df_dup

In [None]:
df = pd.concat([df_Sabio, df_Brenda, df_dup])
df = df.reset_index(drop=True)
mask = df.astype(str).drop_duplicates().index
df = df.loc[mask]
df = df.reset_index(drop=True)
len(df)

In [None]:
print(len(df_dup))
print(len(df_Brenda))
print(len(df_Sabio))
len(df_dup)+len(df_Brenda)+len(df_Sabio)

In [None]:
df = df.drop(columns=['PubMedID'])
mask = df.astype(str).drop_duplicates().index
df = df.loc[mask]
df = df.reset_index(drop=True)
len(df)

In [None]:
df

In [None]:
df = df.sort_values(by=['ECNumber'])

In [None]:
np.sort(df['Value'])

In [None]:
df

In [None]:
df.to_csv(fdir+'kcats_merged.csv', index=False)

In [None]:
df.to_json(fdir+'kcats_merged.json', index=False)