In [46]:
import pandas as pd
import numpy as np
from os.path import join
import os
import pickle
import pubchempy as pcp
from rdkit import Chem
from scipy.stats import gmean
from rdkit.Chem import Crippen
from rdkit.Chem import Descriptors 
from bioservices import *
from data_preprocessing import *

## Script to generate ESM1b enzyme representations, ChemBERTa2 substrate representations, rxnfp reaction fingerprints and to split the dataset in training and test sets, and the training set in 5-fold for cross-validation (CV)

### BRENDA Dataset

In [47]:
df_BRENDA = pd.read_pickle(join("df_kcat_km_cp3.pkl"))

In [48]:
df_BRENDA

Unnamed: 0,EC,substrate,Organism,Uniprot,kcat_km,KEGG ID
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,3.8,C11215
1,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,3.4,C11215
2,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,0.006,C00063
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,0.16,C00002
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,0.16,D08646
...,...,...,...,...,...,...
12842,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,C00002
12843,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,D08646
12844,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,D02300
12845,3.7.1.13,"2-hydroxy-6-oxo-6-phenylhexa-2,4-dienoate",Janthinobacterium sp. J3,Q84II3,730.0,C01273


Remove duplicates and incomplete entries

In [49]:
df_BRENDA = df_BRENDA.drop_duplicates(subset = ["EC", "Uniprot", "kcat_km", "KEGG ID"])
df_BRENDA = df_BRENDA.reset_index(drop = True)

df_BRENDA = df_BRENDA.loc[~pd.isnull(df_BRENDA["Uniprot"])]
df_BRENDA = df_BRENDA.loc[~pd.isnull(df_BRENDA["kcat_km"])]
df_BRENDA

Unnamed: 0,EC,substrate,Organism,Uniprot,kcat_km,KEGG ID
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,3.8,C11215
1,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,3.4,C11215
2,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,0.006,C00063
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,0.16,C00002
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,0.16,D08646
...,...,...,...,...,...,...
9264,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,C00002
9265,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,D08646
9266,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,D02300
9267,3.7.1.13,"2-hydroxy-6-oxo-6-phenylhexa-2,4-dienoate",Janthinobacterium sp. J3,Q84II3,730.0,C01273


Merge and calculate the geometric mean of kcat/km values for entries with same EC + same Uniprot ID + same KEGG ID

In [50]:
def geometric_mean(x):
    return x  

# get kcat/km values for entries with same EC + same Uniprot ID + same KEGG ID
geometric_means_values = (df_BRENDA.groupby(["EC", "Uniprot", "KEGG ID"], as_index=False).agg({"kcat_km": geometric_mean}))

# merge dataframes
result_df = pd.merge(df_BRENDA, geometric_means_values, on= ["EC", "Uniprot", "KEGG ID"], how= "left")
result_df

Unnamed: 0,EC,substrate,Organism,Uniprot,kcat_km_x,KEGG ID,kcat_km_y
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,3.8,C11215,"[3.8, 3.4]"
1,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,3.4,C11215,"[3.8, 3.4]"
2,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,0.006,C00063,0.006
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,0.16,C00002,0.16
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,0.16,D08646,0.16
...,...,...,...,...,...,...,...
9264,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,C00002,6.3
9265,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,D08646,6.3
9266,2.7.1.91,atp,Homo sapiens,Q9NRA0,6.3,D02300,6.3
9267,3.7.1.13,"2-hydroxy-6-oxo-6-phenylhexa-2,4-dienoate",Janthinobacterium sp. J3,Q84II3,730.0,C01273,730.0


In [51]:
# convert to list
result_df["kcat_km_y"] = result_df["kcat_km_y"].apply(lambda x: list(x) if isinstance(x, (list, np.ndarray)) else [x])

# keep track of how many kcat/km values are merged
result_df["kcat_km_count"] = result_df["kcat_km_y"].apply(len)

result_df.drop(columns=["kcat_km_x"], inplace=True)

result_df = result_df.rename(columns={"kcat_km_y":"kcat_km"})

# calculate geometric mean
def calculate_geometric_mean(value):
    if isinstance(value, list):
        return gmean(value)
    return value  

result_df["kcat_km_gmean"] = result_df["kcat_km"].apply(calculate_geometric_mean)

result_df = result_df.drop_duplicates(subset = ["EC", "Uniprot","KEGG ID", "kcat_km_gmean"])
result_df.reset_index(drop=True, inplace=True)
result_df

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km,kcat_km_count,kcat_km_gmean
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,C11215,"[3.8, 3.4]",2,3.59444
1,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,C00063,[0.006],1,0.00600
2,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D08646,[0.16],1,0.16000
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D02300,[0.16],1,0.16000
...,...,...,...,...,...,...,...,...
7661,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000
7662,2.7.1.91,atp,Homo sapiens,Q9NRA0,D08646,[6.3],1,6.30000
7663,2.7.1.91,atp,Homo sapiens,Q9NRA0,D02300,[6.3],1,6.30000
7664,3.7.1.13,"2-hydroxy-6-oxo-6-phenylhexa-2,4-dienoate",Janthinobacterium sp. J3,Q84II3,C01273,[730.0],1,730.00000


In [52]:
result_df.to_pickle(join("files","kcat_km_gmean.pkl"))

In [53]:
result_df = result_df.rename(columns={"kcat_km":"kcat_km_values"})
result_df = result_df.rename(columns={"kcat_km_gmean":"kcat_km"})
df_BRENDA = result_df
df_BRENDA

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,C11215,"[3.8, 3.4]",2,3.59444
1,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,C00063,[0.006],1,0.00600
2,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D08646,[0.16],1,0.16000
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D02300,[0.16],1,0.16000
...,...,...,...,...,...,...,...,...
7661,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000
7662,2.7.1.91,atp,Homo sapiens,Q9NRA0,D08646,[6.3],1,6.30000
7663,2.7.1.91,atp,Homo sapiens,Q9NRA0,D02300,[6.3],1,6.30000
7664,3.7.1.13,"2-hydroxy-6-oxo-6-phenylhexa-2,4-dienoate",Janthinobacterium sp. J3,Q84II3,C01273,[730.0],1,730.00000


Download amino acid sequences for all data points

In [60]:
# save all unique  Uniprot IDs
IDs = list(set(df_BRENDA["Uniprot"]))
file = open(join("esm1b_UID_sequence_mapping", "UNIPROT_IDs.txt"), "w")

for ID in list(IDs):
    file.write(str(ID) + "\n")
file.close()

Map Uniprot IDs to amino acid sequence

In [61]:
# map from https://www.uniprot.org/id-mapping uniprot IDs to amino acid sequences
sequence_df = pd.read_csv(join("esm1b_UID_sequence_mapping", "uid_mapping.tsv"), sep = "\t")

sequence_df.drop(columns = ["From"], inplace = True)
sequence_df.rename(columns={"Entry": "Uniprot"}, inplace=True)

sequence_df["Sequence ID"] = "Sequence_" + (sequence_df.index).astype(str)

droplist = []

# delete sequences with less than 40 residues
for ind in sequence_df.index:
    if len(sequence_df.loc[ind , "Sequence"]) < 40:
        droplist.append(ind)

sequence_df = sequence_df.drop(droplist)
sequence_df

Unnamed: 0,Uniprot,Sequence,Sequence ID
0,A0A023DFE8,MANVIKARPKLYVMDNGRMRMDKNWMIAMHNPATIHNPNAQTEFVE...,Sequence_0
1,A0A023H437,NTLANHGFLPRNGRNISVPMIVKAGFEGYNVQSDILITAGKVGMLT...,Sequence_1
2,A0A023J5W7,ANTSYVDYNVEANPDLYPLCVETIPLSFPDCQNGPLRSHLICDESA...,Sequence_2
3,A0A023MIF8,MITMAPHQNQFLLFIGVSLVLLSSYATANNSFNRSAFPDDFIFGAS...,Sequence_3
4,A0A059WZ16,MGQNMEIDNFLKIERLAENDLPKFIQLIRLFEAVFEMKNFSIPDSE...,Sequence_4
...,...,...,...
2484,W8VZ54,MTSYDYDYVVIGGGSGGMASSKEAARLGARVALFDFVKPSTQGTKW...,Sequence_2484
2485,W8X9R6,MANPKSEYDVIIVGGGLNGLATGAYLQKAGLSVGIFERRDESGTFC...,Sequence_2485
2486,X0JT48,MLGKVALEEAFALPRHKERTRWWAGLFAIDPDKHAAEINDITEQRI...,Sequence_2486
2487,X5JA14,MTLTTSRLGITNSLFVALLGVNTYTLASVLIPHSTLTLTNQLIAPD...,Sequence_2487


Remove amino acids after the 1022nd position - *ESM1b model operates with a maximum of 1024 tokens*

In [62]:
def adjust_sequence(seq):
    if len(seq) > 1022:
        return seq[:1022]  
    else:
        return seq  

sequence_df["Seq"] = sequence_df["Sequence"].apply(adjust_sequence)
sequence_df = sequence_df.drop(columns=["Sequence"])
sequence_df

Unnamed: 0,Uniprot,Sequence ID,Seq
0,A0A023DFE8,Sequence_0,MANVIKARPKLYVMDNGRMRMDKNWMIAMHNPATIHNPNAQTEFVE...
1,A0A023H437,Sequence_1,NTLANHGFLPRNGRNISVPMIVKAGFEGYNVQSDILITAGKVGMLT...
2,A0A023J5W7,Sequence_2,ANTSYVDYNVEANPDLYPLCVETIPLSFPDCQNGPLRSHLICDESA...
3,A0A023MIF8,Sequence_3,MITMAPHQNQFLLFIGVSLVLLSSYATANNSFNRSAFPDDFIFGAS...
4,A0A059WZ16,Sequence_4,MGQNMEIDNFLKIERLAENDLPKFIQLIRLFEAVFEMKNFSIPDSE...
...,...,...,...
2484,W8VZ54,Sequence_2484,MTSYDYDYVVIGGGSGGMASSKEAARLGARVALFDFVKPSTQGTKW...
2485,W8X9R6,Sequence_2485,MANPKSEYDVIIVGGGLNGLATGAYLQKAGLSVGIFERRDESGTFC...
2486,X0JT48,Sequence_2486,MLGKVALEEAFALPRHKERTRWWAGLFAIDPDKHAAEINDITEQRI...
2487,X5JA14,Sequence_2487,MTLTTSRLGITNSLFVALLGVNTYTLASVLIPHSTLTLTNQLIAPD...


In [57]:
df_BRENDA = df_BRENDA.merge(sequence_df, how = "left", on = "Uniprot")
df_BRENDA

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,C11215,"[3.8, 3.4]",2,3.59444,Sequence_1168,MKSVLFVCVGNGGKSQMAAALAQKYASDSVEIHSAGTKPAQGLNQL...
1,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,C00063,[0.006],1,0.00600,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...
2,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D08646,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D02300,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...
...,...,...,...,...,...,...,...,...,...,...
7661,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
7662,2.7.1.91,atp,Homo sapiens,Q9NRA0,D08646,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
7663,2.7.1.91,atp,Homo sapiens,Q9NRA0,D02300,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...
7664,3.7.1.13,"2-hydroxy-6-oxo-6-phenylhexa-2,4-dienoate",Janthinobacterium sp. J3,Q84II3,C01273,[730.0],1,730.00000,Sequence_1971,MLNKAEQISEKSERAFVERFVNAGGVETRYLEAGKGQPVILIHGGG...


In [58]:
# 2489 active entries and 30 obsolete entries are found from Uniprot
print(f"Number of different amino acid sequences in the dataset: {len(set(df_BRENDA['Seq']))}")

Number of different amino acid sequences in the dataset: 2465


Calculating enzyme representations with ESM1b model

In [None]:
import torch
import esm

# load ESM-1b model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter()
model.eval()  


sequence_representations = []

for index, row in sequence_df.iterrows():
    sequence_id = row["Sequence ID"]  
    sequence = row["Seq"]

    batch_labels, batch_strs, batch_tokens = batch_converter([(sequence_id, sequence)])
    
    # extract per-residue representations
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33]  

    # generate per-sequence representations via averaging
    sequence_len = (batch_tokens != alphabet.padding_idx).sum(1).item()  
    sequence_representation = token_representations[0, 1:sequence_len - 1].mean(0) 
    
    sequence_representations.append(sequence_representation)

    print(f"Processed: {sequence_id}")

sequence_df["ESM1b"] = sequence_representations
sequence_df

In [None]:
def tensor_to_list(tensor_sequence):
   return [tensor.item() for tensor in tensor_sequence]

sequence_df["ESM1b"] = sequence_df["ESM1b"].apply(tensor_to_list)

df_BRENDA["ESM1b"] = sequence_df["ESM1b"]
df = df_BRENDA.loc[~pd.isnull(df_BRENDA["ESM1b"])]

In [63]:
# embeddings created on HPC
df = pd.read_pickle("Sequence_dataframe_ESM1b.pkl")
df

Unnamed: 0,Uniprot,Sequence ID,Seq,ESM1b
0,A0A023DFE8,Sequence_0,MANVIKARPKLYVMDNGRMRMDKNWMIAMHNPATIHNPNAQTEFVE...,"[-0.0716533362865448, 0.19521984457969666, -0...."
1,A0A023H437,Sequence_1,NTLANHGFLPRNGRNISVPMIVKAGFEGYNVQSDILITAGKVGMLT...,"[0.21069276332855225, 0.21327118575572968, 0.0..."
2,A0A023J5W7,Sequence_2,ANTSYVDYNVEANPDLYPLCVETIPLSFPDCQNGPLRSHLICDESA...,"[0.00614246167242527, 0.26790952682495117, -0...."
3,A0A023MIF8,Sequence_3,MITMAPHQNQFLLFIGVSLVLLSSYATANNSFNRSAFPDDFIFGAS...,"[-0.09554827213287354, 0.2854669690132141, -0...."
4,A0A059WZ16,Sequence_4,MGQNMEIDNFLKIERLAENDLPKFIQLIRLFEAVFEMKNFSIPDSE...,"[0.060367342084646225, 0.01692936383187771, -0..."
...,...,...,...,...
2484,W8VZ54,Sequence_2484,MTSYDYDYVVIGGGSGGMASSKEAARLGARVALFDFVKPSTQGTKW...,"[-0.10296626389026642, 0.26446282863616943, 0...."
2485,W8X9R6,Sequence_2485,MANPKSEYDVIIVGGGLNGLATGAYLQKAGLSVGIFERRDESGTFC...,"[0.09363408386707306, 0.25524449348449707, 0.0..."
2486,X0JT48,Sequence_2486,MLGKVALEEAFALPRHKERTRWWAGLFAIDPDKHAAEINDITEQRI...,"[-0.0684085488319397, 0.15891578793525696, 0.0..."
2487,X5JA14,Sequence_2487,MTLTTSRLGITNSLFVALLGVNTYTLASVLIPHSTLTLTNQLIAPD...,"[0.0869896188378334, 0.3494809865951538, -0.04..."


In [64]:
df_BRENDA = pd.merge(df_BRENDA, df[["Seq", "ESM1b"]], on ="Seq", how = "left")
df_BRENDA = df_BRENDA.dropna()
df_BRENDA.head()

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b
0,2.8.4.2,arsenate,Corynebacterium glutamicum,P0DKS7,C11215,"[3.8, 3.4]",2,3.59444,Sequence_1168,MKSVLFVCVGNGGKSQMAAALAQKYASDSVEIHSAGTKPAQGLNQL...,"[-0.08632251620292664, 0.12142688035964966, -0..."
1,2.7.7.2,ctp,Methanocaldococcus jannaschii,Q58579,C00063,[0.006],1,0.006,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -..."
2,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -..."
3,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D08646,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -..."
4,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,D02300,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -..."


### Calculating chemical reaction fingerprints

Map reactions kegg IDs

In [208]:
# create a data frame of KEGG IDs of substrates
drugs_df = pd.read_pickle(join("kegg","KEGG_drugs_df.pkl"))
compounds_df = pd.read_pickle(join("kegg", "KEGG_substrate_df.pkl"))
KEGG_substrate_df = pd.concat([compounds_df, drugs_df]).reset_index(drop=True)
KEGG_substrate_df.head()

Unnamed: 0,KEGG ID,substrate
0,C00001,H2O
1,C00001,Water
2,C00002,ATP
3,C00002,Adenosine 5'-triphosphate
4,C00003,NAD+


In [210]:
KEGG_substrate_df["substrate"] = [name.lower() for name in KEGG_substrate_df["substrate"]]
df_BRENDA["substrate"] = [name.lower() for name in df_BRENDA["substrate"]]

In [65]:
# create a data frame with KEGG IDs of reactions 
KEGG_reaction_df = pd.read_pickle(join("kegg", "KEGG_reaction_df.pkl"))
KEGG_reaction_df.head()

Unnamed: 0,EC number,KEGG reaction ID,substrates left,substrates right,KEGG IDs left,KEGG IDs right
0,1.1.1.1,R00623,Primary alcohol + NAD+,Aldehyde + NADH + H+,"[C00226, C00003]","[C00071, C00004, C00080]"
1,1.1.1.1,R00624,Secondary alcohol + NAD+,Ketone + NADH + H+,"[C01612, C00003]","[C01450, C00004, C00080]"
2,1.1.1.1,R00754,Ethanol + NAD+,Acetaldehyde + NADH + H+,"[C00469, C00003]","[C00084, C00004, C00080]"
3,1.1.1.1,R02124,Retinol + NAD+,Retinal + NADH + H+,"[C00473, C00003]","[C00376, C00004, C00080]"
4,1.1.1.1,R02878,1-Octanol + NAD+,1-Octanal + NADH + H+,"[C00756, C00003]","[C01545, C00004, C00080]"


In [66]:
def map_BRENDA_entry_to_KEGG_reaction_ID(entry, KEGG_reaction_df):

    # get ec number and KEGG ID of substrate
    ec = entry["EC"]
    KEGG_ID = entry["KEGG ID"]

    # save all reaction IDs from KEGG with direction in a list:
    reaction_ids = []

    # only search if a KEGG ID was found
    if not pd.isnull(KEGG_ID):

        # take subset of KEGG database with reactions with fitting EC number
        reaction_df = KEGG_reaction_df.loc[KEGG_reaction_df["EC number"] == ec]

        # iterate overall entries with fitting EC numbers
        for k in reaction_df.index:
            reaction_entry = reaction_df.loc[k]
            
            # get KEGG IDs of substrates on left and right side of reaction equation:
            left = reaction_entry["KEGG IDs left"]
            right = reaction_entry["KEGG IDs right"]
            if KEGG_ID in left:
                reaction_ids.append(reaction_entry["KEGG reaction ID"] + "_f")
            if KEGG_ID in right:
                reaction_ids.append(reaction_entry["KEGG reaction ID"] + "_b")
                
    if reaction_ids == []:
        return(None)
    else:
        return(reaction_ids)

In [67]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

df_BRENDA["KEGG reaction ID"] = np.nan
for ind in df_BRENDA.index:
    reaction_ids = map_BRENDA_entry_to_KEGG_reaction_ID(df_BRENDA.loc[ind], KEGG_reaction_df)
    if not reaction_ids is None:
        df_BRENDA["KEGG reaction ID"][ind] = reaction_ids

In [68]:
n_old = len(df_BRENDA)
brenda_df = df_BRENDA.loc[~pd.isnull(df_BRENDA["KEGG reaction ID"])]
print("We remove %s out of %s data points, because we couldn't find a reaction ID for these data points." 
      % (n_old - len(brenda_df), n_old))

We remove 4297 out of 7588 data points, because we couldn't find a reaction ID for these data points.


In [69]:
brenda_df = brenda_df.reset_index(drop = True)
brenda_df.to_pickle(join("files", "brenda_df.pkl"))
len(brenda_df)

3291

In [70]:
brenda_df.head()

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f]
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.064,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f]
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f]
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f]
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.0,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b]


Get KEGG IDs for left and right side of each reaction

In [71]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# create columns where to store reactions KEGG IDs
brenda_df["KEGG IDs left"] = ""
brenda_df["KEGG IDs right"] = ""

for ind in brenda_df.index:
    
    # extract KEGG ID
    reaction_id = brenda_df["KEGG reaction ID"][ind]
    reaction_id = reaction_id[0]
    
    if not pd.isnull(reaction_id):
        # if _f is found in the KEGG ID, the reaction is proceeding forward and the KEGG ID is stored in the substrates
        if reaction_id[-2:] == "_f":
            reaction_id = reaction_id[:-2]
            reaction_side = "left"
        
        # if _b is found in the KEGG ID, the reaction is proceeding backwards and the KEGG ID is stored in the products
        elif reaction_id[-2:] == "_b":
            reaction_id = reaction_id[:-2]
            reaction_side = "right"
        else:
            reaction_side = "left"

        reaction = KEGG_reaction_df[KEGG_reaction_df["KEGG reaction ID"] == reaction_id]
        try:
            if reaction_side == "left":
                brenda_df["KEGG IDs left"][ind] = reaction.iloc[0]["KEGG IDs left"]
                brenda_df["KEGG IDs right"][ind] = reaction.iloc[0]["KEGG IDs right"]
            else:
                brenda_df["KEGG IDs left"][ind] = reaction.iloc[0]["KEGG IDs right"]
                brenda_df["KEGG IDs right"][ind] = reaction.iloc[0]["KEGG IDs left"]
        except: 
            print(ind, reaction_id)

In [72]:
brenda_df.head()

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]"
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.064,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]"
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]"
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]"
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.0,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]"


Get metabolites InChIs 

In [78]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

kegg_con = KEGG()
chebi_con = ChEBI()

# list of metabolite's KEGG IDs
met_IDs = []

for ind in brenda_df.index:
    sub_IDs, pro_IDs = brenda_df["KEGG IDs left"][ind], brenda_df["KEGG IDs right"][ind]
    if sub_IDs != "" and pro_IDs != "" :
        try:
            met_IDs = met_IDs + sub_IDs + pro_IDs
        except TypeError:
            pass
    
# map the respective InChI for each unique metabolite 
df_metabolites = pd.DataFrame(data = {"metabolite ID": list(set(met_IDs))})
df_metabolites = df_metabolites.loc[df_metabolites["metabolite ID"] != ""]
df_metabolites["InChI"] = np.nan

# map InChIs from bioservices
for ind in df_metabolites.index:
    met = df_metabolites["metabolite ID"][ind]
    if met[0:5] == "InChI":
        df_metabolites["InChI"][ind] = met
    else:
        try:
            kegg_entry = kegg_con.parse(kegg_con.get(met))
            chebi_entry = chebi_con.getCompleteEntity("CHEBI:" + kegg_entry["DBLINKS"]["ChEBI"])
            df_metabolites["InChI"][ind] = chebi_entry.inchi
        except:
            pass
        
df_metabolites.head()



Unnamed: 0,metabolite ID,InChI
0,C00133,"InChI=1S/C3H7NO2/c1-2(4)3(5)6/h2H,4H2,1H3,(H,5..."
1,C00328,InChI=1S/C10H12N2O3/c11-7-4-2-1-3-6(7)9(13)5-8...
2,C20957,
3,C00673,
4,C06010,"InChI=1S/C5H8O4/c1-3(6)5(2,9)4(7)8/h9H,1-2H3,(..."


In [80]:
df_metabolites.to_pickle(join("files", "df_metabolites_reactions.pkl"))

In [73]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

df_metabolites = pd.read_pickle(join("files", "df_metabolites_reactions.pkl"))

# map InChIs from mol files
for ind in df_metabolites.index:
    if pd.isnull(df_metabolites["InChI"][ind]):
        try:
            mol = Chem.MolFromMolFile(join("mol-files", df_metabolites["metabolite ID"][ind] + '.mol'))
            df_metabolites["InChI"][ind] = Chem.MolToInchi(mol)
        except:
            pass

# remove not mapped metabolites
df_metabolites = df_metabolites.loc[~pd.isnull(df_metabolites["InChI"])]

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *


[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *

[19:38:55] ERROR: Unknown element(s): *



[19:38:56] ERROR: Unknown element(s): *

[19:38:56] ERROR: Unknown element(s): *

[19:38:56] ERROR: Unsupported in this mode element '*'

[19:38:56] ERROR: Unknown element(s): *

[19:38:56] ERROR: Unknown element(s): *


[19:38:56] ERROR: Unsupported in this mode element '*'

[19:38:56] ERROR: Unsupported in this mode element '*'

[19:38:56] ERROR: Unknown element(s): *

[19:38:56] ERROR: Unknown element(s): *

[19:38:56] ERROR: Unknown element(s): *


[19:38:

In [74]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# create columns where to insert mapped InChIs
brenda_df["substrate_InChI_set"] = ""
brenda_df["product_InChI_set"] = ""

for ind in brenda_df.index:
    sub_IDs, pro_IDs = brenda_df["KEGG IDs left"][ind], brenda_df["KEGG IDs right"][ind]
    
    # map KEGG IDs to InChIs
    try:
        sub_inchis = []
        pro_inchis = []
        for sub in sub_IDs:
            inchi = list(df_metabolites["InChI"].loc[df_metabolites["metabolite ID"] == sub])[0]
            sub_inchis.append(inchi)
        for pro in pro_IDs:
            inchi = list(df_metabolites["InChI"].loc[df_metabolites["metabolite ID"] == pro])[0]
            pro_inchis.append(inchi)

        brenda_df["substrate_InChI_set"][ind] = set(sub_inchis)
        brenda_df["product_InChI_set"][ind] = set(pro_inchis)
    except:
        pass

In [75]:
brenda_df

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_InChI_set,product_InChI_set
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.160,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3..."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.064,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3..."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3..."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3..."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.000,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]",{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,{InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3286,1.1.2.10,methanol,Methylorubrum extorquens,C5AXV8,C00132,[2.0],1,2.000,Sequence_576,MRMRNHFLTAVGLAALSVGLALPAIAEVTEQDILNDAKTTDDVVTY...,"[-0.0761818215250969, 0.2266412079334259, -0.1...",[R01146_f],"[C00132, C18233]","[C00067, C18234, C00080]",,
3287,1.1.2.10,formaldehyde,Methylorubrum extorquens,C5AXV8,C00067,[259.0],1,259.000,Sequence_576,MRMRNHFLTAVGLAALSVGLALPAIAEVTEQDILNDAKTTDDVVTY...,"[-0.0761818215250969, 0.2266412079334259, -0.1...",[R01146_b],"[C00067, C18234, C00080]","[C00132, C18233]",,
3288,2.7.1.91,atp,Homo sapiens,Q9NYA1,C00002,[410.0],1,410.000,Sequence_2309,MDPAGGPRGVLPRPCRVLVLLNPRGGKGKALQLFRSHVQPLLAEAE...,"[-0.07513425499200821, 0.2476138025522232, 0.1...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",{InChI=1S/C18H37NO2/c1-2-3-4-5-6-7-8-9-10-11-1...,{InChI=1S/C18H38NO5P/c1-2-3-4-5-6-7-8-9-10-11-...
3289,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.300,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...,"[-0.09094876050949097, 0.2581353187561035, 0.0...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",{InChI=1S/C18H37NO2/c1-2-3-4-5-6-7-8-9-10-11-1...,{InChI=1S/C18H38NO5P/c1-2-3-4-5-6-7-8-9-10-11-...


In [76]:
from rdkit import Chem

# map SMILES from InChIs
def inchi_to_smiles(inchi):
    try:
        mol = Chem.MolFromInchi(inchi)
        return Chem.MolToSmiles(mol)
    except:
        return None
    
df_metabolites["SMILES"] = df_metabolites["InChI"].apply(inchi_to_smiles)

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:11] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] Cannot assign bond directions!
[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

[19:40:12] ERROR: 

In [78]:
df_metabolites

Unnamed: 0,metabolite ID,InChI,SMILES
0,C00133,"InChI=1S/C3H7NO2/c1-2(4)3(5)6/h2H,4H2,1H3,(H,5...",C[C@@H](N)C(=O)O
1,C00328,InChI=1S/C10H12N2O3/c11-7-4-2-1-3-6(7)9(13)5-8...,Nc1ccccc1C(=O)C[C@H](N)C(=O)O
2,C20957,InChI=1S/C8H14N2O5/c1-4(9)7(13)10-5(8(14)15)2-...,C[C@H](N)C(O)=N[C@H](CCC(=O)O)C(=O)O
3,C00673,"InChI=1S/C5H11O7P/c6-3-1-5(7)12-4(3)2-11-13(8,...",O=P(O)(O)OC[C@H]1OC(O)C[C@@H]1O
4,C06010,"InChI=1S/C5H8O4/c1-3(6)5(2,9)4(7)8/h9H,1-2H3,(...",CC(=O)[C@](C)(O)C(=O)O
...,...,...,...
1638,C05178,InChI=1S/C19H23NO4/c1-20-7-6-13-10-19(24-3)17(...,COc1ccc(C[C@@H]2c3cc(O)c(OC)cc3CCN2C)cc1O
1639,C00532,"InChI=1S/C5H12O5/c6-1-3(8)5(10)4(9)2-7/h3-10H,...",OC[C@H](O)C(O)[C@@H](O)CO
1640,C12640,InChI=1S/C36H36O18/c37-13-25-28(43)30(45)32(47...,O=C1C=CC(=C2Oc3cc(O)cc(O[C@@H]4O[C@H](CO)[C@@H...
1641,C18042,InChI=1S/C21H36O2/c1-13(22)17-6-7-18-16-5-4-14...,C[C@H](O)[C@H]1CC[C@H]2[C@@H]3CC[C@H]4C[C@H](O...


In [79]:
# create columns where to store substrates and products SMILES 
brenda_df["substrate_SMILES"] = ""
brenda_df["product_SMILES"] = ""

# get SMILES from InChIs
def inchi_to_smiles(inchi):
    try:
        mol = Chem.MolFromInchi(inchi)
        return Chem.MolToSmiles(mol)
    except:
        return None
    
# assign SMILES to metabolites
for ind in brenda_df.index:
    sub_IDs, pro_IDs = brenda_df["KEGG IDs left"][ind], brenda_df["KEGG IDs right"][ind]
    
    try:
        sub_smiles = []
        pro_smiles = []
        for sub in sub_IDs:
            try:
                inchi = list(df_metabolites["InChI"].loc[df_metabolites["metabolite ID"] == sub])[0]
                sub_smiles.append(inchi_to_smiles(inchi))
            except:
                 sub_smiles.append(None)

        for pro in pro_IDs:
            try:
                inchi = list(df_metabolites["InChI"].loc[df_metabolites["metabolite ID"] == pro])[0]
                pro_smiles.append(inchi_to_smiles(inchi))
            except:
                pro_smiles.append(None)

        brenda_df["substrate_SMILES"][ind] = sub_smiles
        brenda_df["product_SMILES"][ind] = pro_smiles
    except:
        pass

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:33] ERROR: 

[19:40:34] ERROR: 

[19:40:34] ERROR: 

[19:40:34] ERROR: 

[19:40:34] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:35] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 

[19:40:36] ERROR: 



In [80]:
# remove incomplete entries
brenda_df = brenda_df[brenda_df["substrate_SMILES"].apply(lambda x: None in x) == False]
brenda_df = brenda_df[brenda_df["product_SMILES"].apply(lambda x: None in x) == False]
brenda_df

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_InChI_set,product_InChI_set,substrate_SMILES,product_SMILES
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,"[O=P(O)(O)OP(=O)(O)O, Cc1cc2nc3c(O)nc(=O)nc-3n..."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.06400,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,"[O=P(O)(O)OP(=O)(O)O, Cc1cc2nc3c(O)nc(=O)nc-3n..."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,"[O=P(O)(O)OP(=O)(O)O, Cc1cc2nc3c(O)nc(=O)nc-3n..."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,"[O=P(O)(O)OP(=O)(O)O, Cc1cc2nc3c(O)nc(=O)nc-3n..."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.00000,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]",{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,{InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-1...,"[C=C1OC(C)=C(O)C1=O, N=C(O)C1=CN([C@@H]2O[C@H]...","[CC1=C(O)C(=O)C(C)O1, N=C(O)c1ccc[n+]([C@@H]2O..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,1.4.1.4,2-oxoglutarate,Thermococcus waiotapuensis,Q977X9,C00026,[683.0],1,683.00000,Sequence_2147,MVELDPFEMAVQQLERAAQFMDISEEALEWLKRPMRIVEVSVPVEM...,"[0.07773970067501068, 0.17748579382896423, 0.0...",[R00248_b],"[C00026, C00014, C00005, C00080]","[C00025, C00006, C00001]","{InChI=1S/C5H6O5/c6-3(5(9)10)1-2-4(7)8/h1-2H2,...",{InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-1...,"[O=C(O)CCC(=O)C(=O)O, N, N=C(O)C1=CN([C@@H]2O[...","[N[C@@H](CCC(=O)O)C(=O)O, N=C(O)c1ccc[n+]([C@@..."
3285,1.4.1.4,nadp+,Escherichia coli,P00370,C00006,[0.00203],1,0.00203,Sequence_1056,MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...,"[0.09033121913671494, 0.14317286014556885, 0.0...","[R00248_f, R00146_f]","[C00025, C00006, C00001]","[C00026, C00014, C00005, C00080]",{InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-1...,"{InChI=1S/C5H6O5/c6-3(5(9)10)1-2-4(7)8/h1-2H2,...","[N[C@@H](CCC(=O)O)C(=O)O, N=C(O)c1ccc[n+]([C@@...","[O=C(O)CCC(=O)C(=O)O, N, N=C(O)C1=CN([C@@H]2O[..."
3288,2.7.1.91,atp,Homo sapiens,Q9NYA1,C00002,[410.0],1,410.00000,Sequence_2309,MDPAGGPRGVLPRPCRVLVLLNPRGGKGKALQLFRSHVQPLLAEAE...,"[-0.07513425499200821, 0.2476138025522232, 0.1...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",{InChI=1S/C18H37NO2/c1-2-3-4-5-6-7-8-9-10-11-1...,{InChI=1S/C18H38NO5P/c1-2-3-4-5-6-7-8-9-10-11-...,"[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO, Nc1nc...",[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)COP(=O)(O...
3289,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...,"[-0.09094876050949097, 0.2581353187561035, 0.0...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",{InChI=1S/C18H37NO2/c1-2-3-4-5-6-7-8-9-10-11-1...,{InChI=1S/C18H38NO5P/c1-2-3-4-5-6-7-8-9-10-11-...,"[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO, Nc1nc...",[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)COP(=O)(O...


In [81]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# create column to store reaction SMILES
brenda_df["Reaction_SMILES"] = ""

for ind in brenda_df.index:
    reaction_smile_left = ""
    reaction_smile_right = ""
    for smiles_sub in brenda_df["substrate_SMILES"][ind]:
        try:
            # concatenate substrate SMILES
            reaction_smile_left =  reaction_smile_left + "." + smiles_sub
        except:
            print(ind, smiles_sub)
    reaction_smile_left = reaction_smile_left[1:] 

    for smiles_prod in brenda_df["product_SMILES"][ind]:
        # concatenate product SMILES
        reaction_smile_right =  reaction_smile_right + "." + smiles_prod
    reaction_smile_right = reaction_smile_right[1:]

    # concatenate reaction SMILES
    brenda_df["Reaction_SMILES"][ind] = reaction_smile_left + ">>" + reaction_smile_right

In [82]:
# remove unnecessary information
brenda_df_new = brenda_df.drop(columns = ["substrate_InChI_set", "product_InChI_set", "product_SMILES"])
brenda_df_new

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_SMILES,Reaction_SMILES
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.06400,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.00000,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]","[C=C1OC(C)=C(O)C1=O, N=C(O)C1=CN([C@@H]2O[C@H]...",C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,1.4.1.4,2-oxoglutarate,Thermococcus waiotapuensis,Q977X9,C00026,[683.0],1,683.00000,Sequence_2147,MVELDPFEMAVQQLERAAQFMDISEEALEWLKRPMRIVEVSVPVEM...,"[0.07773970067501068, 0.17748579382896423, 0.0...",[R00248_b],"[C00026, C00014, C00005, C00080]","[C00025, C00006, C00001]","[O=C(O)CCC(=O)C(=O)O, N, N=C(O)C1=CN([C@@H]2O[...",O=C(O)CCC(=O)C(=O)O.N.N=C(O)C1=CN([C@@H]2O[C@H...
3285,1.4.1.4,nadp+,Escherichia coli,P00370,C00006,[0.00203],1,0.00203,Sequence_1056,MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...,"[0.09033121913671494, 0.14317286014556885, 0.0...","[R00248_f, R00146_f]","[C00025, C00006, C00001]","[C00026, C00014, C00005, C00080]","[N[C@@H](CCC(=O)O)C(=O)O, N=C(O)c1ccc[n+]([C@@...",N[C@@H](CCC(=O)O)C(=O)O.N=C(O)c1ccc[n+]([C@@H]...
3288,2.7.1.91,atp,Homo sapiens,Q9NYA1,C00002,[410.0],1,410.00000,Sequence_2309,MDPAGGPRGVLPRPCRVLVLLNPRGGKGKALQLFRSHVQPLLAEAE...,"[-0.07513425499200821, 0.2476138025522232, 0.1...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]","[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO, Nc1nc...",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...
3289,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...,"[-0.09094876050949097, 0.2581353187561035, 0.0...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]","[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO, Nc1nc...",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...


In [83]:
brenda_df_new.to_pickle(join("files", "df_Brenda_rxn.pkl"))

In [84]:
from transformers import BertModel, BertConfig
from rxnfp.transformer_fingerprints import (
    RXNBERTFingerprintGenerator, get_default_model_and_tokenizer, generate_fingerprints
)
from transformers import BertModel

from rxnfp.core import (
    FingerprintGenerator
)
from rxnfp.tokenization import (
    SmilesTokenizer
)

# load model
model_dir1 = "bert_scratch_512_upsto_15_48_48_48_48ep"
config_file1 = f"{model_dir1}/config.json"
model_file1 = f"{model_dir1}/pytorch_model.bin"

config1 = BertConfig.from_json_file(config_file1)
model1 = BertModel.from_pretrained(pretrained_model_name_or_path=model_dir1, config=config1)

force_no_cuda = False

tokenizer_vocab_path = f"{model_dir1}/vocab.txt"

device = torch.device("cuda" if (torch.cuda.is_available() and not force_no_cuda) else "cpu")

tokenizer = SmilesTokenizer(
        tokenizer_vocab_path
    )

rxnfp_generator1 =  RXNBERTFingerprintGenerator(model1, tokenizer)

brenda_df_new["rxnfp"]=""

for ind in brenda_df_new.index:
    reaction_smiles = brenda_df_new["Reaction_SMILES"][ind]
    try:
        brenda_df_new["rxnfp"][ind] = rxnfp_generator1.convert(reaction_smiles)

    except IndexError:
        pass

Some weights of BertModel were not initialized from the model checkpoint at bert_scratch_512_upsto_15_48_48_48_48ep and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
brenda_df_new.head()

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_SMILES,Reaction_SMILES,rxnfp
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0...."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.064,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0...."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0...."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0...."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.0,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]","[C=C1OC(C)=C(O)C1=O, N=C(O)C1=CN([C@@H]2O[C@H]...",C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,"[0.6669322848320007, -0.3460233807563782, -0.3..."


### Concatenate the ESM1b to rxnfp 

In [87]:
def concatenate_tensors(row):
    esm1b_tensor = row["ESM1b"]
    rxnfp_tensor = row["rxnfp"]
    
    esm1b_tensor = np.array(esm1b_tensor)
    rxnfp_tensor = np.array(rxnfp_tensor)
    
    concatenated_tensor = np.concatenate([esm1b_tensor, rxnfp_tensor])
    return concatenated_tensor

brenda_df_new["esm1b_rxnfp"] = brenda_df_new.apply(concatenate_tensors, axis=1)
brenda_df_new


Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_SMILES,Reaction_SMILES,rxnfp,esm1b_rxnfp
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -..."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.06400,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -..."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0..."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",[Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0..."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.00000,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]","[C=C1OC(C)=C(O)C1=O, N=C(O)C1=CN([C@@H]2O[C@H]...",C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,"[0.6669322848320007, -0.3460233807563782, -0.3...","[0.01972285658121109, 0.385234534740448, -0.03..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,1.4.1.4,2-oxoglutarate,Thermococcus waiotapuensis,Q977X9,C00026,[683.0],1,683.00000,Sequence_2147,MVELDPFEMAVQQLERAAQFMDISEEALEWLKRPMRIVEVSVPVEM...,"[0.07773970067501068, 0.17748579382896423, 0.0...",[R00248_b],"[C00026, C00014, C00005, C00080]","[C00025, C00006, C00001]","[O=C(O)CCC(=O)C(=O)O, N, N=C(O)C1=CN([C@@H]2O[...",O=C(O)CCC(=O)C(=O)O.N.N=C(O)C1=CN([C@@H]2O[C@H...,"[1.147512674331665, -0.573023796081543, -0.255...","[0.07773970067501068, 0.17748579382896423, 0.0..."
3285,1.4.1.4,nadp+,Escherichia coli,P00370,C00006,[0.00203],1,0.00203,Sequence_1056,MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...,"[0.09033121913671494, 0.14317286014556885, 0.0...","[R00248_f, R00146_f]","[C00025, C00006, C00001]","[C00026, C00014, C00005, C00080]","[N[C@@H](CCC(=O)O)C(=O)O, N=C(O)c1ccc[n+]([C@@...",N[C@@H](CCC(=O)O)C(=O)O.N=C(O)c1ccc[n+]([C@@H]...,"[1.3571501970291138, -0.4052989184856415, -0.2...","[0.09033121913671494, 0.14317286014556885, 0.0..."
3288,2.7.1.91,atp,Homo sapiens,Q9NYA1,C00002,[410.0],1,410.00000,Sequence_2309,MDPAGGPRGVLPRPCRVLVLLNPRGGKGKALQLFRSHVQPLLAEAE...,"[-0.07513425499200821, 0.2476138025522232, 0.1...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]","[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO, Nc1nc...",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,"[1.531532645225525, -0.08210200816392899, 0.21...","[-0.07513425499200821, 0.2476138025522232, 0.1..."
3289,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...,"[-0.09094876050949097, 0.2581353187561035, 0.0...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]","[CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO, Nc1nc...",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,"[1.531532645225525, -0.08210200816392899, 0.21...","[-0.09094876050949097, 0.2581353187561035, 0.0..."


In [88]:
brenda_df_new.to_pickle(join("files", "brenda_df_esm1b_rxnfp.pkl"))


### Get substrate representations from ChemBERTa2

In [89]:
# create substrate SMILES
brenda_df_new["substrate_SMILES"] = brenda_df_new["substrate_SMILES"].apply(lambda smile: ".".join(smile) if isinstance(smile, list) else str(smile))
brenda_df_new

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_SMILES,Reaction_SMILES,rxnfp,esm1b_rxnfp
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16000,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -..."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.06400,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -..."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0..."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.00000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0..."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.00000,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]",C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,"[0.6669322848320007, -0.3460233807563782, -0.3...","[0.01972285658121109, 0.385234534740448, -0.03..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,1.4.1.4,2-oxoglutarate,Thermococcus waiotapuensis,Q977X9,C00026,[683.0],1,683.00000,Sequence_2147,MVELDPFEMAVQQLERAAQFMDISEEALEWLKRPMRIVEVSVPVEM...,"[0.07773970067501068, 0.17748579382896423, 0.0...",[R00248_b],"[C00026, C00014, C00005, C00080]","[C00025, C00006, C00001]",O=C(O)CCC(=O)C(=O)O.N.N=C(O)C1=CN([C@@H]2O[C@H...,O=C(O)CCC(=O)C(=O)O.N.N=C(O)C1=CN([C@@H]2O[C@H...,"[1.147512674331665, -0.573023796081543, -0.255...","[0.07773970067501068, 0.17748579382896423, 0.0..."
3285,1.4.1.4,nadp+,Escherichia coli,P00370,C00006,[0.00203],1,0.00203,Sequence_1056,MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...,"[0.09033121913671494, 0.14317286014556885, 0.0...","[R00248_f, R00146_f]","[C00025, C00006, C00001]","[C00026, C00014, C00005, C00080]",N[C@@H](CCC(=O)O)C(=O)O.N=C(O)c1ccc[n+]([C@@H]...,N[C@@H](CCC(=O)O)C(=O)O.N=C(O)c1ccc[n+]([C@@H]...,"[1.3571501970291138, -0.4052989184856415, -0.2...","[0.09033121913671494, 0.14317286014556885, 0.0..."
3288,2.7.1.91,atp,Homo sapiens,Q9NYA1,C00002,[410.0],1,410.00000,Sequence_2309,MDPAGGPRGVLPRPCRVLVLLNPRGGKGKALQLFRSHVQPLLAEAE...,"[-0.07513425499200821, 0.2476138025522232, 0.1...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,"[1.531532645225525, -0.08210200816392899, 0.21...","[-0.07513425499200821, 0.2476138025522232, 0.1..."
3289,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,6.30000,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...,"[-0.09094876050949097, 0.2581353187561035, 0.0...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,"[1.531532645225525, -0.08210200816392899, 0.21...","[-0.09094876050949097, 0.2581353187561035, 0.0..."


In [90]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-10M-MTR")

def ChemBERTa2(smile):

    inputs = tokenizer(smile, return_tensors="pt")

    with torch.no_grad():  
        outputs = model(**inputs)[0]

    outputs = outputs[0, 1:-1, :]
    embeddings = outputs.mean(axis=0).detach().numpy()
    
    return embeddings

brenda_df_new["ChemBERTa2"] = brenda_df_new["substrate_SMILES"].apply(ChemBERTa2)
brenda_df_new.head()

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_SMILES,Reaction_SMILES,rxnfp,esm1b_rxnfp,ChemBERTa2
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,0.16,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,0.064,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,100.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,15.0,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,150.0,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]",C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,"[0.6669322848320007, -0.3460233807563782, -0.3...","[0.01972285658121109, 0.385234534740448, -0.03...","[-0.32911405, 0.0, 0.7023653, 0.33816174, -0.5..."


Transform the Kcat_km values in log(10)Kcat_km

In [91]:
brenda_df_new["kcat_km"] = pd.to_numeric(brenda_df_new["kcat_km"])
brenda_df_new["kcat_km"] = np.log10(brenda_df_new["kcat_km"])
brenda_df_new

Unnamed: 0,EC,substrate,Organism,Uniprot,KEGG ID,kcat_km_values,kcat_km_count,kcat_km,Sequence ID,Seq,ESM1b,KEGG reaction ID,KEGG IDs left,KEGG IDs right,substrate_SMILES,Reaction_SMILES,rxnfp,esm1b_rxnfp,ChemBERTa2
0,2.7.7.2,atp,Methanocaldococcus jannaschii,Q58579,C00002,[0.16],1,-0.795880,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
1,2.7.7.2,fmn,Methanocaldococcus jannaschii,Q58579,C00061,[0.064],1,-1.193820,Sequence_1742,MKKRVVTAGTFDILHPGHYEILKFAKSLGDELIVIVARDETVKKLK...,"[-0.044629089534282684, 0.17446771264076233, -...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[-0.044629089534282684, 0.17446771264076233, -...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
2,2.7.7.2,fmn,Corynebacterium ammoniagenes,Q59263,C00061,[100.0],1,2.000000,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
3,2.7.7.2,atp,Corynebacterium ammoniagenes,Q59263,C00002,[15.0],1,1.176091,Sequence_1748,MDIWYGTAAVPKDLDNSAVTIGVFDGVHRGHQKLINATVEKAREVG...,"[0.010040179826319218, 0.22952252626419067, -0...",[R00161_f],"[C00002, C00061]","[C00013, C00016]",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,"[0.9621078372001648, -0.09999620914459229, -0....","[0.010040179826319218, 0.22952252626419067, -0...","[0.10461673, 0.0, 0.4171699, 0.33389837, -0.22..."
4,1.3.1.105,nadph,Fragaria vesca,O23939,C00005,[150.0],1,2.176091,Sequence_964,MAAAPSESIPSVNKAWVXSEYGKTSDVLKFDPSVAVPEIKEDQVLI...,"[0.01972285658121109, 0.385234534740448, -0.03...",[R10593_b],"[C20718, C00005, C00080]","[C20717, C00006]",C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,C=C1OC(C)=C(O)C1=O.N=C(O)C1=CN([C@@H]2O[C@H](C...,"[0.6669322848320007, -0.3460233807563782, -0.3...","[0.01972285658121109, 0.385234534740448, -0.03...","[-0.32911405, 0.0, 0.7023653, 0.33816174, -0.5..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,1.4.1.4,2-oxoglutarate,Thermococcus waiotapuensis,Q977X9,C00026,[683.0],1,2.834421,Sequence_2147,MVELDPFEMAVQQLERAAQFMDISEEALEWLKRPMRIVEVSVPVEM...,"[0.07773970067501068, 0.17748579382896423, 0.0...",[R00248_b],"[C00026, C00014, C00005, C00080]","[C00025, C00006, C00001]",O=C(O)CCC(=O)C(=O)O.N.N=C(O)C1=CN([C@@H]2O[C@H...,O=C(O)CCC(=O)C(=O)O.N.N=C(O)C1=CN([C@@H]2O[C@H...,"[1.147512674331665, -0.573023796081543, -0.255...","[0.07773970067501068, 0.17748579382896423, 0.0...","[-0.30871013, 0.0, 0.63025105, 0.3126415, -0.0..."
3285,1.4.1.4,nadp+,Escherichia coli,P00370,C00006,[0.00203],1,-2.692504,Sequence_1056,MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...,"[0.09033121913671494, 0.14317286014556885, 0.0...","[R00248_f, R00146_f]","[C00025, C00006, C00001]","[C00026, C00014, C00005, C00080]",N[C@@H](CCC(=O)O)C(=O)O.N=C(O)c1ccc[n+]([C@@H]...,N[C@@H](CCC(=O)O)C(=O)O.N=C(O)c1ccc[n+]([C@@H]...,"[1.3571501970291138, -0.4052989184856415, -0.2...","[0.09033121913671494, 0.14317286014556885, 0.0...","[-0.18963942, 0.0, 0.7673503, 0.32965156, -0.3..."
3288,2.7.1.91,atp,Homo sapiens,Q9NYA1,C00002,[410.0],1,2.612784,Sequence_2309,MDPAGGPRGVLPRPCRVLVLLNPRGGKGKALQLFRSHVQPLLAEAE...,"[-0.07513425499200821, 0.2476138025522232, 0.1...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,"[1.531532645225525, -0.08210200816392899, 0.21...","[-0.07513425499200821, 0.2476138025522232, 0.1...","[-0.04711772, 0.0, 0.23433654, 0.4554978, -0.0..."
3289,2.7.1.91,atp,Homo sapiens,Q9NRA0,C00002,[6.3],1,0.799341,Sequence_2304,MNGHLEAEEQQDQRPDQELTGSWGHGPRSTLVRAKAMAPPPPPLAA...,"[-0.09094876050949097, 0.2581353187561035, 0.0...","[R01926_f, R02976_f]","[C00319, C00002]","[C06124, C00008]",CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)CO.Nc1ncnc...,"[1.531532645225525, -0.08210200816392899, 0.21...","[-0.09094876050949097, 0.2581353187561035, 0.0...","[-0.04711772, 0.0, 0.23433654, 0.4554978, -0.0..."


In [92]:
brenda_df_new.to_pickle(join("files", "final_df_brenda.pkl"))

### Split the dataset into training and test sets

In [262]:
# shuffle the data frame 
df = brenda_df_new.copy()
df = df.sample(frac = 1, random_state = 44)
df.reset_index(drop= True, inplace = True)

In [263]:
def split_dataframe_enzyme(frac, df):
    df1 = pd.DataFrame(columns = list(df.columns))
    df2 = pd.DataFrame(columns = list(df.columns))
    
    df.reset_index(inplace = True, drop = True)
    
    train_indices = []
    test_indices = []
    ind = 0
    while len(train_indices) +len(test_indices) < len(df):
        if ind not in train_indices and ind not in test_indices:
            if ind % frac != 0:
                n_old = len(train_indices)
                train_indices.append(ind)
                train_indices = list(set(train_indices))

                while n_old != len(train_indices):
                    n_old = len(train_indices)

                    training_rxn= list(set(df["Reaction_SMILES"].loc[train_indices]))

                    train_indices = train_indices + (list(df.loc[df["Reaction_SMILES"].isin(training_rxn)].index))
                    train_indices = list(set(train_indices))
                
            else:
                n_old = len(test_indices)
                test_indices.append(ind)
                test_indices = list(set(test_indices)) 

                while n_old != len(test_indices):
                    n_old = len(test_indices)

                    testing_rxn= list(set(df["Reaction_SMILES"].loc[test_indices]))

                    test_indices = test_indices + (list(df.loc[df["Reaction_SMILES"].isin(testing_rxn)].index))
                    test_indices = list(set(test_indices))
                
        ind +=1
    
    
    df1 = df.loc[train_indices]
    df2 = df.loc[test_indices]
    
    return(df1, df2)

In [264]:
train_df, test_df = split_dataframe_enzyme(frac = 5, df = df.copy())
print("Test set size: %s" % len(test_df))
print("Training set size: %s" % len(train_df))
print("Size of test set in percent: %s" % np.round(100*len(test_df)/ (len(test_df) + len(train_df))))

train_df.reset_index(inplace = True, drop = True)
test_df.reset_index(inplace = True, drop = True)

train_df.to_pickle(join("partitions", "train_df.pkl"))
test_df.to_pickle(join("partitions", "test_df.pkl"))

Test set size: 561
Training set size: 2375
Size of test set in percent: 19.0


Splitting the training set into 5 folds for 5-fold cross-validations (CVs)

In [265]:
# 5-fold cross validation is performed such that the same enzyme or reaction does not appear in different folds
data_train2 = train_df.copy()
data_train2["index"] = list(data_train2.index)

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

train_indices = [[] for _ in range(5)]
test_indices = [[] for _ in range(5)]

for i in range(5):
    for j in range(5):
        if i != j:
            train_indices[i] = train_indices[i] + fold_indices[j]
    test_indices[i] = fold_indices[i]

1882 493
1388 494
932 456
454 478


In [266]:
import pickle

train_file = join("partitions", "CV_train_indices.pkl")
test_file = join("partitions", "CV_test_indices.pkl")

with open(train_file, "wb") as f:
    pickle.dump(train_indices, f)

with open(test_file, "wb") as f:
    pickle.dump(test_indices, f)