### In this jupyter notebook, we create the input matrices and tensors for the GNN to create task-specific reaction fingerprints:

In [1]:
import pandas as pd
import os
from os.path import join
import matplotlib.pyplot as plt
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\projects\kcat_paper\code\preprocessing


## 1. Calculating input arrays for GNNs:

Loading all reactions in our dataset:

In [2]:
df_reactions = pd.read_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs_and_FPs.pkl"))

df_reactions["reaction_structure"] = ""

for ind in df_reactions.index:
    
    df_reactions["substrate_InChI_set"][ind] = drop_small_metabolites(metabolites = df_reactions["substrate_InChI_set"][ind])
    df_reactions["product_InChI_set"][ind] = drop_small_metabolites(metabolites = df_reactions["product_InChI_set"][ind])
    df_reactions["#substrates"][ind] = len(df_reactions["substrate_InChI_set"][ind])
    df_reactions["#products"][ind] = len(df_reactions["product_InChI_set"][ind])
    
    df_reactions["reaction_structure"][ind] = str(df_reactions["#substrates"][ind]) + "_" + str(df_reactions["#products"][ind])
    
df_reactions.drop(columns = ["MW_frac", "structural_fp", "difference_fp", "#substrates", "#products"], inplace = True)

df_reactions.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the ca

Unnamed: 0,substrate_InChI_set,product_InChI_set,Reaction ID,reaction_structure
0,{InChI=1S/C8H8O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_0,2_2
1,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C17H21N4O9...",Reaction_1,2_2
2,{InChI=1S/C19H23N7O6/c20-19-25-15-14(17(30)26-...,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,Reaction_2,2_3
3,{InChI=1S/C16H28N2O11/c1-5(21)17-9-13(25)14(8(...,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,Reaction_3,2_1
4,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...",Reaction_4,1_2


Loading training and test data and merging it with reaction data

In [3]:
train_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat.pkl"))
test_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits","test_df_kcat.pkl"))
train_df.rename(columns = {"geomean_kcat" :"log10_kcat"}, inplace = True)
test_df.rename(columns = {"geomean_kcat" :"log10_kcat"}, inplace = True)

train_df = train_df.merge(df_reactions, on = "Reaction ID", how = "left")
test_df = test_df.merge(df_reactions, on = "Reaction ID", how = "left")

train_df

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,...,structural_fp,difference_fp,ESM1b,log10_kcat,frac_of_max_UID,frac_of_max_RID,frac_of_max_EC,substrate_InChI_set,product_InChI_set,reaction_structure
0,Reaction_127,Sequence_1959,[1.7],[Q7Z4W1],[0],[1],[0],[False],MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,...,1100111100000001001000110110010001001111111100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.026505822, 0.16142353, 0.12178893, -0.1417...",0.230449,0.447368,0.077273,0.014286,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,"{InChI=1S/C6H12O2/c1-3-4-6(8)5(2)7/h6,8H,3-4H2...",2_2
1,Reaction_796,Sequence_2315,[21.9],[Q8U4F6],[1],[0],[0],[True],MNYRYPPRYGPEWGSGGIYGLRFHNGTLYFTLAFEGEAHFITEDSH...,{InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)1...,...,1100000000000000000000000000000001000001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.035813812, 0.1608091, 0.010744683, 0.140818...",1.340444,1.000000,0.722772,1.000000,{InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)1...,{InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-...,2_2
2,Reaction_565,Sequence_473,[2.85],[Q92871],[1],[0],[0],[True],MAVTAQAARRKERVLCLFDVDGTLTPARQKIDPEVAAFLQKLRSRV...,{InChI=1S/C6H13O9P/c7-1-2-3(8)4(9)5(10)6(14-2)...,...,1100000100000000000000000000000001000000111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.059231035, 0.20886274, -0.04375118, -0.059...",0.454845,1.000000,1.000000,0.021127,{InChI=1S/C6H13O9P/c7-1-2-3(8)4(9)5(10)6(14-2)...,{InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15...,1_1
3,Reaction_781,Sequence_2711,[666.0],[Q9RF52],[1],[0],[0],[True],MTEAMKITLSTQPADARWGDKATYSINNDGITLHLNGKDDLGLIQR...,{InChI=1S/C10H18N2O5/c1-5(2)3-7(10(16)17)12-9(...,...,1100000000000000000000000000000001001001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.014622692, 0.18103217, -0.005008551, 0.146...",2.823474,0.725490,1.000000,0.701053,{InChI=1S/C10H18N2O5/c1-5(2)3-7(10(16)17)12-9(...,"{InChI=1S/C6H13NO2/c1-4(2)3-5(7)6(8)9/h4-5H,3,...",2_2
4,Reaction_3860,Sequence_2247,[0.025166666666666667],[B2HMK0],[0],[0],[1],[False],MAYHNPFIVNGKIRFPENTNLVRHVEKWARVRGDKLAYRFLDFSTE...,{InChI=1S/C12H24O2/c1-2-3-4-5-6-7-8-9-10-11-12...,...,1100110100000000000000110110010001000001111100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.011039141, 0.24913643, 0.12042855, 0.022845...",-1.599174,1.000000,0.134821,1.572917,{InChI=1S/C12H24O2/c1-2-3-4-5-6-7-8-9-10-11-12...,{InChI=1S/C22H36N5O8P/c1-2-3-4-5-6-7-8-9-10-11...,3_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3416,Reaction_133,Sequence_3165,[11.0],[Q91X52],[0],[1],[0],[False],MDLGLAGRRALVTGAGKGIGRSTVLALKAAGAQVVAVSRTREDLDD...,"{InChI=1S/C3H4O2/c1-3(5)2-4/h2H,1H3, InChI=1S/...",...,1100111100000001001000110110010001001101111100...,"[0.0, 0.0, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0,...","[-0.004179131, 0.12924549, 0.07075395, -0.1302...",1.041393,0.200000,1.000000,0.092437,"{InChI=1S/C3H4O2/c1-3(5)2-4/h2H,1H3, InChI=1S/...",{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,2_3
3417,Reaction_1915,Sequence_784,[343.0],[Q745T6],[1],[0],[0],[True],MAVPGKGLGKKLWSAAEMAALLGPGILLLAVRYARERDRWTPRDPL...,{InChI=1S/C12H22O11/c13-1-3-5(15)7(17)9(19)11(...,...,1100000100000000000000100010000001000000001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.059636135, 0.3283546, 0.23935841, 0.0461564...",2.535294,1.000000,1.000000,0.268388,{InChI=1S/C12H22O11/c13-1-3-5(15)7(17)9(19)11(...,{InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-...,2_1
3418,Reaction_4280,Sequence_602,[0.026],[F4JUY5],[0],[0],[1],[False],MESSRSLEHVLSMQGGEDDASYVKNCYGPAARLALSKPMLTTAINS...,{InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)...,...,1101111100011000000001110110011101100011101100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.0, -10.0, 0....","[0.06577969, 0.35223398, -0.14408422, -0.05531...",-1.585027,1.000000,1.000000,1.000000,{InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)...,{InChI=1S/C14H20N6O5S/c15-6(14(23)24)1-2-26-3-...,2_2
3419,Reaction_1743,Sequence_1987,[0.11],[A0A088T0J9],[1],[0],[0],[True],MASKLALISTLIAAIKAQQIGTLTTETHPPLTWQTCTSGGSCTTNN...,{InChI=1S/C18H24ClNO13/c19-7-3-6(20(28)29)1-2-...,...,1100010100000000000000110110000001000001011000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.12327819, 0.3065659, -0.14999121, -0.147799...",-0.958607,1.000000,1.000000,1.000000,{InChI=1S/C18H24ClNO13/c19-7-3-6(20(28)29)1-2-...,{InChI=1S/C6H4ClNO3/c7-5-3-4(8(10)11)1-2-6(5)9...,2_2


In [4]:
try:
    os.mkdir(join("..", "..", "data", "metabolite_data", "ts_fp_data"))
    os.mkdir(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN"))
    os.mkdir(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN"))
except FileExistsError: 
    pass

In [5]:
keep_structures = list(set(train_df["reaction_structure"]))

train_df = train_df.loc[train_df["reaction_structure"].isin(keep_structures)]
test_df = test_df.loc[test_df["reaction_structure"].isin(keep_structures)]

train_df.reset_index(drop= True, inplace = True)
test_df.reset_index(drop= True, inplace = True)

In [6]:
for structure in set(train_df["reaction_structure"]):
    print(structure, len(train_df.loc[train_df["reaction_structure"] == structure]))

2_3 425
2_1 183
1_2 334
3_4 57
5_3 1
3_5 17
2_4 20
1_1 180
4_3 43
3_1 59
4_4 12
3_2 300
1_4 2
2_2 1424
1_3 11
3_3 351
4_5 2


In [7]:
train_df = train_df.loc[train_df["reaction_structure"].isin(keep_structures)]
test_df = test_df.loc[test_df["reaction_structure"].isin(keep_structures)]

train_df.reset_index(drop= True, inplace = True)
test_df.reset_index(drop= True, inplace = True)

mol_files = []

for ind in train_df.index:
    mol_files = mol_files + list(train_df["substrate_InChI_set"][ind]) + list(train_df["product_InChI_set"][ind])
for ind in test_df.index:
    mol_files = mol_files + list(test_df["substrate_InChI_set"][ind]) + list(test_df["product_InChI_set"][ind])
mol_files = list(set(mol_files))
len(mol_files)

3150

Create dictionary for all InchiCodes:

In [8]:
Inchi_dict = dict()
count =1
for mol in mol_files:
    if mol[0:5] == "InChI":
        if mol not in Inchi_dict.keys():
            Inchi_dict[mol] = "InChI_" + str(count)
            count +=1

Calculating input tensors and matrices:

In [10]:
calculate_atom_and_bond_feature_vectors(mol_files = mol_files)

In [12]:
import pickle

droplist = []
for ind in train_df.index:
    sub_IDs = train_df["substrate_InChI_set"][ind]
    pro_IDs = train_df["product_InChI_set"][ind]
    XE_sub, X_sub, A_sub= [], [], []
    XE_pro, X_pro, A_pro= [], [], []
    
    complete = True

    for i, ID in enumerate(sub_IDs):
        if ID[:5] == "InChI":
            ID = Inchi_dict[ID]
        [XE, X, A] = create_input_data_for_GNN_for_substrates(substrate_ID = ID, print_error=True)
        if A is None:
            complete = False
        XE_sub = XE_sub + [XE]
        X_sub = X_sub + [X]
        A_sub = A_sub + [A]

    for i, ID in enumerate(pro_IDs):
        if ID[:5] == "InChI":
            ID = Inchi_dict[ID]
        [XE, X, A] = create_input_data_for_GNN_for_substrates(substrate_ID = ID, print_error=True)
        if A is None:
            complete = False
        XE_pro = XE_pro + [XE]
        X_pro = X_pro + [X]
        A_pro = A_pro + [A]
    
    if complete:
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_XE_sub.npy"), np.array(XE_sub))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_XE_pro.npy"), np.array(XE_pro))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_X_sub.npy"), np.array(X_sub))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_X_pro.npy"), np.array(X_pro))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_A_sub.npy"), np.array(A_sub))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_A_pro.npy"), np.array(A_pro))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_str.npy"), train_df["reaction_structure"][ind])
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "train_GNN", str(ind)+"_y.npy"), train_df["log10_kcat"][ind])
    else:
        droplist.append(ind)
        
train_df.drop(droplist, inplace = True)

More than 114 (116) atoms in molcuele InChI_362
Could not create input for substrate ID InChI_362


In [16]:
len(droplist)

670

In [13]:
droplist = []
for ind in test_df.index:
    sub_IDs = test_df["substrate_InChI_set"][ind]
    pro_IDs = test_df["product_InChI_set"][ind]
    XE_sub, X_sub, A_sub= [], [], []
    XE_pro, X_pro, A_pro= [], [], []
    
    complete = True

    for i, ID in enumerate(sub_IDs):
        if ID[:5] == "InChI":
            ID = Inchi_dict[ID]
        [XE, X, A] = create_input_data_for_GNN_for_substrates(substrate_ID = ID, print_error=True)
        if A is None:
            complete = False
        XE_sub = XE_sub + [XE]
        X_sub = X_sub + [X]
        A_sub = A_sub + [A]

    for i, ID in enumerate(pro_IDs):
        if ID[:5] == "InChI":
            ID = Inchi_dict[ID]
        [XE, X, A] = create_input_data_for_GNN_for_substrates(substrate_ID = ID, print_error=True)
        if A is None:
            complete = False
        XE_pro = XE_pro + [XE]
        X_pro = X_pro + [X]
        A_pro = A_pro + [A]
    
    if complete:
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_XE_sub.npy"), np.array(XE_sub))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_XE_pro.npy"), np.array(XE_pro))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_X_sub.npy"), np.array(X_sub))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_X_pro.npy"), np.array(X_pro))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_A_sub.npy"), np.array(A_sub))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_A_pro.npy"), np.array(A_pro))
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_str.npy"), test_df["reaction_structure"][ind])
        np.save(join("..", "..", "data", "metabolite_data", "ts_fp_data", "test_GNN", str(ind)+"_y.npy"), test_df["log10_kcat"][ind])
    else:
        droplist.append(ind)
test_df.drop(droplist, inplace = True)

In [14]:
len(droplist)

1

Saving the DataFrames for training of GNNs:

In [16]:
len(train_df), len(test_df)

(3418, 849)

In [18]:
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [38]:
join(CURRENT_DIR, "..", "data", "merged_datasets")

'C:\\Users\\alexk\\projects\\kcat_prediction_V5\\notebooks_and_code\\..\\data\\merged_datasets'

In [19]:
train_df.to_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_ts_fp.pkl"))
test_df.to_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_ts_fp.pkl"))

Splitting the training set into 5-folds for a 5-fold CV of the GNN:

In [21]:
train_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_ts_fp.pkl"))
#train_df.reset_index(inplace = True, drop = True)

In [22]:
data_train2 = train_df.copy()
keep_structures = ["2_2","1_2", "3_4", "3_1","2_3","4_3","3_2","2_1","3_3","1_1"]
data_train2 = data_train2.loc[data_train2["reaction_structure"].isin(keep_structures)]
#data_train2.reset_index(inplace = True, drop = True)


data_train2["index"] = list(data_train2.index)

data_train2, df_fold = split_dataframe(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))#

data_train2, df_fold = split_dataframe(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

train_indices = [[], [], [], [], []]
test_indices = [[], [], [], [], []]

for i in range(5):
    for j in range(5):
        if i != j:
            train_indices[i] = train_indices[i] + fold_indices[j]
            
    test_indices[i] = fold_indices[i]

2527 826
1825 702
1152 673
534 618


In [23]:
np.save(join("..", "..", "data", "kcat_data", "splits", "CV_train_indices_ts_fp.npy"), train_indices)
np.save(join("..", "..", "data", "kcat_data", "splits",  "CV_test_indices_ts_fp.npy"), test_indices)

  arr = np.asanyarray(arr)


In [24]:
train_df

Unnamed: 0,index,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,...,structural_fp,difference_fp,ESM1b,log10_kcat,frac_of_max_UID,frac_of_max_RID,frac_of_max_EC,substrate_InChI_set,product_InChI_set,reaction_structure
0,0,Reaction_127,Sequence_1959,[1.7],[Q7Z4W1],[0],[1],[0],[False],MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,...,1100111100000001001000110110010001001111111100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.026505822, 0.16142353, 0.12178893, -0.1417...",0.230449,0.447368,0.077273,0.014286,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,"{InChI=1S/C6H12O2/c1-3-4-6(8)5(2)7/h6,8H,3-4H2...",2_2
1,1,Reaction_796,Sequence_2315,[21.9],[Q8U4F6],[1],[0],[0],[True],MNYRYPPRYGPEWGSGGIYGLRFHNGTLYFTLAFEGEAHFITEDSH...,...,1100000000000000000000000000000001000001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.035813812, 0.1608091, 0.010744683, 0.140818...",1.340444,1.000000,0.722772,1.000000,{InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)1...,{InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4...,2_2
2,2,Reaction_565,Sequence_473,[2.85],[Q92871],[1],[0],[0],[True],MAVTAQAARRKERVLCLFDVDGTLTPARQKIDPEVAAFLQKLRSRV...,...,1100000100000000000000000000000001000000111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.059231035, 0.20886274, -0.04375118, -0.059...",0.454845,1.000000,1.000000,0.021127,{InChI=1S/C6H13O9P/c7-1-2-3(8)4(9)5(10)6(14-2)...,{InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15...,1_1
3,3,Reaction_781,Sequence_2711,[666.0],[Q9RF52],[1],[0],[0],[True],MTEAMKITLSTQPADARWGDKATYSINNDGITLHLNGKDDLGLIQR...,...,1100000000000000000000000000000001001001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.014622692, 0.18103217, -0.005008551, 0.146...",2.823474,0.725490,1.000000,0.701053,{InChI=1S/C10H18N2O5/c1-5(2)3-7(10(16)17)12-9(...,"{InChI=1S/C6H13NO2/c1-4(2)3-5(7)6(8)9/h4-5H,3,...",2_2
4,4,Reaction_3860,Sequence_2247,[0.025166666666666667],[B2HMK0],[0],[0],[1],[False],MAYHNPFIVNGKIRFPENTNLVRHVEKWARVRGDKLAYRFLDFSTE...,...,1100110100000000000000110110010001000001111100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.011039141, 0.24913643, 0.12042855, 0.022845...",-1.599174,1.000000,0.134821,1.572917,"{InChI=1S/p+1, InChI=1S/C10H16N5O13P3/c11-8-5-...",{InChI=1S/C22H36N5O8P/c1-2-3-4-5-6-7-8-9-10-11...,3_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3413,3416,Reaction_133,Sequence_3165,[11.0],[Q91X52],[0],[1],[0],[False],MDLGLAGRRALVTGAGKGIGRSTVLALKAAGAQVVAVSRTREDLDD...,...,1100111100000001001000110110010001001101111100...,"[0.0, 0.0, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0,...","[-0.004179131, 0.12924549, 0.07075395, -0.1302...",1.041393,0.200000,1.000000,0.092437,"{InChI=1S/C3H4O2/c1-3(5)2-4/h2H,1H3, InChI=1S/...",{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,2_3
3414,3417,Reaction_1915,Sequence_784,[343.0],[Q745T6],[1],[0],[0],[True],MAVPGKGLGKKLWSAAEMAALLGPGILLLAVRYARERDRWTPRDPL...,...,1100000100000000000000100010000001000000001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.059636135, 0.3283546, 0.23935841, 0.0461564...",2.535294,1.000000,1.000000,0.268388,{InChI=1S/C12H22O11/c13-1-3-5(15)7(17)9(19)11(...,{InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-...,2_1
3415,3418,Reaction_4280,Sequence_602,[0.026],[F4JUY5],[0],[0],[1],[False],MESSRSLEHVLSMQGGEDDASYVKNCYGPAARLALSKPMLTTAINS...,...,1101111100011000000001110110011101100011101100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.0, -10.0, 0....","[0.06577969, 0.35223398, -0.14408422, -0.05531...",-1.585027,1.000000,1.000000,1.000000,{InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)...,{InChI=1S/C14H20N6O5S/c15-6(14(23)24)1-2-26-3-...,2_2
3416,3419,Reaction_1743,Sequence_1987,[0.11],[A0A088T0J9],[1],[0],[0],[True],MASKLALISTLIAAIKAQQIGTLTTETHPPLTWQTCTSGGSCTTNN...,...,1100010100000000000000110110000001000001011000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.12327819, 0.3065659, -0.14999121, -0.147799...",-0.958607,1.000000,1.000000,1.000000,{InChI=1S/C18H24ClNO13/c19-7-3-6(20(28)29)1-2-...,{InChI=1S/C6H4ClNO3/c7-5-3-4(8(10)11)1-2-6(5)9...,2_2


In [31]:
bs = 16
import random

In [32]:
for i in range(5):
    df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_ts_fp.pkl"))
    keep_structures = ["2_2","1_2", "3_4", "3_1","2_3","4_3","3_2","2_1","3_3","1_1"]
    df = df.loc[df["reaction_structure"].isin(keep_structures)]

    train_index, test_index  = train_indices[i], test_indices[i]
    train_df = df.loc[train_index]
    test_df = df.loc[test_index]
    


    train_batches = []

    batch_size = bs
    for structure in keep_structures:
        structure_indices = list(train_df.loc[train_df["reaction_structure"] == structure].index)

        random.shuffle(structure_indices)
        n = len(structure_indices)

        n_batches = int(n/batch_size)

        for i in range(n_batches):
            batch = structure_indices[i*batch_size : (i+1)*batch_size]
            train_batches = train_batches + [batch]
            
        random.shuffle(train_batches)

In [33]:
train_batches

[[1935,
  2884,
  2734,
  2763,
  2686,
  1063,
  497,
  2558,
  1471,
  3322,
  2175,
  2758,
  1621,
  1711,
  1776,
  938],
 [2215,
  1518,
  128,
  879,
  1213,
  1569,
  1804,
  1789,
  1996,
  2263,
  2121,
  2624,
  1068,
  2319,
  273,
  603],
 [1319,
  1454,
  649,
  2200,
  2149,
  3200,
  1844,
  2087,
  2891,
  2897,
  2168,
  947,
  2048,
  563,
  1272,
  2855],
 [559,
  980,
  195,
  2608,
  2526,
  2551,
  1261,
  3058,
  1586,
  2946,
  2194,
  1986,
  460,
  1115,
  296,
  3006],
 [216,
  323,
  715,
  1886,
  1912,
  1768,
  1737,
  2948,
  3342,
  859,
  1497,
  1441,
  1875,
  783,
  615,
  176],
 [1314,
  2704,
  660,
  1850,
  3114,
  2421,
  2000,
  210,
  1901,
  668,
  498,
  375,
  1705,
  767,
  233,
  1780],
 [2724,
  3385,
  1189,
  2692,
  1797,
  1073,
  2479,
  2525,
  3209,
  1269,
  1743,
  1398,
  286,
  1067,
  1929,
  841],
 [3128,
  2336,
  3148,
  2910,
  1163,
  2352,
  2962,
  2233,
  2899,
  1701,
  2627,
  127,
  1034,
  118,
  2786,
  3403],


In [37]:
batch = train_batches[1]

In [38]:
train_df.loc[batch]

Unnamed: 0,index,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,...,structural_fp,difference_fp,ESM1b,log10_kcat,frac_of_max_UID,frac_of_max_RID,frac_of_max_EC,substrate_InChI_set,product_InChI_set,reaction_structure
2215,2216,Reaction_3847,Sequence_1063,[29.7],[Q8S948],[0],[0],[1],[False],MMTSCRNIDLGTMMMACGCGRRQFPSLAKTVCKFTSSNRSYGGLVG...,...,1100000100000000000000000000000001000000111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.01158332, 0.16099967, 0.060728062, 0.040407...",1.472756,1.0,1.0,1.0,{InChI=1S/C15H28O7P2/c1-13(2)7-5-8-14(3)9-6-10...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...",2_2
1518,1519,Reaction_1398,Sequence_2001,[0.85],[Q3S2I3],[1],[0],[0],[True],MKKATVAAVVVGTAAAVAVAALIMRHRMGKSSKWARARAILKEFEE...,...,1100110100000000000000110110010001001001111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.074195676, 0.29617867, 0.102727994, -0.1146...",-0.070581,0.169661,1.0,1.0,{InChI=1S/C10H16N5O14P3/c11-10-13-7-4(8(18)14-...,"{InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-...",2_2
128,128,Reaction_2653,Sequence_678,"[65, 16]","[Q73PV9, Q73PV9]","[1, 1]","[0, 0]","[0, 0]","[False, False]",MRPLVIILMGSSSDMGHAEKIASELKTFGIEYAIRIGSAHKTAEHV...,...,1100100100000000000000000010000001000001111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.10355619, 0.12570773, -0.014607985, 0.05307...",1.508517,0.844156,1.0,0.844156,"{InChI=1S/p+1, InChI=1S/C9H14N3O9P/c10-7-4(9(1...","{InChI=1S/CO2/c2-1-3, InChI=1S/C8H14N3O7P/c9-5...",2_2
879,879,Reaction_796,Sequence_2818,[17],[P14410],[1],[0],[0],[False],MARKKFSGLEISLIVLFVIVTIIAIALIVVLATKTPAVDEISDSTS...,...,1100000000000000000000000000000001000001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.11052354, 0.24920371, 0.032532427, 0.15047...",1.230449,0.124088,0.561056,0.010625,{InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)1...,{InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4...,2_2
1213,1214,Reaction_1001,Sequence_3557,"[4.0, 9.0, 6.0]","[Q8GLK7, Q8GLK7, Q8GLK7]","[1, 1, 1]","[0, 0, 0]","[0, 0, 0]","[True, True, True]",MEKFLIIAGPCAIESESLVLRVAEKIRELQDKFRDVEFVFKSSFDK...,...,1100000000000000000000000000000001000000111000...,"[0.0, 0.0, 0.0, 0.0, -10.0, 0.0, 0.0, 0.0, 0.0...","[0.14187834, 0.1993661, 0.013229275, 0.1012368...",0.778151,1.0,1.0,0.09,{InChI=1S/C5H11O8P/c6-1-3(7)5(9)4(8)2-13-14(10...,"{InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4), InCh...",2_2
1569,1570,Reaction_1701,Sequence_64,[220.0],[A0A0H3W535],[1],[0],[0],[True],MVYTAQYLAMAAAMLPNAVLAQNNQTYANYSSQSQPDLYPQTLATL...,...,1100000000000000000000000000000001000001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.034615003, 0.28327507, -0.09423155, 0.0092...",2.342423,1.0,1.0,0.099926,{InChI=1S/C11H13NO7/c13-8-5-18-11(10(15)9(8)14...,{InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4...,2_2
1804,1805,Reaction_610,Sequence_1636,[2.86],[Q56YA5],[1],[0],[0],[True],MDYMYGPGRHHLFVPGPVNIPEPVIRAMNRNNEDYRSPAIPALTKT...,...,0000000000000000000000000000000000000001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.10319713, 0.3816219, -0.019816792, -0.10457...",0.456366,0.446875,0.063556,0.021832,"{InChI=1S/C2H2O3/c3-1-2(4)5/h1H,(H,4,5), InChI...","{InChI=1S/C2H5NO2/c3-1-2(4)5/h1,3H2,(H,4,5), I...",2_2
1789,1790,Reaction_3656,Sequence_131,[0.58],[F2XBU9],[1],[0],[0],[False],MNKPQSWEARAETYSLYGFTDMPSLHQRGTVVVTHGEGPYIVDVNG...,...,0000000000000000000000000000000001000001001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.15986124, 0.17664924, -0.021997772, 0.13143...",-0.236572,0.322222,1.0,1.0,"{InChI=1S/C6H13NO2/c7-5-3-1-2-4-6(8)9/h1-5,7H2...","{InChI=1S/C6H10O3/c7-5-3-1-2-4-6(8)9/h5H,1-4H2...",2_2
1996,1997,Reaction_2654,Sequence_678,[77],[Q73PV9],[1],[0],[0],[False],MRPLVIILMGSSSDMGHAEKIASELKTFGIEYAIRIGSAHKTAEHV...,...,1100100100000000000000000010000001000001111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.10355619, 0.12570773, -0.014607985, 0.05307...",1.886491,1.0,1.0,1.0,"{InChI=1S/CO2/c2-1-3, InChI=1S/C8H14N3O7P/c9-5...","{InChI=1S/p+1, InChI=1S/C9H14N3O9P/c10-7-4(9(1...",2_2
2263,2264,Reaction_1177,Sequence_15,"[0.1616667, 0.186667]","[A0A0H2V630, A0A0H2V630]","[1, 1]","[0, 0]","[0, 0]","[True, True]",MRILFVGPPLYGLLYPVLSLAQAFRVNGHEVLIASGGQFAQKAAEA...,...,1100110100000101001000110110000001001101111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.044715572, 0.17049211, 0.050847173, 0.08717...",-0.760156,1.0,1.0,1.037039,{InChI=1S/C30H27N3O15/c34-19-7-1-4-13(22(19)37...,{InChI=1S/C36H37N3O20/c40-9-23-27(47)28(48)29(...,2_2
