In [1]:
import sys
import os
from os.path import join
import numpy as np
import pickle
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

sys.path.append('.\\additional_code')
from xgboost_training import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

cuda:0
C:\Users\alexk\projects\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data:

In [2]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts.pkl"))
df_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts.pkl"))

df_train = df_train.loc[df_train["evidence"] == "exp"]
df_test = df_test.loc[df_test["evidence"] == "exp"]

### (a) Balancing datasets such that we have same amount of negative as positive data samples:

In [3]:
pos_UIDs = df_train["Uniprot ID"].loc[df_train["Binding"] == 1]
for UID in pos_UIDs:
    n_pos = len(df_train.loc[df_train["Binding"] == 1].loc[df_train["Uniprot ID"] == UID])
    help_df = df_train.loc[df_train["Binding"] == 0].loc[df_train["Uniprot ID"] == UID]
    df_train.drop(list(help_df.index)[n_pos:], inplace= True)
    df_train.reset_index(inplace = True, drop = True)

In [4]:
pos_UIDs = df_test["Uniprot ID"].loc[df_test["Binding"] == 1]
for UID in pos_UIDs:
    n_pos = len(df_test.loc[df_test["Binding"] == 1].loc[df_test["Uniprot ID"] == UID])
    help_df = df_test.loc[df_test["Binding"] == 0].loc[df_test["Uniprot ID"] == UID]
    df_test.drop(list(help_df.index)[n_pos:], inplace= True)
    df_test.reset_index(inplace = True, drop = True)

### (b) Create dictionary with all target values

In [5]:
mol_files = list(set(df_train["molecule ID"])) + list(set(df_test["molecule ID"]))
mol_files = list(set(mol_files))

target_variable_dict = {}
target_variable_dict = create_target_dict(df = df_train, target_variable_dict = target_variable_dict)
target_variable_dict = create_target_dict(df = df_test, target_variable_dict = target_variable_dict)

### (c) Get list with input combinations of Uniprot ID and metabolite ID

In [23]:
train_IDs = get_uid_cid_IDs(df_train)
test_IDs = get_uid_cid_IDs(df_test)

print(np.mean(df_train["Binding"]), np.mean(df_test["Binding"]))
print(len(train_IDs), len(test_IDs))

0.5003414600833163 0.5005673758865248
29286 7050


## 2. Calculating input matrices for metabolites

### (a) Creating input matrices:

In [7]:
#calculate_atom_and_bond_feature_vectors(mol_files = mol_files)

In [8]:
#for mol_ID in mol_files:
#    calculate_and_save_input_matrixes(molecule_ID = mol_ID)

###  (b) Removing all datapoints without molecule input file:

In [9]:
df_train["molecule ID"] = [ID.replace(":", "_") for ID in df_train["substrate ID"]]
df_test["molecule ID"] = [ID.replace(":", "_") for ID in df_test["substrate ID"]]

In [34]:
valid_mols = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices"))
valid_mols = [mol.split("_A")[0] for mol in valid_mols]

df_train = df_train.loc[df_train["molecule ID"].isin(valid_mols)]
df_test = df_test.loc[df_test["molecule ID"].isin(valid_mols)]

train_IDs = get_uid_cid_IDs(df_train)
test_IDs = get_uid_cid_IDs(df_test)
df_train

Unnamed: 0,Uniprot ID,molecule ID,evidence,Binding,type,substrate ID,ECFP,ESM1b,ESM1b_ts,GNN rep
0,Q5B2F7,CHEBI_57344,exp,1,,CHEBI:57344,0100000001000000000000000000000001000000000000...,"[0.09207666, 0.18022089, 0.1191696, -0.0068351...","[-0.52362674, 0.5027057, -0.40282017, 0.742947...","[1577.9962, 10.317345, 29.326752, 233.01369, 4..."
1,Q9SAH9,CHEBI_58349,exp,1,,CHEBI:58349,0000000001000000100000100000000000000000000000...,"[0.022810845, 0.1272514, -0.051154055, -0.0810...","[0.61918294, 0.121414125, 0.40603346, 1.126637...","[2261.6094, 0.0, 0.0, 115.09651, 179.84134, 46..."
2,Q8IPJ6,CHEBI_57776,exp,1,,CHEBI:57776,0000000000000000000000000000010001000000000000...,"[0.09814875, 0.22172487, 0.11138555, 0.0365497...","[0.29864457, 0.22536643, 0.27347004, -0.128196...","[791.13226, 7.796671, 0.0, 0.0, 4.66982, 10.69..."
3,A0A1D5PCZ1,C00002,exp,1,,C00002,0000000001000000000000000000000000000000000000...,"[-0.21187752, 0.08564956, 0.055316914, -0.0550...","[-0.86605054, -0.38922024, -0.539311, 1.373580...","[1238.0188, 0.0, 0.0, 42.365837, 74.54658, 28...."
4,O22765,CHEBI_33384,exp,1,,CHEBI:33384,0100000000000000000000000000000000000000000000...,"[0.027133903, 0.33383188, -0.0057643764, -0.00...","[1.1005167, -1.0289398, -0.061415985, 0.988528...","[72.62339, 18.489643, 0.0, 50.355515, 13.49715..."
...,...,...,...,...,...,...,...,...,...,...
29355,O54937,CHEBI_16199,exp,0,,CHEBI:16199,0000000000000000000000000000000000000000000000...,"[0.050787933, 0.20482497, -0.0821579, -0.03619...","[-0.6821743, -0.25235456, -0.06566423, 0.84851...","[10.734227, 0.0, 4.6557817, 1.7201436, 0.0, 0...."
29356,P42980,CHEBI_43474,exp,0,,CHEBI:43474,0000000000000000000000000000000000000000000000...,"[0.03450865, 0.10044937, -0.081294104, 0.03105...","[0.7604322, -0.6746883, 0.038595006, 0.1019296...","[32.170155, 0.0, 0.0, 0.0, 0.7738515, 47.04823..."
29357,P31254,CHEBI_30616,exp,0,,CHEBI:30616,0000000001000000000000000000000000000000000100...,"[-0.10911206, 0.12464452, -0.006680568, 0.1137...","[0.5049492, 0.23488945, -0.7357721, 0.21344757...","[1288.9618, 0.0, 0.0, 76.52397, 73.09448, 37.6..."
29358,C0HLL2,CHEBI_30616,exp,0,,CHEBI:30616,0000000001000000000000000000000000000000000100...,"[0.087619156, 0.30014926, 0.051759467, 0.07981...","[1.0081663, -0.47126764, 0.106960185, -0.28055...","[1288.9618, 0.0, 0.0, 76.52397, 73.09448, 37.6..."


### (c) Creating representations for the enzymes:

In [11]:
uids_list = list(set(df_train["Uniprot ID"])) + list(set(df_test["Uniprot ID"]))
uids_list = list(set(uids_list))
uid_to_emb = {}
embeddings = np.zeros((0,1280))
for uid in uids_list:
    try:
        emb = np.reshape(np.array(list(df_train["ESM1b"].loc[df_train["Uniprot ID"] == uid])[0]), (1,1280))
    except IndexError:
        try:
            emb = np.reshape(np.array(list(df_test["ESM1b"].loc[df_test["Uniprot ID"] == uid])[0]), (1,1280))
        except IndexError:
            emb = np.reshape(np.array(list(df_validation["ESM1b"].loc[df_validation["Uniprot ID"] == uid])[0]), (1,1280))
    embeddings = np.concatenate([embeddings, emb])
    uid_to_emb[uid] = emb

We perform a PCA an the enzyme representations to get 50-dimensional representations

In [12]:
from sklearn.decomposition import PCA
dim = 50

pca = PCA(n_components = dim)
pca.fit(embeddings)
emb_pca = pca.transform(embeddings)

#Calculate mean and std to normalize the PCA-transformed vectors
mean = np.mean(emb_pca, axis = 0)
std = np.std(emb_pca, axis = 0)

uid_to_pca_emb = {}

for i, uid in enumerate(uids_list):
    uid_to_pca_emb[uid] = (emb_pca[i] - mean) / std

In [13]:
uid_to_emb = uid_to_pca_emb

## 3. Training GNN:

###  (a) Defining a DataGenerator:

In [14]:
class CustomDataSet(Dataset):
    def __init__(self, split_IDs, folder):
        self.all_IDs = split_IDs
        self.folder = folder

    def __len__(self):
        return len(self.all_IDs)

    def __getitem__(self, idx):
        ID = self.all_IDs[idx]
        try:
            [uid,cid1, cid2] = ID.split("_") 
            cid = cid1 +"_"+cid2
        except ValueError:
            [uid,cid] = ID.split("_")
            
        XE = torch.tensor(np.load(join(self.folder, cid + '_XE.npy')), dtype = torch.float32)
        X = torch.tensor(np.load(join(self.folder, cid + '_X.npy')), dtype = torch.float32)
        A = torch.tensor(np.load(join(self.folder, cid + '_A.npy')), dtype = torch.float32)
        ESM1b = torch.tensor(uid_to_emb[uid], dtype = torch.float32)
        label = torch.tensor(target_variable_dict[ID], dtype= torch.float32)
        return XE,X,A,ESM1b, label

### (b) Splitting the training set in a validation and a training set:

In [35]:
n = len(train_IDs) 
random.seed(1)
random.shuffle(train_IDs)
test_IDs = train_IDs[int(0.8*n):]
train_IDs = train_IDs[:int(0.8*n)]


In [37]:
batch_size = 64

train_dataset = CustomDataSet(folder  = join(CURRENT_DIR, ".." ,"data", "substrate_data",
                                             "GNN_input_matrices"), split_IDs = train_IDs)
train_loader = DataLoader(train_dataset , batch_size=batch_size, shuffle=True, drop_last=True)

test_dataset = CustomDataSet(folder  = join(CURRENT_DIR, ".." ,"data", "substrate_data",
                                            "GNN_input_matrices"), split_IDs = test_IDs)
test_loader = DataLoader(test_dataset , batch_size=batch_size, shuffle=False, drop_last=True)

In [38]:
n_train_batches = int(len(train_dataset)/batch_size)
n_test_batches = int(len(test_dataset)/batch_size)
train_batches = list(range(n_train_batches))
test_batches = list(range(n_test_batches))

### (c) Training GNN:

In [39]:
import torch.optim as optim

model = GNN(D= 100, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay= 0.00001)

for epoch in range(10):  # loop over the dataset multiple times
    model.train()
    running_loss = 0.0
    for i, [XE, X, A,ESM1b, labels] in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()
        XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)
        # forward + backward + optimize
        outputs = model(XE, X, A, ESM1b)
        loss = criterion(outputs, labels.view((batch_size,-1)))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 20 == 19:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0
            
    #After each epoch, calculate the validation loss:
    running_acc = 0.0
    running_loss = 0.0
    model.eval()
    for i, [XE, X, A,ESM1b, labels] in enumerate(test_loader):
        XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)
        
        with torch.no_grad():
            outputs = model(XE, X, A, ESM1b)
        loss = criterion(outputs, labels.view((batch_size,-1)))
        running_loss += loss.item()

        outputs2 = np.round(outputs.view(-1).cpu().detach().numpy()) 
        labels2 = labels.cpu().detach().numpy()
        acc = np.mean([outputs2[i] == labels2[i] for i in range(len(labels))])
        running_acc += acc

    print("Epoch: %s, Val. loss: %s, Val. acc: %s" % (epoch, np.round(running_loss/(i+1),2),
                                                                  np.round(running_acc/(i+1), 2)))

print('Finished Training')

[1,    20] loss: 0.699
[1,    40] loss: 0.693
[1,    60] loss: 0.692
[1,    80] loss: 0.683
[1,   100] loss: 0.691
[1,   120] loss: 0.693
[1,   140] loss: 0.689
[1,   160] loss: 0.681
[1,   180] loss: 0.691
[1,   200] loss: 0.686
[1,   220] loss: 0.682
[1,   240] loss: 0.683
[1,   260] loss: 0.675
[1,   280] loss: 0.680
[1,   300] loss: 0.684
[1,   320] loss: 0.676
[1,   340] loss: 0.686
[1,   360] loss: 0.677
Epoch: 0, Val. loss: 0.67, Val. acc: 0.57
[2,    20] loss: 0.669
[2,    40] loss: 0.669
[2,    60] loss: 0.665
[2,    80] loss: 0.667
[2,   100] loss: 0.660
[2,   120] loss: 0.663
[2,   140] loss: 0.658
[2,   160] loss: 0.662
[2,   180] loss: 0.656
[2,   200] loss: 0.655
[2,   220] loss: 0.659
[2,   240] loss: 0.654
[2,   260] loss: 0.642
[2,   280] loss: 0.648
[2,   300] loss: 0.645
[2,   320] loss: 0.657
[2,   340] loss: 0.642
[2,   360] loss: 0.645
Epoch: 1, Val. loss: 0.65, Val. acc: 0.62
[3,    20] loss: 0.647
[3,    40] loss: 0.634
[3,    60] loss: 0.619
[3,    80] loss: 0.

In [41]:
torch.save(model.state_dict(),join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN", "Pytorch_GNN_V2"))

In [42]:
for epoch in range(3):  # loop over the dataset multiple times
    model.train()
    running_loss = 0.0
    for i, [XE, X, A,ESM1b, labels] in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()
        XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)
        # forward + backward + optimize
        outputs = model(XE, X, A, ESM1b)
        loss = criterion(outputs, labels.view((batch_size,-1)))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 20 == 19:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0
            
    #After each epoch, calculate the validation loss:
    running_acc = 0.0
    running_loss = 0.0
    model.eval()
    for i, [XE, X, A,ESM1b, labels] in enumerate(test_loader):
        XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)
        
        with torch.no_grad():
            outputs = model(XE, X, A, ESM1b)
        loss = criterion(outputs, labels.view((batch_size,-1)))
        running_loss += loss.item()

        outputs2 = np.round(outputs.view(-1).cpu().detach().numpy()) 
        labels2 = labels.cpu().detach().numpy()
        acc = np.mean([outputs2[i] == labels2[i] for i in range(len(labels))])
        running_acc += acc

    print("Epoch: %s, Val. loss: %s, Val. acc: %s" % (epoch, np.round(running_loss/(i+1),2),
                                                                  np.round(running_acc/(i+1), 2)))

print('Finished Training')

[1,    20] loss: 0.527
[1,    40] loss: 0.524
[1,    60] loss: 0.517
[1,    80] loss: 0.511
[1,   100] loss: 0.522
[1,   120] loss: 0.519
[1,   140] loss: 0.520
[1,   160] loss: 0.507
[1,   180] loss: 0.515
[1,   200] loss: 0.526
[1,   220] loss: 0.511
[1,   240] loss: 0.527
[1,   260] loss: 0.528
[1,   280] loss: 0.522
[1,   300] loss: 0.518
[1,   320] loss: 0.524
[1,   340] loss: 0.531
[1,   360] loss: 0.545
Epoch: 0, Val. loss: 0.54, Val. acc: 0.71
[2,    20] loss: 0.536
[2,    40] loss: 0.537
[2,    60] loss: 0.502
[2,    80] loss: 0.527
[2,   100] loss: 0.544
[2,   120] loss: 0.522
[2,   140] loss: 0.519
[2,   160] loss: 0.536
[2,   180] loss: 0.522
[2,   200] loss: 0.519
[2,   220] loss: 0.519
[2,   240] loss: 0.526
[2,   260] loss: 0.500
[2,   280] loss: 0.536
[2,   300] loss: 0.507
[2,   320] loss: 0.514
[2,   340] loss: 0.527
[2,   360] loss: 0.512
Epoch: 1, Val. loss: 0.53, Val. acc: 0.72
[3,    20] loss: 0.533
[3,    40] loss: 0.522
[3,    60] loss: 0.498
[3,    80] loss: 0.

GNN(
  (BN1): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN2): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=100, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=1, bias=True)
  (drop_layer): Dropout(p=0.2, inplace=False)
)

### 4. Creating GNN representations for traing and test set:

Load trained model:

In [2]:
model = GNN(D= 50, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)
model.load_state_dict(torch.load(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN","Pytorch_GNN")))
model.eval()

GNN(
  (BN1): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN2): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=100, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=1, bias=True)
  (drop_layer): Dropout(p=0.2, inplace=False)
)

Loading data:

In [3]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts.pkl"))
df_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts.pkl"))

Creating a DataFrame with all metabolites in our dataset:

In [4]:
valid_mols = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices"))
valid_mols = [mol.split("_A.npy")[0] for mol in valid_mols]
valid_mols = [mol for mol in valid_mols if not "." in mol]
df_mols = pd.DataFrame(data = {"molecule ID" : valid_mols})

#To create the substrate rep, the UID does not matter. Therfore, setting it random:
df_mols["uid"] = "P9WIQ3"
df_mols

Unnamed: 0,molecule ID,uid
0,C00001,P9WIQ3
1,C00002,P9WIQ3
2,C00003,P9WIQ3
3,C00004,P9WIQ3
4,C00005,P9WIQ3
...,...,...
1347,CHEBI_85986,P9WIQ3
1348,CHEBI_86339,P9WIQ3
1349,CHEBI_87136,P9WIQ3
1350,CHEBI_87305,P9WIQ3


In [5]:
input_data_folder = join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices")

def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    UniRep = ();
    extras = ();
    # Generate data
    for i in range(len(cid_list)):
        cid  = cid_list[i]

        X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
        XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
        A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
    return(XE, X, A)


def get_substrate_representations(df):
    df["substrate_rep"] = ""
    
    i = 0
    n = len(df)

    while i*64 <= n:
        cid_all = list(df["molecule ID"])

        if (i+1)*64  <= n:
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][i*64:(i+1)*64] = list(representations[:, :D])
        else:
            print(i)
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][-len(representations):] = list(representations[:, :D])
        i += 1
    return(df)

In [7]:
df_mols = get_substrate_representations(df = df_mols)
df_mols

Unnamed: 0,molecule ID,uid,substrate_rep
0,C00001,P9WIQ3,"[0.029613253, 0.0, 0.26818, 0.0, 0.13194986, 0..."
1,C00002,P9WIQ3,"[1238.0188, 0.0, 0.0, 42.365837, 74.54658, 28...."
2,C00003,P9WIQ3,"[2111.6353, 0.0, 0.0, 139.63763, 183.79233, 33..."
3,C00004,P9WIQ3,"[1813.2203, 60.59075, 0.0, 224.62354, 335.9574..."
4,C00005,P9WIQ3,"[1859.3567, 120.4939, 0.0, 255.73376, 384.9909..."
...,...,...,...
1347,CHEBI_85986,P9WIQ3,"[673.1395, 31.783257, 0.0, 15.939333, 38.13377..."
1348,CHEBI_86339,P9WIQ3,"[6.3847322, 18.652742, 0.0, 76.061226, 61.6412..."
1349,CHEBI_87136,P9WIQ3,"[1872.2444, 96.23757, 0.0, 217.94662, 388.0197..."
1350,CHEBI_87305,P9WIQ3,"[1924.4116, 97.580894, 0.0, 217.94662, 384.225..."


In [8]:
df_train["GNN rep"] = ""
for ind in df_train.index:
    try:
        df_train["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_train["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_train["molecule ID"][ind].replace(":", "_")])[0]


In [9]:
df_test["GNN rep"] = ""
for ind in df_test.index:
    try:
        df_test["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test["molecule ID"][ind].replace(":", "_")])[0]


In [10]:
'''df_engqvist["molecule ID"] = df_engqvist["substrate"]

df_engqvist["GNN rep"] = ""
for ind in df_engqvist.index:
    try:
        df_engqvist["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_engqvist["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass
    
df_engqvist''';

In [12]:
df_train.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_train_with_ESM1b_ts_GNN.pkl"), protocol = 4)
df_test.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts_GNN.pkl"), protocol = 4)

#df_engqvist.to_pickle(join(CURRENT_DIR, "alex_data", "new_test_data_Engqvist_group_with_GNN_train_test_split_with_GNN_similar.pkl"), protocol = 4)