In [1]:
import sys
import os
from os.path import join
import numpy as np
import pickle
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

sys.path.append('.\\additional_code')
from xgboost_training import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

cpu
C:\Users\alexk\projects\ESP\notebooks_and_code


## 1. Loading and preprocessing data:

In [2]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts.pkl"))
df_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts.pkl"))

### (a) Balancing datasets such that we have same amount of negative as positive data samples:

In [3]:
pos_UIDs = df_train["Uniprot ID"].loc[df_train["Binding"] == 1]
for UID in pos_UIDs:
    n_pos = len(df_train.loc[df_train["Binding"] == 1].loc[df_train["Uniprot ID"] == UID])
    help_df = df_train.loc[df_train["Binding"] == 0].loc[df_train["Uniprot ID"] == UID]
    df_train.drop(list(help_df.index)[n_pos:], inplace= True)
    df_train.reset_index(inplace = True, drop = True)

In [4]:
pos_UIDs = df_test["Uniprot ID"].loc[df_test["Binding"] == 1]
for UID in pos_UIDs:
    n_pos = len(df_test.loc[df_test["Binding"] == 1].loc[df_test["Uniprot ID"] == UID])
    help_df = df_test.loc[df_test["Binding"] == 0].loc[df_test["Uniprot ID"] == UID]
    df_test.drop(list(help_df.index)[n_pos:], inplace= True)
    df_test.reset_index(inplace = True, drop = True)

### (b) Create dictionary with all target values

In [5]:
mol_files = list(set(df_train["molecule ID"])) + list(set(df_test["molecule ID"]))
mol_files = list(set(mol_files))

target_variable_dict = {}
target_variable_dict = create_target_dict(df = df_train, target_variable_dict = target_variable_dict)
target_variable_dict = create_target_dict(df = df_test, target_variable_dict = target_variable_dict)

### (c) Get list with input combinations of Uniprot ID and metabolite ID

In [6]:
train_IDs = get_uid_cid_IDs(df_train)
test_IDs = get_uid_cid_IDs(df_test)

print(np.mean(df_train["Binding"]), np.mean(df_test["Binding"]))
print(len(train_IDs), len(test_IDs))

0.5 0.5
29480 7068


## 2. Calculating input matrices for metabolites

### (a) Creating input matrices:

In [9]:
def calculate_and_save_input_matrixes(molecule_ID, save_folder = join(CURRENT_DIR, ".." ,"data", "substrate_data",
                                                                      "GNN_input_matrices")):
    molecule_ID = molecule_ID.replace(":", "_")
    molecule_ID = molecule_ID.replace("/", "Q")
    [XE, X, A] = create_input_data_for_GNN_for_substrates(substrate_ID = molecule_ID, print_error=True)
    if not A is None:
        np.save(join(save_folder, molecule_ID + '_X.npy'), X) #feature matrix of atoms/nodes
        np.save(join(save_folder, molecule_ID + '_XE.npy'), XE) #feature matrix of atoms/nodes and bonds/edges
        np.save(join(save_folder, molecule_ID + '_A.npy'), A) #adjacency matrix
        
        
def calculate_atom_and_bond_feature_vectors(mol_files):
    #check if feature vectors have already been calculated:
    try:
        os.mkdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors"))
    except FileExistsError:
        None
    
    #existing feature vector files:
    feature_files = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors"))
    for mol_file in mol_files:
        #check if feature vectors were already calculated:
        try:
            if not mol_file + "-atoms.txt" in  feature_files:
                #load mol_file
                is_CHEBI_ID = (mol_file[0:5] == "CHEBI")
                is_inchi = (mol_file[0:5] == "InChI")
                if is_CHEBI_ID:
                    ID = int(mol_file.split(" ")[0].split(":")[-1])
                    Inchi = list(df_chebi_to_inchi["Inchi"].loc[df_chebi_to_inchi["ChEBI"] == float(ID)])[0]

                    if not pd.isnull(Inchi):
                        mol = Chem.inchi.MolFromInchi(Inchi)
                    else:
                        print(ID, Inchi)
                elif is_inchi:
                    mol = Chem.inchi.MolFromInchi(mol_file)
                    mol_file = mol_file.replace("/", "Q")
                else:
                    mol = Chem.MolFromMolFile(mol_folder +  "/mol-files/" + mol_file + '.mol')
       
            if not mol is None:
                calculate_atom_feature_vector_for_mol_file(mol, mol_file)
                calculate_bond_feature_vector_for_mol_file(mol, mol_file)
        except OSError: pass

In [10]:
calculate_atom_and_bond_feature_vectors(mol_files = mol_files)

In [11]:
for mol_ID in mol_files:
    calculate_and_save_input_matrixes(molecule_ID = mol_ID)

More than 70 (91) atoms in molcuele CHEBI_58466
Could not create input for substrate ID CHEBI_58466
More than 70 (70) atoms in molcuele CHEBI_83905
Could not create input for substrate ID CHEBI_83905
More than 70 (90) atoms in molcuele CHEBI_58502
Could not create input for substrate ID CHEBI_58502
More than 70 (76) atoms in molcuele CHEBI_57373
Could not create input for substrate ID CHEBI_57373
More than 70 (116) atoms in molcuele CHEBI_60032
Could not create input for substrate ID CHEBI_60032
More than 70 (79) atoms in molcuele CHEBI_58677
Could not create input for substrate ID CHEBI_58677
More than 70 (166) atoms in molcuele CHEBI_61502
Could not create input for substrate ID CHEBI_61502
More than 70 (194) atoms in molcuele CHEBI_61998
Could not create input for substrate ID CHEBI_61998
More than 70 (194) atoms in molcuele C00770
Could not create input for substrate ID C00770
More than 70 (70) atoms in molcuele CHEBI_18259
Could not create input for substrate ID CHEBI_18259
More t

###  (b) Removing all datapoints without molecule input file:

In [11]:
df_train["molecule ID"] = [ID.replace(":", "_") for ID in df_train["substrate ID"]]
df_test["molecule ID"] = [ID.replace(":", "_") for ID in df_test["substrate ID"]]

In [12]:
valid_mols = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices"))
valid_mols = [mol.split("_A")[0] for mol in valid_mols]

df_train = df_train.loc[df_train["molecule ID"].isin(valid_mols)]
df_test = df_test.loc[df_test["molecule ID"].isin(valid_mols)]

train_IDs = get_uid_cid_IDs(df_train)
test_IDs = get_uid_cid_IDs(df_test)
df_train

Unnamed: 0,Uniprot ID,molecule ID,evidence,Binding,type,substrate ID,ECFP,ESM1b,ESM1b_ts
0,G8BBN0,CHEBI_35681,exp,1.0,,CHEBI:35681,0100000000000000000000000000000001000000000000...,"[-0.033332635, 0.35044205, -0.07861315, 0.0046...","[0.5721893, 0.56740093, 0.09789569, 0.8466092,..."
1,P78937,CHEBI_30616,exp,1.0,,CHEBI:30616,0000000001000000000000000000000000000000000100...,"[0.049317513, 0.11258735, -0.08035447, 0.04825...","[-0.56589794, -0.5028634, 0.2953197, -0.357490..."
2,F4K688,CHEBI_30616,exp,1.0,,CHEBI:30616,0000000001000000000000000000000000000000000100...,"[-0.005019231, 0.06971764, -0.022618646, -0.03...","[0.3031646, 0.69172686, -1.0995013, 0.13241063..."
3,Q9Z0J5,CHEBI_58349,exp,1.0,,CHEBI:58349,0000000001000000100000100000000000000000000000...,"[-0.15290919, 0.31520224, 0.025415594, 0.02750...","[0.118711345, 0.8216332, -0.9046953, 1.179861,..."
4,P49189,CHEBI_58264,exp,1.0,,CHEBI:58264,0000000000000000010000000000000000000000000000...,"[-0.044796597, 0.24305029, 0.10043996, -0.0269...","[0.8842707, -0.06434063, 0.5387947, 1.6151128,..."
...,...,...,...,...,...,...,...,...,...
29475,C9Y9E7,CHEBI_16810,,0.0,engqvist,CHEBI:16810,0000000000000000000000000000000010000000000000...,"[0.1486952, 0.23952422, -0.18132365, 0.0853893...","[-0.5624707, 0.49068797, -0.78957033, 1.021208..."
29476,C9Y9E7,CHEBI_17544,,0.0,engqvist,CHEBI:17544,0000000000000000000000000000000000000000000000...,"[0.1486952, 0.23952422, -0.18132365, 0.0853893...","[-0.5624707, 0.49068797, -0.78957033, 1.021208..."
29477,C9Y9E7,C00007,,0.0,engqvist,C00007,0000000000000000100000000000000000000000000000...,"[0.1486952, 0.23952422, -0.18132365, 0.0853893...","[-0.5624707, 0.49068797, -0.78957033, 1.021208..."
29478,D4MUV9,CHEBI_16810,,0.0,engqvist,CHEBI:16810,0000000000000000000000000000000010000000000000...,"[0.08790772, 0.17450011, -0.014648443, 0.06931...","[1.0554699, 0.441238, 0.19652943, 1.1101232, -..."


### (c) Creating representations for the enzymes:

In [13]:
uids_list = list(set(df_train["Uniprot ID"])) + list(set(df_test["Uniprot ID"]))
uids_list = list(set(uids_list))
uid_to_emb = {}
embeddings = np.zeros((0,1280))
for uid in uids_list:
    try:
        emb = np.reshape(np.array(list(df_train["ESM1b"].loc[df_train["Uniprot ID"] == uid])[0]), (1,1280))
    except IndexError:
        try:
            emb = np.reshape(np.array(list(df_test["ESM1b"].loc[df_test["Uniprot ID"] == uid])[0]), (1,1280))
        except IndexError:
            emb = np.reshape(np.array(list(df_validation["ESM1b"].loc[df_validation["Uniprot ID"] == uid])[0]), (1,1280))
    embeddings = np.concatenate([embeddings, emb])
    uid_to_emb[uid] = emb

We perform a PCA an the enzyme representations to get 50-dimensional representations

In [14]:
from sklearn.decomposition import PCA
dim = 50

pca = PCA(n_components = dim)
pca.fit(embeddings)
emb_pca = pca.transform(embeddings)

#Calculate mean and std to normalize the PCA-transformed vectors
mean = np.mean(emb_pca, axis = 0)
std = np.std(emb_pca, axis = 0)

uid_to_pca_emb = {}

for i, uid in enumerate(uids_list):
    uid_to_pca_emb[uid] = (emb_pca[i] - mean) / std

In [15]:
uid_to_emb = uid_to_pca_emb

## 3. Training GNN:

###  (a) Defining a DataGenerator:

In [16]:
class CustomDataSet(Dataset):
    def __init__(self, split_IDs, folder):
        self.all_IDs = split_IDs
        self.folder = folder

    def __len__(self):
        return len(self.all_IDs)

    def __getitem__(self, idx):
        ID = self.all_IDs[idx]
        try:
            [uid,cid1, cid2] = ID.split("_") 
            cid = cid1 +"_"+cid2
        except ValueError:
            [uid,cid] = ID.split("_")
            
        XE = torch.tensor(np.load(join(self.folder, cid + '_XE.npy')), dtype = torch.float32)
        X = torch.tensor(np.load(join(self.folder, cid + '_X.npy')), dtype = torch.float32)
        A = torch.tensor(np.load(join(self.folder, cid + '_A.npy')), dtype = torch.float32)
        ESM1b = torch.tensor(uid_to_emb[uid], dtype = torch.float32)
        label = torch.tensor(target_variable_dict[ID], dtype= torch.float32)
        return XE,X,A,ESM1b, label

### (b) Splitting the training set in a validation and a training set:

In [17]:
n = len(train_IDs) 
random.seed(1)
random.shuffle(train_IDs)
test_IDs = train_IDs[int(0.8*n):]
train_IDs = train_IDs[:int(0.8*n)]


In [18]:
batch_size = 64

train_dataset = CustomDataSet(folder  = join(CURRENT_DIR, ".." ,"data", "substrate_data",
                                             "GNN_input_matrices"), split_IDs = train_IDs)
train_loader = DataLoader(train_dataset , batch_size=batch_size, shuffle=True, drop_last=True)

test_dataset = CustomDataSet(folder  = join(CURRENT_DIR, ".." ,"data", "substrate_data",
                                            "GNN_input_matrices"), split_IDs = test_IDs)
test_loader = DataLoader(test_dataset , batch_size=batch_size, shuffle=False, drop_last=True)

In [19]:
n_train_batches = int(len(train_dataset)/batch_size)
n_test_batches = int(len(test_dataset)/batch_size)
train_batches = list(range(n_train_batches))
test_batches = list(range(n_test_batches))

### (c) Training GNN:

In [20]:
pre_training = True

In [21]:
import torch.optim as optim

model = GNN(D= 100, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)
if pre_training:
    model.load_state_dict(torch.load(join(CURRENT_DIR, ".." ,"data", "substrate_data_KM", "GNN", "Pytorch_GNN_KM")))

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay= 0.00001)


    


for epoch in range(10):  # loop over the dataset multiple times
    model.train()
    running_loss = 0.0
    for i, [XE, X, A,ESM1b, labels] in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()
        XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)
        # forward + backward + optimize
        outputs = model(XE, X, A, ESM1b)
        loss = criterion(outputs, labels.view((batch_size,-1)))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 20 == 19:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0
            
    #After each epoch, calculate the validation loss:
    running_acc = 0.0
    running_loss = 0.0
    model.eval()
    for i, [XE, X, A,ESM1b, labels] in enumerate(test_loader):
        XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)
        
        with torch.no_grad():
            outputs = model(XE, X, A, ESM1b)
        loss = criterion(outputs, labels.view((batch_size,-1)))
        running_loss += loss.item()

        outputs2 = np.round(outputs.view(-1).cpu().detach().numpy()) 
        labels2 = labels.cpu().detach().numpy()
        acc = np.mean([outputs2[i] == labels2[i] for i in range(len(labels))])
        running_acc += acc

    print("Epoch: %s, Val. loss: %s, Val. acc: %s" % (epoch, np.round(running_loss/(i+1),2),
                                                                  np.round(running_acc/(i+1), 2)))

print('Finished Training')

  allow_unreachable=True)  # allow_unreachable flag
  allow_unreachable=True)  # allow_unreachable flag


[1,    20] loss: 0.743
[1,    40] loss: 0.693
[1,    60] loss: 0.707
[1,    80] loss: 0.690
[1,   100] loss: 0.694
[1,   120] loss: 0.682
[1,   140] loss: 0.690
[1,   160] loss: 0.688
[1,   180] loss: 0.682
[1,   200] loss: 0.676
[1,   220] loss: 0.686
[1,   240] loss: 0.675
[1,   260] loss: 0.681
[1,   280] loss: 0.674
[1,   300] loss: 0.679
[1,   320] loss: 0.676
[1,   340] loss: 0.673
[1,   360] loss: 0.682
Epoch: 0, Val. loss: 0.67, Val. acc: 0.58
[2,    20] loss: 0.666
[2,    40] loss: 0.662
[2,    60] loss: 0.674
[2,    80] loss: 0.662
[2,   100] loss: 0.671
[2,   120] loss: 0.660
[2,   140] loss: 0.661
[2,   160] loss: 0.659
[2,   180] loss: 0.649
[2,   200] loss: 0.668
[2,   220] loss: 0.655
[2,   240] loss: 0.659
[2,   260] loss: 0.650
[2,   280] loss: 0.646
[2,   300] loss: 0.650
[2,   320] loss: 0.635
[2,   340] loss: 0.646
[2,   360] loss: 0.643
Epoch: 1, Val. loss: 0.64, Val. acc: 0.61
[3,    20] loss: 0.626
[3,    40] loss: 0.635
[3,    60] loss: 0.632
[3,    80] loss: 0.

In [22]:
torch.save(model.state_dict(),join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN", "Pytorch_GNN_with_pretraining"))

### 4. Creating GNN representations for traing and test set:

Load trained model:

In [12]:
model = GNN(D= 100, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)
model.load_state_dict(torch.load(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN", "Pytorch_GNN_with_pretraining")))
model.eval()

GNN(
  (BN1): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN2): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN3): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=150, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=1, bias=True)
  (drop_layer): Dropout(p=0.2, inplace=False)
)

Loading data:

In [13]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts.pkl"))
df_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts.pkl"))

Creating a DataFrame with all metabolites in our dataset:

In [14]:
valid_mols = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices"))
valid_mols = [mol.split("_A.npy")[0] for mol in valid_mols]
valid_mols = [mol for mol in valid_mols if not "." in mol]
df_mols = pd.DataFrame(data = {"molecule ID" : valid_mols})

#To create the substrate rep, the UID does not matter. Therfore, setting it random:
df_mols["uid"] = "P9WIQ3"
df_mols

Unnamed: 0,molecule ID,uid
0,C00001,P9WIQ3
1,C00002,P9WIQ3
2,C00003,P9WIQ3
3,C00004,P9WIQ3
4,C00005,P9WIQ3
...,...,...
1355,CHEBI_88052,P9WIQ3
1356,InChI=1SQC18H36O3Qc1-2-3-4-5-6-7-8-9-10-11-12-...,P9WIQ3
1357,"InChI=1SQC3H6O3Qc1-2(4)3(5)6Qh2,4H,1H3,(H,5,6)...",P9WIQ3
1358,"InChI=1SQC8H16O3Qc1-2-3-4-5-6-7(9)8(10)11Qh7,9...",P9WIQ3


In [15]:
input_data_folder = join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices")

def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    UniRep = ();
    extras = ();
    # Generate data
    for i in range(len(cid_list)):
        cid  = cid_list[i]

        X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
        XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
        A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
    return(XE, X, A)


def get_substrate_representations(df):
    df["substrate_rep"] = ""
    
    i = 0
    n = len(df)

    while i*64 <= n:
        cid_all = list(df["molecule ID"])

        if (i+1)*64  <= n:
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][i*64:(i+1)*64] = list(representations[:, :D])
        else:
            print(i)
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][-len(representations):] = list(representations[:, :D])
        i += 1
    return(df)

In [16]:
df_mols = get_substrate_representations(df = df_mols)
df_mols

21


Unnamed: 0,molecule ID,uid,substrate_rep
0,C00001,P9WIQ3,"[0.0, 0.6329006, 0.0, 44.773804, 41.644196, 21..."
1,C00002,P9WIQ3,"[0.0, 46.134777, 0.0, 0.0, 1177.1073, 187.5388..."
2,C00003,P9WIQ3,"[0.0, 8.150052, 0.0, 8.550125, 40.956882, 66.0..."
3,C00004,P9WIQ3,"[0.0, 1.8680593, 0.0, 0.0, 44.121048, 557.9273..."
4,C00005,P9WIQ3,"[0.0, 0.0, 0.0, 0.0, 183.79741, 173.6141, 0.0,..."
...,...,...,...
1355,CHEBI_88052,P9WIQ3,"[0.0, 46.631508, 0.0, 0.0, 2.3734078, 1.361197..."
1356,InChI=1SQC18H36O3Qc1-2-3-4-5-6-7-8-9-10-11-12-...,P9WIQ3,"[0.0, 0.2668656, 0.0, 0.0, 0.784772, 10.25194,..."
1357,"InChI=1SQC3H6O3Qc1-2(4)3(5)6Qh2,4H,1H3,(H,5,6)...",P9WIQ3,"[0.0, 0.9508717, 0.0, 90.905556, 0.79774696, 0..."
1358,"InChI=1SQC8H16O3Qc1-2-3-4-5-6-7(9)8(10)11Qh7,9...",P9WIQ3,"[0.0, 0.2668656, 0.0, 0.0, 0.784772, 10.25194,..."


In [21]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_train_with_ESM1b_ts_GNN.pkl"))
df_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts_GNN.pkl"))

In [23]:
df_train["GNN rep (pretrained)"] = ""
for ind in df_train.index:
    try:
        df_train["GNN rep (pretrained)"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_train["molecule ID"][ind].replace(":", "_").replace("Q", "/")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [24]:
df_test["GNN rep (pretrained)"] = ""
for ind in df_test.index:
    try:
        df_test["GNN rep (pretrained)"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test["molecule ID"][ind].replace(":", "_").replace("Q", "/")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [25]:
df_train.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_train_with_ESM1b_ts_GNN.pkl"), protocol = 4)
df_test.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_ESM1b_ts_GNN.pkl"), protocol = 4)

#df_engqvist.to_pickle(join(CURRENT_DIR, "alex_data", "new_test_data_Engqvist_group_with_GNN_train_test_split_with_GNN_similar.pkl"), protocol = 4)