In [1]:
import sys
import os
from os.path import join
import numpy as np
import pickle
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

sys.path.append('.\\additional_code')
from xgboost_training import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

cpu
C:\Users\alexk\projects\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data:

### (c) Get list with input combinations of Uniprot ID and metabolite ID

## 2. Calculating input matrices for metabolites

In [2]:
df_Mou  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_df.pkl"))
df_Berry  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Berry_df.pkl"))
df_Oat  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Oat_df.pkl"))

df_test_new = pd.concat([df_Mou, df_Berry, df_Oat], ignore_index = True)
df_test_new.head()

df_test_new["metabolite"] = [met.replace("?", "+") for met in df_test_new["metabolite"]]

In [8]:
len(df_Mou), len(set(df_Mou["metabolite"])), len(set(df_Mou["enzyme"]))

(2847, 59, 53)

In [9]:
len(df_Oat), len(set(df_Oat["metabolite"])), len(set(df_Oat["enzyme"]))

(266, 38, 7)

In [10]:
len(df_Berry), len(set(df_Berry["metabolite"])), len(set(df_Berry["enzyme"]))

(380, 38, 10)

In [3]:
mol_files = list(df_test_new["metabolite"])
mol_names = list(df_test_new["metabolite"])

met_dict = {'Baicalein' : "InChI=1S/C15H10O5/c16-9-6-11(8-4-2-1-3-5-8)20-12-7-10(17)14(18)15(19)13(9)12/h1-7,17-19H",
 'Umbelliferone' : "InChI=1S/C9H6O3/c10-7-3-1-6-2-4-9(11)12-8(6)5-7/h1-5,10H",
 '4-Methyl-umbelliferone' : "InChI=1S/C10H8O3/c1-6-4-10(12)13-9-5-7(11)2-3-8(6)9/h2-5,11H,1H3",
 'Sinapic acid' : "InChI=1S/C11H12O5/c1-15-8-5-7(3-4-10(12)13)6-9(16-2)11(8)14/h3-6,14H,1-2H3,(H,12,13)/b4-3+",
 '4-hydroxyl-benzoic acid' : "InChI=1S/C7H6O3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H,(H,9,10)",
'a-cyano-4-hydroxyl-cinamic acid' : "InChI=1S/C10H7NO3/c11-6-8(10(13)14)5-7-1-3-9(12)4-2-7/h1-5,12H,(H,13,14)/b8-5+",
 '3,4-dichloroaniline' : "InChI=1S/C6H5Cl2N/c7-5-2-1-4(9)3-6(5)8/h1-3H,9H2",
 '3,4-dihydroxylbenzoic acid' : "InChI=1S/C7H6O4/c8-5-2-1-4(7(10)11)3-6(5)9/h1-3,8-9H,(H,10,11)",
#2,5-dihydroxybenzoic acid
 ' 2,5-dihydroxylbenzoic acid' : "InChI=1S/C7H6O4/c8-4-1-2-6(9)5(3-4)7(10)11/h1-3,8-9H,(H,10,11)",
 ' D-glycerate' : "InChI=1S/C3H6O4/c4-1-2(5)3(6)7/h2,4-5H,1H2,(H,6,7)/p-1/t2-/m1/s1",
#D-GlcNAc
 ' GlcNAc' : "InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-10)15-8(5)14/h4-8,10,12-14H,2H2,1H3,(H,9,11)",
 ' Indole 3-acetate' : "InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3-8(7)9/h1-4,6,11H,5H2,(H,12,13)/p-1",
 ' Gibberellin A3' : "InChI=1S/C19H22O6/c1-9-7-17-8-18(9,24)5-3-10(17)19-6-4-11(20)16(2,15(23)25-19)13(19)12(17)14(21)22/h4,6,10-13,20,24H,1,3,5,7-8H2,2H3,(H,21,22)/t10-,11+,12-,13-,16-,17+,18+,19-/m1/s1",
 ' Gibberellin A4' : "InChI=1S/C19H24O5/c1-9-7-18-8-10(9)3-4-11(18)19-6-5-12(20)17(2,16(23)24-19)14(19)13(18)15(21)22/h10-14,20H,1,3-8H2,2H3,(H,21,22)/t10-,11-,12+,13-,14-,17-,18+,19-/m1/s1",
#(+)-Jasmonic acid 
' (?)-Jasmonic acid' : "InChI=1S/C12H18O3/c1-2-3-4-5-10-9(8-12(14)15)6-7-11(10)13/h3-4,9-10H,2,5-8H2,1H3,(H,14,15)/b4-3-/t9-,10-/m0/s1",
 #(+)-cis,trans-Abscisic Acid
 ' (?)- cis, trans Abscisic acid' : "InChI=1S/C15H20O4/c1-10(7-13(17)18)5-6-15(19)11(2)8-12(16)9-14(15,3)4/h5-8,19H,9H2,1-4H3,(H,17,18)/b6-5+,10-7+/t15-/m1/s1",
 ' Kinetin' : "InChI=1S/C10H9N5O/c1-2-7(16-3-1)4-11-9-8-10(13-5-12-8)15-6-14-9/h1-3,5-6H,4H2,(H2,11,12,13,14,15)",
 'Zeatin' : "InChI=1S/C10H13N5O/c1-7(4-16)2-3-11-9-8-10(13-5-12-8)15-6-14-9/h2,5-6,16H,3-4H2,1H3,(H2,11,12,13,14,15)/b7-2+",
 'Luteolin' : "InChI=1S/C15H10O6/c16-8-4-11(19)15-12(20)6-13(21-14(15)5-8)7-1-2-9(17)10(18)3-7/h1-6,16-19H",
 'Quercetin' : "InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",
 ' Fisetin' : "InChI=1S/C15H10O6/c16-8-2-3-9-12(6-8)21-15(14(20)13(9)19)7-1-4-10(17)11(18)5-7/h1-6,16-18,20H",
 ' Kaempferol' : "InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",
 'Cinnamic acid' : "InChI=1S/C9H8O2/c10-9(11)7-6-8-4-2-1-3-5-8/h1-7H,(H,10,11)/b7-6+",
 '4-hydroxy cinnamic acid' : "InChI=1S/C9H8O3/c10-8-4-1-7(2-5-8)3-6-9(11)12/h1-6,10H,(H,11,12)/b6-3+",
 '3,4-dihyroxy cinnamic acid' : "InChI=1S/C9H8O4/c10-7-3-1-6(5-8(7)11)2-4-9(12)13/h1-5,10-11H,(H,12,13)/b4-2+",
 '4-hydroxy 3-methoxy cinnamic acid' : "InChI=1S/C10H10O4/c11-6-8-5-7(1-3-9(8)12)2-4-10(13)14/h1-5,11-12H,6H2,(H,13,14)/b4-2+",
 '2-hydroxy cinnamic acid' : "InChI=1S/C9H8O3/c10-8-4-2-1-3-7(8)5-6-9(11)12/h1-6,10H,(H,11,12)",
 '3-hydroxy cinnamic acid' : "InChI=1S/C9H8O3/c10-8-3-1-2-7(6-8)4-5-9(11)12/h1-6,10H,(H,11,12)/b5-4+",
 '7-hydroxy 6-methoxy coumarin (Scopoletin)' : "InChI=1S/C10H8O4/c1-13-9-4-6-2-3-10(12)14-8(6)5-7(9)11/h2-5,11H,1H3",
 '6,7-dihydroxy coumarin (Esculetin)' : "InChI=1S/C9H6O4/c10-6-3-5-1-2-9(12)13-8(5)4-7(6)11/h1-4,10-11H",
 'Threonine' : "InChI=1S/C4H9NO3/c1-2(6)3(5)4(7)8/h2-3,6H,5H2,1H3,(H,7,8)/t2-,3+/m1/s1",
 'Glucose' : "InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1",
 'Dihydrojasmonic acid' : "InChI=1S/C12H20O3/c1-2-3-4-5-10-9(8-12(14)15)6-7-11(10)13/h9-10H,2-8H2,1H3,(H,14,15)",
 'Ser-Phe' : "InChI=1S/C12H16N2O4/c13-9(7-15)11(16)14-10(12(17)18)6-8-4-2-1-3-5-8/h1-5,9-10,15H,6-7,13H2,(H,14,16)(H,17,18)/t9-,10-/m0/s1",
 'Ser-Leu' : "InChI=1S/C9H18N2O4/c1-5(2)3-7(9(14)15)11-8(13)6(10)4-12/h5-7,12H,3-4,10H2,1-2H3,(H,11,13)(H,14,15)/t6-,7-/m0/s1",
 'BocCysThrOMe' : "InChI=1S/C13H24N2O6S/c1-7(16)9(11(18)20-5)15-10(17)8(6-22)14-12(19)21-13(2,3)4/h7-9,16,22H,6H2,1-5H3,(H,14,19)(H,15,17)",
 '1-Thio-S-cyanomethyl-N-acetyl-D-glucosamine' : "InChI=1S/C10H20N2O5S/c1-5(14)12-7-9(16)8(15)6(4-13)17-10(7)18-3-2-11/h6-10,13,15-16H,2-4,11H2,1H3,(H,12,14)",
 'MUGlcNAc' : "InChI=1S/C17H18O8/c1-7-5-12(19)24-11-6-9(3-4-10(7)11)23-17-13(8(2)18)14(20)15(21)16(22)25-17/h3-6,13-17,20-22H,1-2H3",
 'Trans-dihydroquercetin (DHQ)' : "InChI=1S/C15H12O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,14-19,21H/t14-,15+/m0/s1",
 '7-hydroxycoumerin 3-carboxylic acid' : "InChI=1S/C10H6O5/c11-6-2-1-5-3-7(9(12)13)10(14)15-8(5)4-6/h1-4,11H,(H,12,13)",
 '7-hydroxycoumerin 4-acetic acid' : "InChI=1S/C11H8O5/c12-7-1-2-8-6(3-10(13)14)4-11(15)16-9(8)5-7/h1-2,4-5,12H,3H2,(H,13,14)",
 'Chloramphenicol' : "InChI=1S/C11H12Cl2N2O5/c12-10(13)11(18)14-8(5-16)9(17)6-1-3-7(4-2-6)15(19)20/h1-4,8-10,16-17H,5H2,(H,14,18)/t8-,9-/m1/s1",
 '?-GlcOBn' : "InChI=1S/C13H18O6/c14-6-9-10(15)11(16)12(17)13(19-9)18-7-8-4-2-1-3-5-8/h1-5,9-17H,6-7H2",
 'a-ManOBn' : "InChI=1S/C13H18O6/c14-6-9-10(15)11(16)12(17)13(19-9)18-7-8-4-2-1-3-5-8/h1-5,9-17H,6-7H2",
 'a-ManOPh' : "InChI=1S/C12H16O6/c13-6-8-9(14)10(15)11(16)12(18-8)17-7-4-2-1-3-5-7/h1-5,8-16H,6H2",
 'a-ManOCH2Bn' : "InChI=1S/C14H20O6/c15-8-10-11(16)12(17)13(18)14(20-10)19-7-6-9-4-2-1-3-5-9/h1-5,10-18H,6-8H2",
 'a-ManOPMP' : "InChI=1S/C13H18O7/c14-5-7-1-3-8(4-2-7)19-13-12(18)11(17)10(16)9(6-15)20-13/h1-4,9-18H,5-6H2",
 'a-ManOBn(pNO2)' : "InChI=1S/C13H19NO8/c15-5-9-10(16)11(17)12(18)13(22-9)21-6-7-1-3-8(4-2-7)14(19)20/h1-4,9-13,15-20H,5-6H2",
 'a-ManOPhF5' : "InChI=1S/C12H11F5O6/c13-3-4(14)6(16)11(7(17)5(3)15)23-12-10(21)9(20)8(19)2(1-18)22-12/h2,8-10,12,18-21H,1H2",
 'a-ManOBnF5' : "InChI=1S/C13H13F5O6/c14-5-3(6(15)8(17)9(18)7(5)16)2-23-13-12(22)11(21)10(20)4(1-19)24-13/h4,10-13,19-22H,1-2H2",
 'ManSTol' : "InChI=1S/C13H18O5S/c1-7-2-4-8(5-3-7)19-13-12(17)11(16)10(15)9(6-14)18-13/h2-5,9-17H,6H2,1H3",
 'Catechin' : "InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1",
 'Genistein' : "InChI=1S/C15H10O5/c16-9-3-1-8(2-4-9)11-7-20-13-6-10(17)5-12(18)14(13)15(11)19/h1-7,16-18H",
 'N6-Benzyladenine' : "InChI=1S/C12H11N5/c1-2-4-9(5-3-1)6-13-11-10-12(15-7-14-10)17-8-16-11/h1-5,7-8H,6H2,(H2,13,14,15,16,17)",
 'Trans-Zentin-Glucose' : "InChI=1S/C16H23N5O6/c1-8(2-3-17-14-10-15(19-6-18-10)21-7-20-14)5-26-16-13(25)12(24)11(23)9(4-22)27-16/h2,6-7,9,11-13,16,22-25H,3-5H2,1H3,(H2,17,18,19,20,21)/b8-2+",
 'DihydroZeatin' : "InChI=1S/C10H15N5O/c1-7(4-16)2-3-11-9-8-10(13-5-12-8)15-6-14-9/h5-7,16H,2-4H2,1H3,(H2,11,12,13,14,15)",
 'Olomoucine' : "InChI=1S/C15H18N6O/c1-21-10-18-12-13(17-9-11-5-3-2-4-6-11)19-15(16-7-8-22)20-14(12)21/h2-6,10,22H,7-9H2,1H3,(H2,16,17,19,20)",
 'N6-isopentenyladenine' : "InChI=1S/C10H13N5/c1-7(2)3-4-11-9-8-10(13-5-12-8)15-6-14-9/h3,5-6H,4H2,1-2H3,(H2,11,12,13,14,15)",
 'Spectinomycin' : "InChI=1S/C14H24N2O7/c1-5-4-6(17)14(20)13(21-5)22-12-10(19)7(15-2)9(18)8(16-3)11(12)23-14/h5,7-13,15-16,18-20H,4H2,1-3H3/t5-,7-,8+,9+,10+,11-,12-,13+,14+/m1/s1",
 "oleanodmycin" : "InChI=1S/C35H61NO12/c1-16-14-35(15-43-35)32(40)19(4)27(37)18(3)22(7)46-33(41)21(6)31(47-26-13-25(42-11)28(38)23(8)45-26)20(5)30(16)48-34-29(39)24(36(9)10)12-17(2)44-34/h16-31,34,37-39H,12-15H2,1-11H3/t16-,17+,18-,19+,20+,21+,22+,23-,24-,25-,26-,27-,28-,29+,30-,31-,34-,35+/m0/s1",
 "novobiocin" : "InChI=1S/C31H36N2O11/c1-14(2)7-8-16-13-17(9-11-19(16)34)27(37)33-21-22(35)18-10-12-20(15(3)24(18)42-28(21)38)41-29-23(36)25(43-30(32)39)26(40-6)31(4,5)44-29/h7,9-13,23,25-26,29,34-36H,8H2,1-6H3,(H2,32,39)(H,33,37)/t23-,25+,26-,29-/m1/s1",
 "spectinomycin" : "InChI=1S/C14H24N2O7/c1-5-4-6(17)14(20)13(21-5)22-12-10(19)7(15-2)9(18)8(16-3)11(12)23-14/h5,7-13,15-16,18-20H,4H2,1-3H3/t5-,7-,8+,9+,10+,11-,12-,13+,14+/m1/s1",
 "CHAPS" : "InChI=1S/C32H58N2O7S/c1-21(8-11-29(38)33-14-6-15-34(4,5)16-7-17-42(39,40)41)24-9-10-25-30-26(20-28(37)32(24,25)3)31(2)13-12-23(35)18-22(31)19-27(30)36/h21-28,30,35-37H,6-20H2,1-5H3,(H-,33,38,39,40,41)/t21-,22+,23-,24-,25+,26+,27-,28+,30+,31+,32-/m1/s1",
 "solanidine" : "InChI=1S/C27H43NO/c1-16-5-8-23-17(2)25-24(28(23)15-16)14-22-20-7-6-18-13-19(29)9-11-26(18,3)21(20)10-12-27(22,25)4/h6,16-17,19-25,29H,5,7-15H2,1-4H3/t16-,17+,19-,20+,21-,22-,23+,24-,25-,26-,27-/m0/s1",
 "solasodine" : "InChI=1S/C27H43NO2/c1-16-7-12-27(28-15-16)17(2)24-23(30-27)14-22-20-6-5-18-13-19(29)8-10-25(18,3)21(20)9-11-26(22,24)4/h5,16-17,19-24,28-29H,6-15H2,1-4H3/t16-,17+,19+,20-,21+,22+,23+,24+,25+,26+,27-/m1/s1",
 "b-sitosterol"  : "InChI=1S/C29H50O/c1-7-21(19(2)3)9-8-20(4)25-12-13-26-24-11-10-22-18-23(30)14-16-28(22,5)27(24)15-17-29(25,26)6/h10,19-21,23-27,30H,7-9,11-18H2,1-6H3/t20-,21-,23+,24+,25-,26+,27+,28+,29-/m1/s1" }

In [4]:
def calculate_atom_and_bond_feature_vectors(mol_files, filenames):
    #check if feature vectors have already been calculated:
    try:
        os.mkdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors"))
    except FileExistsError:
        None
    
    #existing feature vector files:
    feature_files = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors"))
    for i, mol_file in enumerate(mol_files):
        mol_name = filenames[i]
        #check if feature vectors were already calculated:
        if not mol_file + "-atoms.txt" in  feature_files:
            #load mol_file
            Inchi = met_dict[mol_file.replace("+", "?")]
            mol = Chem.inchi.MolFromInchi(Inchi)

            if not mol is None:
                calculate_atom_feature_vector_for_mol_file(mol, mol_file, mol_name)
                calculate_bond_feature_vector_for_mol_file(mol, mol_file, mol_name)
                
def calculate_atom_feature_vector_for_mol_file(mol, mol_file, mol_name):
    #get number of atoms N
    N = mol.GetNumAtoms()
    atom_list = []
    for i in range(N):
        features = []
        atom = mol.GetAtomWithIdx(i)
        features.append(atom.GetAtomicNum()), features.append(atom.GetDegree()), features.append(atom.GetFormalCharge())
        features.append(str(atom.GetHybridization())), features.append(atom.GetIsAromatic()), features.append(atom.GetMass())
        features.append(atom.GetTotalNumHs()), features.append(str(atom.GetChiralTag()))
        atom_list.append(features)
    with open(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors", mol_name + "-atoms.txt"), "wb") as fp:   #Pickling
        pickle.dump(atom_list, fp)
            
def calculate_bond_feature_vector_for_mol_file(mol, mol_file, mol_name):
    N = mol.GetNumBonds()
    bond_list = []
    for i in range(N):
        features = []
        bond = mol.GetBondWithIdx(i)
        features.append(bond.GetBeginAtomIdx()), features.append(bond.GetEndAtomIdx()),
        features.append(str(bond.GetBondType())), features.append(bond.GetIsAromatic()),
        features.append(bond.IsInRing()), features.append(str(bond.GetStereo()))
        bond_list.append(features)
    with open(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors", mol_name + "-bonds.txt"), "wb") as fp:   #Pickling
        pickle.dump(bond_list, fp)

In [5]:
calculate_atom_and_bond_feature_vectors(mol_files = mol_files, filenames = mol_names)

In [6]:
for mol_ID in mol_names:
    calculate_and_save_input_matrixes(molecule_ID = mol_ID)

### 4. Creating GNN representations for traing and test set:

Load trained model:

In [3]:
model = GNN(D= 50, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)
model.load_state_dict(torch.load(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN","Pytorch_GNN")))
model.eval()

GNN(
  (BN1): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN2): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=100, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=1, bias=True)
  (drop_layer): Dropout(p=0.2, inplace=False)
)

Loading data:

In [4]:
df_Mou  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_df.pkl"))
df_Berry  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Berry_df.pkl"))
df_Oat  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Oat_df.pkl"))


df_Mou["molecule ID"] = [met.replace("?", "+") for met in df_Mou["metabolite"]]
df_Berry["molecule ID"] = [met.replace("?", "+") for met in df_Berry["metabolite"]]
df_Oat["molecule ID"] = [met.replace("?", "+") for met in df_Oat["metabolite"]]

Creating a DataFrame with all metabolites in our dataset:

In [5]:
valid_mols = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices"))
valid_mols = [mol.split("_A.npy")[0] for mol in valid_mols]
valid_mols = [mol for mol in valid_mols if not "." in mol]
df_mols = pd.DataFrame(data = {"molecule ID" : valid_mols})

#To create the substrate rep, the UID does not matter. Therfore, setting it random:
df_mols["uid"] = "P9WIQ3"
df_mols

Unnamed: 0,molecule ID,uid
0,"(+)- cis, trans Abscisic acid",P9WIQ3
1,(+)-Jasmonic acid,P9WIQ3
2,"2,5-dihydroxylbenzoic acid",P9WIQ3
3,D-glycerate,P9WIQ3
4,Fisetin,P9WIQ3
...,...,...
1438,Trans-Zentin-Glucose,P9WIQ3
1439,Umbelliferone,P9WIQ3
1440,Valeronitrile,P9WIQ3
1441,Zeatin,P9WIQ3


In [6]:
input_data_folder = join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices")

def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    UniRep = ();
    extras = ();
    # Generate data
    for i in range(len(cid_list)):
        cid  = cid_list[i]

        X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
        XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
        A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
    return(XE, X, A)


def get_substrate_representations(df):
    df["substrate_rep"] = ""
    
    i = 0
    n = len(df)

    while i*64 <= n:
        cid_all = list(df["molecule ID"])

        if (i+1)*64  <= n:
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][i*64:(i+1)*64] = list(representations[:, :D])
        else:
            print(i)
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][-len(representations):] = list(representations[:, :D])
        i += 1
    return(df)

In [7]:
df_mols = get_substrate_representations(df = df_mols)
df_mols

22


Unnamed: 0,molecule ID,uid,substrate_rep
0,"(+)- cis, trans Abscisic acid",P9WIQ3,"[263.28204, 101.09926, 44.73753, 97.21757, 73...."
1,(+)-Jasmonic acid,P9WIQ3,"[275.29468, 53.96031, 32.44248, 183.91042, 390..."
2,"2,5-dihydroxylbenzoic acid",P9WIQ3,"[319.7119, 0.0, 0.0, 0.0, 0.0, 171.47157, 0.0,..."
3,D-glycerate,P9WIQ3,"[67.22981, 23.192509, 0.0, 34.899544, 15.00455..."
4,Fisetin,P9WIQ3,"[1040.2584, 0.0, 0.0, 0.0, 0.0, 365.02298, 0.0..."
...,...,...,...
1438,Trans-Zentin-Glucose,P9WIQ3,"[758.35345, 58.56499, 0.0, 33.78201, 165.84697..."
1439,Umbelliferone,P9WIQ3,"[603.0928, 0.0, 0.0, 0.0, 0.0, 385.28824, 0.0,..."
1440,Valeronitrile,P9WIQ3,"[31.178925, 19.753098, 59.99971, 51.572536, 92..."
1441,Zeatin,P9WIQ3,"[445.40707, 41.34903, 0.0, 26.694038, 66.09759..."


In [8]:
df_Mou["GNN rep"] = ""
for ind in df_Mou.index:
    try:
        df_Mou["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_Mou["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Mou["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_Mou["molecule ID"][ind].replace(":", "_")])[0]


In [9]:
df_Berry["GNN rep"] = ""
for ind in df_Berry.index:
    try:
        df_Berry["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_Berry["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass

In [10]:
df_Oat["GNN rep"] = ""
for ind in df_Oat.index:
    try:
        df_Oat["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_Oat["molecule ID"][ind]])[0]
    except IndexError:
        pass

In [12]:
df_Mou.to_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_df_GNN.pkl"), protocol = 4)
df_Berry.to_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Berry_df_GNN.pkl"), protocol = 4)
df_Oat.to_pickle(join(CURRENT_DIR, ".." ,"data", "Min_data", "Min_validation_Oat_df_GNN.pkl"), protocol = 4)