In [1]:
import sys
import os
from os.path import join
import numpy as np
import pickle
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

sys.path.append('.\\additional_code')
from xgboost_training import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

cuda:0
C:\Users\alexk\projects\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data:

### (c) Get list with input combinations of Uniprot ID and metabolite ID

## 2. Calculating input matrices for metabolites

In [2]:
df_test_new = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Mou_data", "Mou_df.pkl"))
                                  
df_test_new

Unnamed: 0,enzyme,metabolite,activity,ECFP,Sequence,ESM1b_ts
0,1a1,4-Aminobenzonitrile,0.716,0000000000000000000000000000000000100000000000...,TIVKAAAVQISPVLYSREGTVERVVKKIRELGEKGVQFATFPETVI...,"[0.9186029, -1.0484099, -0.13906932, 0.1101291..."
1,1a1,4-Nitrophenylacetonitrile,0.000,0000000000001000000000000000000000000000000000...,TIVKAAAVQISPVLYSREGTVERVVKKIRELGEKGVQFATFPETVI...,"[0.9186029, -1.0484099, -0.13906932, 0.1101291..."
2,1a1,4-(Dimethylamino)benzonitrile,4.363,0000000000000000000000000000000001100000000000...,TIVKAAAVQISPVLYSREGTVERVVKKIRELGEKGVQFATFPETVI...,"[0.9186029, -1.0484099, -0.13906932, 0.1101291..."
3,1a1,4-Chlorobenzonitrile,3.998,0000000000000000000000000000000000100000000000...,TIVKAAAVQISPVLYSREGTVERVVKKIRELGEKGVQFATFPETVI...,"[0.9186029, -1.0484099, -0.13906932, 0.1101291..."
4,1a1,"2,6-Dichlorobenzonitrile",0.000,0000000000000001000000000000000000000000000000...,TIVKAAAVQISPVLYSREGTVERVVKKIRELGEKGVQFATFPETVI...,"[0.9186029, -1.0484099, -0.13906932, 0.1101291..."
...,...,...,...,...,...,...
235,pmi28,Propionitrile,1.054,0000000000000000000000000000000001000000000000...,MKIVKAAAVQISPVLYNREATVEKVVQKILELGQQGVQFATFPETV...,"[0.85646003, -1.1750039, -0.1453441, 0.1105357..."
236,pmi28,Benzoylacetonitrile,2.902,0000000000000000000000000000000000000000000000...,MKIVKAAAVQISPVLYNREATVEKVVQKILELGQQGVQFATFPETV...,"[0.85646003, -1.1750039, -0.1453441, 0.1105357..."
237,pmi28,Mandelonitrile,0.000,0100000000000000000000000000000000000000000000...,MKIVKAAAVQISPVLYNREATVEKVVQKILELGQQGVQFATFPETV...,"[0.85646003, -1.1750039, -0.1453441, 0.1105357..."
238,pmi28,Benzonitrile,4.580,0000000000000000000000000000000000100000000000...,MKIVKAAAVQISPVLYNREATVEKVVQKILELGQQGVQFATFPETV...,"[0.85646003, -1.1750039, -0.1453441, 0.1105357..."


In [3]:
met_dict = {'2,6-Dichlorobenzonitrile' : "2_6-dichlorobenzonitrile",
           '⍺-Methylbenzyl cyanide' : "alpha-methylbenzylcyanide",
           '4-(Dimethylamino)benzonitrile' :"4-Dimethylamino-benzonitrile"}

df_test_new["substrate ID"] = np.nan
for ind in df_test_new.index:
    met = df_test_new["metabolite"][ind]
    try:
        df_test_new["substrate ID"][ind] = met_dict[met]
    except KeyError:
        df_test_new["substrate ID"][ind] = met

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_new["substrate ID"][ind] = met
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_new["substrate ID"][ind] = met_dict[met]


In [4]:
mol_files = list(df_test_new["substrate ID"])
mol_names = list(df_test_new["metabolite"])

In [5]:
def calculate_atom_and_bond_feature_vectors(mol_files, filenames):
    #check if feature vectors have already been calculated:
    try:
        os.mkdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors"))
    except FileExistsError:
        None
    
    #existing feature vector files:
    feature_files = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors"))
    for i, mol_file in enumerate(mol_files):
        mol_name = filenames[i]
        #check if feature vectors were already calculated:
        if not mol_file + "-atoms.txt" in  feature_files:
            #load mol_file
            is_InChI_ID = (mol_file[0:5] == "InChI")
            if is_InChI_ID:
                    mol = Chem.inchi.MolFromInchi(mol_file)
    
            else:
                mol = Chem.MolFromMolFile(join(CURRENT_DIR, ".." ,"data",
                                               "Mou_data","nitrile_coordinates", mol_file + '.mol'))
            if not mol is None:
                calculate_atom_feature_vector_for_mol_file(mol, mol_file, mol_name)
                calculate_bond_feature_vector_for_mol_file(mol, mol_file, mol_name)
                
def calculate_atom_feature_vector_for_mol_file(mol, mol_file, mol_name):
    #get number of atoms N
    N = mol.GetNumAtoms()
    atom_list = []
    for i in range(N):
        features = []
        atom = mol.GetAtomWithIdx(i)
        features.append(atom.GetAtomicNum()), features.append(atom.GetDegree()), features.append(atom.GetFormalCharge())
        features.append(str(atom.GetHybridization())), features.append(atom.GetIsAromatic()), features.append(atom.GetMass())
        features.append(atom.GetTotalNumHs()), features.append(str(atom.GetChiralTag()))
        atom_list.append(features)
    with open(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors", mol_name + "-atoms.txt"), "wb") as fp:   #Pickling
        pickle.dump(atom_list, fp)
            
def calculate_bond_feature_vector_for_mol_file(mol, mol_file, mol_name):
    N = mol.GetNumBonds()
    bond_list = []
    for i in range(N):
        features = []
        bond = mol.GetBondWithIdx(i)
        features.append(bond.GetBeginAtomIdx()), features.append(bond.GetEndAtomIdx()),
        features.append(str(bond.GetBondType())), features.append(bond.GetIsAromatic()),
        features.append(bond.IsInRing()), features.append(str(bond.GetStereo()))
        bond_list.append(features)
    with open(join(CURRENT_DIR, ".." ,"data", "substrate_data", "mol_feature_vectors", mol_name + "-bonds.txt"), "wb") as fp:   #Pickling
        pickle.dump(bond_list, fp)

In [6]:
calculate_atom_and_bond_feature_vectors(mol_files = mol_files, filenames = mol_names)

In [7]:
for mol_ID in mol_names:
    calculate_and_save_input_matrixes(molecule_ID = mol_ID)

### 4. Creating GNN representations for traing and test set:

Load trained model:

In [2]:
model = GNN(D= 50, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)
model.load_state_dict(torch.load(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN","Pytorch_GNN")))
model.eval()

GNN(
  (BN1): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN2): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (BN3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=100, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=1, bias=True)
  (drop_layer): Dropout(p=0.2, inplace=False)
)

Loading data:

In [9]:
#df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_EC1_1_3_15_with_ESM1b_ts.pkl"))
#df_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_EC1_1_3_15_with_ESM1b_ts.pkl"))
df_test_new = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "Mou_data", "Mou_df.pkl"))

In [10]:
df_test_new["molecule ID"] = df_test_new["metabolite"]

Creating a DataFrame with all metabolites in our dataset:

In [11]:
valid_mols = os.listdir(join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices"))
valid_mols = [mol.split("_A.npy")[0] for mol in valid_mols]
valid_mols = [mol for mol in valid_mols if not "." in mol]
df_mols = pd.DataFrame(data = {"molecule ID" : valid_mols})

#To create the substrate rep, the UID does not matter. Therfore, setting it random:
df_mols["uid"] = "P9WIQ3"
df_mols

Unnamed: 0,molecule ID,uid
0,"2,6-Dichlorobenzonitrile",P9WIQ3
1,2-Aminobenzonitrile,P9WIQ3
2,2-hydroxyglutarate,P9WIQ3
3,2-hydroxyoctanoate,P9WIQ3
4,2-hydroxystearate,P9WIQ3
...,...,...
1373,mandelate,P9WIQ3
1374,Mandelonitrile,P9WIQ3
1375,Propionitrile,P9WIQ3
1376,Valeronitrile,P9WIQ3


In [12]:
input_data_folder = join(CURRENT_DIR, ".." ,"data", "substrate_data", "GNN_input_matrices")

def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    UniRep = ();
    extras = ();
    # Generate data
    for i in range(len(cid_list)):
        cid  = cid_list[i]

        X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
        XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
        A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
    return(XE, X, A)


def get_substrate_representations(df):
    df["substrate_rep"] = ""
    
    i = 0
    n = len(df)

    while i*64 <= n:
        cid_all = list(df["molecule ID"])

        if (i+1)*64  <= n:
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][i*64:(i+1)*64] = list(representations[:, :D])
        else:
            print(i)
            XE, X, A= get_representation_input(cid_all[i*64:(i+1)*64])
            XE = torch.tensor(np.array(XE), dtype = torch.float32).to(device)
            X = torch.tensor(np.array(X), dtype = torch.float32).to(device)
            A = torch.tensor(np.array(A), dtype = torch.float32).to(device)
            representations = model.get_GNN_rep(XE, X,A).cpu().detach().numpy()
            df["substrate_rep"][-len(representations):] = list(representations[:, :D])
        i += 1
    return(df)

In [13]:
df_mols = get_substrate_representations(df = df_mols)
df_mols

21


Unnamed: 0,molecule ID,uid,substrate_rep
0,"2,6-Dichlorobenzonitrile",P9WIQ3,"[106.52673, 194.76971, 202.29494, 3.9334106, 1..."
1,2-Aminobenzonitrile,P9WIQ3,"[0.0, 150.40901, 97.39244, 1.6200383, 0.586062..."
2,2-hydroxyglutarate,P9WIQ3,"[227.57434, 288.43564, 0.0, 332.77005, 96.4076..."
3,2-hydroxyoctanoate,P9WIQ3,"[210.85622, 399.41058, 31.167301, 380.30545, 3..."
4,2-hydroxystearate,P9WIQ3,"[434.14682, 953.9577, 151.3173, 985.81024, 39...."
...,...,...,...
1373,mandelate,P9WIQ3,"[0.8413251, 222.21786, 42.490715, 3.3426163, 6..."
1374,Mandelonitrile,P9WIQ3,"[0.8413251, 229.04993, 42.604652, 19.163635, 6..."
1375,Propionitrile,P9WIQ3,"[2.467718, 49.35915, 10.034855, 5.220823, 1.19..."
1376,Valeronitrile,P9WIQ3,"[35.775196, 130.18954, 6.314514, 111.56009, 1...."


In [9]:
df_train["GNN rep"] = ""
for ind in df_train.index:
    try:
        df_train["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_train["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_train["molecule ID"][ind].replace(":", "_")])[0]


In [10]:
df_test["GNN rep"] = ""
for ind in df_test.index:
    try:
        df_test["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test["molecule ID"][ind].replace(":", "_")])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test["molecule ID"][ind].replace(":", "_")])[0]


In [14]:
df_test_new["GNN rep"] = ""
for ind in df_test_new.index:
    try:
        df_test_new["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test_new["molecule ID"][ind]])[0]
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_new["GNN rep"][ind] = list(df_mols["substrate_rep"].loc[df_mols["molecule ID"] == df_test_new["molecule ID"][ind]])[0]


In [16]:
#df_train.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_train_with_EC1_1_3_15_with_ESM1b_ts_GNN.pkl"), protocol = 4)
#df_test.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test_with_EC1_1_3_15_with_ESM1b_ts_GNN.pkl"), protocol = 4)
df_test_new.to_pickle(join(CURRENT_DIR, ".." ,"data", "Mou_data", "Mou_df_GNN.pkl"), protocol = 4)