### 1. Loading and preprocessing training data from Yang et al.
### 2. Creating substrate representations
### 3. Creating enzyme representations
### 4. Repeating step 1-3 for the two test sets from Yang et al.

In [2]:
import pandas as pd
import numpy as np
import random
from os.path import join
import os
import re
import sys
import time
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from Bio import SeqIO
import warnings
import torch
warnings.filterwarnings("ignore")

sys.path.append('.\\additional_code')
from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\projects\SubFinder\notebooks_and_code


### 1. Loading and preprocessing data from Yang et al.

In [4]:
df = pd.read_csv(join(CURRENT_DIR, ".." ,"data", "Yang_data", 'Yang_training_set.csv'), sep = ";").copy()
df = df.loc[~pd.isnull(df["ID"])]
display(df.head())

Unnamed: 0,ID,Name,Family,Log P,Accessible Area,Volume,COOH,Num OH,F3-OH,F-5OH,...,UGT76E1,UGT76E2,UGT76E12,UGT76E11,UGT76C4,UGT85A5,UGT85A4,UGT78D2,UGT78D1,VVGT1
1,2.0,Baicalein,1.0,3.11,460.41,193.278,0.0,3.0,0.0,8.08,...,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,1.0
3,3.0,Umbelliferone,2.0,1.58,325.177,115.245,0.0,1.0,,,...,0.0,0.0,2.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0
5,4.0,4-Methyl-umbelliferone,2.0,1.74,351.454,132.289,0.0,1.0,,,...,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
7,5.0,Sinapic acid,10.0,2.8,411.385,159.218,4.17,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
9,6.0,4-hydroxyl-benzoic acid,5.0,3.11,302.821,102.181,4.57,1.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
enzymes = ['UGT71C1', 'UGT71C2', 'UGT71C3', 'UGT71C4', 'UGT71C5',
       'UGT71B8', 'UGT71B6', 'UGT71B5', 'UGT71B1', 'UGT88A1', 'UGT72E2',
       'UGT72E3', 'UGT72E1', 'UGT72D1', 'UGT72C1', 'UGT72B3', 'UGT72B1',
       'UGT73C3', 'UGT73C4', 'UGT73C5', 'UGT73C6', 'UGT73C1', 'UGT73C7',
       'UGT73B2', 'UGT73B3', 'UGT73B4', 'UGT73B5', 'UGT73B1', 'UGT89B1',
       'UGT89C1', 'UGT84A3', 'UGT84A4', 'UGT84A2', 'UGT84A1', 'UGT84B1',
       'UGT84B2', 'UGT75B1', 'UGT75D1', 'UGT74F1', 'UGT74F2', 'UGT74B1',
       'UGT76E5', 'UGT76E4', 'UGT76E1', 'UGT76E2', 'UGT76E12', 'UGT76E11',
       'UGT76C4', 'UGT85A5', 'UGT85A4', 'UGT78D2', 'UGT78D1', 'VVGT1']

metabolites = df["Name"]
print(len(enzymes)),print(len(metabolites))

df_activity = pd.DataFrame(columns = ["metabolite", "enzyme", "Binding"])

for metabolite in metabolites:
    help_df = df.loc[df["Name"] == metabolite]
    for enzyme in enzymes:
        activity = list(help_df[enzyme])[0]
        if activity <= 1:
            df_activity = df_activity.append({"metabolite" : metabolite, "enzyme" : enzyme, "Binding": activity}, ignore_index = True)
            
df_activity.head()

53
59


Unnamed: 0,metabolite,enzyme,Binding
0,Baicalein,UGT71C1,0.0
1,Baicalein,UGT71C2,0.0
2,Baicalein,UGT71C3,1.0
3,Baicalein,UGT71B8,1.0
4,Baicalein,UGT71B6,1.0


### 2. Creating substrate representations

In [8]:
met_dict = {'Baicalein' : "InChI=1S/C15H10O5/c16-9-6-11(8-4-2-1-3-5-8)20-12-7-10(17)14(18)15(19)13(9)12/h1-7,17-19H",
     'Umbelliferone' : "InChI=1S/C9H6O3/c10-7-3-1-6-2-4-9(11)12-8(6)5-7/h1-5,10H",
     '4-Methyl-umbelliferone' : "InChI=1S/C10H8O3/c1-6-4-10(12)13-9-5-7(11)2-3-8(6)9/h2-5,11H,1H3",
     'Sinapic acid' : "InChI=1S/C11H12O5/c1-15-8-5-7(3-4-10(12)13)6-9(16-2)11(8)14/h3-6,14H,1-2H3,(H,12,13)/b4-3+",
     '4-hydroxyl-benzoic acid' : "InChI=1S/C7H6O3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H,(H,9,10)",
    'a-cyano-4-hydroxyl-cinamic acid' : "InChI=1S/C10H7NO3/c11-6-8(10(13)14)5-7-1-3-9(12)4-2-7/h1-5,12H,(H,13,14)/b8-5+",
     '3,4-dichloroaniline' : "InChI=1S/C6H5Cl2N/c7-5-2-1-4(9)3-6(5)8/h1-3H,9H2",
     '3,4-dihydroxylbenzoic acid' : "InChI=1S/C7H6O4/c8-5-2-1-4(7(10)11)3-6(5)9/h1-3,8-9H,(H,10,11)",
    #2,5-dihydroxybenzoic acid
     ' 2,5-dihydroxylbenzoic acid' : "InChI=1S/C7H6O4/c8-4-1-2-6(9)5(3-4)7(10)11/h1-3,8-9H,(H,10,11)",
     ' D-glycerate' : "InChI=1S/C3H6O4/c4-1-2(5)3(6)7/h2,4-5H,1H2,(H,6,7)/p-1/t2-/m1/s1",
    #D-GlcNAc
     ' GlcNAc' : "InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-10)15-8(5)14/h4-8,10,12-14H,2H2,1H3,(H,9,11)",
     ' Indole 3-acetate' : "InChI=1S/C10H9NO2/c12-10(13)5-7-6-11-9-4-2-1-3-8(7)9/h1-4,6,11H,5H2,(H,12,13)/p-1",
     ' Gibberellin A3' : "InChI=1S/C19H22O6/c1-9-7-17-8-18(9,24)5-3-10(17)19-6-4-11(20)16(2,15(23)25-19)13(19)12(17)14(21)22/h4,6,10-13,20,24H,1,3,5,7-8H2,2H3,(H,21,22)/t10-,11+,12-,13-,16-,17+,18+,19-/m1/s1",
     ' Gibberellin A4' : "InChI=1S/C19H24O5/c1-9-7-18-8-10(9)3-4-11(18)19-6-5-12(20)17(2,16(23)24-19)14(19)13(18)15(21)22/h10-14,20H,1,3-8H2,2H3,(H,21,22)/t10-,11-,12+,13-,14-,17-,18+,19-/m1/s1",
    #(+)-Jasmonic acid 
    ' (?)-Jasmonic acid' : "InChI=1S/C12H18O3/c1-2-3-4-5-10-9(8-12(14)15)6-7-11(10)13/h3-4,9-10H,2,5-8H2,1H3,(H,14,15)/b4-3-/t9-,10-/m0/s1",
     #(+)-cis,trans-Abscisic Acid
     ' (?)- cis, trans Abscisic acid' : "InChI=1S/C15H20O4/c1-10(7-13(17)18)5-6-15(19)11(2)8-12(16)9-14(15,3)4/h5-8,19H,9H2,1-4H3,(H,17,18)/b6-5+,10-7+/t15-/m1/s1",
     ' Kinetin' : "InChI=1S/C10H9N5O/c1-2-7(16-3-1)4-11-9-8-10(13-5-12-8)15-6-14-9/h1-3,5-6H,4H2,(H2,11,12,13,14,15)",
     'Zeatin' : "InChI=1S/C10H13N5O/c1-7(4-16)2-3-11-9-8-10(13-5-12-8)15-6-14-9/h2,5-6,16H,3-4H2,1H3,(H2,11,12,13,14,15)/b7-2+",
     'Luteolin' : "InChI=1S/C15H10O6/c16-8-4-11(19)15-12(20)6-13(21-14(15)5-8)7-1-2-9(17)10(18)3-7/h1-6,16-19H",
     'Quercetin' : "InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",
     ' Fisetin' : "InChI=1S/C15H10O6/c16-8-2-3-9-12(6-8)21-15(14(20)13(9)19)7-1-4-10(17)11(18)5-7/h1-6,16-18,20H",
     ' Kaempferol' : "InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",
     'Cinnamic acid' : "InChI=1S/C9H8O2/c10-9(11)7-6-8-4-2-1-3-5-8/h1-7H,(H,10,11)/b7-6+",
     '4-hydroxy cinnamic acid' : "InChI=1S/C9H8O3/c10-8-4-1-7(2-5-8)3-6-9(11)12/h1-6,10H,(H,11,12)/b6-3+",
     '3,4-dihyroxy cinnamic acid' : "InChI=1S/C9H8O4/c10-7-3-1-6(5-8(7)11)2-4-9(12)13/h1-5,10-11H,(H,12,13)/b4-2+",
     '4-hydroxy 3-methoxy cinnamic acid' : "InChI=1S/C10H10O4/c11-6-8-5-7(1-3-9(8)12)2-4-10(13)14/h1-5,11-12H,6H2,(H,13,14)/b4-2+",
     '2-hydroxy cinnamic acid' : "InChI=1S/C9H8O3/c10-8-4-2-1-3-7(8)5-6-9(11)12/h1-6,10H,(H,11,12)",
     '3-hydroxy cinnamic acid' : "InChI=1S/C9H8O3/c10-8-3-1-2-7(6-8)4-5-9(11)12/h1-6,10H,(H,11,12)/b5-4+",
     '7-hydroxy 6-methoxy coumarin (Scopoletin)' : "InChI=1S/C10H8O4/c1-13-9-4-6-2-3-10(12)14-8(6)5-7(9)11/h2-5,11H,1H3",
     '6,7-dihydroxy coumarin (Esculetin)' : "InChI=1S/C9H6O4/c10-6-3-5-1-2-9(12)13-8(5)4-7(6)11/h1-4,10-11H",
     'Threonine' : "InChI=1S/C4H9NO3/c1-2(6)3(5)4(7)8/h2-3,6H,5H2,1H3,(H,7,8)/t2-,3+/m1/s1",
     'Glucose' : "InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1",
     'Dihydrojasmonic acid' : "InChI=1S/C12H20O3/c1-2-3-4-5-10-9(8-12(14)15)6-7-11(10)13/h9-10H,2-8H2,1H3,(H,14,15)",
     'Ser-Phe' : "InChI=1S/C12H16N2O4/c13-9(7-15)11(16)14-10(12(17)18)6-8-4-2-1-3-5-8/h1-5,9-10,15H,6-7,13H2,(H,14,16)(H,17,18)/t9-,10-/m0/s1",
     'Ser-Leu' : "InChI=1S/C9H18N2O4/c1-5(2)3-7(9(14)15)11-8(13)6(10)4-12/h5-7,12H,3-4,10H2,1-2H3,(H,11,13)(H,14,15)/t6-,7-/m0/s1",
     'BocCysThrOMe' : "InChI=1S/C13H24N2O6S/c1-7(16)9(11(18)20-5)15-10(17)8(6-22)14-12(19)21-13(2,3)4/h7-9,16,22H,6H2,1-5H3,(H,14,19)(H,15,17)",
     '1-Thio-S-cyanomethyl-N-acetyl-D-glucosamine' : "InChI=1S/C10H20N2O5S/c1-5(14)12-7-9(16)8(15)6(4-13)17-10(7)18-3-2-11/h6-10,13,15-16H,2-4,11H2,1H3,(H,12,14)",
     'MUGlcNAc' : "InChI=1S/C17H18O8/c1-7-5-12(19)24-11-6-9(3-4-10(7)11)23-17-13(8(2)18)14(20)15(21)16(22)25-17/h3-6,13-17,20-22H,1-2H3",
     'Trans-dihydroquercetin (DHQ)' : "InChI=1S/C15H12O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,14-19,21H/t14-,15+/m0/s1",
     '7-hydroxycoumerin 3-carboxylic acid' : "InChI=1S/C10H6O5/c11-6-2-1-5-3-7(9(12)13)10(14)15-8(5)4-6/h1-4,11H,(H,12,13)",
     '7-hydroxycoumerin 4-acetic acid' : "InChI=1S/C11H8O5/c12-7-1-2-8-6(3-10(13)14)4-11(15)16-9(8)5-7/h1-2,4-5,12H,3H2,(H,13,14)",
     'Chloramphenicol' : "InChI=1S/C11H12Cl2N2O5/c12-10(13)11(18)14-8(5-16)9(17)6-1-3-7(4-2-6)15(19)20/h1-4,8-10,16-17H,5H2,(H,14,18)/t8-,9-/m1/s1",
     '?-GlcOBn' : "InChI=1S/C13H18O6/c14-6-9-10(15)11(16)12(17)13(19-9)18-7-8-4-2-1-3-5-8/h1-5,9-17H,6-7H2",
     'a-ManOBn' : "InChI=1S/C13H18O6/c14-6-9-10(15)11(16)12(17)13(19-9)18-7-8-4-2-1-3-5-8/h1-5,9-17H,6-7H2",
     'a-ManOPh' : "InChI=1S/C12H16O6/c13-6-8-9(14)10(15)11(16)12(18-8)17-7-4-2-1-3-5-7/h1-5,8-16H,6H2",
     'a-ManOCH2Bn' : "InChI=1S/C14H20O6/c15-8-10-11(16)12(17)13(18)14(20-10)19-7-6-9-4-2-1-3-5-9/h1-5,10-18H,6-8H2",
     'a-ManOPMP' : "InChI=1S/C13H18O7/c14-5-7-1-3-8(4-2-7)19-13-12(18)11(17)10(16)9(6-15)20-13/h1-4,9-18H,5-6H2",
     'a-ManOBn(pNO2)' : "InChI=1S/C13H19NO8/c15-5-9-10(16)11(17)12(18)13(22-9)21-6-7-1-3-8(4-2-7)14(19)20/h1-4,9-13,15-20H,5-6H2",
     'a-ManOPhF5' : "InChI=1S/C12H11F5O6/c13-3-4(14)6(16)11(7(17)5(3)15)23-12-10(21)9(20)8(19)2(1-18)22-12/h2,8-10,12,18-21H,1H2",
     'a-ManOBnF5' : "InChI=1S/C13H13F5O6/c14-5-3(6(15)8(17)9(18)7(5)16)2-23-13-12(22)11(21)10(20)4(1-19)24-13/h4,10-13,19-22H,1-2H2",
     'ManSTol' : "InChI=1S/C13H18O5S/c1-7-2-4-8(5-3-7)19-13-12(17)11(16)10(15)9(6-14)18-13/h2-5,9-17H,6H2,1H3",
     'Catechin' : "InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1",
     'Genistein' : "InChI=1S/C15H10O5/c16-9-3-1-8(2-4-9)11-7-20-13-6-10(17)5-12(18)14(13)15(11)19/h1-7,16-18H",
     'N6-Benzyladenine' : "InChI=1S/C12H11N5/c1-2-4-9(5-3-1)6-13-11-10-12(15-7-14-10)17-8-16-11/h1-5,7-8H,6H2,(H2,13,14,15,16,17)",
     'Trans-Zentin-Glucose' : "InChI=1S/C16H23N5O6/c1-8(2-3-17-14-10-15(19-6-18-10)21-7-20-14)5-26-16-13(25)12(24)11(23)9(4-22)27-16/h2,6-7,9,11-13,16,22-25H,3-5H2,1H3,(H2,17,18,19,20,21)/b8-2+",
     'DihydroZeatin' : "InChI=1S/C10H15N5O/c1-7(4-16)2-3-11-9-8-10(13-5-12-8)15-6-14-9/h5-7,16H,2-4H2,1H3,(H2,11,12,13,14,15)",
     'Olomoucine' : "InChI=1S/C15H18N6O/c1-21-10-18-12-13(17-9-11-5-3-2-4-6-11)19-15(16-7-8-22)20-14(12)21/h2-6,10,22H,7-9H2,1H3,(H2,16,17,19,20)",
     'N6-isopentenyladenine' : "InChI=1S/C10H13N5/c1-7(2)3-4-11-9-8-10(13-5-12-8)15-6-14-9/h3,5-6H,4H2,1-2H3,(H2,11,12,13,14,15)",
     'Spectinomycin' : "InChI=1S/C14H24N2O7/c1-5-4-6(17)14(20)13(21-5)22-12-10(19)7(15-2)9(18)8(16-3)11(12)23-14/h5,7-13,15-16,18-20H,4H2,1-3H3/t5-,7-,8+,9+,10+,11-,12-,13+,14+/m1/s1",
     "oleanodmycin" : "InChI=1S/C35H61NO12/c1-16-14-35(15-43-35)32(40)19(4)27(37)18(3)22(7)46-33(41)21(6)31(47-26-13-25(42-11)28(38)23(8)45-26)20(5)30(16)48-34-29(39)24(36(9)10)12-17(2)44-34/h16-31,34,37-39H,12-15H2,1-11H3/t16-,17+,18-,19+,20+,21+,22+,23-,24-,25-,26-,27-,28-,29+,30-,31-,34-,35+/m0/s1",
     "novobiocin" : "InChI=1S/C31H36N2O11/c1-14(2)7-8-16-13-17(9-11-19(16)34)27(37)33-21-22(35)18-10-12-20(15(3)24(18)42-28(21)38)41-29-23(36)25(43-30(32)39)26(40-6)31(4,5)44-29/h7,9-13,23,25-26,29,34-36H,8H2,1-6H3,(H2,32,39)(H,33,37)/t23-,25+,26-,29-/m1/s1",
     "spectinomycin" : "InChI=1S/C14H24N2O7/c1-5-4-6(17)14(20)13(21-5)22-12-10(19)7(15-2)9(18)8(16-3)11(12)23-14/h5,7-13,15-16,18-20H,4H2,1-3H3/t5-,7-,8+,9+,10+,11-,12-,13+,14+/m1/s1",
     "CHAPS" : "InChI=1S/C32H58N2O7S/c1-21(8-11-29(38)33-14-6-15-34(4,5)16-7-17-42(39,40)41)24-9-10-25-30-26(20-28(37)32(24,25)3)31(2)13-12-23(35)18-22(31)19-27(30)36/h21-28,30,35-37H,6-20H2,1-5H3,(H-,33,38,39,40,41)/t21-,22+,23-,24-,25+,26+,27-,28+,30+,31+,32-/m1/s1",
     "solanidine" : "InChI=1S/C27H43NO/c1-16-5-8-23-17(2)25-24(28(23)15-16)14-22-20-7-6-18-13-19(29)9-11-26(18,3)21(20)10-12-27(22,25)4/h6,16-17,19-25,29H,5,7-15H2,1-4H3/t16-,17+,19-,20+,21-,22-,23+,24-,25-,26-,27-/m0/s1",
     "solasodine" : "InChI=1S/C27H43NO2/c1-16-7-12-27(28-15-16)17(2)24-23(30-27)14-22-20-6-5-18-13-19(29)8-10-25(18,3)21(20)9-11-26(22,24)4/h5,16-17,19-24,28-29H,6-15H2,1-4H3/t16-,17+,19+,20-,21+,22+,23+,24+,25+,26+,27-/m1/s1",
     "b-sitosterol"  : "InChI=1S/C29H50O/c1-7-21(19(2)3)9-8-20(4)25-12-13-26-24-11-10-22-18-23(30)14-16-28(22,5)27(24)15-17-29(25,26)6/h10,19-21,23-27,30H,7-9,11-18H2,1-6H3/t20-,21-,23+,24+,25-,26+,27+,28+,29-/m1/s1" }

Calculating ECFPs

In [9]:
df_activity["ECFP"] = np.nan

for met in metabolites:
    Inchi = met_dict[met]
    mol = Chem.MolFromInchi(Inchi)
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024).ToBitString()    
    df_activity["ECFP"].loc[df_activity["metabolite"] == met] = ecfp

df_activity

Unnamed: 0,metabolite,enzyme,Binding,ECFP
0,Baicalein,UGT71C1,0.0,0000000000000000000000000001000000000000000000...
1,Baicalein,UGT71C2,0.0,0000000000000000000000000001000000000000000000...
2,Baicalein,UGT71C3,1.0,0000000000000000000000000001000000000000000000...
3,Baicalein,UGT71B8,1.0,0000000000000000000000000001000000000000000000...
4,Baicalein,UGT71B6,1.0,0000000000000000000000000001000000000000000000...
...,...,...,...,...
2842,Spectinomycin,UGT76E12,0.0,0000100000000000000000000000000001001000000000...
2843,Spectinomycin,UGT76E11,0.0,0000100000000000000000000000000001001000000000...
2844,Spectinomycin,UGT85A5,0.0,0000100000000000000000000000000001001000000000...
2845,Spectinomycin,UGT85A4,0.0,0000100000000000000000000000000001001000000000...


### 3. Creating enzyme representations

Creating fasta file with all sequences:

In [10]:
import torch
df_activity["ESM1b_ts"] = ""

for enzyme in enzymes:
    rep = np.array(torch.load(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_enzymes",
                               enzyme + ".pt"))["cls_representations"][33])   
    help_df = df_activity.loc[df_activity["enzyme"] ==enzyme]
    for ind in help_df.index:
        df_activity["ESM1b_ts"][ind] = rep

df_activity

Unnamed: 0,metabolite,enzyme,Binding,ECFP,ESM1b_ts
0,Baicalein,UGT71C1,0.0,0000000000000000000000000001000000000000000000...,"[0.08609772, -0.110735565, -0.29194435, 0.5663..."
1,Baicalein,UGT71C2,0.0,0000000000000000000000000001000000000000000000...,"[0.013256749, -0.16644819, -0.34313837, 0.5727..."
2,Baicalein,UGT71C3,1.0,0000000000000000000000000001000000000000000000...,"[0.11302839, -0.028711304, -0.28791505, 0.6181..."
3,Baicalein,UGT71B8,1.0,0000000000000000000000000001000000000000000000...,"[0.32527673, -0.020017605, -0.23925737, 0.4402..."
4,Baicalein,UGT71B6,1.0,0000000000000000000000000001000000000000000000...,"[0.32567993, -0.24399604, -0.2138156, 0.499061..."
...,...,...,...,...,...
2842,Spectinomycin,UGT76E12,0.0,0000100000000000000000000000000001001000000000...,"[0.080241196, -0.49778256, -0.100768566, 0.547..."
2843,Spectinomycin,UGT76E11,0.0,0000100000000000000000000000000001001000000000...,"[0.026391676, -0.51961315, -0.07424599, 0.4777..."
2844,Spectinomycin,UGT85A5,0.0,0000100000000000000000000000000001001000000000...,"[-0.02222197, -0.30882728, -0.1440919, 0.38996..."
2845,Spectinomycin,UGT85A4,0.0,0000100000000000000000000000000001001000000000...,"[-0.024465147, -0.33846748, -0.10585976, 0.333..."


In [11]:
df_activity.to_pickle(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_df.pkl"))

### 4. Repeating step 1-3 for the two test sets from Yang et al.

#### Berry:

In [14]:
df_Berry = pd.read_csv(join(CURRENT_DIR, ".." ,"data", "Yang_data", 'Berry_validation.csv'), sep = ";").copy()
display(df_Berry.head())

Unnamed: 0,Acceptor,Lb01/UGT72B10,Lb02/UGT74P1,Lb04/UGT73A10,Lb06/UGT74N1,Lb12/UGT75L2,Lb15/UGT85A20,Lb17/UGT73Q1,Lb18/UGT73A12,Lb19/UGT86A5,Lb23/UGT94E2
0,oleanodmycin,0,0,0,0,0,0,0,0,0,0
1,Baicalein,1,0,1,0,1,0,0,1,0,1
2,Umbelliferone,0,0,0,0,0,0,0,0,0,0
3,4-Methyl-umbelliferone,0,0,0,1,0,0,0,0,0,0
4,Sinapic acid,0,0,1,0,1,1,0,1,0,0


In [15]:
enzymes = ['Lb01/UGT72B10', 'Lb02/UGT74P1', 'Lb04/UGT73A10',
       'Lb06/UGT74N1', 'Lb12/UGT75L2', 'Lb15/UGT85A20', 'Lb17/UGT73Q1',
       'Lb18/UGT73A12', 'Lb19/UGT86A5', 'Lb23/UGT94E2']

new_mets = list(df_Berry["Acceptor"])

df_activity = pd.DataFrame(columns = ["metabolite", "enzyme", "Binding"])

for metabolite in new_mets:
    help_df = df_Berry.loc[df_Berry["Acceptor"] == metabolite]
    for enzyme in enzymes:
        activity = list(help_df[enzyme])[0]
        if activity <= 1:
            df_activity = df_activity.append({"metabolite" : metabolite, "enzyme" : enzyme.split("/")[1],
                                              "Binding": activity}, ignore_index = True)
df_activity.head()

Unnamed: 0,metabolite,enzyme,Binding
0,oleanodmycin,UGT72B10,0
1,oleanodmycin,UGT74P1,0
2,oleanodmycin,UGT73A10,0
3,oleanodmycin,UGT74N1,0
4,oleanodmycin,UGT75L2,0


Adding ECFPs:

In [17]:
df_activity["ECFP"] = np.nan

for met in new_mets:
    Inchi = met_dict[met]
    mol = Chem.MolFromInchi(Inchi)
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024).ToBitString()    
    df_activity["ECFP"].loc[df_activity["metabolite"] == met] = ecfp

df_activity

Unnamed: 0,metabolite,enzyme,Binding,ECFP
0,oleanodmycin,UGT72B10,0,0000000000000000000000000000000001001000000000...
1,oleanodmycin,UGT74P1,0,0000000000000000000000000000000001001000000000...
2,oleanodmycin,UGT73A10,0,0000000000000000000000000000000001001000000000...
3,oleanodmycin,UGT74N1,0,0000000000000000000000000000000001001000000000...
4,oleanodmycin,UGT75L2,0,0000000000000000000000000000000001001000000000...
...,...,...,...,...
375,b-sitosterol,UGT85A20,0,0100000110000000100000000000000001001000000000...
376,b-sitosterol,UGT73Q1,0,0100000110000000100000000000000001001000000000...
377,b-sitosterol,UGT73A12,0,0100000110000000100000000000000001001000000000...
378,b-sitosterol,UGT86A5,0,0100000110000000100000000000000001001000000000...


Adding enzyme representations:

In [18]:
pt_files = os.listdir(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_enzymes"))

df_activity["ESM1b_ts"] = ""

for enzyme in enzymes:
    enzyme = enzyme.split("/")[1]
    for file in pt_files:
        if enzyme in file:
            break
    rep = np.array(torch.load(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_enzymes",
                               file))["cls_representations"][33])
    help_df = df_activity.loc[df_activity["enzyme"] == enzyme]
    for ind in help_df.index:
        df_activity["ESM1b_ts"][ind] = rep

df_activity

Unnamed: 0,metabolite,enzyme,Binding,ECFP,ESM1b_ts
0,oleanodmycin,UGT72B10,0,0000000000000000000000000000000001001000000000...,"[0.190367, -0.023007678, -0.17562558, 0.215061..."
1,oleanodmycin,UGT74P1,0,0000000000000000000000000000000001001000000000...,"[-0.08973988, -0.51375425, -0.1270897, 0.48713..."
2,oleanodmycin,UGT73A10,0,0000000000000000000000000000000001001000000000...,"[-0.08658633, -0.027905624, -0.13412698, 0.285..."
3,oleanodmycin,UGT74N1,0,0000000000000000000000000000000001001000000000...,"[0.098173656, -0.5818817, -0.100889355, 0.4825..."
4,oleanodmycin,UGT75L2,0,0000000000000000000000000000000001001000000000...,"[0.04887647, -0.48787764, 0.08551325, 0.492277..."
...,...,...,...,...,...
375,b-sitosterol,UGT85A20,0,0100000110000000100000000000000001001000000000...,"[0.08819179, -0.41269886, -0.046921156, 0.6169..."
376,b-sitosterol,UGT73Q1,0,0100000110000000100000000000000001001000000000...,"[0.12415886, -0.11941293, -0.19394022, 0.27981..."
377,b-sitosterol,UGT73A12,0,0100000110000000100000000000000001001000000000...,"[-0.09134824, -0.07326901, -0.18867402, 0.3036..."
378,b-sitosterol,UGT86A5,0,0100000110000000100000000000000001001000000000...,"[0.21679285, -0.45862696, 0.023173656, 0.43411..."


In [19]:
df_activity.to_pickle(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_validation_Berry_df.pkl"))

#### Oat

In [21]:
df_Oat = pd.read_csv(join(CURRENT_DIR, ".." ,"data", "Yang_data", 'Oat_validation.csv'), sep = ";").copy()
display(df_Berry.head())

Unnamed: 0,Acceptor,Lb01/UGT72B10,Lb02/UGT74P1,Lb04/UGT73A10,Lb06/UGT74N1,Lb12/UGT75L2,Lb15/UGT85A20,Lb17/UGT73Q1,Lb18/UGT73A12,Lb19/UGT86A5,Lb23/UGT94E2
0,oleanodmycin,0,0,0,0,0,0,0,0,0,0
1,Baicalein,1,0,1,0,1,0,0,1,0,1
2,Umbelliferone,0,0,0,0,0,0,0,0,0,0
3,4-Methyl-umbelliferone,0,0,0,1,0,0,0,0,0,0
4,Sinapic acid,0,0,1,0,1,1,0,1,0,0


In [22]:
enzymes = ['As01/UGT84C2', 'As07/UGT85B2', 'As08/UGT74H5',
       'As09/UGT88C4', 'As12/UGT73A5', 'As14/UGT74H6', 'As17/UGT75E3']


df_activity = pd.DataFrame(columns = ["metabolite", "enzyme", "Binding"])

for metabolite in new_mets:
    help_df = df_Oat.loc[df_Oat["Acceptor"] == metabolite]
    for enzyme in enzymes:
        activity = list(help_df[enzyme])[0]
        if activity <= 1:
            df_activity = df_activity.append({"metabolite" : metabolite, "enzyme" : enzyme.split("/")[1],
                                              "Binding": activity}, ignore_index = True)
df_activity.head()

Unnamed: 0,metabolite,enzyme,Binding
0,oleanodmycin,UGT84C2,0
1,oleanodmycin,UGT85B2,0
2,oleanodmycin,UGT74H5,0
3,oleanodmycin,UGT88C4,0
4,oleanodmycin,UGT73A5,0


Adding ECFPs:

In [23]:
df_activity["ECFP"] = np.nan

for met in new_mets:
    Inchi = met_dict[met]
    mol = Chem.MolFromInchi(Inchi)
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024).ToBitString()    
    df_activity["ECFP"].loc[df_activity["metabolite"] == met] = ecfp

df_activity

Unnamed: 0,metabolite,enzyme,Binding,ECFP
0,oleanodmycin,UGT84C2,0,0000000000000000000000000000000001001000000000...
1,oleanodmycin,UGT85B2,0,0000000000000000000000000000000001001000000000...
2,oleanodmycin,UGT74H5,0,0000000000000000000000000000000001001000000000...
3,oleanodmycin,UGT88C4,0,0000000000000000000000000000000001001000000000...
4,oleanodmycin,UGT73A5,0,0000000000000000000000000000000001001000000000...
...,...,...,...,...
261,b-sitosterol,UGT74H5,0,0100000110000000100000000000000001001000000000...
262,b-sitosterol,UGT88C4,0,0100000110000000100000000000000001001000000000...
263,b-sitosterol,UGT73A5,0,0100000110000000100000000000000001001000000000...
264,b-sitosterol,UGT74H6,0,0100000110000000100000000000000001001000000000...


Adding enzyme representations:

In [24]:
pt_files = os.listdir(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_enzymes"))

df_activity["ESM1b_ts"] = ""

for enzyme in enzymes:
    enzyme = enzyme.split("/")[1]
    for file in pt_files:
        if enzyme in file:
            break
    rep = np.array(torch.load(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_enzymes",
                               file))["cls_representations"][33])
    help_df = df_activity.loc[df_activity["enzyme"] == enzyme]
    for ind in help_df.index:
        df_activity["ESM1b_ts"][ind] = rep

df_activity

Unnamed: 0,metabolite,enzyme,Binding,ECFP,ESM1b_ts
0,oleanodmycin,UGT84C2,0,0000000000000000000000000000000001001000000000...,"[0.1105218, -0.7652184, 0.06292175, 0.6331433,..."
1,oleanodmycin,UGT85B2,0,0000000000000000000000000000000001001000000000...,"[-0.03549692, -0.61546826, 0.078006655, 0.6455..."
2,oleanodmycin,UGT74H5,0,0000000000000000000000000000000001001000000000...,"[0.1102507, -0.5676314, -0.20757274, 0.5748560..."
3,oleanodmycin,UGT88C4,0,0000000000000000000000000000000001001000000000...,"[0.11656699, -0.30830058, -0.2670081, 0.418650..."
4,oleanodmycin,UGT73A5,0,0000000000000000000000000000000001001000000000...,"[0.18211323, -0.67609215, -0.08622708, 0.62989..."
...,...,...,...,...,...
261,b-sitosterol,UGT74H5,0,0100000110000000100000000000000001001000000000...,"[0.1102507, -0.5676314, -0.20757274, 0.5748560..."
262,b-sitosterol,UGT88C4,0,0100000110000000100000000000000001001000000000...,"[0.11656699, -0.30830058, -0.2670081, 0.418650..."
263,b-sitosterol,UGT73A5,0,0100000110000000100000000000000001001000000000...,"[0.18211323, -0.67609215, -0.08622708, 0.62989..."
264,b-sitosterol,UGT74H6,0,0100000110000000100000000000000001001000000000...,"[0.09090661, -0.31147072, -0.29651684, 0.58979..."


In [25]:
df_activity.to_pickle(join(CURRENT_DIR, ".." ,"data", "Yang_data", "Yang_validation_Oat_df.pkl"))