In [1]:
import pandas as pd
import numpy as np
import random
from os.path import join
import os
import re
import sys
import time
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from Bio import SeqIO
import warnings
import torch
warnings.filterwarnings("ignore")

sys.path.append('.\\additional_code')
from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

## Adding negative data points

In [5]:
UNIPROT_df = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "enzyme_data", "UNIPROT_df.pkl"))

df_UID_MID_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data", "df_UID_MID_train.pkl"))
df_UID_MID_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "enzyme_substrate_data", "df_UID_MID_test.pkl"))

In [6]:
df_all = pd.concat([df_UID_MID_train, df_UID_MID_test], ignore_index = True)

df_exp = df_all.loc[df_all["evidence"] == "exp"]
df_phylo = df_all.loc[df_all["evidence"] == "phylo"]

print("We have %s entries with phylogenetic evidence and %s entries with experimental evidence" % (len(df_phylo), len(df_exp)))

print("\n experimental dataset:")
print("Number of different enzymes: %s, Number of different substrates: %s"
      % (len(set(df_exp["Uniprot ID"])), len(set(df_exp["molecule ID"]))) )

print("\n phylogenetic dataset:")
print("Number of different enzymes: %s, Number of different substrates: %s"
      % (len(set(df_phylo["Uniprot ID"])), len(set(df_phylo["molecule ID"]))))

We have 274030 entries with phylogenetic evidence and 18351 entries with experimental evidence

 experimental dataset:
Number of different enzymes: 12156, Number of different substrates: 1379

 phylogenetic dataset:
Number of different enzymes: 198259, Number of different substrates: 661


### (a) Creating negative data points for the training and test set:
To assign the negative data points, we will choose similar metabolites compared to the substrate of the psoitive datapoints. Therefore, we are first creating a similarity matrix for all metabolites in the dataset.

In [7]:
df_chebi_to_inchi = pd.read_csv(join(CURRENT_DIR, ".." ,"data", "substrate_data", "chebiID_to_inchi.tsv"), sep = "\t")
mol_folder = "C:\\Users\\alexk\\mol-files\\"

def get_mol(met_ID):
    is_CHEBI_ID = (met_ID[0:5] == "CHEBI")
    is_InChI = (met_ID[0:5] == "InChI")
    if is_CHEBI_ID:
        try:
            ID = int(met_ID.split(" ")[0].split(":")[-1])
            Inchi = list(df_chebi_to_inchi["Inchi"].loc[df_chebi_to_inchi["ChEBI"] == float(ID)])[0]
            mol = Chem.inchi.MolFromInchi(Inchi)
        except:
            mol = None     
    elif is_InChI:
        try:
            mol = Chem.inchi.MolFromInchi(met_ID)
        except:
            mol = None
        
    else:
        try:
            mol = Chem.MolFromMolFile(mol_folder +  "mol-files\\" + met_ID + '.mol')
        except OSError:
            mol = None
            
    return(mol)

def drop_samples_without_mol_file(df):
    droplist = []
    for ind in df.index:
        if get_mol(met_ID = df["molecule ID"][ind]) is None:
            droplist.append(ind)

    df.drop(droplist, inplace = True)
    return(df)

def get_metabolites_and_similarities(df):
    df_metabolites = pd.DataFrame(data = {"ECFP": df["ECFP"], "ID": df["molecule ID"]})
    df_metabolites = df_metabolites.drop_duplicates()
    df_metabolites.reset_index(inplace = True, drop = True)


    ms = [get_mol(met_ID = df_metabolites["ID"][ind]) for ind in df_metabolites.index]
    fps = [Chem.RDKFingerprint(x) for x in ms]

    similarity_matrix = np.zeros((len(ms), len(ms)))
    for i in range(len(ms)):
        for j in range(len(ms)):
            similarity_matrix[i,j] = DataStructs.FingerprintSimilarity(fps[i],fps[j])
            
    return(df_metabolites, similarity_matrix)



def get_valid_list(met_ID, UID, forbidden_metabolites, df_metabolites, similarity_matrix, lower_bound =0.7, upper_bound =0.9):
    binding_met_IDs = list(df_UID_MID["molecule ID"].loc[df_UID_MID["Uniprot ID"] == UID])
    k = df_metabolites.loc[df_metabolites["ID"] == met_ID].index[0]

    similarities = similarity_matrix[k,:]
    selection = (similarities< upper_bound) * (similarities >lower_bound) 
    metabolites = list(df_metabolites["ID"].loc[selection])
    
    no_mets = list(set(binding_met_IDs + forbidden_metabolites))
    
    metabolites = [met for met in metabolites if (met not in no_mets)]
    return(metabolites)


def create_negative_samples(df, df_metabolites, similarity_matrix):
    start = time.time()
    UID_list = []
    MID_list = []
    Type_list = []
    forbidden_mets = []

    for ind in df.index:
        if ind % 100 ==0:
            print(ind)
            print("Time: %s [min]" % np.round(float((time.time()-start)/60),2))

            df2 = pd.DataFrame(data = {"Uniprot ID": UID_list, "molecule ID" : MID_list, "type" : Type_list})
            df2["Binding"] = 0
            df = pd.concat([df, df2], ignore_index=True)

            UID_list, MID_list, Type_list = [], [], []

            forbidden_mets_old = forbidden_mets.copy()
            all_mets = list(set(df["molecule ID"]))
            all_mets = [met for met in all_mets if not met in forbidden_mets_old]
            forbidden_mets = list(set([met for met in all_mets if 
                                       (np.mean(df["Binding"].loc[df["molecule ID"] == met]) < 1/2)]))
            forbidden_mets = forbidden_mets + forbidden_mets_old
            print(len(forbidden_mets))

        UID = df["Uniprot ID"][ind]
        Type = df["type"][ind]
        met_ID = df["molecule ID"][ind]

        metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                     df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                     lower_bound =0.7, upper_bound =0.95)
        lower_bound = 0.7
        while len(metabolites) < 2:
            lower_bound = lower_bound - 0.2
            metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                     df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                     lower_bound =lower_bound, upper_bound =0.95)
            if lower_bound <0:
                break
        
        new_metabolites =  random.sample(metabolites, min(1,len(metabolites)))

        for met in new_metabolites:
            UID_list.append(UID), MID_list.append(met), Type_list.append(Type)

    df2 = pd.DataFrame(data = {"Uniprot ID": UID_list, "molecule ID" : MID_list, "type" : Type_list})
    df2["Binding"] = 0

    df = pd.concat([df, df2], ignore_index = True)
    return(df)

#### (a)(i) Creating negative data points for the training set (experimental evidence):

In [8]:
df_UID_MID_train_exp = df_UID_MID_train.loc[df_UID_MID_train["evidence"] == "exp"]

df_UID_MID_train_exp = drop_samples_without_mol_file(df = df_UID_MID_train_exp)
#calculating similarity matrix for all metabolites in the set:
df_metabolites_train, similarity_matrix_train = get_metabolites_and_similarities(df = df_UID_MID_train_exp)
print(len(df_metabolites_train))

df_UID_MID_train_exp["Binding"] = 1
df_UID_MID_train_exp.reset_index(inplace = True, drop = True)

df_UID_MID_train_exp = create_negative_samples(df = df_UID_MID_train_exp, df_metabolites = df_metabolites_train,
                                          similarity_matrix = similarity_matrix_train)
df_UID_MID_train_exp

1266
0
Time: 0.0 [min]
0
100
Time: 0.1 [min]
2
200
Time: 0.21 [min]
5
300
Time: 0.33 [min]
10
400
Time: 0.44 [min]
22
500
Time: 0.56 [min]
33
600
Time: 0.66 [min]
45
700
Time: 0.76 [min]
59
800
Time: 0.88 [min]
66
900
Time: 0.98 [min]
83
1000
Time: 1.07 [min]
95
1100
Time: 1.18 [min]
110
1200
Time: 1.29 [min]
126
1300
Time: 1.4 [min]
137
1400
Time: 1.51 [min]
146
1500
Time: 1.63 [min]
159
1600
Time: 1.74 [min]
172
1700
Time: 1.85 [min]
188
1800
Time: 1.95 [min]
203
1900
Time: 2.05 [min]
216
2000
Time: 2.16 [min]
228
2100
Time: 2.27 [min]
242
2200
Time: 2.37 [min]
253
2300
Time: 2.48 [min]
267
2400
Time: 2.59 [min]
282
2500
Time: 2.67 [min]
295
2600
Time: 2.77 [min]
307
2700
Time: 2.86 [min]
313
2800
Time: 2.95 [min]
325
2900
Time: 3.04 [min]
334
3000
Time: 3.12 [min]
339
3100
Time: 3.21 [min]
352
3200
Time: 3.31 [min]
365
3300
Time: 3.4 [min]
372
3400
Time: 3.5 [min]
380
3500
Time: 3.59 [min]
392
3600
Time: 3.68 [min]
400
3700
Time: 3.77 [min]
406
3800
Time: 3.87 [min]
412
3900
Time: 3

Unnamed: 0,Uniprot ID,molecule ID,evidence,ECFP,Binding,type
0,Q5B2F7,CHEBI:57344,exp,0100000001000000000000000000000001000000000000...,1,
1,Q9SAH9,CHEBI:58349,exp,0000000001000000100000100000000000000000000000...,1,
2,Q8IPJ6,CHEBI:57776,exp,0000000000000000000000000000010001000000000000...,1,
3,A0A1D5PCZ1,C00002,exp,0000000001000000000000000000000000000000000000...,1,
4,O22765,CHEBI:33384,exp,0100000000000000000000000000000000000000000000...,1,
...,...,...,...,...,...,...
29207,P04152,CHEBI:15901,,,0,
29208,Q4Q1I5,C00007,,,0,
29209,P43123,CHEBI:30616,,,0,
29210,Q8RVK9,C00002,,,0,


#### (a)(ii) Creating negative data points for the training set (phylogentical evidence):

In [9]:
df_UID_MID_train_phylo = df_UID_MID_train.loc[df_UID_MID_train["evidence"] == "phylo"]

In [10]:
df_UID_MID_train_phylo = drop_samples_without_mol_file(df = df_UID_MID_train_phylo)
#calculating similarity matrix for all metabolites in the set:
df_metabolites_train, similarity_matrix_train = get_metabolites_and_similarities(df = df_UID_MID_train_phylo)
print(len(df_metabolites_train))

df_UID_MID_train_phylo["Binding"] = 1
df_UID_MID_train_phylo.reset_index(inplace = True, drop = True)

df_UID_MID_train_phylo = create_negative_samples(df = df_UID_MID_train_phylo, df_metabolites = df_metabolites_train,
                                          similarity_matrix = similarity_matrix_train)
df_UID_MID_train_phylo

655
0
Time: 0.0 [min]
0
100
Time: 0.22 [min]
0
200
Time: 0.45 [min]
1
300
Time: 0.68 [min]
1
400
Time: 0.9 [min]
1
500
Time: 1.12 [min]
1
600
Time: 1.34 [min]
1
700
Time: 1.56 [min]
2
800
Time: 1.77 [min]
3
900
Time: 2.0 [min]
4
1000
Time: 2.22 [min]
5
1100
Time: 2.46 [min]
5
1200
Time: 2.69 [min]
5
1300
Time: 2.91 [min]
5
1400
Time: 3.14 [min]
6
1500
Time: 3.36 [min]
7
1600
Time: 3.58 [min]
9
1700
Time: 3.81 [min]
9
1800
Time: 4.04 [min]
9
1900
Time: 4.26 [min]
9
2000
Time: 4.48 [min]
11
2100
Time: 4.7 [min]
11
2200
Time: 4.92 [min]
11
2300
Time: 5.15 [min]
13
2400
Time: 5.37 [min]
13
2500
Time: 5.6 [min]
13
2600
Time: 5.82 [min]
13
2700
Time: 6.03 [min]
13
2800
Time: 6.25 [min]
13
2900
Time: 6.47 [min]
14
3000
Time: 6.7 [min]
16
3100
Time: 6.94 [min]
16
3200
Time: 7.17 [min]
16
3300
Time: 7.39 [min]
16
3400
Time: 7.62 [min]
16
3500
Time: 7.84 [min]
16
3600
Time: 8.08 [min]
17
3700
Time: 8.32 [min]
17
3800
Time: 8.58 [min]
18
3900
Time: 8.84 [min]
19
4000
Time: 9.1 [min]
19
4100
Time:

197
30500
Time: 65.74 [min]
197
30600
Time: 65.93 [min]
198
30700
Time: 66.12 [min]
198
30800
Time: 66.31 [min]
200
30900
Time: 66.51 [min]
201
31000
Time: 66.7 [min]
201
31100
Time: 66.9 [min]
201
31200
Time: 67.09 [min]
201
31300
Time: 67.28 [min]
202
31400
Time: 67.48 [min]
202
31500
Time: 67.67 [min]
202
31600
Time: 67.87 [min]
202
31700
Time: 68.07 [min]
204
31800
Time: 68.26 [min]
204
31900
Time: 68.45 [min]
204
32000
Time: 68.65 [min]
204
32100
Time: 68.83 [min]
205
32200
Time: 69.03 [min]
205
32300
Time: 69.22 [min]
205
32400
Time: 69.42 [min]
205
32500
Time: 69.64 [min]
205
32600
Time: 69.86 [min]
205
32700
Time: 70.08 [min]
206
32800
Time: 70.3 [min]
206
32900
Time: 70.54 [min]
206
33000
Time: 70.77 [min]
206
33100
Time: 71.01 [min]
206
33200
Time: 71.24 [min]
207
33300
Time: 71.47 [min]
207
33400
Time: 71.7 [min]
208
33500
Time: 71.92 [min]
208
33600
Time: 72.15 [min]
208
33700
Time: 72.38 [min]
208
33800
Time: 72.6 [min]
209
33900
Time: 72.83 [min]
210
34000
Time: 73.07 [mi

302
59500
Time: 121.55 [min]
302
59600
Time: 121.73 [min]
302
59700
Time: 121.92 [min]
302
59800
Time: 122.11 [min]
302
59900
Time: 122.29 [min]
302
60000
Time: 122.48 [min]
304
60100
Time: 122.67 [min]
304
60200
Time: 122.85 [min]
304
60300
Time: 123.02 [min]
305
60400
Time: 123.2 [min]
305
60500
Time: 123.38 [min]
305
60600
Time: 123.56 [min]
305
60700
Time: 123.73 [min]
305
60800
Time: 123.91 [min]
306
60900
Time: 124.09 [min]
306
61000
Time: 124.28 [min]
306
61100
Time: 124.45 [min]
306
61200
Time: 124.63 [min]
306
61300
Time: 124.81 [min]
306
61400
Time: 124.99 [min]
307
61500
Time: 125.17 [min]
307
61600
Time: 125.34 [min]
308
61700
Time: 125.52 [min]
309
61800
Time: 125.7 [min]
309
61900
Time: 125.88 [min]
309
62000
Time: 126.08 [min]
309
62100
Time: 126.26 [min]
309
62200
Time: 126.46 [min]
311
62300
Time: 126.64 [min]
311
62400
Time: 126.82 [min]
312
62500
Time: 127.01 [min]
313
62600
Time: 127.18 [min]
314
62700
Time: 127.36 [min]
314
62800
Time: 127.55 [min]
315
62900
Time: 

382
87900
Time: 173.19 [min]
382
88000
Time: 173.36 [min]
382
88100
Time: 173.53 [min]
382
88200
Time: 173.71 [min]
383
88300
Time: 173.87 [min]
384
88400
Time: 174.04 [min]
384
88500
Time: 174.21 [min]
384
88600
Time: 174.39 [min]
384
88700
Time: 174.56 [min]
385
88800
Time: 174.73 [min]
387
88900
Time: 174.89 [min]
387
89000
Time: 175.06 [min]
387
89100
Time: 175.21 [min]
388
89200
Time: 175.37 [min]
388
89300
Time: 175.53 [min]
388
89400
Time: 175.7 [min]
388
89500
Time: 175.86 [min]
388
89600
Time: 176.03 [min]
388
89700
Time: 176.19 [min]
388
89800
Time: 176.35 [min]
389
89900
Time: 176.53 [min]
389
90000
Time: 176.69 [min]
389
90100
Time: 176.85 [min]
389
90200
Time: 177.02 [min]
391
90300
Time: 177.18 [min]
392
90400
Time: 177.35 [min]
392
90500
Time: 177.5 [min]
393
90600
Time: 177.67 [min]
394
90700
Time: 177.83 [min]
395
90800
Time: 178.0 [min]
395
90900
Time: 178.16 [min]
395
91000
Time: 178.32 [min]
396
91100
Time: 178.48 [min]
396
91200
Time: 178.64 [min]
396
91300
Time: 1

474
115800
Time: 815.49 [min]
474
115900
Time: 815.64 [min]
474
116000
Time: 815.8 [min]
474
116100
Time: 815.96 [min]
474
116200
Time: 816.12 [min]
474
116300
Time: 816.29 [min]
474
116400
Time: 816.45 [min]
474
116500
Time: 816.61 [min]
474
116600
Time: 816.77 [min]
474
116700
Time: 816.93 [min]
475
116800
Time: 817.1 [min]
476
116900
Time: 817.27 [min]
476
117000
Time: 817.45 [min]
477
117100
Time: 817.62 [min]
477
117200
Time: 817.8 [min]
478
117300
Time: 817.97 [min]
478
117400
Time: 818.16 [min]
478
117500
Time: 818.33 [min]
478
117600
Time: 818.5 [min]
478
117700
Time: 818.67 [min]
478
117800
Time: 818.84 [min]
479
117900
Time: 819.01 [min]
479
118000
Time: 819.19 [min]
480
118100
Time: 819.37 [min]
480
118200
Time: 819.55 [min]
481
118300
Time: 819.72 [min]
481
118400
Time: 819.89 [min]
481
118500
Time: 820.06 [min]
481
118600
Time: 820.24 [min]
481
118700
Time: 820.41 [min]
481
118800
Time: 820.59 [min]
481
118900
Time: 820.76 [min]
482
119000
Time: 820.92 [min]
482
119100
Tim

143200
Time: 848.39 [min]
547
143300
Time: 848.5 [min]
547
143400
Time: 848.63 [min]
548
143500
Time: 848.75 [min]
548
143600
Time: 848.86 [min]
550
143700
Time: 848.98 [min]
550
143800
Time: 849.1 [min]
551
143900
Time: 849.22 [min]
553
144000
Time: 849.35 [min]
553
144100
Time: 849.47 [min]
553
144200
Time: 849.6 [min]
553
144300
Time: 849.73 [min]
553
144400
Time: 849.85 [min]
553
144500
Time: 849.97 [min]
553
144600
Time: 850.1 [min]
555
144700
Time: 850.21 [min]
555
144800
Time: 850.34 [min]
555
144900
Time: 850.46 [min]
555
145000
Time: 850.58 [min]
555
145100
Time: 850.71 [min]
557
145200
Time: 850.84 [min]
557
145300
Time: 850.96 [min]
557
145400
Time: 851.08 [min]
557
145500
Time: 851.21 [min]
558
145600
Time: 851.33 [min]
559
145700
Time: 851.45 [min]
559
145800
Time: 851.57 [min]
560
145900
Time: 851.69 [min]
560
146000
Time: 851.8 [min]
561
146100
Time: 851.91 [min]
561
146200
Time: 852.04 [min]
562
146300
Time: 852.16 [min]
562
146400
Time: 852.28 [min]
562
146500
Time: 85

620
170700
Time: 879.96 [min]
620
170800
Time: 880.08 [min]
620
170900
Time: 880.21 [min]
620
171000
Time: 880.33 [min]
620
171100
Time: 880.45 [min]
620
171200
Time: 880.58 [min]
620
171300
Time: 880.69 [min]
620
171400
Time: 880.81 [min]
620
171500
Time: 880.93 [min]
620
171600
Time: 881.05 [min]
621
171700
Time: 881.17 [min]
621
171800
Time: 881.28 [min]
621
171900
Time: 881.4 [min]
622
172000
Time: 881.52 [min]
622
172100
Time: 881.64 [min]
622
172200
Time: 881.76 [min]
622
172300
Time: 881.88 [min]
622
172400
Time: 882.01 [min]
622
172500
Time: 882.13 [min]
622
172600
Time: 882.25 [min]
622
172700
Time: 882.36 [min]
623
172800
Time: 882.48 [min]
624
172900
Time: 882.6 [min]
624
173000
Time: 882.72 [min]
624
173100
Time: 882.84 [min]
624
173200
Time: 882.95 [min]
624
173300
Time: 883.06 [min]
624
173400
Time: 883.18 [min]
624
173500
Time: 883.3 [min]
624
173600
Time: 883.4 [min]
625
173700
Time: 883.52 [min]
625
173800
Time: 883.62 [min]
626
173900
Time: 883.74 [min]
626
174000
Tim

198100
Time: 912.5 [min]
653
198200
Time: 912.62 [min]
653
198300
Time: 912.73 [min]
653
198400
Time: 912.85 [min]
653
198500
Time: 912.97 [min]
653
198600
Time: 913.1 [min]
653
198700
Time: 913.22 [min]
653
198800
Time: 913.34 [min]
653
198900
Time: 913.46 [min]
653
199000
Time: 913.6 [min]
653
199100
Time: 913.7 [min]
653
199200
Time: 913.81 [min]
653
199300
Time: 913.91 [min]
653
199400
Time: 914.02 [min]
653
199500
Time: 914.13 [min]
653
199600
Time: 914.25 [min]
653
199700
Time: 914.36 [min]
653
199800
Time: 914.46 [min]
653
199900
Time: 914.57 [min]
653
200000
Time: 914.68 [min]
653
200100
Time: 914.79 [min]
653
200200
Time: 914.9 [min]
653
200300
Time: 915.01 [min]
653
200400
Time: 915.13 [min]
653
200500
Time: 915.24 [min]
653
200600
Time: 915.34 [min]
653
200700
Time: 915.44 [min]
653
200800
Time: 915.57 [min]
653
200900
Time: 915.67 [min]
653
201000
Time: 915.78 [min]
653
201100
Time: 915.89 [min]
653
201200
Time: 916.0 [min]
653
201300
Time: 916.11 [min]
653
201400
Time: 916

Unnamed: 0,Uniprot ID,molecule ID,evidence,ECFP,Binding,type
0,A8XT89,CHEBI:58885,phylo,0000000000000100000000000000000000000000000000...,1,
1,B2GV06,CHEBI:57292,phylo,0100100001000000000000000000000011000000000000...,1,
2,A0A022RBJ3,CHEBI:33227,phylo,1000000000000000000000000000000000000000000000...,1,
3,G3S168,CHEBI:59776,phylo,0100000000000000000000000000000000000000000000...,1,
4,F6I0H0,C00002,phylo,0000000001000000000000000000000000000000000000...,1,
...,...,...,...,...,...,...
424181,A0A0A0LAF9,C00002,,,0,
424182,A9V8I9,C00002,,,0,
424183,A0A2J6JMI2,C00002,,,0,
424184,Q8Y7G6,C00002,,,0,


#### (a)(iii) Creating negative data points for the test set:

In [11]:
df_UID_MID_test = df_UID_MID_test.loc[df_UID_MID_test["evidence"] == "exp"]

In [12]:
df_UID_MID_test = drop_samples_without_mol_file(df = df_UID_MID_test)
#calculating similarity matrix for all metabolites in the set:
df_metabolites_test, similarity_matrix_test = get_metabolites_and_similarities(df = df_UID_MID_test)
print(len(df_metabolites_test))

df_UID_MID_test["Binding"] = 1
df_UID_MID_test.reset_index(inplace = True, drop = True)

df_UID_MID_test = create_negative_samples(df = df_UID_MID_test, df_metabolites = df_metabolites_test,
                                          similarity_matrix = similarity_matrix_test)
df_UID_MID_test

706
0
Time: 0.0 [min]
0
100
Time: 0.05 [min]
6
200
Time: 0.1 [min]
21
300
Time: 0.15 [min]
36
400
Time: 0.21 [min]
54
500
Time: 0.26 [min]
73
600
Time: 0.31 [min]
89
700
Time: 0.37 [min]
104
800
Time: 0.42 [min]
128
900
Time: 0.48 [min]
139
1000
Time: 0.53 [min]
151
1100
Time: 0.59 [min]
165
1200
Time: 0.65 [min]
183
1300
Time: 0.71 [min]
200
1400
Time: 0.77 [min]
216
1500
Time: 0.83 [min]
227
1600
Time: 0.88 [min]
242
1700
Time: 0.94 [min]
257
1800
Time: 1.0 [min]
268
1900
Time: 1.05 [min]
285
2000
Time: 1.12 [min]
303
2100
Time: 1.17 [min]
315
2200
Time: 1.23 [min]
332
2300
Time: 1.3 [min]
351
2400
Time: 1.36 [min]
367
2500
Time: 1.42 [min]
383
2600
Time: 1.49 [min]
396
2700
Time: 1.55 [min]
409
2800
Time: 1.63 [min]
426
2900
Time: 1.7 [min]
444
3000
Time: 1.77 [min]
459
3100
Time: 1.84 [min]
474
3200
Time: 1.92 [min]
496
3300
Time: 2.0 [min]
518
3400
Time: 2.08 [min]
537
3500
Time: 2.16 [min]
552


Unnamed: 0,Uniprot ID,molecule ID,evidence,ECFP,Binding,type
0,P71828,CHEBI:57925,exp,0100000001000000000000000000000000000000000000...,1,
1,A0A1D8PGI8,CHEBI:16897,exp,0100000000000001000000000000000000000000000100...,1,
2,Q8NEZ4,C00019,exp,0100100001000000000000000000000001000000000000...,1,
3,F4K5T2,CHEBI:35235,exp,0100000000000000000000000000000000000000000000...,1,
4,Q05762,CHEBI:57453,exp,0110000000000000001000000000000000000000000000...,1,
...,...,...,...,...,...,...
7009,P53739,CHEBI:57618,,,0,
7010,H9D1R1,CHEBI:71682,,,0,
7011,P00962,CHEBI:58048,,,0,
7012,P48163,C00002,,,0,


In [13]:
df_UID_MID_train_phylo.to_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_train_phylo_1_1.pkl"))
df_UID_MID_train_exp.to_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_train_exp_1_1.pkl"))
df_UID_MID_test.to_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_test_exp_phylo_1_1.pkl"))

### (b) Mapping ECFPs and ESM-1b-vectors to different splits:

In [21]:
df_UID_MID_train_phylo = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_train_phylo_1_1.pkl"))
df_UID_MID_train_exp = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_train_exp_1_1.pkl"))
df_UID_MID_test = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_test_exp_phylo_1_1.pkl"))

In [22]:
df_UID_MID_train_exp["evidence"] = "exp"
df_UID_MID_train_phylo["evidence"] = "phylo"
df_UID_MID_train = pd.concat([df_UID_MID_train_exp, df_UID_MID_train_phylo], ignore_index = True)

df_UID_MID_test["evidence"] = "exp"

#### (b)(i) Mappings ECFPs:

In [13]:
df_ecfps = pd.read_pickle(join(CURRENT_DIR, ".." ,"data", "substrate_data", "df_ecfps.pkl"))

In [23]:
df_UID_MID_train_phylo.drop(columns = ["ECFP"], inplace = True)
df_UID_MID_train_phylo = df_UID_MID_train_phylo.merge(df_ecfps, right_on = "substrate ID", left_on = "molecule ID", how = "left")

df_UID_MID_test.drop(columns = ["ECFP"], inplace = True)
df_UID_MID_test = df_UID_MID_test.merge(df_ecfps, right_on = "substrate ID", left_on = "molecule ID", how = "left")

In [14]:
df_UID_MID_train_phylo.drop(columns = ["ECFP"], inplace = True)
df_UID_MID_train_phylo = df_UID_MID_train_phylo.merge(df_ecfps, right_on = "substrate ID", left_on = "molecule ID", how = "left")

df_UID_MID_train_exp.drop(columns = ["ECFP"], inplace = True)
df_UID_MID_train_exp = df_UID_MID_train_exp.merge(df_ecfps, right_on = "substrate ID", left_on = "molecule ID", how = "left")


In [32]:
df_UID_MID_train_phylo["outcome"] = df_UID_MID_train_phylo["Binding"]
df_UID_MID_train_exp["outcome"] = df_UID_MID_train_exp["Binding"]
df_UID_MID_test["outcome"] = df_UID_MID_test["Binding"]

In [37]:
df_UID_MID_train_phylo = df_UID_MID_train_phylo.merge(UNIPROT_df, how = "left", on = "Uniprot ID")
df_UID_MID_train_exp = df_UID_MID_train_exp.merge(UNIPROT_df, how = "left", on = "Uniprot ID")
df_UID_MID_test = df_UID_MID_test.merge(UNIPROT_df, how = "left", on = "Uniprot ID")

In [None]:
df_UID_MID_train_phylo.to_csv(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_train_phylo_1_1.csv"))
df_UID_MID_train_exp.to_csv(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_train_exp_1_1.csv"))
df_UID_MID_test.to_csv(join(CURRENT_DIR, ".." ,"data","enzyme_substrate_data","df_UID_MID_test_exp_phylo_1_1.csv"))

#### (b)(ii) Mappings ESM1b-vectors:

In [141]:
'''df_train = df_UID_MID_train
df_test = df_UID_MID_test

Uniprot_df = pd.DataFrame(data = {"Uniprot ID" : UNIPROT_df["Uniprot ID"],
                                 "ESM1b" : UNIPROT_df["ESM1b"]})

df_train = df_train.merge(Uniprot_df, on = "Uniprot ID", how = "left")
df_test = df_test.merge(Uniprot_df, on = "Uniprot ID", how = "left")

df_train.to_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train.pkl"))
df_test.to_pickle(join(CURRENT_DIR, ".." ,"data", "splits", "df_test.pkl"))''';