### 1. Loading and preprocessing training data from Mou et al.
### 2. Creating substrate representations
### 3. Creating enzyme representations

In [1]:
import pandas as pd
import numpy as np
import random
from os.path import join
import os
import re
import sys
import time
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from Bio import SeqIO
import warnings
import torch
warnings.filterwarnings("ignore")

sys.path.append('.\\additional_code')
from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\projects\SubFinder\notebooks_and_code


### 1. Loading and preprocessing training data from Mou et al.

In [2]:
df = pd.read_excel(join(CURRENT_DIR, ".." ,"data", "Mou_data", 'nitrilase_descriptors.xlsx'),sheet_name = 'neu50').copy()
display(df.head())
df = pd.DataFrame(data= {"enzyme" : df["protein"], "metabolite" : df["nitriles"],
                        "activity" : df["activity"]})
df

Unnamed: 0,protein,nitrile class,nitriles,ZINC ID,activity,interface_delta_X,fa_atr,fa_elec,fa_rep,fa_sol,...,ISA,Xi,Ap,Phi,Psi,Pa,Pb,Pt,DGLJ,DGtor
0,1a1,Aromatic,4-Aminobenzonitrile,152697,0.716,-12.808,-17.008,-0.564,4.762,7.619,...,0.476,0.467,0.529,0.619,0.656,0.543,0.565,0.55,0.705,0.711
1,1a1,Arylaliphatic,4-Nitrophenylacetonitrile,154682,0.0,-12.572,-12.752,-0.501,1.421,6.467,...,0.475,0.47,0.528,0.614,0.664,0.539,0.567,0.549,0.695,0.703
2,1a1,Aromatic,4-(Dimethylamino)benzonitrile,155390,4.363,-11.979,-22.038,0.104,6.49,5.492,...,0.476,0.467,0.529,0.621,0.661,0.543,0.565,0.55,0.711,0.701
3,1a1,Aromatic,4-Chlorobenzonitrile,157255,3.998,-10.293,-18.752,-0.017,5.785,6.069,...,0.476,0.467,0.528,0.611,0.67,0.539,0.564,0.549,0.705,0.707
4,1a1,Aromatic,"2,6-Dichlorobenzonitrile",157318,0.0,-9.036,-11.809,-0.275,2.007,5.476,...,0.476,0.467,0.528,0.61,0.667,0.539,0.564,0.549,0.703,0.677


Unnamed: 0,enzyme,metabolite,activity
0,1a1,4-Aminobenzonitrile,0.716
1,1a1,4-Nitrophenylacetonitrile,0.000
2,1a1,4-(Dimethylamino)benzonitrile,4.363
3,1a1,4-Chlorobenzonitrile,3.998
4,1a1,"2,6-Dichlorobenzonitrile",0.000
...,...,...,...
235,pmi28,Propionitrile,1.054
236,pmi28,Benzoylacetonitrile,2.902
237,pmi28,Mandelonitrile,0.000
238,pmi28,Benzonitrile,4.580


### 2. Creating substrate representations

In [3]:
metabolites = list(set(df["metabolite"]))

met_dict = {'2,6-Dichlorobenzonitrile' : "2_6-dichlorobenzonitrile",
           '⍺-Methylbenzyl cyanide' : "alpha-methylbenzylcyanide",
           '4-(Dimethylamino)benzonitrile' :"4-Dimethylamino-benzonitrile"}

df["ECFP"] = np.nan

for met in metabolites:
    met2 = met
    try:
        met2 = met_dict[met]
    except KeyError:
        pass
    
    mol = Chem.MolFromMolFile(join(CURRENT_DIR, ".." ,"data", "Mou_data","nitrile_coordinates", met2 + ".mol"))
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024).ToBitString()
    
    df["ECFP"].loc[df["metabolite"] == met] = ecfp

df

Unnamed: 0,enzyme,metabolite,activity,ECFP
0,1a1,4-Aminobenzonitrile,0.716,0000000000000000000000000000000000100000000000...
1,1a1,4-Nitrophenylacetonitrile,0.000,0000000000001000000000000000000000000000000000...
2,1a1,4-(Dimethylamino)benzonitrile,4.363,0000000000000000000000000000000001100000000000...
3,1a1,4-Chlorobenzonitrile,3.998,0000000000000000000000000000000000100000000000...
4,1a1,"2,6-Dichlorobenzonitrile",0.000,0000000000000001000000000000000000000000000000...
...,...,...,...,...
235,pmi28,Propionitrile,1.054,0000000000000000000000000000000001000000000000...
236,pmi28,Benzoylacetonitrile,2.902,0000000000000000000000000000000000000000000000...
237,pmi28,Mandelonitrile,0.000,0100000000000000000000000000000000000000000000...
238,pmi28,Benzonitrile,4.580,0000000000000000000000000000000000100000000000...


### 3. Creating enzyme representations

In [4]:
enzymes = list(set(df["enzyme"]))

enzy_str = ""
for enz in enzymes:
    enzy_str += enz +","
enzy_str

'1b16,1a8,1a17,1a2,3a2,2a6,1a27,1b15,pmi28,1a1,pmi26,3wuy,'

https://www.rcsb.org/downloads/fasta

In [5]:
from Bio import SeqIO

df["Sequence"] = np.nan

pdb_files = os.listdir(join(CURRENT_DIR, ".." ,"data", "Mou_data", "pdb_files"))
for pdb in pdb_files:
    for i, record in enumerate(SeqIO.parse(join(CURRENT_DIR, ".." ,"data", "Mou_data", "pdb_files", pdb), "pdb-atom")):
        if i ==0:
            AA =str(record.seq)
            print(pdb, AA)
        df["Sequence"].loc[df["enzyme"] == pdb.split("_")[0]] = AA

1a17_S_2535.pdb STVRVAAVQISPVLYNREATVQKVVNKILELGKQGVQFATFPETIVPYYPYFSFIQAPYAMGKEHLRLLEQSVTVPSAATDAISEAAKEANMVVSIGVNERDGGTIYNTQLLFDADGTLIQRRRKLTPTYHERMIWGQGDASGLRATDSAVGRIGQLACWEHYNPLFRYALIADGEQIHSAMYPGSFLGALHGEQTEINVRQHALESASFVVVATGWLDADQQAQIAKDTGGPIGPISGGCFTAVIGPDGQLIGEALTSGEGEVIADIDLAQIDARKRLMDAS
1a1_S_2858.pdb TIVKAAAVQISPVLYSREGTVERVVKKIRELGEKGVQFATFPETVIPYYPYFSFVQTPLQILAGPEHLKLLDQSVTVPSPATDAIGQAARQAGMVVSIGVNERDGGTLYNTQLLFDADGALIQRRRKIKPTHYERMIWGEGDGSGLRAVDSQVGRIGQLACWEHNNPLARYAMMADGEQIHSAMYPGSMFGDPFAQKTEINIRQHALESGCFVVCSTAWLDADQQAQIMQDTGCAIGPISGGCLTAIVAPDGTFLGEPLTSGEGEVIADLDFKLIDKRKQTMDSR
1a27_S_2787.pdb AIIRAAAVQISPVLYSREGTVDKVCQQIITLGKQGVQFAVFPETVVPYYPYFSFVQPAFAMGAQHLKLLDQSVTVPSAATLAIGEACKQAGMVVSIGVNERDGGTIYNAQLLFDADGTLIQHRRKITPTYHERMVWGQGDGSGLRAIDSAVGRIGSLACWEHYNPLARYALMADGEQIHAAMFPGSLVGDIFAEQIEVTIRHHALESGCFVVNATAWLDADQQGQIMQDTGCGLGPISGGCFTAIVSPEGKLLGEPLRSGEGVVIADLDTALIDKRKRMMDSV
1a2_S_2697.pdb KVVKAAAVQLSPVLYSREGTVERVVRKIHELGRQGVQFATFPETVVPYYPYFSFVQTPLQIIAGPEHLKLLDQAVTVPSPATDA

Creating fasta file with all sequences:

In [6]:
ofile = open(join(CURRENT_DIR, ".." ,"data", "Mou_data", "all_sequences_Mou.fasta"), "w")
for ind in df.index:
    seq = df["Sequence"][ind]
    if not pd.isnull(seq):
        seq_end = seq.find("#")
        seq = seq[:seq_end]
        ofile.write(">" + str(ind) + "\n" + seq  + "\n")
ofile.close()

Mapping enzyme representations:

In [7]:
import torch
rep_dict = torch.load(join(CURRENT_DIR, ".." ,"data", "Mou_data", "all_sequences_Mou.pt"))


df["ESM1b_ts"] = ""
for ind in df.index:
    df["ESM1b_ts"][ind] = rep_dict[str(ind) +".pt"]

In [8]:
df.to_pickle(join(CURRENT_DIR, ".." ,"data", "Mou_data", "Mou_df.pkl"))