In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from drfp import DrfpEncoder
CURRENT_DIR = os.getcwd()

## 1.Calculating reaction fingerprints for all reactions:

In [2]:
df_reactions = pd.read_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs.pkl"))
df_reactions

Unnamed: 0,substrate_InChI_set,product_InChI_set,Reaction ID,MW_frac
0,{InChI=1S/C8H8O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_0,1.000000
1,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C17H21N4O9...",Reaction_1,1.000000
2,{InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-1...,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,Reaction_2,1.000000
3,{InChI=1S/C16H28N2O11/c1-5(21)17-9-13(25)14(8(...,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,Reaction_3,2.000000
4,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...",Reaction_4,1.000000
...,...,...,...,...
4434,{InChI=1S/C34H58N7O21P3S/c1-18(58-33-21(43)13-...,"{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C34H56N7O2...",Reaction_4434,1.000000
4435,{InChI=1S/C11H19NO8/c1-4(10(16)17)19-9-7(12-5(...,{InChI=1S/C11H20NO11P/c1-4(10(16)17)21-9-7(12-...,Reaction_4435,1.000000
4436,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,"{InChI=1S/p+1, InChI=1S/C10H15N5O10P2/c11-8-5-...",Reaction_4436,1.000000
4437,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C16H12O5/c1-20-10-3-5-11(14(18)7-10)...,Reaction_4437,0.998668


In [3]:
mol_folder = join("..", "..", "data", "metabolite_data", "mol-files")
def get_reaction_site_smarts(metabolites):
    reaction_site = ""
    for met in metabolites:
        is_kegg_id = False
        
        if met[0] == "C":
            is_kegg_id = True
            
        if is_kegg_id:
            try:
                Smarts = Chem.MolToSmarts(Chem.MolFromMolFile(join(mol_folder, met + '.mol')))
            except OSError:
                return(np.nan)
        else:
            mol = Chem.inchi.MolFromInchi(met)
            if mol is not None:
                Smarts = Chem.MolToSmarts(mol)
            else:
                return(np.nan)
        reaction_site = reaction_site + "." + Smarts
    return(reaction_site[1:])

def get_reaction_site_smiles(metabolites):
    reaction_site = ""
    for met in metabolites:
        is_kegg_id = False
        
        if met[0] == "C":
            is_kegg_id = True
            
        if is_kegg_id:
            try:
                Smarts = Chem.MolToSmiles(Chem.MolFromMolFile(join(mol_folder, met + '.mol')))
            except OSError:
                return(np.nan)
        else:
            mol = Chem.inchi.MolFromInchi(met)
            if mol is not None:
                Smarts = Chem.MolToSmiles(mol)
            else:
                return(np.nan)
        reaction_site = reaction_site + "." + Smarts
    return(reaction_site[1:])

def convert_fp_to_array(difference_fp_dict):
    fp = np.zeros(2048)
    for key in difference_fp_dict.keys():
        fp[key] = difference_fp_dict[key]
    return(fp)

In [None]:
df_reactions["structural_fp"], df_reactions["difference_fp"], df_reactions["DRFP"] = "", "", ""
df_reactions["#substrates"], df_reactions["#products"] = "", ""

for ind in df_reactions.index:
    substrates = list(df_reactions["substrate_InChI_set"][ind])
    products = list(df_reactions["product_InChI_set"][ind])
    try:
        left_site = get_reaction_site_smarts(substrates)
        right_site = get_reaction_site_smarts(products)
        if not pd.isnull(left_site) and not pd.isnull(right_site):

            rxn_forward = AllChem.ReactionFromSmarts(left_site + ">>" + right_site)

            difference_fp = Chem.rdChemReactions.CreateDifferenceFingerprintForReaction(rxn_forward)
            difference_fp = convert_fp_to_array(difference_fp.GetNonzeroElements())
            structural_fp = Chem.rdChemReactions.CreateStructuralFingerprintForReaction(rxn_forward).ToBitString()
            
            left_site = get_reaction_site_smiles(substrates)
            right_site = get_reaction_site_smiles(products)
            drfp = DrfpEncoder.encode(left_site + ">>" + right_site)[0]

            df_reactions["DRFP"][ind] = drfp
            df_reactions["structural_fp"][ind] = structural_fp
            df_reactions["difference_fp"][ind] = difference_fp
            df_reactions["#substrates"][ind] = len(substrates)
            df_reactions["#products"][ind] = len(products)
    except IndexError:
        pass

In [5]:
df_reactions.to_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs_and_FPs.pkl"))

In [3]:
import pickle5 as p
with open(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs_and_FPs.pkl"), "rb") as fh:
    data = p.load(fh)

## 2. Calculating ESM1b vectors for all sequences: 

In [2]:
df_sequences = pd.read_pickle(join("..", "..", "data", "enzyme_data", "all_sequences_with_IDs.pkl"))

The ESM-1b model takes a FASTA file with the enzymes' amino acid sequences as its input. Creating a FASTA file with enzyme sequences:

In [3]:
ofile = open(join("..", "..", "data", "enzyme_data",
                  "all_sequences_kcat_esm1b.fasta"), "w")
for ind in df_sequences.index:
    seq = df_sequences["Sequence"][ind]
    if not pd.isnull(seq):
        ofile.write(">" + str(ind) + "\n" + seq[:1020]  + "\n")
ofile.close()

To calculate the ESM-1b vectors, we used the model and code provided by the Facebook Research team: https://github.com/facebookresearch/esm. The following command line was used to calculate the representations:

python extract.py esm1b_t33_650M_UR50S \path_to_fasta_file\all_sequences_kcat_esm1b.fasta \path_to_store_representations\all_sequences_kcat_esm1b--repr_layers 33 --include mean

Loading ESM-1b-vectors and storing them in the UniProt DataFrame. All representations were merged into one dictionary:


The code and the models to create the task-specififc ESM-1b vectors can be found in the following GitHub repository: https://github.com/AlexanderKroll/SubFinder

In [4]:
import torch
rep_dict = torch.load(join("..", "..", "data", "enzyme_data", "all_sequences_kcat_esm1b.pt"))

df_sequences["ESM1b"] = ""
for ind in df_sequences.index:
    if not pd.isnull(df_sequences["Sequence"][ind]):        
        df_sequences["ESM1b"][ind] = rep_dict[str(ind)+".pt"]

In [5]:
import torch
rep_dict = torch.load(join("..", "..", "data", "enzyme_data", "all_sequences_kcat_esm1b_ts.pt"))

df_sequences["ESM1b_ts"] = ""
for ind in df_sequences.index:
    if not pd.isnull(df_sequences["Sequence"][ind]):        
        df_sequences["ESM1b_ts"][ind] = rep_dict[str(ind)+".pt"]

In [6]:
df_sequences.to_pickle(join("..", "..", "data", "enzyme_data", "all_sequences_with_IDs_and_ESM1b_ts.pkl"))