In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os
from rdkit import Chem
from rdkit.Chem import AllChem
CURRENT_DIR = os.getcwd()

## 1.Calculating reaction fingerprints for all reactions:

In [2]:
df_reactions = pd.read_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs.pkl"))
df_reactions

Unnamed: 0,substrate_InChI_set,product_InChI_set,Reaction ID,MW_frac
0,{InChI=1S/C8H8O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_0,1.000000
1,{InChI=1S/C17H23N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,"{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C17H21N4O9...",Reaction_1,1.000000
2,{InChI=1S/C19H23N7O6/c20-19-25-15-14(17(30)26-...,"{InChI=1S/p+1, InChI=1S/C19H21N7O6/c20-19-25-1...",Reaction_2,1.000000
3,{InChI=1S/C16H28N2O11/c1-5(21)17-9-13(25)14(8(...,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,Reaction_3,2.000000
4,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/H2O/h1H2, InChI=1S/C3H5O6P/c1-2(3(4)...",Reaction_4,1.000000
...,...,...,...,...
4434,"{InChI=1S/O2/c1-2, InChI=1S/C34H58N7O21P3S/c1-...","{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C34H56N7O2...",Reaction_4434,1.000000
4435,{InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15...,"{InChI=1S/p+1, InChI=1S/C10H15N5O10P2/c11-8-5-...",Reaction_4435,1.000000
4436,{InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15...,"{InChI=1S/p+1, InChI=1S/C8H16NO9P/c1-3(11)9-5-...",Reaction_4436,1.000000
4437,{InChI=1S/C16H12O4/c1-19-12-5-2-10(3-6-12)14-9...,"{InChI=1S/p+1, InChI=1S/C17H21N4O9P/c1-7-3-9-1...",Reaction_4437,0.998668


In [3]:
mol_folder = join("..", "..", "data", "metabolite_data", "mol-files")
def get_reaction_site_smarts(metabolites):
    reaction_site = ""
    for met in metabolites:
        is_kegg_id = False
        
        if met[0] == "C":
            is_kegg_id = True
            
        if is_kegg_id:
            try:
                Smarts = Chem.MolToSmarts(Chem.MolFromMolFile(join(mol_folder, met + '.mol')))
            except OSError:
                return(np.nan)
        else:
            mol = Chem.inchi.MolFromInchi(met)
            if mol is not None:
                Smarts = Chem.MolToSmarts(mol)
            else:
                return(np.nan)
        reaction_site = reaction_site + "." + Smarts
    return(reaction_site[1:])

def convert_fp_to_array(difference_fp_dict):
    fp = np.zeros(2048)
    for key in difference_fp_dict.keys():
        fp[key] = difference_fp_dict[key]
    return(fp)

In [4]:
df_reactions["structural_fp"], df_reactions["difference_fp"] = "", ""
df_reactions["#substrates"], df_reactions["#products"] = "", ""

for ind in df_reactions.index:
    substrates = list(df_reactions["substrate_InChI_set"][ind])
    products = list(df_reactions["product_InChI_set"][ind])
    try:
        left_site = get_reaction_site_smarts(substrates)
        right_site = get_reaction_site_smarts(products)
        if not pd.isnull(left_site) and not pd.isnull(right_site):

            rxn_forward = AllChem.ReactionFromSmarts(left_site + ">>" + right_site)

            difference_fp = Chem.rdChemReactions.CreateDifferenceFingerprintForReaction(rxn_forward)
            difference_fp =convert_fp_to_array(difference_fp.GetNonzeroElements())
            structural_fp = Chem.rdChemReactions.CreateStructuralFingerprintForReaction(rxn_forward).ToBitString()

            df_reactions["structural_fp"][ind] = structural_fp
            df_reactions["difference_fp"][ind] = difference_fp
            df_reactions["#substrates"][ind] = len(substrates)
            df_reactions["#products"][ind] = len(products)
    except IndexError:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
df_reactions.to_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs_and_FPs.pkl"))

## 2. Calculating ESM1b vectors for all sequences: 

In [2]:
df_sequences = pd.read_pickle(join("..", "..", "data", "enzyme_data", "all_sequences_with_IDs.pkl"))

In [3]:
ofile = open(join("..", "..", "data", "enzyme_data",
                  "all_sequences_kcat_esm1b.fasta"), "w")
for ind in df_sequences.index:
    seq = df_sequences["Sequence"][ind]
    if not pd.isnull(seq):
        ofile.write(">" + str(ind) + "\n" + seq[:1020]  + "\n")
ofile.close()

In [4]:
import torch
rep_dict = torch.load(join("..", "..", "data", "enzyme_data", "all_sequences_kcat_esm1b.pt"))

df_sequences["ESM1b"] = ""
for ind in df_sequences.index:
    if not pd.isnull(df_sequences["Sequence"][ind]):        
        df_sequences["ESM1b"][ind] = rep_dict[str(ind)+".pt"]

In [5]:
import torch
rep_dict = torch.load(join("..", "..", "data", "enzyme_data", "all_sequences_kcat_esm1b_ts.pt"))

df_sequences["ESM1b_ts"] = ""
for ind in df_sequences.index:
    if not pd.isnull(df_sequences["Sequence"][ind]):        
        df_sequences["ESM1b_ts"][ind] = rep_dict[str(ind)+".pt"]

In [6]:
df_sequences.to_pickle(join("..", "..", "data", "enzyme_data", "all_sequences_with_IDs_and_ESM1b_ts.pkl"))