In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os
import pickle
import requests
from multiprocessing.dummy import Pool
import json

import warnings
warnings.filterwarnings('ignore')
CURRENT_DIR = os.getcwd()
os.chdir(CURRENT_DIR)


## Script to download the BRENDA dataset, generate ESM-2 enzyme representations of wild type and mutants and ChemBERTa-2 substrate smiles representations using SMILES from Pubchem, BioServices, RDKit and MetaboAnalyst. 

Load BRENDA data from json files

In [2]:
directory = "brenda_ec_json_km"

keys_to_extract = ["value", "substrate", "uniprot", "commentary"]
all_data = []


for filename in os.listdir(directory):
    if filename.endswith(".json"): 
        file_path = os.path.join(directory, filename)

        if os.path.getsize(file_path) == 0:
            continue  

        ec_number = filename.split("-")[0]  

        with open(file_path, encoding="utf-8") as file:
            data = json.load(file)

        if "value" not in data or not isinstance(data["value"], dict):
            continue  

        # Extract data
        extracted_data = [{"ec": ec_number} | {key: data[key].get(str(i), None) if key in data else None for key in keys_to_extract} for i in range(len(data["value"]))]

        all_data.extend(extracted_data)

df = pd.DataFrame(all_data)
df.rename(columns={"value": "km"}, inplace=True)
df

Unnamed: 0,ec,km,substrate,uniprot,commentary
0,1.1.1.1,0.0710,NADH,D4GSN2,"#10# isoenzyme 2 <51>; #136# pH 6.0, temperatu..."
1,1.1.1.1,0.1000,NAD+,B8QU18,#10# isoenzyme ADH-3 <49>; #124# at pH 9.0 in ...
2,1.1.1.1,0.2000,NAD+,P42328,"#10# isoenzyme ADH-1, pH 7.5 <49>; #48# in 0.1..."
3,1.1.1.1,0.0560,ethanol,Q6L0S1,
4,1.1.1.1,66.0000,ethanol,P39462,"#109# mutant enzyme W95L/N249Y, in 0.1 M glyci..."
...,...,...,...,...,...
49729,7.6.2.9,0.6000,ATP,P46920,"#8# wild-type, performed in 10 mM sodium phosp..."
49730,7.6.2.9,1.7000,ATP,P46920,"#8# mutant F19W, performed in 10 mM sodium pho..."
49731,7.6.2.9,0.7000,ATP,P46920,"#8# mutant S45C, performed in 10 mM sodium pho..."
49732,7.6.2.9,2.8000,ATP,P46920,"#8# mutant G161C, performed in 10 mM sodium ph..."


Set the substrates 

In [None]:
unique_subst = df[["substrate"]].drop_duplicates().reset_index(drop=True)
unique_subst.to_pickle(join("files","unique_substrates_brenda.pkl"))
unique_subst

Unnamed: 0,substrate
0,NADH
1,NAD+
2,ethanol
3,1-Pentanol
4,Isobutyraldehyde
...,...
8617,Hoechst 33342/in
8618,doxorubicin/in
8619,"S-(2,4-dinitrophenyl)glutathione[side 1]"
8620,S-glutathione[side 1]


In [None]:
from IPython.core.display import HTML

def make_scrollable(df, max_height=300):
    return HTML(df.to_html(classes="scrollable", escape=False) + 
                f"<style>.scrollable {{ max-height: {max_height}px; overflow-y: scroll; display: block; }}</style>")

make_scrollable(unique_subst)

### a) Retrive smiles from PubChem 

In [None]:
name_to_smiles = {}
dt = unique_subst.copy()

def get_smiles(name):
    try:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/CanonicalSMILES/TXT"
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            smiles = response.content.decode().strip()
        else:
            smiles = None
    except:
        smiles = None
    name_to_smiles[name] = smiles

unique = dt["substrate"].dropna().unique().tolist()

thread_pool = Pool(4)
thread_pool.map(get_smiles, unique)
thread_pool.close()
thread_pool.join()

dt["smiles"] = dt["substrate"].map(name_to_smiles)
dt

Unnamed: 0,substrate,smiles
0,NADH,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...
1,NAD+,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...
2,ethanol,CCO
3,1-Pentanol,CCCCCO
4,Isobutyraldehyde,CC(C)C=O
...,...,...
8617,Hoechst 33342/in,
8618,doxorubicin/in,
8619,"S-(2,4-dinitrophenyl)glutathione[side 1]",
8620,S-glutathione[side 1],


In [None]:
dt = dt.dropna(subset=["smiles"]).reset_index(drop=True)
dt

Unnamed: 0,substrate,smiles
0,NADH,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...
1,NAD+,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...
2,ethanol,CCO
3,1-Pentanol,CCCCCO
4,Isobutyraldehyde,CC(C)C=O
...,...,...
4644,caldariellaquinol,CC(C)CCCC(C)CCCC(C)CCCC(C)CCCC(C)CCCC(C)CCC1=C...
4645,decylubiquinol,CCCCCCCCCCC1=C(C(=C(C(=C1O)OC)OC)O)C
4646,Na+,[Na+]
4647,Sb3+,[Sb+3]


In [None]:
dt.to_pickle(join("files","BRENDA_subst_smiles_PubChem.pkl"))

Add smiles to original dataframe

In [54]:
df = pd.merge(df, dt, on="substrate", how="left")
df = df.reset_index(drop=True)
df

Unnamed: 0,ec,km,substrate,uniprot,commentary,smiles
0,1.1.1.1,0.0710,NADH,D4GSN2,"#10# isoenzyme 2 <51>; #136# pH 6.0, temperatu...",C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...
1,1.1.1.1,0.1000,NAD+,B8QU18,#10# isoenzyme ADH-3 <49>; #124# at pH 9.0 in ...,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...
2,1.1.1.1,0.2000,NAD+,P42328,"#10# isoenzyme ADH-1, pH 7.5 <49>; #48# in 0.1...",C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...
3,1.1.1.1,0.0560,ethanol,Q6L0S1,,CCO
4,1.1.1.1,66.0000,ethanol,P39462,"#109# mutant enzyme W95L/N249Y, in 0.1 M glyci...",CCO
...,...,...,...,...,...,...
49729,7.6.2.9,0.6000,ATP,P46920,"#8# wild-type, performed in 10 mM sodium phosp...",
49730,7.6.2.9,1.7000,ATP,P46920,"#8# mutant F19W, performed in 10 mM sodium pho...",
49731,7.6.2.9,0.7000,ATP,P46920,"#8# mutant S45C, performed in 10 mM sodium pho...",
49732,7.6.2.9,2.8000,ATP,P46920,"#8# mutant G161C, performed in 10 mM sodium ph...",


Remove duplicates and incomplete entries

In [55]:
df = df.drop_duplicates(["ec", "km", "substrate", "uniprot"]) 
df = df.reset_index(drop=True)

df = df.loc[~pd.isnull(df["uniprot"])]
df = df.loc[~pd.isnull(df["substrate"])]
df = df.loc[~pd.isnull(df["km"])]
df = df.loc[~pd.isnull(df["smiles"])]
df

Unnamed: 0,ec,km,substrate,uniprot,commentary,smiles
0,1.1.1.1,0.07100,NADH,D4GSN2,"#10# isoenzyme 2 <51>; #136# pH 6.0, temperatu...",C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...
1,1.1.1.1,0.10000,NAD+,B8QU18,#10# isoenzyme ADH-3 <49>; #124# at pH 9.0 in ...,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...
2,1.1.1.1,0.20000,NAD+,P42328,"#10# isoenzyme ADH-1, pH 7.5 <49>; #48# in 0.1...",C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...
3,1.1.1.1,0.05600,ethanol,Q6L0S1,,CCO
4,1.1.1.1,66.00000,ethanol,P39462,"#109# mutant enzyme W95L/N249Y, in 0.1 M glyci...",CCO
...,...,...,...,...,...,...
49598,7.6.2.9,0.08340,betaine,Q0PCR9,"#7# at pH 7.0, Aphanothece halophytica cells <16>",C[N+](C)(C)CC(=O)[O-]
49599,7.6.2.9,0.09190,betaine,Q0PCR9,"#7# at pH 8.5, BetTA.halophytica expressing MK...",C[N+](C)(C)CC(=O)[O-]
49600,7.6.2.9,0.00832,betaine,Q0PCR9,#7# Aphanothece halophytica cells supplemented...,C[N+](C)(C)CC(=O)[O-]
49601,7.6.2.9,0.00941,betaine,Q0PCR9,#7# Aphanothece halophytica cells supplemented...,C[N+](C)(C)CC(=O)[O-]


Extract enzyme type and mutations

In [57]:
import re

def extract_info(commentary):
    mutations = re.findall(r"[A-Z]\d+[A-Z]", commentary)  
    if mutations:
        enzyme_type = "mutant"
        mutations = "/".join(mutations)  
    else:
        enzyme_type = None
    
    if enzyme_type is None:
        for keyword in ["native", "wild-type", "wild type", "recombinant"]:
            if keyword in commentary:
                enzyme_type = keyword
                break  

    return pd.Series([enzyme_type, mutations])

df[["enzyme_type", "mutations"]] = df["commentary"].apply(extract_info)
df["enzyme_type"] = df["enzyme_type"].replace({None: "wild type"}) #no commentary label found
df = df[df["enzyme_type"] != "recombinant"]
df["enzyme_type"] = df["enzyme_type"].replace({"wild-type": "wild type", "native": "wild type"})
df = df.drop(columns="commentary")
df

Unnamed: 0,ec,km,substrate,uniprot,smiles,enzyme_type,mutations
0,1.1.1.1,0.07100,NADH,D4GSN2,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[]
1,1.1.1.1,0.10000,NAD+,B8QU18,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,G211S
2,1.1.1.1,0.20000,NAD+,P42328,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,W87A
3,1.1.1.1,0.05600,ethanol,Q6L0S1,CCO,wild type,[]
4,1.1.1.1,66.00000,ethanol,P39462,CCO,mutant,W95L/N249Y
...,...,...,...,...,...,...,...
49598,7.6.2.9,0.08340,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[]
49599,7.6.2.9,0.09190,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[]
49600,7.6.2.9,0.00832,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[]
49601,7.6.2.9,0.00941,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[]


In [58]:
df.to_pickle(join("files","brenda.pkl"))

 Merge and calculate the geometric mean of km values for wild type entries with same EC + Uniprot ID + substrate

In [59]:
wild_type_df = df[df["enzyme_type"] == "wild type"]

def geometric_mean_values(x):
    return x

geometric_means_values = (wild_type_df.groupby(["ec", "substrate", "uniprot"], as_index=False).agg({"km": geometric_mean_values}))

result_df_wt = pd.merge(wild_type_df, geometric_means_values, on= ["ec", "substrate", "uniprot"], how= "left")
result_df_wt

Unnamed: 0,ec,km_x,substrate,uniprot,smiles,enzyme_type,mutations,km_y
0,1.1.1.1,0.07100,NADH,D4GSN2,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[],0.071
1,1.1.1.1,0.05600,ethanol,Q6L0S1,CCO,wild type,[],0.056
2,1.1.1.1,0.26000,Isobutyraldehyde,P39462,CC(C)C=O,wild type,[],0.26
3,1.1.1.1,0.41000,(R)-2-butanol,P39462,CCC(C)O,wild type,[],0.41
4,1.1.1.1,0.01200,(S)-2-butanol,P39462,CCC(C)O,wild type,[],0.012
...,...,...,...,...,...,...,...,...
20723,7.6.2.9,0.08340,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[],"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ..."
20724,7.6.2.9,0.09190,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[],"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ..."
20725,7.6.2.9,0.00832,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[],"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ..."
20726,7.6.2.9,0.00941,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[],"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ..."


In [60]:
from scipy.stats import gmean

result_df_wt["km_y"] = result_df_wt["km_y"].apply(lambda x: list(x) if isinstance(x, (list, np.ndarray)) else [x])

result_df_wt["km_count"] = result_df_wt["km_y"].apply(len)

result_df_wt.drop(columns=["km_x"], inplace=True)

result_df_wt = result_df_wt.rename(columns={"km_y":"km"})

def calculate_geometric_mean(value):
    if isinstance(value, list):
        return gmean(value)
    
    return value  

result_df_wt["km_gmean"] = result_df_wt["km"].apply(calculate_geometric_mean)

result_df_wt = result_df_wt.drop_duplicates(subset = ["ec", "substrate", "uniprot", "km_gmean"])
result_df_wt.reset_index(drop=True, inplace=True)
result_df_wt

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean
0,1.1.1.1,NADH,D4GSN2,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[],[0.071],1,0.07100
1,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[],[0.056],1,0.05600
2,1.1.1.1,Isobutyraldehyde,P39462,CC(C)C=O,wild type,[],[0.26],1,0.26000
3,1.1.1.1,(R)-2-butanol,P39462,CCC(C)O,wild type,[],[0.41],1,0.41000
4,1.1.1.1,(S)-2-butanol,P39462,CCC(C)O,wild type,[],[0.012],1,0.01200
...,...,...,...,...,...,...,...,...,...
16503,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[],[0.02],1,0.02000
16504,7.3.2.7,Sb3+,P30632,[Sb+3],wild type,[],[4e-05],1,0.00004
16505,7.4.2.14,8-azido-ATP,Q03518,C1=NC(=C2C(=N1)N(C(=N2)N=[N+]=[N-])C3C(C(C(O3)...,wild type,[],[0.103],1,0.10300
16506,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,[],"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059


In [61]:
result_df_wt.to_pickle(join("files", "wild_type_gmeans.pkl"))

 Merge and calculate the geometric mean of km values for mutant entries with same EC + Uniprot ID + substrate + mutation

In [62]:
mutants_df = df[df["enzyme_type"] == "mutant"]

def geometric_mean_values(x):
    return x

geometric_means_values = (mutants_df.groupby(["ec", "substrate", "mutations", "uniprot"], as_index=False).agg({"km": geometric_mean_values}))

result_df_mut = pd.merge(mutants_df, geometric_means_values, on= ["ec", "substrate", "mutations", "uniprot"], how= "left")
result_df_mut

Unnamed: 0,ec,km_x,substrate,uniprot,smiles,enzyme_type,mutations,km_y
0,1.1.1.1,0.1000,NAD+,B8QU18,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,G211S,0.1
1,1.1.1.1,0.2000,NAD+,P42328,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,W87A,0.2
2,1.1.1.1,66.0000,ethanol,P39462,CCO,mutant,W95L/N249Y,66.0
3,1.1.1.1,0.2600,1-Pentanol,P39462,CCCCCO,mutant,W95L,0.26
4,1.1.1.1,0.0080,NADH,P39462,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,mutant,W95L,0.008
...,...,...,...,...,...,...,...,...
13344,7.2.1.1,0.0150,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141A,0.015
13345,7.2.1.1,0.0088,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141L,0.0088
13346,7.2.1.1,0.0088,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141L,0.0088
13347,7.2.1.1,0.0234,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141V,0.0234


In [63]:
from scipy.stats import gmean 

result_df_mut["km_y"] = result_df_mut["km_y"].apply(lambda x: list(x) if isinstance(x, (list, np.ndarray)) else [x])

result_df_mut ["km_count"] = result_df_mut ["km_y"].apply(len)

result_df_mut .drop(columns=["km_x"], inplace=True)

result_df_mut  = result_df_mut .rename(columns={"km_y":"km"})

def calculate_geometric_mean(value):
    if isinstance(value, list):
        return gmean(value)
    return value  

result_df_mut ["km_gmean"] = result_df_mut ["km"].apply(calculate_geometric_mean)

result_df_mut  = result_df_mut.drop_duplicates(subset = ["ec", "mutations", "substrate", "uniprot", "km_gmean"])
result_df_mut.reset_index(drop=True, inplace=True)
result_df_mut 

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean
0,1.1.1.1,NAD+,B8QU18,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,G211S,[0.1],1,0.1000
1,1.1.1.1,NAD+,P42328,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,W87A,[0.2],1,0.2000
2,1.1.1.1,ethanol,P39462,CCO,mutant,W95L/N249Y,[66.0],1,66.0000
3,1.1.1.1,1-Pentanol,P39462,CCCCCO,mutant,W95L,[0.26],1,0.2600
4,1.1.1.1,NADH,P39462,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,mutant,W95L,[0.008],1,0.0080
...,...,...,...,...,...,...,...,...,...
12498,7.2.1.1,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141A,[0.015],1,0.0150
12499,7.2.1.1,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141L,[0.0088],1,0.0088
12500,7.2.1.1,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141L,[0.0088],1,0.0088
12501,7.2.1.1,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141V,[0.0234],1,0.0234


In [64]:
result_df_mut.to_pickle(join("files","mutants_gmeans.pkl"))

Merge dataframes 

In [3]:
wt_df = pd.read_pickle(join("files", "wild_type_gmeans.pkl"))
mut_df = pd.read_pickle(join("files","mutants_gmeans.pkl"))

km_df = pd.concat([wt_df, mut_df], ignore_index = True)
km_df 

Unnamed: 0,ec,substrate,uniprot,Mapped_substrate,smiles,enzyme_type,mutations,km,km_count,km_gmean
0,1.1.1.1,nadh,D4GSN2,nadh,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[],[0.071],1,0.071000
1,1.1.1.1,ethanol,Q6L0S1,ethanol,CCO,wild type,[],[0.056],1,0.056000
2,1.1.1.1,isobutyraldehyde,P39462,2-methylpropanal,CC(C)C=O,wild type,[],[0.26],1,0.260000
3,1.1.1.1,(r)-2-butanol,P39462,(r)-2-butanol,CCC(C)O,wild type,[],[0.41],1,0.410000
4,1.1.1.1,(s)-2-butanol,P39462,(s)-2-butanol,CCC(C)O,wild type,[],[0.012],1,0.012000
...,...,...,...,...,...,...,...,...,...,...
25025,7.6.2.1,atp,C7EXK4,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,I364M,"[0.369, 0.148]",2,0.233692
25026,7.6.2.1,atp,C7EXK4,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,V906A,[0.103],1,0.103000
25027,7.6.2.9,atp,P46920,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,F19W,[1.7],1,1.700000
25028,7.6.2.9,atp,P46920,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,S45C,[0.7],1,0.700000


Log transform the geometrical mean values

In [4]:
def compute_log10_km(row):
    value = row["km_gmean"]
    if isinstance(value, (int, float, np.float64)):
        return np.log10(value) 

km_df["log_km"] = km_df.apply(compute_log10_km, axis=1)
km_df.to_pickle(join("files","km_df.pkl"))
km_df 

Unnamed: 0,ec,substrate,uniprot,Mapped_substrate,smiles,enzyme_type,mutations,km,km_count,km_gmean,log_km
0,1.1.1.1,nadh,D4GSN2,nadh,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[],[0.071],1,0.071000,-1.148742
1,1.1.1.1,ethanol,Q6L0S1,ethanol,CCO,wild type,[],[0.056],1,0.056000,-1.251812
2,1.1.1.1,isobutyraldehyde,P39462,2-methylpropanal,CC(C)C=O,wild type,[],[0.26],1,0.260000,-0.585027
3,1.1.1.1,(r)-2-butanol,P39462,(r)-2-butanol,CCC(C)O,wild type,[],[0.41],1,0.410000,-0.387216
4,1.1.1.1,(s)-2-butanol,P39462,(s)-2-butanol,CCC(C)O,wild type,[],[0.012],1,0.012000,-1.920819
...,...,...,...,...,...,...,...,...,...,...,...
25025,7.6.2.1,atp,C7EXK4,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,I364M,"[0.369, 0.148]",2,0.233692,-0.631356
25026,7.6.2.1,atp,C7EXK4,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,V906A,[0.103],1,0.103000,-0.987163
25027,7.6.2.9,atp,P46920,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,F19W,[1.7],1,1.700000,0.230449
25028,7.6.2.9,atp,P46920,atp,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,mutant,S45C,[0.7],1,0.700000,-0.154902


Download amino acid sequences for all data points

In [None]:
# Save all unique Uniprot IDs
IDs = list(set(km_df["uniprot"]))

with open("esm2/UNIPROT_IDs_brenda_initial.txt", "w") as file:
    for ID in IDs:
        file.write(str(ID) + "\n")

Map Uniprot IDs to amino acid sequence

In [None]:
# Map from https://www.uniprot.org/id-mapping uniprot IDs to amino acid sequences
sequence_df = pd.read_csv(("esm2/idmapping_brenda_initial.tsv"), sep = "\t")

sequence_df.drop(columns = ["From"], inplace = True)
sequence_df.rename(columns={"Entry": "uniprot"}, inplace=True)

sequence_df["Sequence ID"] = "sequence_" + (sequence_df.index).astype(str)
sequence_df

Unnamed: 0,uniprot,Sequence,Kinetics,Sequence ID
0,Q9UTM9,MSRTIVIVGCGVFGLSTAVELAKNHSFDNIIAIDAEPVPSSMSAAN...,BIOPHYSICOCHEMICAL PROPERTIES: Kinetic parame...,sequence_0
1,C4PG45,MAFTFQINNDVQFRHNQALLDKSASYRPILKETQVKAASIVALELD...,,sequence_1
2,E9P8D2,MDAESIEWKLTANLRNGPTFFQPLADSIEPLQFKLIGSDTVATAFP...,BIOPHYSICOCHEMICAL PROPERTIES: Kinetic parame...,sequence_2
3,D7FMJ8,MIDISLAPPFLPALEAEAITHYRRTYHNFRLGKSQGAKGETNQFHV...,,sequence_3
4,Q9M6E2,MAGSTEFVVRSLERVMVAPSQPSPKAFLQLSTLDNLPGVRENIFNT...,,sequence_4
...,...,...,...,...
8871,B9X0T8,MTKILLMLALALGAAGLRWPAAVPQRRATSGRAAGARLERAVGPVA...,,sequence_8871
8872,Q9I297,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,sequence_8872
8873,A0A5K1QBJ4,,,sequence_8873
8874,Q9XYL9,MPAKLGYWKIRGLQQPVRLLLEYLDEEYEEHLYGRDDREKWLGDKF...,,sequence_8874


In [10]:
# Delete sequences with less than 40 residues or NaN
sequence_df = sequence_df.dropna(subset=["Sequence"])
sequence_df = sequence_df[sequence_df["Sequence"].str.len() >= 40]

sequence_df

Unnamed: 0,uniprot,Sequence,Kinetics,Sequence ID
0,Q9UTM9,MSRTIVIVGCGVFGLSTAVELAKNHSFDNIIAIDAEPVPSSMSAAN...,BIOPHYSICOCHEMICAL PROPERTIES: Kinetic parame...,sequence_0
1,C4PG45,MAFTFQINNDVQFRHNQALLDKSASYRPILKETQVKAASIVALELD...,,sequence_1
2,E9P8D2,MDAESIEWKLTANLRNGPTFFQPLADSIEPLQFKLIGSDTVATAFP...,BIOPHYSICOCHEMICAL PROPERTIES: Kinetic parame...,sequence_2
3,D7FMJ8,MIDISLAPPFLPALEAEAITHYRRTYHNFRLGKSQGAKGETNQFHV...,,sequence_3
4,Q9M6E2,MAGSTEFVVRSLERVMVAPSQPSPKAFLQLSTLDNLPGVRENIFNT...,,sequence_4
...,...,...,...,...
8870,P00962,MSEAEARPTNFIRQIIDEDLASGKHTTVHTRFPPEPNGYLHIGHAK...,,sequence_8870
8871,B9X0T8,MTKILLMLALALGAAGLRWPAAVPQRRATSGRAAGARLERAVGPVA...,,sequence_8871
8872,Q9I297,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,sequence_8872
8874,Q9XYL9,MPAKLGYWKIRGLQQPVRLLLEYLDEEYEEHLYGRDDREKWLGDKF...,,sequence_8874


Merge dataframes to include enzyme sequences

In [11]:
km_df = pd.read_pickle(join("files","km_df.pkl"))
km_df = km_df.merge(sequence_df, how = "left", on = "uniprot")
km_df = km_df.drop(columns=["Kinetics"])
km_df.rename(columns={"Sequence ID": "sequence_id"}, inplace=True)
km_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean,log_km,Sequence,sequence_id
0,1.1.1.1,NADH,D4GSN2,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[],[0.071],1,0.0710,-1.148742,MRAAVLREHGEPLDVTEVPDPTCDADGVVVEVEACGICRSDWHSWM...,sequence_6039
1,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[],[0.056],1,0.0560,-1.251812,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,sequence_4093
2,1.1.1.1,Isobutyraldehyde,P39462,CC(C)C=O,wild type,[],[0.26],1,0.2600,-0.585027,MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQ...,sequence_4174
3,1.1.1.1,(R)-2-butanol,P39462,CCC(C)O,wild type,[],[0.41],1,0.4100,-0.387216,MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQ...,sequence_4174
4,1.1.1.1,(S)-2-butanol,P39462,CCC(C)O,wild type,[],[0.012],1,0.0120,-1.920819,MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQ...,sequence_4174
...,...,...,...,...,...,...,...,...,...,...,...,...
29069,7.2.1.1,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141A,[0.015],1,0.0150,-1.823909,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767
29070,7.2.1.1,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141L,[0.0088],1,0.0088,-2.055517,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767
29071,7.2.1.1,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141L,[0.0088],1,0.0088,-2.055517,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767
29072,7.2.1.1,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141V,[0.0234],1,0.0234,-1.630784,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767


In [13]:
km_df["mutations"] = km_df["mutations"].apply(lambda x: np.nan if isinstance(x, list) and not x else x)
km_df = km_df.dropna(subset=["Sequence"]).reset_index(drop=True)
km_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean,log_km,Sequence,sequence_id
0,1.1.1.1,NADH,D4GSN2,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,,[0.071],1,0.0710,-1.148742,MRAAVLREHGEPLDVTEVPDPTCDADGVVVEVEACGICRSDWHSWM...,sequence_6039
1,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,,[0.056],1,0.0560,-1.251812,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,sequence_4093
2,1.1.1.1,Isobutyraldehyde,P39462,CC(C)C=O,wild type,,[0.26],1,0.2600,-0.585027,MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQ...,sequence_4174
3,1.1.1.1,(R)-2-butanol,P39462,CCC(C)O,wild type,,[0.41],1,0.4100,-0.387216,MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQ...,sequence_4174
4,1.1.1.1,(S)-2-butanol,P39462,CCC(C)O,wild type,,[0.012],1,0.0120,-1.920819,MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQ...,sequence_4174
...,...,...,...,...,...,...,...,...,...,...,...,...
28596,7.2.1.1,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141A,[0.015],1,0.0150,-1.823909,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767
28597,7.2.1.1,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141L,[0.0088],1,0.0088,-2.055517,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767
28598,7.2.1.1,ubiquinone-1,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)C,mutant,G141L,[0.0088],1,0.0088,-2.055517,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767
28599,7.2.1.1,ubiquinone,A5F5Y6,CC1=C(C(=O)C(=C(C1=O)OC)OC)CC=C(C)CCC=C(C)CCC=...,mutant,G141V,[0.0234],1,0.0234,-1.630784,MSSAKELKKSVLAPVLDNNPIALQVLGVCSALAVTTKLETAFVMTL...,sequence_8767


In [None]:
km_df = km_df.rename(columns={"Sequence":"sequence"})
tot_mutants = (km_df["enzyme_type"] == "mutant").sum()
print(f"Before applying the mutation we have {tot_mutants} total mutants")

Before applying the mutation we have 12404 total mutants


Create mutant sequences

In [None]:
import re

skipped_entries_out_of_range = []
skipped_entries_mismatch = []

def apply_mutations(sequence, mutations, sequence_id):
    if pd.isna(mutations):  
        return sequence # same sequence if the enzyme is a wild type

    mutated_sequence = list(sequence)  
    mutation_list = mutations.split("/")

    for mutation in mutation_list:
        match = re.match(r"([A-Z])(\d+)([A-Z])", mutation)
        if match:
            original_residue, position, new_residue = match.groups()
            position = int(position) - 1  

            if position >= len(mutated_sequence):
                print(f"Mutation position {position + 1} is out of range for Sequence ID: {sequence_id} "
                      f"(Sequence length: {len(mutated_sequence)})")
                skipped_entries_out_of_range.append(sequence_id)  
                return sequence 

            if mutated_sequence[position] == original_residue:
                mutated_sequence[position] = new_residue
            else:
                print(f"Expected {original_residue} at position {position + 1}, "
                      f"found {mutated_sequence[position]} in Sequence ID: {sequence_id}")
                skipped_entries_mismatch.append(sequence_id)  
                return sequence 

    return "".join(mutated_sequence) 

km_df["mutated_sequence"] = km_df.apply(lambda row: apply_mutations(row["sequence"], row["mutations"], row["sequence_id"]), axis=1)

skipped_total_count = len(skipped_entries_out_of_range) + len(skipped_entries_mismatch)
km_df = km_df[~km_df["sequence_id"].isin(set(skipped_entries_out_of_range + skipped_entries_mismatch))].reset_index(drop=True)

Expected G at position 211, found C in Sequence ID: sequence_7878
Expected H at position 1, found M in Sequence ID: sequence_2579
Expected H at position 1, found M in Sequence ID: sequence_2579
Expected H at position 1, found M in Sequence ID: sequence_2579
Expected H at position 1, found M in Sequence ID: sequence_2579
Expected H at position 1, found M in Sequence ID: sequence_2579
Expected H at position 1, found M in Sequence ID: sequence_2579
Expected Y at position 25, found A in Sequence ID: sequence_8793
Expected G at position 211, found D in Sequence ID: sequence_6039
Expected A at position 25, found Y in Sequence ID: sequence_8793
Expected A at position 25, found G in Sequence ID: sequence_4174
Expected V at position 254, found I in Sequence ID: sequence_5880
Mutation position 254 is out of range for Sequence ID: sequence_5895 (Sequence length: 146)
Expected V at position 254, found I in Sequence ID: sequence_5880
Mutation position 254 is out of range for Sequence ID: sequence_5

In [118]:
print(f"We dropped {len(skipped_entries_out_of_range)} entries because their mutation index was out of range,\n"
      f"and {len(skipped_entries_mismatch)} entries because their registered mutation did not match the sequence residue.\n"
      f"Total discarded entries: {skipped_total_count}")

We dropped 107 entries because their mutation index was out of range,
and 3738 entries because their registered mutation did not match the sequence residue.
Total discarded entries: 3845


In [None]:
km_df.to_pickle(join("files","dataframe_mutations.pkl"))
km_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean,log_km,sequence_id,sequence,mutated_sequence
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,,[0.056],1,0.056,-1.251812,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,,[0.01],1,0.010,-2.000000,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,,[5.7],1,5.700,0.755875,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,,[17.0],1,17.000,1.230449,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,,[0.033],1,0.033,-1.481486,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18975,6.4.1.3,butyryl-CoA,Q9X4K7,CCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP...,mutant,D422I,[0.317],1,0.317,-0.498941,sequence_794,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...
18976,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,G423A,[0.179],1,0.179,-0.747147,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...
18977,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,F417Y/G423A,[0.315],1,0.315,-0.501689,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...
18978,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,R51A,[0.309],1,0.309,-0.510042,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...


In [120]:
def check_mutation_in_row(df, row_index):
    import re

    try:
        row = df.loc[row_index]
        wt_seq = row["sequence"]
        mut_seq = row["mutated_sequence"]
        mutation = row["mutations"]

        match = re.match(r"([A-Z])(\d+)([A-Z])", mutation)
        wt_aa, pos_str, mut_aa = match.groups()
        pos = int(pos_str) - 1 

        # Extract actual amino acids
        actual_wt = wt_seq[pos]
        actual_mut = mut_seq[pos]

        # Check 
        correct = (actual_wt == wt_aa) and (actual_mut == mut_aa)

        return {
            "row_index": row_index,
            "mutation": mutation,
            "position": pos + 1,
            "expected_wt": wt_aa,
            "expected_mut": mut_aa,
            "actual_wt": actual_wt,
            "actual_mut": actual_mut,
            "mutation_correct": correct}
    
    except Exception as e:
        return f"Error processing row {row_index}: not a mutant"

In [159]:
check_mutation_in_row(km_df, 13884) # 13884 on are mutants

{'row_index': 13884,
 'mutation': 'A25Y',
 'position': 25,
 'expected_wt': 'A',
 'expected_mut': 'Y',
 'actual_wt': 'A',
 'actual_mut': 'Y',
 'mutation_correct': True}

In [155]:
slice_df = km_df.iloc[13884:]  # mutants
shortest_idx = slice_df["sequence"].str.len().idxmin()
print("Index of the shortest sequence:", shortest_idx) #to check 

Index of the shortest sequence: 17734


In [156]:
print(km_df.loc[17734,"sequence"]) # check single mutation
print(km_df.loc[17734,"mutated_sequence"])

MEKQCSKFIVSGHVQGVGFCYHTSHQGLKLGLTGYAKNLNNGDVEVVACGTPERLEELYLWLQEGPKTASVRQVRRLSSELEHDYQGFEIL
MEKQCSKFIVSGHVQGVGFRYHTSHQGLKLGLTGYAKNLNNGDVEVVACGTPERLEELYLWLQEGPKTASVRQVRRLSSELEHDYQGFEIL


In [157]:
print(km_df.loc[18977,"sequence"]) # to check double mutations
print(km_df.loc[18977,"mutated_sequence"])

MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQARHSARGKLLVRERINRLLDPGSPFLELSALAAHEVYGEEVAAAGIVAGIGRVEGVECMIVGNDATVKGGTYYPLTVKKHLRAQAIALENRLPCIYLVDSGGANLPRQDEVFPDREHFGRIFFNQANMSARGIPQIAVVMGSCTAGGAYVPAMSDETVMVREQATIFLAGPPLVKAATGEVVSAEELGGADVHCKVSGVADHYAEDDDHALAIARRCVANLNWRKQGQLQCRAPRAPLYPAEELYGVIPADSKQPYDVREVIARLVDGSEFDEFKALFGTTLVCGFAHLHGYPIAILANNGILFAEAAQKGAHFIELACQRGIPLLFLQNITGFMVGQKYEAGGIAKHGAKLVTAVACARVPKFTVLIGGSFGAGNYGMCGRAYDPRFLWMWPNARIGVMGGEQAAGVLAQVKREQAERAGQQLGVEEEAKIKAPILEQYEHQGHPYYSSARLWDDGVIDPAQTREVLALALSAALNAPIEPTAFGVFRM
MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQARHSARGKLLVRERINRLLDPGSPFLELSALAAHEVYGEEVAAAGIVAGIGRVEGVECMIVGNDATVKGGTYYPLTVKKHLRAQAIALENRLPCIYLVDSGGANLPRQDEVFPDREHFGRIFFNQANMSARGIPQIAVVMGSCTAGGAYVPAMSDETVMVREQATIFLAGPPLVKAATGEVVSAEELGGADVHCKVSGVADHYAEDDDHALAIARRCVANLNWRKQGQLQCRAPRAPLYPAEELYGVIPADSKQPYDVREVIARLVDGSEFDEFKALFGTTLVCGFAHLHGYPIAILANNGILFAEAAQKGAHFIELACQRGIPLLFLQNITGFMVGQKYEAGGIAKHGAKLVTAVACARVPKFTVLIGGSYGAGNYAMCGRAYDPRFLWMWPNARIGVMGGEQAAGVLAQVKREQAER

Write fasta files

In [None]:
import pandas as pd 

data = km_df.copy()
set_sequences = data[["sequence_id", "sequence"]].drop_duplicates()

path_to_folder = "./" 

output_file = path_to_folder + "sequences.fasta"

with open(output_file, "w") as f:
    for i, row in set_sequences.iterrows():
        f.write(f">{row["sequence_id"]}\n")  
        f.write(f"{row["sequence"]}\n")

print(f"FASTA file saved as {output_file}")

FASTA file saved as ./sequences.fasta


In [179]:
set_sequences

Unnamed: 0,sequence_id,sequence
0,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...
1,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...
2,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...
3,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...
4,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...
...,...,...
18732,sequence_1781,MSLQYHVLNSIPSTTFLSSTKTTISSSFLTISGSPLNVARDKSRSG...
18765,sequence_2966,MDKKPLNTLISATGLWMSRTGTIHKIKHHEVSRSKIYIEMACGDHL...
18799,sequence_8776,MSQPENESGLPFQEKVVPELLKHRLVTPEEYLRIHKKTVENYQEYW...
18895,sequence_4306,MNLRHIIKQNHLELLFQQGSFGLEKESQRVRHDGSVVTSAHPKAFG...


In [180]:
import pandas as pd 

data = km_df.copy()
mutants = data[data["enzyme_type"] == "mutant"]
set_mutants = mutants[["mutations", "mutated_sequence"]].drop_duplicates()
set_mutants["mut_sequence_id"] = ["sequence_" + str(i) for i in range(len(set_mutants))]

path_to_folder = "./"  #current dir
output_file = path_to_folder + "mutated_sequences.fasta"

with open(output_file, "w") as f:
    for i, row in set_mutants.iterrows():
        f.write(f">{row["mut_sequence_id"]}\n")  
        f.write(f"{row["mutated_sequence"]}\n")

print(f"FASTA file saved as {output_file}") 

FASTA file saved as ./mutated_sequences.fasta


In [181]:
set_mutants

Unnamed: 0,mutations,mutated_sequence,mut_sequence_id
13884,A25Y,MKAAVLHEFGQSLQIEEVDIPTPGYGEIVVKMQASGVCHTDLHAVE...,sequence_0
13885,C243S,MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSL...,sequence_1
13886,A25Y/Y25A/W49F/W167Y,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...,sequence_2
13887,C138A,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,sequence_3
13888,N107D,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,sequence_4
...,...,...,...
18973,D422L,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,sequence_3517
18976,G423A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,sequence_3518
18977,F417Y/G423A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,sequence_3519
18978,R51A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,sequence_3520


### Calculate enzyme representations with ESM2 model

Wild type

In [9]:
import torch 

# embeddings created on HPC
df_embeddings = pd.read_pickle(join("esm2","esm2_embeddings.pkl") )
df_embeddings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
sequence_6192,0.036674,-0.067815,0.002455,-0.041538,-0.017318,-0.117273,0.090007,-0.173834,-0.055330,0.033040,...,-0.039567,-0.054645,0.005367,-0.006952,-0.020008,0.020537,0.110500,-0.068274,0.010587,0.013656
sequence_8330,-0.032552,0.009637,-0.009749,0.049361,-0.075588,0.013456,0.051721,-0.151665,-0.013161,0.055164,...,-0.007382,0.044071,-0.113119,0.000762,-0.066038,0.159967,-0.043443,-0.108785,-0.014618,0.097665
sequence_5082,-0.013258,-0.046236,0.049629,0.049162,-0.000829,-0.037967,0.018167,0.017215,-0.008251,0.110621,...,0.003913,-0.003081,-0.149153,-0.010410,0.050787,-0.050004,0.015199,-0.204153,0.028395,0.006605
sequence_4211,0.006071,-0.052948,-0.082423,-0.119677,0.041080,-0.099962,0.095571,-0.021064,0.036480,0.011532,...,0.000552,-0.079671,0.009289,-0.058665,-0.016481,0.084028,0.095536,-0.131930,0.029925,-0.062902
sequence_160,0.062007,-0.026618,0.060861,0.003348,-0.029485,-0.071353,0.037760,-0.153333,-0.038834,0.051100,...,-0.069185,-0.131539,-0.034923,-0.046858,-0.044474,0.008002,0.073027,-0.137378,0.051336,0.058667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sequence_5024,0.117810,-0.022722,-0.024599,-0.022583,-0.060000,0.055131,-0.089376,-0.134770,0.000008,0.113228,...,0.061024,0.010855,-0.079373,0.114981,0.069105,-0.095559,0.146723,-0.143707,-0.022493,-0.011926
sequence_5149,-0.002663,-0.054902,-0.039496,0.099608,-0.060603,0.001114,0.075823,-0.012113,-0.056139,0.114702,...,-0.008789,0.029089,-0.116983,0.063976,-0.025435,-0.055899,0.076090,-0.170878,-0.018385,0.009253
sequence_2482,-0.064027,-0.137698,0.020514,-0.025607,-0.160139,0.060539,-0.019756,-0.148470,-0.012079,-0.051131,...,0.001284,-0.080147,-0.016309,0.020663,-0.067223,-0.006531,0.098793,-0.151104,0.061029,0.071473
sequence_3853,0.024918,-0.091246,0.000215,0.046039,-0.082471,-0.007025,0.060215,-0.152614,-0.003692,0.007881,...,-0.016311,-0.086545,-0.083098,-0.059569,-0.014365,0.006217,0.055936,-0.257766,0.017276,0.085767


In [183]:
sequence_ids = df_embeddings.index.tolist() 
embeddings = torch.tensor(df_embeddings.values)  

embeddings_list = embeddings.tolist()  

df_esm2_sequences = pd.DataFrame({"sequence_id": sequence_ids, "esm2": embeddings_list})
df_esm2_sequences

Unnamed: 0,sequence_id,esm2
0,sequence_6192,"[0.03667372465133667, -0.06781544536352158, 0...."
1,sequence_8330,"[-0.03255162015557289, 0.009637461043894291, -..."
2,sequence_5082,"[-0.013258455321192741, -0.04623601958155632, ..."
3,sequence_4211,"[0.006071204319596291, -0.05294770747423172, -..."
4,sequence_160,"[0.06200714781880379, -0.026618069037795067, 0..."
...,...,...
5841,sequence_5024,"[0.11780980974435806, -0.02272235043346882, -0..."
5842,sequence_5149,"[-0.0026628402993083, -0.054901983588933945, -..."
5843,sequence_2482,"[-0.06402728706598282, -0.13769802451133728, 0..."
5844,sequence_3853,"[0.024918004870414734, -0.09124615788459778, 0..."


In [185]:
wildtype_df = km_df[km_df["enzyme_type"] == "wild type"]
wildtype_df.drop(columns = ["mutations", "mutated_sequence"], inplace=True)
wildtype_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,sequence
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.01000,-2.000000,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.03300,-1.481486,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...
...,...,...,...,...,...,...,...,...,...,...,...
13879,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,MSSVNKDTIHVAERSLHKEHLTEGGNMAFHNHLNDFAHIEDPLERR...
13880,7.3.2.7,Sb3+,P30632,[Sb+3],wild type,[4e-05],1,0.00004,-4.397940,sequence_2898,MSDQLEASIKNILEQKTLKWIFVGGKGGVGKTTCSCSLAAQLSKVR...
13881,7.4.2.14,8-azido-ATP,Q03518,C1=NC(=C2C(=N1)N(C(=N2)N=[N+]=[N-])C3C(C(C(O3)...,wild type,[0.103],1,0.10300,-0.987163,sequence_3327,MASSRCPAPRGCRCLPGASLAWLGTVLLLLADWVLLRTALPRIFSL...
13882,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,MVKQSKRPDFEEELLEEQPERYPGDTNFQKWGFDLHPQVAPISGGL...


In [186]:
wildtype_df = wildtype_df.merge(df_esm2_sequences, on="sequence_id", how="left")
wildtype_df 

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,sequence,esm2
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,"[0.039680514484643936, -0.008533245883882046, ..."
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.01000,-2.000000,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,"[0.0005186749622225761, -0.0679105892777443, -..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,"[-0.009646818973124027, -0.012489932589232922,..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,"[0.007409711368381977, -0.049554433673620224, ..."
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.03300,-1.481486,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,"[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
13879,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,MSSVNKDTIHVAERSLHKEHLTEGGNMAFHNHLNDFAHIEDPLERR...,"[-0.040915098041296005, 0.0022725595626980066,..."
13880,7.3.2.7,Sb3+,P30632,[Sb+3],wild type,[4e-05],1,0.00004,-4.397940,sequence_2898,MSDQLEASIKNILEQKTLKWIFVGGKGGVGKTTCSCSLAAQLSKVR...,"[0.03835789114236832, -0.16177919507026672, -0..."
13881,7.4.2.14,8-azido-ATP,Q03518,C1=NC(=C2C(=N1)N(C(=N2)N=[N+]=[N-])C3C(C(C(O3)...,wild type,[0.103],1,0.10300,-0.987163,sequence_3327,MASSRCPAPRGCRCLPGASLAWLGTVLLLLADWVLLRTALPRIFSL...,"[0.027131887152791023, -0.0465911403298378, -0..."
13882,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,MVKQSKRPDFEEELLEEQPERYPGDTNFQKWGFDLHPQVAPISGGL...,"[0.010143788531422615, -0.04845583438873291, -..."


In [187]:
wildtype_df.to_pickle(join("files","final_wildtype_df.pkl"))

Mutants

In [188]:
# embeddings created on HPC
df_embeddings = pd.read_pickle(join("esm2","esm2_mutated_embeddings.pkl") )

sequence_ids = df_embeddings.index.tolist() 
embeddings = torch.tensor(df_embeddings.values)  

embeddings_list = embeddings.tolist()  

df_esm2_mutated_sequences = pd.DataFrame({"sequence_id": sequence_ids, "esm2_mutations": embeddings_list})
df_esm2_mutated_sequences

Unnamed: 0,sequence_id,esm2_mutations
0,sequence_730,"[-0.020675646141171455, -0.044364944100379944,..."
1,sequence_160,"[-0.045619722455739975, -0.07153552770614624, ..."
2,sequence_1761,"[0.06175371631979942, -0.037848398089408875, -..."
3,sequence_1962,"[0.02871672809123993, 0.029217060655355453, -0..."
4,sequence_2909,"[0.015058504417538643, -0.10007670521736145, -..."
...,...,...
3517,sequence_1919,"[0.07903542369604111, -0.007479571737349033, -..."
3518,sequence_1150,"[-0.015877744182944298, 0.0044091977179050446,..."
3519,sequence_475,"[-0.007585270330309868, -0.03660120069980621, ..."
3520,sequence_2769,"[0.06510938704013824, -0.06140793114900589, 0...."


In [189]:
df_esm2_mutated_sequences = df_esm2_mutated_sequences.rename(columns={"sequence_id":"mut_sequence_id"})
df_esm2_mutated_sequences

Unnamed: 0,mut_sequence_id,esm2_mutations
0,sequence_730,"[-0.020675646141171455, -0.044364944100379944,..."
1,sequence_160,"[-0.045619722455739975, -0.07153552770614624, ..."
2,sequence_1761,"[0.06175371631979942, -0.037848398089408875, -..."
3,sequence_1962,"[0.02871672809123993, 0.029217060655355453, -0..."
4,sequence_2909,"[0.015058504417538643, -0.10007670521736145, -..."
...,...,...
3517,sequence_1919,"[0.07903542369604111, -0.007479571737349033, -..."
3518,sequence_1150,"[-0.015877744182944298, 0.0044091977179050446,..."
3519,sequence_475,"[-0.007585270330309868, -0.03660120069980621, ..."
3520,sequence_2769,"[0.06510938704013824, -0.06140793114900589, 0...."


In [190]:
mutant_df = km_df[km_df["enzyme_type"] == "mutant"]
mutant_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean,log_km,sequence_id,sequence,mutated_sequence
13884,1.1.1.1,benzyl alcohol,Q8GIX7,C1=CC=C(C=C1)CO,mutant,A25Y,[16.0],1,16.000,1.204120,sequence_3432,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...,MKAAVLHEFGQSLQIEEVDIPTPGYGEIVVKMQASGVCHTDLHAVE...
13885,1.1.1.1,ethanol,P06525,CCO,mutant,C243S,[0.251],1,0.251,-0.600326,sequence_4444,MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSL...,MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSL...
13886,1.1.1.1,NAD+,Q8GIX7,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,A25Y/Y25A/W49F/W167Y,[0.5],1,0.500,-0.301030,sequence_3432,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...
13887,1.1.1.10,diacetyl,Q7Z4W1,CC(=O)C(=O)C,mutant,C138A,[0.22],1,0.220,-0.657577,sequence_3160,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...
13888,1.1.1.10,diacetyl,Q7Z4W1,CC(=O)C(=O)C,mutant,N107D,[26.0],1,26.000,1.414973,sequence_3160,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18975,6.4.1.3,butyryl-CoA,Q9X4K7,CCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP...,mutant,D422I,[0.317],1,0.317,-0.498941,sequence_794,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...
18976,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,G423A,[0.179],1,0.179,-0.747147,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...
18977,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,F417Y/G423A,[0.315],1,0.315,-0.501689,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...
18978,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,R51A,[0.309],1,0.309,-0.510042,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...


In [191]:
mutant_dataframe = set_mutants.merge(df_esm2_mutated_sequences, on="mut_sequence_id", how="left")
mutant_df = mutant_df.merge(mutant_dataframe[["mutated_sequence","esm2_mutations"]], on="mutated_sequence", how="left")
mutant_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,mutations,km,km_count,km_gmean,log_km,sequence_id,sequence,mutated_sequence,esm2_mutations
0,1.1.1.1,benzyl alcohol,Q8GIX7,C1=CC=C(C=C1)CO,mutant,A25Y,[16.0],1,16.000,1.204120,sequence_3432,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...,MKAAVLHEFGQSLQIEEVDIPTPGYGEIVVKMQASGVCHTDLHAVE...,"[-0.014810403808951378, -0.09746553748846054, ..."
1,1.1.1.1,ethanol,P06525,CCO,mutant,C243S,[0.251],1,0.251,-0.600326,sequence_4444,MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSL...,MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSL...,"[-0.024963846430182457, -0.05664348974823952, ..."
2,1.1.1.1,NAD+,Q8GIX7,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,mutant,A25Y/Y25A/W49F/W167Y,[0.5],1,0.500,-0.301030,sequence_3432,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...,MKAAVLHEFGQSLQIEEVDIPTPGAGEIVVKMQASGVCHTDLHAVE...,"[-0.014488689601421356, -0.08690503239631653, ..."
3,1.1.1.10,diacetyl,Q7Z4W1,CC(=O)C(=O)C,mutant,C138A,[0.22],1,0.220,-0.657577,sequence_3160,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,"[-0.0629732757806778, -0.07528746873140335, 0...."
4,1.1.1.10,diacetyl,Q7Z4W1,CC(=O)C(=O)C,mutant,N107D,[26.0],1,26.000,1.414973,sequence_3160,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,MELFLAGRRVLVTGAGKGIGRGTVQALHATGARVVAVSRTQADLDS...,"[-0.0645524114370346, -0.06851803511381149, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,6.4.1.3,butyryl-CoA,Q9X4K7,CCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP...,mutant,D422I,[0.317],1,0.317,-0.498941,sequence_794,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,"[-0.020847810432314873, -0.027564095333218575,..."
5101,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,G423A,[0.179],1,0.179,-0.747147,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.015681099146604538, -0.12168556451797485, -..."
5102,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,F417Y/G423A,[0.315],1,0.315,-0.501689,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.01609487272799015, -0.12138516455888748, -0..."
5103,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,R51A,[0.309],1,0.309,-0.510042,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.017652757465839386, -0.12387675046920776, -..."


In [192]:
mutant_df.to_pickle(join("files","final_mutant_df.pkl"))

Concatenate wiltype and mutant dataframes

In [None]:
final_km_df = pd.concat([wildtype_df, mutant_df])
final_km_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,sequence,esm2,mutations,mutated_sequence,esm2_mutations
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.056,-1.251812,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,"[0.039680514484643936, -0.008533245883882046, ...",,,
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.010,-2.000000,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,"[0.0005186749622225761, -0.0679105892777443, -...",,,
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.700,0.755875,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,"[-0.009646818973124027, -0.012489932589232922,...",,,
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.000,1.230449,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,"[0.007409711368381977, -0.049554433673620224, ...",,,
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.033,-1.481486,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,"[-0.014221813529729843, -0.03296571969985962, ...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5100,6.4.1.3,butyryl-CoA,Q9X4K7,CCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP...,mutant,[0.317],1,0.317,-0.498941,sequence_794,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,,D422I,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,"[-0.020847810432314873, -0.027564095333218575,..."
5101,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,[0.179],1,0.179,-0.747147,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,G423A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.015681099146604538, -0.12168556451797485, -..."
5102,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,[0.315],1,0.315,-0.501689,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,F417Y/G423A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.01609487272799015, -0.12138516455888748, -0..."
5103,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,[0.309],1,0.309,-0.510042,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,R51A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.017652757465839386, -0.12387675046920776, -..."


In [194]:
final_km_df.reset_index(inplace=True)
final_km_df.to_pickle(join("files","final_km_df.pkl"))
final_km_df

Unnamed: 0,index,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,sequence,esm2,mutations,mutated_sequence,esm2_mutations
0,0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.056,-1.251812,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,"[0.039680514484643936, -0.008533245883882046, ...",,,
1,1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.010,-2.000000,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,"[0.0005186749622225761, -0.0679105892777443, -...",,,
2,2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.700,0.755875,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,"[-0.009646818973124027, -0.012489932589232922,...",,,
3,3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.000,1.230449,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,"[0.007409711368381977, -0.049554433673620224, ...",,,
4,4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.033,-1.481486,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,"[-0.014221813529729843, -0.03296571969985962, ...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18984,5100,6.4.1.3,butyryl-CoA,Q9X4K7,CCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP...,mutant,[0.317],1,0.317,-0.498941,sequence_794,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,,D422I,MSEPEEQQPDIHTTAGKLADLRRRIEEATHAGSARAVEKQHAKGKL...,"[-0.020847810432314873, -0.027564095333218575,..."
18985,5101,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,[0.179],1,0.179,-0.747147,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,G423A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.015681099146604538, -0.12168556451797485, -..."
18986,5102,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,[0.315],1,0.315,-0.501689,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,F417Y/G423A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.01609487272799015, -0.12138516455888748, -0..."
18987,5103,6.4.1.4,3-methylcrotonoyl-CoA,Q9I297,CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)...,mutant,[0.309],1,0.309,-0.510042,sequence_8872,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,,R51A,MAILHTQINPRSAEFAANAATMLEQVNALRTLLGRIHEGGGSAAQA...,"[0.017652757465839386, -0.12387675046920776, -..."


In [None]:
final_km_df= pd.read_pickle(join("files","final_km_df.pkl"))

Add ChemBerta2

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-10M-MTR")

def ChemBERTa2(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, output_hidden_states=True)["hidden_states"][-1]
    embedding = outputs[0][1:-1].mean(dim=0).detach().numpy()
    return embedding

df = pd.read_pickle(join("files","final_wildtype_df.pkl"))
df["ChemBERTa2"] = df["smiles"].apply(ChemBERTa2)
df

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence,esm2,ChemBERTa2
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,"[0.039680514484643936, -0.008533245883882046, ...","[-0.117441595, -0.47532502, -0.27592745, 0.080..."
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.01000,-2.000000,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,"[0.0005186749622225761, -0.0679105892777443, -...","[0.717974, -0.5367382, 0.14527474, 0.5770402, ..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,"[-0.009646818973124027, -0.012489932589232922,...","[-0.117441595, -0.47532502, -0.27592745, 0.080..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,"[0.007409711368381977, -0.049554433673620224, ...","[-0.117441595, -0.47532502, -0.27592745, 0.080..."
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.03300,-1.481486,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,"[-0.014221813529729843, -0.03296571969985962, ...","[-0.086237095, 0.48899588, 0.5615026, -0.64097..."
...,...,...,...,...,...,...,...,...,...,...,...,...
13888,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,MSSVNKDTIHVAERSLHKEHLTEGGNMAFHNHLNDFAHIEDPLERR...,"[-0.040915098041296005, 0.0022725595626980066,...","[0.43881714, -0.47575158, 0.30838424, 0.868043..."
13889,7.3.2.7,Sb3+,P30632,[Sb+3],wild type,[4e-05],1,0.00004,-4.397940,MSDQLEASIKNILEQKTLKWIFVGGKGGVGKTTCSCSLAAQLSKVR...,"[0.03835789114236832, -0.16177919507026672, -0...","[-0.059656713, -0.74262524, -0.1560827, 0.3256..."
13890,7.4.2.14,8-azido-ATP,Q03518,C1=NC(=C2C(=N1)N(C(=N2)N=[N+]=[N-])C3C(C(C(O3)...,wild type,[0.103],1,0.10300,-0.987163,MASSRCPAPRGCRCLPGASLAWLGTVLLLLADWVLLRTALPRIFSL...,"[0.027131887152791023, -0.0465911403298378, -0...","[0.9551751, -0.5000047, -0.34930488, 0.9231899..."
13891,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,MVKQSKRPDFEEELLEEQPERYPGDTNFQKWGFDLHPQVAPISGGL...,"[0.010143788531422615, -0.04845583438873291, -...","[-0.044458658, -0.50271213, -0.06864436, 0.128..."


In [7]:
df["esm2_ChemBERTa2"] = df.apply(lambda row: list(row["esm2"]) + list(row["ChemBERTa2"])
                                                           
    if isinstance(row["esm2"], (list, np.ndarray)) and isinstance(row["ChemBERTa2"], (list, np.ndarray))
    else None, axis=1)

df = df[df["esm2_ChemBERTa2"].notna()]
df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence,esm2,ChemBERTa2,esm2_ChemBERTa2
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,"[0.039680514484643936, -0.008533245883882046, ...","[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.039680514484643936, -0.008533245883882046, ..."
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.01000,-2.000000,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,"[0.0005186749622225761, -0.0679105892777443, -...","[0.717974, -0.5367382, 0.14527474, 0.5770402, ...","[0.0005186749622225761, -0.0679105892777443, -..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,"[-0.009646818973124027, -0.012489932589232922,...","[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.009646818973124027, -0.012489932589232922,..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,"[0.007409711368381977, -0.049554433673620224, ...","[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.007409711368381977, -0.049554433673620224, ..."
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.03300,-1.481486,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,"[-0.014221813529729843, -0.03296571969985962, ...","[-0.086237095, 0.48899588, 0.5615026, -0.64097...","[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13888,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,MSSVNKDTIHVAERSLHKEHLTEGGNMAFHNHLNDFAHIEDPLERR...,"[-0.040915098041296005, 0.0022725595626980066,...","[0.43881714, -0.47575158, 0.30838424, 0.868043...","[-0.040915098041296005, 0.0022725595626980066,..."
13889,7.3.2.7,Sb3+,P30632,[Sb+3],wild type,[4e-05],1,0.00004,-4.397940,MSDQLEASIKNILEQKTLKWIFVGGKGGVGKTTCSCSLAAQLSKVR...,"[0.03835789114236832, -0.16177919507026672, -0...","[-0.059656713, -0.74262524, -0.1560827, 0.3256...","[0.03835789114236832, -0.16177919507026672, -0..."
13890,7.4.2.14,8-azido-ATP,Q03518,C1=NC(=C2C(=N1)N(C(=N2)N=[N+]=[N-])C3C(C(C(O3)...,wild type,[0.103],1,0.10300,-0.987163,MASSRCPAPRGCRCLPGASLAWLGTVLLLLADWVLLRTALPRIFSL...,"[0.027131887152791023, -0.0465911403298378, -0...","[0.9551751, -0.5000047, -0.34930488, 0.9231899...","[0.027131887152791023, -0.0465911403298378, -0..."
13891,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,MVKQSKRPDFEEELLEEQPERYPGDTNFQKWGFDLHPQVAPISGGL...,"[0.010143788531422615, -0.04845583438873291, -...","[-0.044458658, -0.50271213, -0.06864436, 0.128...","[0.010143788531422615, -0.04845583438873291, -..."


In [8]:
df.to_pickle(join("files", "df_PubChem.pkl"))

### b) Retrive SMILES from BioServices and RDKit

In [2]:
wildtype_df = pd.read_pickle(join("files", "df_PubChem.pkl"))
wildtype_df

Unnamed: 0,ec,substrate,uniprot,smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,sequence,esm2,ChemBERTa2,esm2_ChemBERTa2
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,MRAIVLERFGIENIKIEDIDDESPGIPVKITMAGLNPVDYSTVNGN...,"[0.039680514484643936, -0.008533245883882046, ...","[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[0.039680514484643936, -0.008533245883882046, ..."
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.01000,-2.000000,sequence_7365,MRAVVFENKERVAVKEVNAPRLQHPLDALVRVHLAGICGSDLHLYH...,"[0.0005186749622225761, -0.0679105892777443, -...","[-0.10073212, 0.0, 0.40556788, -0.33414266, 0....","[0.0005186749622225761, -0.0679105892777443, -..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,MKAALVYEPLGNENLRIEDVDDPKVLDGQVLIEVRKAGLNPVDYNT...,"[-0.009646818973124027, -0.012489932589232922,...","[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[-0.009646818973124027, -0.012489932589232922,..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,MFRLARAQTSITTTSKALGGSRRLFVRLNSSFAIPESQKGVIFYEN...,"[0.007409711368381977, -0.049554433673620224, ...","[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[0.007409711368381977, -0.049554433673620224, ..."
4,1.1.1.1,2-hydroxymethylpyrene,P08319,C1=CC2=C3C(=C1)C=CC4=C3C(=CC(=C4)CO)C=C2,wild type,[0.033],1,0.03300,-1.481486,sequence_750,MGTKGKVIKCKAAIAWEAGKPLCIEEVEVAPPKAHEVRIQIIATSL...,"[-0.014221813529729843, -0.03296571969985962, ...","[0.08988855, 0.0, 0.32113054, 0.5458949, -0.06...","[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13879,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,MSSVNKDTIHVAERSLHKEHLTEGGNMAFHNHLNDFAHIEDPLERR...,"[-0.040915098041296005, 0.0022725595626980066,...","[-0.4070543, 0.0, 0.227839, -0.12523368, 0.347...","[-0.040915098041296005, 0.0022725595626980066,..."
13880,7.3.2.7,Sb3+,P30632,[Sb+3],wild type,[4e-05],1,0.00004,-4.397940,sequence_2898,MSDQLEASIKNILEQKTLKWIFVGGKGGVGKTTCSCSLAAQLSKVR...,"[0.03835789114236832, -0.16177919507026672, -0...","[-0.14727125, 0.0, 0.14913563, -0.6732974, -0....","[0.03835789114236832, -0.16177919507026672, -0..."
13881,7.4.2.14,8-azido-ATP,Q03518,C1=NC(=C2C(=N1)N(C(=N2)N=[N+]=[N-])C3C(C(C(O3)...,wild type,[0.103],1,0.10300,-0.987163,sequence_3327,MASSRCPAPRGCRCLPGASLAWLGTVLLLLADWVLLRTALPRIFSL...,"[0.027131887152791023, -0.0465911403298378, -0...","[-0.069181144, 0.0, 0.031216782, 0.011584956, ...","[0.027131887152791023, -0.0465911403298378, -0..."
13882,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,MVKQSKRPDFEEELLEEQPERYPGDTNFQKWGFDLHPQVAPISGGL...,"[0.010143788531422615, -0.04845583438873291, -...","[-0.16678084, 0.0, -0.30256566, 0.056213908, -...","[0.010143788531422615, -0.04845583438873291, -..."


Get substrate KEGG IDs

In [None]:
from bioservices import KEGG
import pandas as pd
import numpy as np

kegg_con = KEGG()

substrates_df = pd.DataFrame({"substrate": wildtype_df["substrate"].dropna().unique()})
substrates_df["kegg_id"] = np.nan

# Search KEGG for each substrate
for i in substrates_df.index:
    name = substrates_df.at[i, "substrate"]
    try:
        results = kegg_con.find("compound", name)
        if results:
            kegg_id = results.split("\t")[0].split(":")[1]
            substrates_df.at[i, "kegg_id"] = kegg_id
    except Exception as e:
        print(f"[KEGG ID] Failed for {name}: {e}")

In [None]:
substrates_df = substrates_df.dropna()
substrates_df.to_pickle(join("files","subst_df.pkl"))

Map KEGG IDs to InChIs

In [6]:
from rdkit import Chem

substrates_df = pd.read_pickle(join("files","subst_df.pkl"))
# Map InchIs from KEGG Ids (mol files)
substrates_df["InChI"] = pd.NA

for ind in substrates_df.index:
    if pd.isnull(substrates_df["InChI"][ind]):
        try:
            mol = Chem.MolFromMolFile(join("mol-files", substrates_df["kegg_id"][ind] + '.mol'))
            substrates_df["InChI"][ind] = Chem.MolToInchi(mol)
        except:
            pass

substrates_df = substrates_df.loc[~pd.isnull(substrates_df["InChI"])]
substrates_df























[15:57:54] ERROR: Unknown element(s): *









[15:57:57] ERROR: Unknown element(s): *
















[15:58:03] ERROR: Unknown element(s): *

[15:58:03] ERROR: Unknown element(s): *










[15:58:06] ERROR: Unknown element(s): *










[15:58:10] ERROR: Unknown element(s): *






[15:58:12] ERROR: Unknown element(s): *



[15:58:17] ERROR: Unknown element(s): *

[15:58:17] ERROR: Unknown element(s): *









[15:58:19] ERROR: Unknown element(s): *

[15:58:19] ERROR: Unsupported in this mode element '*'


[15:58:19] ERROR: Unknown element(s): *




[15:58:20] ERROR: Unknown element(s): *

[15:58:20] ERROR: Unknown element(s): *



[15:58:20] ERROR: Unknown element(s): *

[15:58:20] ERROR: Unknown element(s): *

[15:58:20] ERROR: Unknown element(s): *


[15:58:21] ERROR: Unknown element(s): *

[15:58:21] ERROR: Unknown element(s): *

[15:58:22] ERROR: Unsupported in this mode element '*'

[15:58:22] ERROR: Unknown element(s): *

[15:58:22] ERROR: Unsupp

Unnamed: 0,substrate,kegg_id,InChI
0,ethanol,C00132,"InChI=1S/CH4O/c1-2/h2H,1H3"
1,NADH,C00004,InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17...
5,acetaldehyde,C00084,"InChI=1S/C2H4O/c1-2-3/h2H,1H3"
7,NAD+,C00003,InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17...
8,Cyclohexanol,C00854,"InChI=1S/C6H12O/c7-6-4-2-1-3-5-6/h6-7H,1-5H2"
...,...,...,...
3888,Hydrogenobyrinic acid,C06399,InChI=1S/C45H60N4O14/c1-21-36-24(10-13-30(52)5...
3889,3-methylcrotonoyl-CoA,C03069,InChI=1S/C26H42N7O17P3S/c1-14(2)9-17(35)54-8-7...
3894,caldariellaquinol,C20623,InChI=1S/C39H68O2S2/c1-28(2)14-9-15-29(3)16-10...
3895,decylubiquinol,C15495,InChI=1S/C19H32O4/c1-5-6-7-8-9-10-11-12-13-15-...


Map InChIs to SMILES

In [7]:
from rdkit import Chem

# map SMILES from InChIs
def inchi_to_smiles(inchi):
    try:
        mol = Chem.MolFromInchi(inchi)
        return Chem.MolToSmiles(mol)
    except:
        return None
    
substrates_df["SMILES"] = substrates_df["InChI"].apply(inchi_to_smiles)
substrates_df

[15:59:07] ERROR: 

[15:59:07] ERROR: 

[15:59:08] ERROR: 

[15:59:08] ERROR: 

[15:59:08] Cannot assign bond directions!
[15:59:08] ERROR: 

[15:59:08] Cannot assign bond directions!
[15:59:08] Cannot assign bond directions!
[15:59:08] ERROR: 

[15:59:08] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] Explicit valence for atom # 0 Cl, 3, is greater than permitted
[15:59:09] ERROR: Explicit valence for atom # 0 Cl, 3, is greater than permitted

[15:59:09] Cannot assign bond directions!
[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 

[15:59:09] ERROR: 


[15:59:10] Cann

Unnamed: 0,substrate,kegg_id,InChI,SMILES
0,ethanol,C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO
1,NADH,C00004,InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17...,N=C(O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...
5,acetaldehyde,C00084,"InChI=1S/C2H4O/c1-2-3/h2H,1H3",CC=O
7,NAD+,C00003,InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17...,N=C(O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)...
8,Cyclohexanol,C00854,"InChI=1S/C6H12O/c7-6-4-2-1-3-5-6/h6-7H,1-5H2",OC1CCCCC1
...,...,...,...,...
3888,Hydrogenobyrinic acid,C06399,InChI=1S/C45H60N4O14/c1-21-36-24(10-13-30(52)5...,C/C1=C2/NC([C@H](CC(=O)O)[C@@]2(C)CCC(=O)O)[C@...
3889,3-methylcrotonoyl-CoA,C03069,InChI=1S/C26H42N7O17P3S/c1-14(2)9-17(35)54-8-7...,CC(C)=CC(=O)SCCN=C(O)CCN=C(O)[C@H](O)C(C)(C)CO...
3894,caldariellaquinol,C20623,InChI=1S/C39H68O2S2/c1-28(2)14-9-15-29(3)16-10...,CSc1c(CCC(C)CCCC(C)CCCC(C)CCCC(C)CCCC(C)CCCC(C...
3895,decylubiquinol,C15495,InChI=1S/C19H32O4/c1-5-6-7-8-9-10-11-12-13-15-...,CCCCCCCCCCc1c(C)c(O)c(OC)c(OC)c1O


Map SMILES from KEGG IDs 

In [None]:
from bioservices import ChEBI
from bioservices import KEGG

chebi_con = ChEBI()
kegg_con = KEGG()
substrates_df["smiles"] = np.nan

for i in substrates_df.index:
    kegg_id = substrates_df.at[i, "kegg_id"]
    if pd.notna(kegg_id):
        try:
            kegg_entry = kegg_con.parse(kegg_con.get(kegg_id))
            chebi_id = kegg_entry.get("DBLINKS", {}).get("ChEBI")
            if chebi_id:
                if isinstance(chebi_id, list):
                    chebi_id = chebi_id[0]
                chebi_entry = chebi_con.getCompleteEntity("CHEBI:" + chebi_id)
                substrates_df.at[i, "smiles"] = chebi_entry.smiles
        except Exception as e:
            print(f"[SMILES] Failed for {kegg_id}: {e}")
        

Get ChemBERTa2 numerical representations

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM


tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-10M-MTR")

def ChemBERTa2(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, output_hidden_states=True)["hidden_states"][-1]
    embedding = outputs[0][1:-1].mean(dim=0).detach().numpy()
    return embedding

substrates_df["inchis_chemberta2"] = substrates_df["SMILES"].apply(ChemBERTa2)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM


tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-10M-MTR")

def ChemBERTa2(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, output_hidden_states=True)["hidden_states"][-1]
    embedding = outputs[0][1:-1].mean(dim=0).detach().numpy()
    return embedding

substrates_df["kegg_chemberta2"] = substrates_df["smiles"].apply(ChemBERTa2)

In [None]:
chemberta2_df = substrates_df.copy()
chemberta2_df = chemberta2_df.replace({None: pd.NA})
chemberta2_df = chemberta2_df.dropna()

Merge dataframe with original dataset

In [None]:
esm2_ChemBERTa2 = wildtype_df.merge(chemberta2_df, on="substrate", how="left")
esm2_ChemBERTa2 = esm2_ChemBERTa2.rename(columns={"smiles_x": "pubchem_smiles","ChemBERTa2": "pubchem_chemberta2","SMILES": "inchis_smiles", "smiles_y": "kegg_smiles", "esm2_ChemBERTa2": "pubchem_fp"})

Concatenate representations

In [None]:
esm2_ChemBERTa2["inchis_fp"] = esm2_ChemBERTa2.apply(lambda row: list(row["esm2"]) + list(row["inchis_chemberta2"])
                                                           
    if isinstance(row["esm2"], (list, np.ndarray)) and isinstance(row["inchis_chemberta2"], (list, np.ndarray))
    else None, axis=1)

esm2_ChemBERTa2["kegg_fp"] = esm2_ChemBERTa2.apply(lambda row: list(row["esm2"]) + list(row["kegg_chemberta2"])
                                                           
    if isinstance(row["esm2"], (list, np.ndarray)) and isinstance(row["kegg_chemberta2"], (list, np.ndarray))
    else None, axis=1)

esm2_ChemBERTa2 = esm2_ChemBERTa2.replace({None: pd.NA}).dropna()

In [None]:
esm2_ChemBERTa2.to_pickle(join("files", "df_biocervices.pkl"))

### c) Retrieve SMILES from MetaboAnalyst 


https://www.metaboanalyst.ca/faces/upload/ConvertView.xhtml

In [2]:
df_bio = pd.read_pickle(join("files", "df_biocervices.pkl"))
df_bio

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,pubchem_chemberta2,pubchem_fp,kegg_id,InChI,inchis_smiles,kegg_smiles,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[0.039680514484643936, -0.008533245883882046, ...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ..."
1,1.1.1.1,NADH,B2ZRE3,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,wild type,[0.01],1,0.01000,-2.000000,sequence_7365,...,"[-0.10073212, 0.0, 0.40556788, -0.33414266, 0....","[0.0005186749622225761, -0.0679105892777443, -...",C00004,InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17...,N=C(O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,NC(=O)C1=CN(C=CC1)[C@@H]1O[C@H](COP(O)(=O)OP(O...,"[0.54406685, -0.34616303, -0.038030434, 0.4146...","[0.46189854, -0.30568308, 0.3288562, 0.6791551...","[0.0005186749622225761, -0.0679105892777443, -...","[0.0005186749622225761, -0.0679105892777443, -..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[-0.009646818973124027, -0.012489932589232922,...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[0.007409711368381977, -0.049554433673620224, ...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ..."
7,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"[0.08433352, 0.0, 0.023516523, -0.4512267, -0....","[-0.014221813529729843, -0.03296571969985962, ...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13877,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,"[-0.4070543, 0.0, 0.227839, -0.12523368, 0.347...","[-0.029194621369242668, -0.03175187110900879, ...",C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ..."
13878,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,"[-0.4070543, 0.0, 0.227839, -0.12523368, 0.347...","[-0.03648005425930023, -0.04548962414264679, -...",C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -..."
13879,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,"[-0.4070543, 0.0, 0.227839, -0.12523368, 0.347...","[-0.040915098041296005, 0.0022725595626980066,...",C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,..."
13882,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"[-0.16678084, 0.0, -0.30256566, 0.056213908, -...","[0.010143788531422615, -0.04845583438873291, -...",C00318,"InChI=1S/C7H15NO3/c1-8(2,3)5-6(9)4-7(10)11/h6,...",C[N+](C)(C)C[C@H](O)CC(=O)[O-],C[N+](C)(C)C[C@H](O)CC([O-])=O,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -..."


Get dataframe from mapped substrates from Metaboanalyst

In [3]:
directory = "metaboanalyst"

csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

df_list = [pd.read_csv(os.path.join(directory, file)) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

df_sub = df[df["Comment"] != 0]
df_sub

Unnamed: 0,Query,Match,HMDB,PubChem,ChEBI,KEGG,METLIN,SMILES,Comment
0,NADH,NADH,HMDB0001487,439153.0,16908,C00004,3687.0,NC(=O)C1=CN(C=CC1)[C@@H]1O[C@H](COP(O)(=O)OP(O...,1
1,NAD,NAD,HMDB0000902,5892.0,44215,C00003,5858.0,NC(=O)C1=C[N+](=CC=C1)[C@@H]1O[C@H](COP([O-])(...,1
2,ethanol,Ethanol,HMDB0000108,702.0,16236,C00469,3203.0,CCO,1
3,1-Pentanol,1-Pentanol,HMDB0013036,6276.0,44884,C16834,,CCCCCO,1
4,Isobutyraldehyde,2-Methylpropanal,HMDB0031243,6561.0,48943,C03219,,CC(C)C=O,1
...,...,...,...,...,...,...,...,...,...
8446,bleomycin A2,Bleomycin,HMDB0014435,5360373.0,22907.0,C06854,,C[C@@H](O)[C@H](NC(=O)[C@@H](C)[C@H](O)[C@@H](...,1
8521,hemoglobin,Hemoglobin,METPA0213,,5656.0,C01708,,,1
8575,nociceptin,Nociceptin,,47205354.0,80266.0,C16044,,,1
8576,neurokinin B,Neurokinin B,,55583.0,80312.0,,,CSCCC(NC(=O)C(CC(C)C)NC(=O)CNC(=O)C(NC(=O)C(CC...,1


In [4]:
df_sub.rename(columns={"Query": "substrate"}, inplace=True) 
df_sub["substrate"] = df_sub["substrate"].str.lower()
df_sub.drop(columns=["METLIN", "Comment"], inplace=True)
df_sub

Unnamed: 0,substrate,Match,HMDB,PubChem,ChEBI,KEGG,SMILES
0,nadh,NADH,HMDB0001487,439153.0,16908,C00004,NC(=O)C1=CN(C=CC1)[C@@H]1O[C@H](COP(O)(=O)OP(O...
1,nad,NAD,HMDB0000902,5892.0,44215,C00003,NC(=O)C1=C[N+](=CC=C1)[C@@H]1O[C@H](COP([O-])(...
2,ethanol,Ethanol,HMDB0000108,702.0,16236,C00469,CCO
3,1-pentanol,1-Pentanol,HMDB0013036,6276.0,44884,C16834,CCCCCO
4,isobutyraldehyde,2-Methylpropanal,HMDB0031243,6561.0,48943,C03219,CC(C)C=O
...,...,...,...,...,...,...,...
8446,bleomycin a2,Bleomycin,HMDB0014435,5360373.0,22907.0,C06854,C[C@@H](O)[C@H](NC(=O)[C@@H](C)[C@H](O)[C@@H](...
8521,hemoglobin,Hemoglobin,METPA0213,,5656.0,C01708,
8575,nociceptin,Nociceptin,,47205354.0,80266.0,C16044,
8576,neurokinin b,Neurokinin B,,55583.0,80312.0,,CSCCC(NC(=O)C(CC(C)C)NC(=O)CNC(=O)C(NC(=O)C(CC...


In [5]:
df_bio = df_bio.merge(df_sub, on = "substrate", how="left")
df_bio = df_bio.dropna(subset=["SMILES"])
df_bio = df_bio.rename(columns={"SMILES":"meta_smiles"})
df_bio

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp,Match,HMDB,PubChem,ChEBI,KEGG,meta_smiles
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",Ethanol,HMDB0000108,702.0,16236,C00469,CCO
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",Ethanol,HMDB0000108,702.0,16236,C00469,CCO
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",Ethanol,HMDB0000108,702.0,16236,C00469,CCO
4,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",Ethanol,HMDB0000108,702.0,16236,C00469,CCO
5,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",Acetaldehyde,HMDB0000990,177.0,15343,C00084,CC=O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",Phosphate,HMDB0001429,1004.0,26078,C00009,OP(O)(O)=O
7815,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",Phosphate,HMDB0001429,1004.0,26078,C00009,OP(O)(O)=O
7816,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",Phosphate,HMDB0001429,1004.0,26078,C00009,OP(O)(O)=O
7817,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",Betaine,HMDB0000043,247.0,41139,C00719,C[N+](C)(C)CC(O)=O


In [6]:
df_bio = df_bio.drop(columns={"Match","HMDB","PubChem","ChEBI",	"KEGG"})
df_bio

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,pubchem_fp,kegg_id,InChI,inchis_smiles,kegg_smiles,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp,meta_smiles
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"[0.039680514484643936, -0.008533245883882046, ...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",CCO
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"[-0.009646818973124027, -0.012489932589232922,...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",CCO
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"[0.007409711368381977, -0.049554433673620224, ...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",CCO
4,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"[-0.014221813529729843, -0.03296571969985962, ...",C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CCO
5,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,"[-0.014221813529729843, -0.03296571969985962, ...",C00084,"InChI=1S/C2H4O/c1-2-3/h2H,1H3",CC=O,[H]C(C)=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CC=O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,"[-0.029194621369242668, -0.03175187110900879, ...",C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",OP(O)(O)=O
7815,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,"[-0.03648005425930023, -0.04548962414264679, -...",C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",OP(O)(O)=O
7816,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,"[-0.040915098041296005, 0.0022725595626980066,...",C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",OP(O)(O)=O
7817,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"[0.010143788531422615, -0.04845583438873291, -...",C00318,"InChI=1S/C7H15NO3/c1-8(2,3)5-6(9)4-7(10)11/h6,...",C[N+](C)(C)C[C@H](O)CC(=O)[O-],C[N+](C)(C)C[C@H](O)CC([O-])=O,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",C[N+](C)(C)CC(O)=O


Generate ChemBERTa2 substrate representations

In [7]:
from transformers import AutoTokenizer, AutoModelForMaskedLM


tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-10M-MTR")

def ChemBERTa2(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, output_hidden_states=True)["hidden_states"][-1]
    embedding = outputs[0][1:-1].mean(dim=0).detach().numpy()
    return embedding

df_bio["meta_chemberta2"] = df_bio["meta_smiles"].apply(ChemBERTa2)
df_bio

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,kegg_id,InChI,inchis_smiles,kegg_smiles,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp,meta_smiles,meta_chemberta2
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
4,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,C00132,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
5,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,C00084,"InChI=1S/C2H4O/c1-2-3/h2H,1H3",CC=O,[H]C(C)=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147..."
7815,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147..."
7816,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147..."
7817,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,C00318,"InChI=1S/C7H15NO3/c1-8(2,3)5-6(9)4-7(10)11/h6,...",C[N+](C)(C)C[C@H](O)CC(=O)[O-],C[N+](C)(C)C[C@H](O)CC([O-])=O,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",C[N+](C)(C)CC(O)=O,"[0.071484834, -0.4210424, -0.012358708, 0.5849..."


Concatenate fingerprints

In [8]:
df_bio["meta_fp"] = df_bio.apply(lambda row: list(row["esm2"]) + list(row["meta_chemberta2"])
                                                           
    if isinstance(row["esm2"], (list, np.ndarray)) and isinstance(row["meta_chemberta2"], (list, np.ndarray))
    else None, axis=1)

df_bio

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,InChI,inchis_smiles,kegg_smiles,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp,meta_smiles,meta_chemberta2,meta_fp
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.039680514484643936, -0.008533245883882046, ..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.009646818973124027, -0.012489932589232922,..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.007409711368381977, -0.049554433673620224, ..."
4,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.014221813529729843, -0.03296571969985962, ..."
5,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,"InChI=1S/C2H4O/c1-2-3/h2H,1H3",CC=O,[H]C(C)=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.029194621369242668, -0.03175187110900879, ..."
7815,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.03648005425930023, -0.04548962414264679, -..."
7816,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.040915098041296005, 0.0022725595626980066,..."
7817,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"InChI=1S/C7H15NO3/c1-8(2,3)5-6(9)4-7(10)11/h6,...",C[N+](C)(C)C[C@H](O)CC(=O)[O-],C[N+](C)(C)C[C@H](O)CC([O-])=O,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",C[N+](C)(C)CC(O)=O,"[0.071484834, -0.4210424, -0.012358708, 0.5849...","[0.010143788531422615, -0.04845583438873291, -..."


In [9]:
df_bio.to_pickle(join("files","df_metaboanalyst.pkl"))

### d) Retrieve mapped compound names from SABIO-RK and convert them to SMILES with PubChem

In [10]:
df = pd.read_pickle(join("files","df_metaboanalyst.pkl"))
df

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,InChI,inchis_smiles,kegg_smiles,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp,meta_smiles,meta_chemberta2,meta_fp
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.039680514484643936, -0.008533245883882046, ..."
2,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.009646818973124027, -0.012489932589232922,..."
3,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.007409711368381977, -0.049554433673620224, ..."
4,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"InChI=1S/CH4O/c1-2/h2H,1H3",CO,CO,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.014221813529729843, -0.03296571969985962, ..."
5,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,"InChI=1S/C2H4O/c1-2-3/h2H,1H3",CC=O,[H]C(C)=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.029194621369242668, -0.03175187110900879, ..."
7815,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.03648005425930023, -0.04548962414264679, -..."
7816,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP(O)(=O)OP(O)(...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.040915098041296005, 0.0022725595626980066,..."
7817,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"InChI=1S/C7H15NO3/c1-8(2,3)5-6(9)4-7(10)11/h6,...",C[N+](C)(C)C[C@H](O)CC(=O)[O-],C[N+](C)(C)C[C@H](O)CC([O-])=O,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",C[N+](C)(C)CC(O)=O,"[0.071484834, -0.4210424, -0.012358708, 0.5849...","[0.010143788531422615, -0.04845583438873291, -..."


Set the substrates 

In [11]:
unique_subst = df[["substrate"]].drop_duplicates().reset_index(drop=True)
unique_subst

Unnamed: 0,substrate
0,ethanol
1,acetaldehyde
2,cyclohexanone
3,benzyl alcohol
4,isatin
...,...
619,5-formyltetrahydrofolate
620,biotin
621,nicotinic acid
622,citrulline


Group and rename compounds called with synonyms of the same substrate

In [None]:
import requests

QUERY_URL = 'https://sabiork.h-its.org/sabioRestWebServices/searchCompoundSynonyms'

query = {"SabioCompoundID":"*", "fields[]":["SabioCompoundID","Name","NameType"]}

request = requests.post(QUERY_URL, params = query)
request.raise_for_status()

print(request.text)

In [None]:
from io import StringIO

sabio_compound_df = pd.read_csv(StringIO(request.text), sep="\t")
sabio_compound_df["Name"] =sabio_compound_df["Name"].str.lower()
sabio_compound_df

In [None]:
sabio_compound_df.to_pickle(join("files","sabio_compounds_synonyms.pkl"))

Map synonyms to recommended name

In [None]:
sabio_compound_df = pd.read_pickle(join("files","sabio_compounds_synonyms.pkl"))
recommended_map = {}

for sid, group in sabio_compound_df.groupby("SabioCompoundID"):
    recommended = group[group["NameType"] == "Recommended"]
    if not recommended.empty:
        recommended_name = recommended["Name"].iloc[0]
        for synonym in group["Name"]:
            recommended_map[synonym] = recommended_name

unique_subst["substrate"] = unique_subst["substrate"].str.lower()
unique_subst["Mapped_substrate"] = unique_subst["substrate"].map(recommended_map)
unique_subst

Unnamed: 0,substrate,Mapped_substrate
0,ethanol,ethanol
1,acetaldehyde,acetaldehyde
2,cyclohexanone,cyclohexanone
3,benzyl alcohol,benzyl alcohol
4,isatin,isatin
...,...,...
619,5-formyltetrahydrofolate,5-formyltetrahydrofolate
620,biotin,biotin
621,nicotinic acid,nicotinate
622,citrulline,citrulline


Get smiles from Pubchem

In [None]:
import requests
import pandas as pd
from multiprocessing.dummy import Pool

name_to_smiles = {}

def get_smiles(name):
    try:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/CanonicalSMILES/TXT"
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            smiles = response.content.decode().strip()
        else:
            smiles = None
    except:
        smiles = None
    name_to_smiles[name] = smiles

unique_substrates_df= unique_subst.copy()
unique_names = unique_substrates_df["Mapped_substrate"].dropna().unique().tolist()

thread_pool = Pool(4)
thread_pool.map(get_smiles, unique_names)
thread_pool.close()
thread_pool.join()
unique_substrates_df["smiles"] = unique_substrates_df["Mapped_substrate"].map(name_to_smiles)
unique_substrates_df.to_pickle(join("files","unique_substrates_df_smiles.pkl"))

In [14]:
unique_substrates_df = pd.read_pickle(join("files","unique_substrates_df_smiles.pkl"))
unique_substrates_df = unique_substrates_df.applymap(lambda x: np.nan if x is None else x)
unique_substrates_df.dropna(inplace=True)
unique_substrates_df

Unnamed: 0,substrate,Mapped_substrate,smiles
0,ethanol,ethanol,CCO
1,acetaldehyde,acetaldehyde,CC=O
2,cyclohexanone,cyclohexanone,C1CCC(=O)CC1
3,benzyl alcohol,benzyl alcohol,C1=CC=C(C=C1)CO
4,isatin,isatin,C1=CC=C2C(=C1)C(=O)C(=O)N2
...,...,...,...
618,methylmalonate,methylmalonate,CC(C(=O)O)C(=O)O
619,5-formyltetrahydrofolate,5-formyltetrahydrofolate,C1C(N(C2=C(N1)N=C(NC2=O)N)C=O)CNC3=CC=C(C=C3)C...
620,biotin,biotin,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2
621,nicotinic acid,nicotinate,C1=CC(=CN=C1)C(=O)[O-]\nC1=CC(=CN=C1)C(=O)O


Generate ChemBERTa2 substrate representations

In [15]:
from transformers import AutoTokenizer, AutoModelForMaskedLM


tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-10M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-10M-MTR")

def ChemBERTa2(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, output_hidden_states=True)["hidden_states"][-1]
    embedding = outputs[0][1:-1].mean(dim=0).detach().numpy()
    return embedding

unique_substrates_df["chemberta2"]  = unique_substrates_df["smiles"].apply(ChemBERTa2)
unique_substrates_df

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,substrate,Mapped_substrate,smiles,chemberta2
0,ethanol,ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
1,acetaldehyde,acetaldehyde,CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814..."
2,cyclohexanone,cyclohexanone,C1CCC(=O)CC1,"[-0.10852594, -0.75660324, -0.13705939, 0.2697..."
3,benzyl alcohol,benzyl alcohol,C1=CC=C(C=C1)CO,"[0.019203858, -0.12148025, 0.25374326, 0.17691..."
4,isatin,isatin,C1=CC=C2C(=C1)C(=O)C(=O)N2,"[-0.3385417, -0.07216162, 0.064711995, 0.16519..."
...,...,...,...,...
618,methylmalonate,methylmalonate,CC(C(=O)O)C(=O)O,"[-0.34274235, -0.39714134, 0.056993164, -0.307..."
619,5-formyltetrahydrofolate,5-formyltetrahydrofolate,C1C(N(C2=C(N1)N=C(NC2=O)N)C=O)CNC3=CC=C(C=C3)C...,"[-0.66469747, -0.23889485, 0.065157115, -0.158..."
620,biotin,biotin,C1C2C(C(S1)CCCCC(=O)O)NC(=O)N2,"[-0.27748668, -0.627471, -0.42387664, 0.236065..."
621,nicotinic acid,nicotinate,C1=CC(=CN=C1)C(=O)[O-]\nC1=CC(=CN=C1)C(=O)O,"[-0.4206067, 0.05459747, 0.25168467, -0.682132..."


Merge with original dataframe

In [16]:
data = pd.merge(df, unique_substrates_df, on= "substrate", how= "left")
data = data.dropna(subset=["chemberta2"])
data

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,inchis_chemberta2,kegg_chemberta2,inchis_fp,kegg_fp,meta_smiles,meta_chemberta2,meta_fp,Mapped_substrate,smiles,chemberta2
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.039680514484643936, -0.008533245883882046, ...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
1,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.009646818973124027, -0.012489932589232922,...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
2,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.007409711368381977, -0.049554433673620224, ...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
3,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.014221813529729843, -0.03296571969985962, ...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080..."
4,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[-0.014221813529729843, -0.03296571969985962, ...",acetaldehyde,CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2821,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.029194621369242668, -0.03175187110900879, ...",phosphate,[O-]P(=O)([O-])[O-],"[0.43881714, -0.47575158, 0.30838424, 0.868043..."
2822,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.03648005425930023, -0.04548962414264679, -...",phosphate,[O-]P(=O)([O-])[O-],"[0.43881714, -0.47575158, 0.30838424, 0.868043..."
2823,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,"[0.65995085, -0.5171341, -0.28194746, 0.606156...","[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.040915098041296005, 0.0022725595626980066,...",phosphate,[O-]P(=O)([O-])[O-],"[0.43881714, -0.47575158, 0.30838424, 0.868043..."
2824,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"[-0.14854136, -0.5359723, -0.20837282, 0.09163...","[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",C[N+](C)(C)CC(O)=O,"[0.071484834, -0.4210424, -0.012358708, 0.5849...","[0.010143788531422615, -0.04845583438873291, -...",betaine,C[N+](C)(C)CC(=O)[O-],"[-0.044458658, -0.50271213, -0.06864436, 0.128..."


Concatenate fingerprint

In [17]:
data["sabio_pubchem_fp"] = data.apply(lambda row: list(row["esm2"]) + list(row["chemberta2"])
                                                           
    if isinstance(row["esm2"], (list, np.ndarray)) and isinstance(row["chemberta2"], (list, np.ndarray))
    else None, axis=1)
data

Unnamed: 0,ec,substrate,uniprot,pubchem_smiles,enzyme_type,km,km_count,km_gmean,log_km,sequence_id,...,kegg_chemberta2,inchis_fp,kegg_fp,meta_smiles,meta_chemberta2,meta_fp,Mapped_substrate,smiles,chemberta2,sabio_pubchem_fp
0,1.1.1.1,ethanol,Q6L0S1,CCO,wild type,[0.056],1,0.05600,-1.251812,sequence_4093,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.039680514484643936, -0.008533245883882046, ...","[0.039680514484643936, -0.008533245883882046, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.039680514484643936, -0.008533245883882046, ...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.039680514484643936, -0.008533245883882046, ..."
1,1.1.1.1,ethanol,Q9HIM3,CCO,wild type,[5.7],1,5.70000,0.755875,sequence_6112,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.009646818973124027, -0.012489932589232922,...","[-0.009646818973124027, -0.012489932589232922,...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.009646818973124027, -0.012489932589232922,...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.009646818973124027, -0.012489932589232922,..."
2,1.1.1.1,ethanol,V9SFA1,CCO,wild type,[17.0],1,17.00000,1.230449,sequence_5419,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[0.007409711368381977, -0.049554433673620224, ...","[0.007409711368381977, -0.049554433673620224, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.007409711368381977, -0.049554433673620224, ...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[0.007409711368381977, -0.049554433673620224, ..."
3,1.1.1.1,ethanol,P08319,CCO,wild type,[3.6],1,3.60000,0.556303,sequence_750,...,"[-0.0678679, -0.47239682, -0.21912168, 0.20201...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.014221813529729843, -0.03296571969985962, ...",ethanol,CCO,"[-0.117441595, -0.47532502, -0.27592745, 0.080...","[-0.014221813529729843, -0.03296571969985962, ..."
4,1.1.1.1,acetaldehyde,P08319,CC=O,wild type,[12.7],1,12.70000,1.103804,sequence_750,...,"[0.110451646, -0.6700488, 0.091193505, 0.32803...","[-0.014221813529729843, -0.03296571969985962, ...","[-0.014221813529729843, -0.03296571969985962, ...",CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[-0.014221813529729843, -0.03296571969985962, ...",acetaldehyde,CC=O,"[-0.08119688, -0.5920504, -0.1953104, -0.07814...","[-0.014221813529729843, -0.03296571969985962, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2821,7.3.2.1,phosphate,P25360,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_442,...,"[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.029194621369242668, -0.03175187110900879, ...","[-0.029194621369242668, -0.03175187110900879, ...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.029194621369242668, -0.03175187110900879, ...",phosphate,[O-]P(=O)([O-])[O-],"[0.43881714, -0.47575158, 0.30838424, 0.868043...","[-0.029194621369242668, -0.03175187110900879, ..."
2822,7.3.2.1,phosphate,A8N031,[O-]P(=O)([O-])[O-],wild type,[0.025],1,0.02500,-1.602060,sequence_486,...,"[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.03648005425930023, -0.04548962414264679, -...","[-0.03648005425930023, -0.04548962414264679, -...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.03648005425930023, -0.04548962414264679, -...",phosphate,[O-]P(=O)([O-])[O-],"[0.43881714, -0.47575158, 0.30838424, 0.868043...","[-0.03648005425930023, -0.04548962414264679, -..."
2823,7.3.2.1,phosphate,P25297,[O-]P(=O)([O-])[O-],wild type,[0.02],1,0.02000,-1.698970,sequence_409,...,"[0.4859301, -0.26224318, 0.46599346, 1.1074358...","[-0.040915098041296005, 0.0022725595626980066,...","[-0.040915098041296005, 0.0022725595626980066,...",OP(O)(O)=O,"[-0.2867363, -0.5887727, 0.49749884, 0.7792147...","[-0.040915098041296005, 0.0022725595626980066,...",phosphate,[O-]P(=O)([O-])[O-],"[0.43881714, -0.47575158, 0.30838424, 0.868043...","[-0.040915098041296005, 0.0022725595626980066,..."
2824,7.6.2.9,betaine,Q0PCR9,C[N+](C)(C)CC(=O)[O-],wild type,"[0.128, 0.143, 0.272, 0.0608, 0.0834, 0.0919, ...",8,0.06059,-1.217598,sequence_2300,...,"[0.024418518, -0.41306245, -0.05438276, 0.3774...","[0.010143788531422615, -0.04845583438873291, -...","[0.010143788531422615, -0.04845583438873291, -...",C[N+](C)(C)CC(O)=O,"[0.071484834, -0.4210424, -0.012358708, 0.5849...","[0.010143788531422615, -0.04845583438873291, -...",betaine,C[N+](C)(C)CC(=O)[O-],"[-0.044458658, -0.50271213, -0.06864436, 0.128...","[0.010143788531422615, -0.04845583438873291, -..."


In [22]:
len(data.loc[0,"esm2"])

1280

## Split the data in train and test sets

In [None]:
# shuffle the data frame 
df = data.copy()
df = df.sample(frac = 1, random_state = 44)
df.reset_index(drop= True, inplace = True)

In [None]:
def split_dataframe_enzyme(frac, df):
    df1 = pd.DataFrame(columns = list(df.columns))
    df2 = pd.DataFrame(columns = list(df.columns))
    
    df.reset_index(inplace = True, drop = True)
    
    train_indices = []
    test_indices = []
    ind = 0
    while len(train_indices) + len(test_indices) < len(df):
        if ind not in train_indices and ind not in test_indices:
            if ind % frac != 0:
                n_old = len(train_indices)
                train_indices.append(ind)
                train_indices = list(set(train_indices))

                while n_old != len(train_indices):
                    n_old = len(train_indices)

                    training_rxn= list(set(df["sequence"].loc[train_indices]))

                    train_indices = train_indices + (list(df.loc[df["sequence"].isin(training_rxn)].index))
                    train_indices = list(set(train_indices))
                
            else:
                n_old = len(test_indices)
                test_indices.append(ind)
                test_indices = list(set(test_indices)) 

                while n_old != len(test_indices):
                    n_old = len(test_indices)

                    testing_rxn= list(set(df["sequence"].loc[test_indices]))

                    test_indices = test_indices + (list(df.loc[df["sequence"].isin(testing_rxn)].index))
                    test_indices = list(set(test_indices))
                
        ind +=1
    
    
    df1 = df.loc[train_indices]
    df2 = df.loc[test_indices]
    
    return(df1, df2)

In [None]:
train_df, test_df = split_dataframe_enzyme(frac = 5, df = df.copy())
print("Test set size: %s" % len(test_df))
print("Training set size: %s" % len(train_df))
print("Size of test set in percent: %s" % np.round(100*len(test_df) / (len(test_df) + len(train_df))))

train_df.reset_index(inplace = True, drop = True)
test_df.reset_index(inplace = True, drop = True)

train_df.to_pickle(join("partitions_smiles", "train_df.pkl"))
test_df.to_pickle(join("partitions_smiles", "test_df.pkl"))

Split the training set into 5 folds for 5-fold CVs

In [None]:
# 5-fold cross validation is performed such that the same enzyme or reaction does not appear in different folds
data_train2 = train_df.copy()
data_train2["index"] = list(data_train2.index)

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

train_indices = [[] for _ in range(5)]
test_indices = [[] for _ in range(5)]

for i in range(5):
    for j in range(5):
        if i != j:
            train_indices[i] = train_indices[i] + fold_indices[j]
    test_indices[i] = fold_indices[i]

In [None]:
import pickle

train_file = join("partitions_smiles", "CV_train_indices.pkl")
test_file = join("partitions_smiles", "CV_test_indices.pkl")

with open(train_file, "wb") as f:
    pickle.dump(train_indices, f)

with open(test_file, "wb") as f:
    pickle.dump(test_indices, f)