In [1]:
import pandas as pd, re, json

# read data
df_KM = pd.read_csv("data/min_KM.txt", sep='\t', names=["EC","substrate","organism","KM","star"])
df_kcat = pd.read_csv("data/max_KCAT.txt", sep='\t', names=["EC","substrate","organism","kcat","star"])

In [2]:
# delete star column
del df_KM['star']
del df_kcat['star']

In [3]:
# remove 'EC'
trim = lambda x: x.split('EC')[1]
df_KM['EC'] = df_KM['EC'].apply(trim)
df_kcat['EC'] = df_kcat['EC'].apply(trim)

In [4]:
# filter organism
trim = lambda x: x.split('//')[0]
df_KM['organism'] = df_KM['organism'].apply(trim)
df_kcat['organism'] = df_kcat['organism'].apply(trim)

In [5]:
# Remove D-, L-, DL- and similar from the beginning of substrates
def remove_dl(substrate):
    regex = '^d-|^l-|^dl-|^\(r\)-|^\(s\)-|^\(\+\)-|^\(\-\)-'
    return re.sub(regex, '', substrate)

# remove dl-info
df_KM['substrate'] = df_KM['substrate'].apply(remove_dl)
df_kcat['substrate'] = df_kcat['substrate'].apply(remove_dl)

In [7]:
# sort by ascending KM and save only minimum value (?)
df_KM = df_KM.sort_values(by=['EC','substrate','organism','KM'])
keep_min = ~df_KM[['EC','substrate','organism']].duplicated() # False for duplicates with higher KM
df_KM = df_KM[keep_min].reset_index(drop=True)

In [8]:
# sort by descending kcat and save only maximum value (?)
df_kcat = df_kcat.sort_values(by=['EC','substrate','organism','kcat'], ascending=[1, 1, 1, 0])
keep_max = ~df_kcat[['EC','substrate','organism']].duplicated() # False for duplicates with lower kcat
df_kcat = df_kcat[keep_max].reset_index(drop=True)

In [10]:
# merge dataframes
df = pd.merge(df_kcat, df_KM, how='outer')

In [11]:
df['e'] = df['kcat']/df['KM']
df.head(20)

Unnamed: 0,EC,substrate,organism,kcat,KM,e
0,1.1.1.1,(+)-1-indanol,thermus thermophilus,61.4,4.2,14.619048
1,1.1.1.1,(+)-alpha-tetraol,thermus thermophilus,57.0,4.2,13.571429
2,1.1.1.1,(-)-1-phenylethanol,thermus thermophilus,1.1,18.1,0.060773
3,1.1.1.1,(2e)-but-2-en-1-ol,yokenella sp.,101.0,9.1,11.098901
4,1.1.1.1,(2e)-but-2-enal,yokenella sp.,405.0,3.3,122.727273
5,1.1.1.1,1-butanol,aeropyrum pernix,0.41,0.596,0.687919
6,1.1.1.1,1-butanol,mesocricetus auratus,7.5,,
7,1.1.1.1,1-butanol,rattus norvegicus,48.8,0.17,287.058824
8,1.1.1.1,1-butanol,sulfolobus solfataricus,3.1,0.08,38.75
9,1.1.1.1,1-heptanol,sulfolobus solfataricus,1.6,0.038,42.105263


In [12]:
print(len(df))

# NaN-values for each coumn
df.isnull().sum().reset_index(name='NaN')

46898
71516


Unnamed: 0,index,NaN
0,EC,0
1,substrate,0
2,organism,0
3,kcat,41625
4,KM,5273
5,e,46898


In [15]:
with open("data/substrates_filtered.json",'r') as f:
    data = json.loads(f.read())

EC_KM = df_KM.EC.unique()
EC_Kcat = df_kcat.EC.unique()
EC = df.EC.unique()


In [24]:
import random
ec = random.choice(EC)
display(df[df['EC'] == ec])

print('\n')
substrates = [x.lower() for x in data[ec]['first_substrate']]
print(substrates)

Unnamed: 0,EC,substrate,organism,kcat,KM,e
12093,2.7.1.2,atp,homo sapiens,68.4,2.3e-07,297391300.0
12094,2.7.1.2,atp,leishmania major,317.0,0.35,905.7143
12095,2.7.1.2,atp,trypanosoma cruzi,1492.0,0.36,4144.444
12096,2.7.1.2,beta-d-glucose,homo sapiens,166.0,,
12097,2.7.1.2,glucose,escherichia coli k-12,410.0,,
12098,2.7.1.2,glucose,homo sapiens,67.6,0.15,450.6667
12099,2.7.1.2,glucose,leishmania major,317.0,,
12100,2.7.1.2,glucose,thermotoga maritima,92.6,,
12101,2.7.1.2,glucose,trypanosoma cruzi,1492.0,1.0,1492.0
52068,2.7.1.2,2-deoxy-d-glucose,homo sapiens,,1.9,




['6-n-(carboxyethyl)atp', '6-n-(carboxymethyl)atp', '6-n-(succinyl)atp', '6-n-[n-(6-aminohexhyl)carbamoyl]atp', 'adp', 'atp', 'ctp', 'd-glucose', 'gtp', 'itp', 'utp', "adenosine 5'-triphosphate-polyamidoamine dendrimer"]


In [60]:

counter_KM = 0 
for ec in EC_KM:
    try:
        natural = [x.lower() for x in data[ec]['first_substrate']]
        df_tmp = df_KM[df_KM['EC'] == ec]
        indexNames = df_tmp[~df_tmp['substrate'].isin(natural)].index
        df_KM.drop(indexNames, inplace=True)
    except:
        counter_KM += 1

counter_Kcat = 0
for ec in EC_Kcat:
    try: 
        natural = [x.lower() for x in data[ec]['first_substrate']]
        df_tmp = df_Kcat[df_Kcat['EC'] == ec]
        indexNames = df_tmp[~df_tmp['substrate'].isin(natural)].index
        df_Kcat.drop(indexNames, inplace=True)
    except:
        counter_Kcat += 1