In [1]:
import pandas as pd, re, json

# read data
df_KM = pd.read_csv("data/min_KM.txt", sep='\t', names=["EC","substrate","organism","KM","star"])
df_kcat = pd.read_csv("data/max_KCAT.txt", sep='\t', names=["EC","substrate","organism","kcat","star"])

In [2]:
# delete star column
del df_KM['star']
del df_kcat['star']

In [3]:
# remove 'EC'
trim = lambda x: x.split('EC')[1]
df_KM['EC'] = df_KM['EC'].apply(trim)
df_kcat['EC'] = df_kcat['EC'].apply(trim)

In [4]:
# filter organism
trim = lambda x: x.split('//')[0]
df_KM['organism'] = df_KM['organism'].apply(trim)
df_kcat['organism'] = df_kcat['organism'].apply(trim)

In [5]:
# Remove D-, L-, DL- and similar from the beginning of substrates
def remove_dl(substrate):
    regex = '^d-|^l-|^dl-|^\(r\)-|^\(s\)-|^\(\+\)-|^\(\-\)-'
    return re.sub(regex, '', substrate)

# remove dl-info
df_KM['substrate'] = df_KM['substrate'].apply(remove_dl)
df_kcat['substrate'] = df_kcat['substrate'].apply(remove_dl)

In [6]:
# sort by ascending KM and save only minimum value (?)
df_KM = df_KM.sort_values(by=['EC','substrate','organism','KM'])
keep_min = ~df_KM[['EC','substrate','organism']].duplicated() # False for duplicates with higher KM
df_KM = df_KM[keep_min].reset_index(drop=True)

In [7]:
# sort by descending kcat and save only maximum value (?)
df_kcat = df_kcat.sort_values(by=['EC','substrate','organism','kcat'], ascending=[1, 1, 1, 0])
keep_max = ~df_kcat[['EC','substrate','organism']].duplicated() # False for duplicates with lower kcat
df_kcat = df_kcat[keep_max].reset_index(drop=True)

In [8]:
# merge dataframes
df = pd.merge(df_kcat, df_KM, how='outer')

In [9]:
df['e'] = df['kcat']/df['KM']
df.head(20)

Unnamed: 0,EC,substrate,organism,kcat,KM,e
0,1.1.1.1,(+)-1-indanol,thermus thermophilus,61.4,4.2,14.619048
1,1.1.1.1,(+)-alpha-tetraol,thermus thermophilus,57.0,4.2,13.571429
2,1.1.1.1,(-)-1-phenylethanol,thermus thermophilus,1.1,18.1,0.060773
3,1.1.1.1,(2e)-but-2-en-1-ol,yokenella sp.,101.0,9.1,11.098901
4,1.1.1.1,(2e)-but-2-enal,yokenella sp.,405.0,3.3,122.727273
5,1.1.1.1,1-butanol,aeropyrum pernix,0.41,0.596,0.687919
6,1.1.1.1,1-butanol,mesocricetus auratus,7.5,,
7,1.1.1.1,1-butanol,rattus norvegicus,48.8,0.17,287.058824
8,1.1.1.1,1-butanol,sulfolobus solfataricus,3.1,0.08,38.75
9,1.1.1.1,1-heptanol,sulfolobus solfataricus,1.6,0.038,42.105263


In [10]:
print(len(df))

# NaN-values for each coumn
df.isnull().sum().reset_index(name='NaN')

71516


Unnamed: 0,index,NaN
0,EC,0
1,substrate,0
2,organism,0
3,kcat,41625
4,KM,5273
5,e,46898
