# Fix downloaded dataset

### Redownload missing data

In [None]:
import os
import csv
import time
import requests

In [None]:
fdir = '../../../data/databases/Sabio-RK/'
fname = 'dataset_download.tsv'
f = open(fdir+fname, 'r')
lines = f.readlines()
f.close()

Certain database entries can be downloaded incompletely with the last line mostly missing (without `\n`) for some ungodly reason. Manually fixing these missing entries in the corresponding text files by redownloading the corresponding entries.

In [None]:
for line in lines:
    data = line.strip().split('\t')
    if len(data) != 17:
        print('EntryID:', data[0])
        print(line)

In [None]:
QUERY_URL = 'http://sabiork.h-its.org/sabioRestWebServices/kineticlawsExportTsv'
#query = {'fields[]':['EntryID', 'Substrate', 'EnzymeType', 'Enzymename', 'PubMedID', 'Organism', 'UniprotID', 'ECNumber', 'Parameter', 'pH', 'Temperature']}
query = {'fields[]':['EntryID', 'PubMedID', 'Organism', 'Substrate', 'EnzymeType', 'Enzymename', 'UniprotID', 'ECNumber', 'Parameter', 'pH', 'Temperature']}

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:57731"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:58094"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:58472"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:60650"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:61738"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:65338"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:68232"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:71116"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

In [None]:
# Extra code to rerun certain chunks/entries if needed 
start = time.time()
query_string = "EntryID:75485"
query['q'] = query_string
request = requests.get(QUERY_URL, params = query)
request.raise_for_status()
end = time.time()
print(end - start)
print(request.text)

### Resolve inconsistent formatting

In [None]:
fdir = '../../../data/databases/Sabio-RK/'
fname = 'dataset_download.tsv'
with open(fdir+fname, 'r') as f:
    data = f.read()

In [None]:
# remove unnecessary dashes and spaces so the corresponding entries are treated as missing values
data = data.replace("\t-", "\t")
data = data.replace("\t ", "\t")
data = data.replace("\t-\n", "\t\n")

In [None]:
fname = 'Sabio-RK_dataset.tsv'
with open(fdir+fname, 'w') as f:
    f.write(data)

# Extract and clean up $k_{cat}$ entries

### Load data

Load the dataset as pandas dataframe and inspect the specifics of each parameter and their associated values

In [None]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [None]:
fdir = '../../../data/databases/Sabio-RK/'
fname = 'Sabio-RK_dataset.tsv'
df = pd.read_csv(fdir+fname, sep='\t')

In [None]:
len(np.unique(df['EntryID']))

In [None]:
df[df['parameter.startValue'] == 0]

### Extract $k_{cat}$ values

In [None]:
mask = df['parameter.type'] == 'kcat'
df = df.loc[mask]
n0 = len(df)
n0

Remove missing $k_{cat}$ values

In [None]:
mask = ~df['parameter.startValue'].isnull()
print("Number of entries with no assigned value:", n0 - np.sum(mask))
df = df.loc[mask]

In [None]:
print("Remaining number of entries:", len(df))

Remove unnecessary `parameter.name`, `parameter.type` and `parameter.associatedSpecies` columns

In [None]:
df = df.drop(['parameter.name', 'parameter.type', 'parameter.associatedSpecies'], axis=1)

In [None]:
df

### Explore and reformat data entries

#### `EntryID`

In [None]:
len(np.unique(df['EntryID']))

Only 28k out of all 74k EntryIDs in the database have an associated $k_{cat}$ value.

#### `PubMedID`

These are parsed as floats but should be integers

In [None]:
np.unique(df['PubMedID'].astype(str))

In [None]:
df['PubMedID']

In [None]:
df['PubMedID'] = df['PubMedID'].astype('Int64')

#### `Substrate`

In the end we reduce the dataset to $k_{cat}$ values only. If I understand correctly, in the DLKcat paper, the authors associate the $k_{cat}$ value with the substrate that has a measured $K_M$ value. If the $K_M$ values are measured for multiple substrates in the same reaction, then this would result in separate data inputs of the form $\text{substrate}_1 \rightarrow k_{cat}$ and $\text{substrate}_2 \rightarrow k_{cat}$ (in the neural net framework this can lead to different `kcat` predictions depending on the input substrate). We want to construct an input vector that includes all the substrates involved in the reaction (ignoring whether it has a measured $K_M$ value) in the form $[\text{substrate}_1, \text{substrate}_2, \dotsc] \rightarrow k_{cat}$. 

This is mentioned in the paper's discussion: "Another challenge relates to reactions involving multiple substrates and those catalysed by heteromeric enzyme complexes. The multiple substrate SMILES and protein sequences that can be defined for such reactions can all function with DLKcat, thereby yielding multiple predicted kcat values for one reaction. We currently select the maximum kcat values in those cases, but it would be favourable to devise an approach that can predict one kcat value for each multi-substrate and/or heteromeric enzyme."

Each entry is a list of substrates separated by `;` -- split into an array of individual substrate strings.

In [None]:
df['Substrate'] = df['Substrate'].apply(lambda x: np.sort(x.split(';')))

#### `EnzymeType`

In [None]:
df['EnzymeType']

In [None]:
mutant_namelist = df['EnzymeType'][df['EnzymeType'].astype(str).apply(lambda x: 'mutant' in x)]

In [None]:
# code from DLKcat
import re

for name in df['EnzymeType'].astype(str): 
    if 'wildtype' in name:
        print (name +' -> wildtype')
    else :
    # if 'mutant' in EnzymeType or 'mutated' in EnzymeType:
        mutant = re.findall('[A-Z]\d+[A-Z]', name)  # re is of great use
        print(name + ' -> '+ '/'.join(mutant))

The modifications given above that simplify the mutant/wild-type descriptors are performed in the DLKcat scripts. Omitting it now to keep things general.

#### `Organism`

In [None]:
np.unique(df['Organism'])

Keeping the organism list as is.

#### `UniprotID`

In [None]:
np.unique(df['UniprotID'][df['UniprotID'].apply(lambda x: type(x) != str)].astype(str))

Some entries do not have an associated UniprotID.

In [None]:
df.loc[[3]]

Some entries have multiple associated UniprotIDs (represent heteromeric protein complexes?). Detect such entries and list the UniprotIDs as an array

In [None]:
df['UniprotID'] = df['UniprotID'].apply(lambda x: np.empty(0) if pd.isnull(x) else np.sort(x.split(' ')))

In [None]:
df[df['UniprotID'].astype(str).apply(lambda x : len(x)) == 2]

#### `ECNumber`

Multiple ECNumbers may be associated with an entry -- putting them in arrays.

In [None]:
df['ECNumber'] = df['ECNumber'].apply(lambda x: np.empty(0) if pd.isnull(x) else np.sort(x.split(' ')))

In [None]:
mask = (df['ECNumber'].apply(lambda x: len(x) == 0)) & (df['UniprotID'].apply(lambda x: len(x) == 0))

In [None]:
np.sum(mask)

Remove entries that are missing both `ECNumber` and `UniprotID` (have no associated enzyme information) 

In [None]:
df = df.loc[~mask]

#### `parameter`

Some database entries have multiple associated $k_{cat}$ values. In DLKcat, the duplicates are removed by selecting the maximum $k_{cat}$ value as the ground truth. This is not necessarily the best approach as the multiple values may reflect experimental uncertainty or alternative specific experimental conditions. However, this is difficult to parse in an automated way, and dealing with such values more rigorously may require manual labour...

Finally, note that in some cases the $k_{cat}$ values have not only an associated `parameter.startValue` but also an extra `parameter.endValue` (indicating an interval of possible values) or an associated `parameter.standardDeviation`. It's questionable whether these could be used as extra inputs.

In [None]:
i = 0
for entryID in np.unique(df['EntryID']):
    n = np.sum(df['EntryID'] == entryID)
    if n > 1:
        i += 1
        print('EntryID %d has %d kcat values' % (entryID, n))
print('Number of entries with more than one kcat: %d' %i)

In [None]:
df[df['PubMedID'] == 15311923]

A few are due to duplicated data entries. Remove those

In [None]:
df = df.loc[df.astype(str).drop_duplicates().index]

In [None]:
len(df)

In [None]:
df.loc[df.astype(str).drop_duplicates().index]

In [None]:
i = 0
for entryID in np.unique(df['EntryID']):
    n = np.sum(df['EntryID'] == entryID)
    if n > 1:
        i += 1
        print('EntryID %d has %d kcat values' % (entryID, n))
print('Number of entries with more than one kcat: %d' %i)

#### `parameter.endValue`

List the number of $k_{cat}$ values with an associated `endValue`

In [None]:
np.sum(~df['parameter.endValue'].isnull())

Relatively few entries exist, so ignoring `endValue` seems to be the way to go. Discard any entries that have both `startValue` and `endValue` defined, and remove the `parameter.endValue` column completely.

In [None]:
mask = df['parameter.endValue'].isnull()
df = df.drop('parameter.endValue', axis=1)
df = df.loc[mask]
len(df)

#### `parameter.standardDeviation`

List the number of $k_{cat}$ values with an associated `standardDeviation`

In [None]:
np.sum(~df['parameter.standardDeviation'].isnull())

Standard deviation is given for a relatively good chunk -- might be worth considering.

#### `parameter.unit`

Units are not given in certain cases.

In [None]:
df[df['parameter.unit'].apply(lambda x : pd.isnull(x))]

Having checked the paper for EntryID: 41969, the units are given as $s^{-1}$, so it's a human error in this case. Probably okay to assume that all `NaN` units can be converted to $s^{-1}$ (as done in DLKcat). 

Some $k_{cat}$ values have weird units associated with them. Some appear to be an error, also leading to wrong values due to an automatic unit conversion step.

In [None]:
np.unique(df['parameter.unit'].astype(str))

In [None]:
df[df['parameter.unit'] == 'J/mol']

Typo having checked the paper (should be $s^{-1}$)

In [None]:
df[df['parameter.unit'] == 'M']

Clean up the units. Keep only `s^(-1)` and convert `mol*s^(-1)*mol^(-1)` and `NaN` to `s^(-1)`

In [None]:
mask = ((df['parameter.unit'].isnull()) | (df['parameter.unit'] == 'mol*s^(-1)*mol^(-1)'))
df.loc[mask, 'parameter.unit'] = 's^(-1)'

In [None]:
mask = df['parameter.unit'] == 's^(-1)'
df = df.loc[mask]

#### `pH`

In [None]:
np.sum(~df['pH'].isnull())

#### `Temperature`

In [None]:
np.sum(~df['Temperature'].isnull())

Most entries seem to have an associated pH and temperature.

In [None]:
df

# Save dataset

In [None]:
df = df.rename(columns={'parameter.startValue': 'Value'})
df = df.rename(columns={'parameter.standardDeviation': 'StandardDeviation'})
df = df.rename(columns={'UniprotID': 'UniProtID'})
df = df.rename(columns={'Enzymename': 'EnzymeName'})
df = df.drop('parameter.unit', axis=1)

In [None]:
df = df[['PubMedID', 'Organism', 'Substrate', 'ECNumber', 'EnzymeName', 'EnzymeType', 'UniProtID', 'pH', 'Temperature', 'Value', 'StandardDeviation']]

In [None]:
# remove duplicated entries
mask = df.astype(str).drop_duplicates().index
df = df.loc[mask]
df = df.reset_index(drop=True)
len(df)

In [None]:
# Remove entries with kcat = 0
# remove duplicated entries
mask = df['Value'] != 0
df = df.loc[mask]
df = df.reset_index(drop=True)
len(df)

In [None]:
fdir = '../../../data/databases/Sabio-RK/'

In [None]:
df.to_csv(fdir+'kcats.csv', index=False)

In [None]:
df.to_json(fdir+'kcats.json', index=False)