## Load data

In [None]:
import pandas as pd
import numpy as np
import re
import csv

In [None]:
pwd

In [None]:
fdir = '../../data/databases/'

In [None]:
df = pd.read_json(fdir+'kcats_merged.json')

In [None]:
df

## Retrieve SMILES

Fix Sabio and look at SMILES. Making a barebones dataset that has clear associated UniProtID and SMILES information

Run external script `pubchem_SMILES.py`

In [None]:
# Extract all unique substrates
#all_substrates = np.unique(np.hstack(df['Substrate'].to_list()))
#print(all_substrates.shape)
#np.save('../../data/databases/unique_substrates.npy', all_substrates)

# Run the separate script 'pubchem_SMILES.py' to generate substrate-SMILES map

In [None]:
with open(fdir+'substrate_SMILES_map.csv','r') as f:
    reader = csv.reader(f)
    substrate_smiles = dict((rows[0],rows[1]) for rows in reader)

In [None]:
smiles_list = []
for row in df.iterrows():
    substrates = row[1].Substrate
    _subs = [substrate_smiles[sub] if substrate_smiles[sub] != '' else None for sub in substrates]
    smiles_list.append(_subs)

## Retrieve UniProtIDs

### Retrieve extra UniProtID information from the existing data

In [None]:
# Build a Organism-ECNumber map to UniProtID values -- see if it helps to uniquely map the entries without a UniProtID to a match
org_ec_uniprot_map = {}
for row in df.iterrows():
    ec = str(row[1].ECNumber)
    org = row[1].Organism
    uniprotid = list(row[1].UniProtID)
    if ((org, ec) in org_ec_uniprot_map):
        if uniprotid != []:
            org_ec_uniprot_map[(org, ec)].append(uniprotid)
            org_ec_uniprot_map[(org, ec)] = [list(x) for x in set([tuple(x) if type(x) == list else x for x in org_ec_uniprot_map[(org, ec)]])]
    else:
        if uniprotid == []:
            org_ec_uniprot_map[(org, ec)] = []
        else:
            org_ec_uniprot_map[(org, ec)] = [uniprotid]

In [None]:
#pd.options.mode.chained_assignment = None
df[df['UniProtID'].apply(lambda x: len(x) == 0)].shape

In [None]:
for row in df[df['UniProtID'].apply(lambda x: len(x) == 0)].iterrows():
    ec = str(row[1].ECNumber)
    org = row[1].Organism
    if (org, ec) in org_ec_uniprot_map:
        values = org_ec_uniprot_map[(org, ec)]
        if len(values) == 1:
            #print(row[0])
            #print(values[0])
            df.at[row[0], 'UniProtID'] = values[0]

In [None]:
df[df['UniProtID'].apply(lambda x: len(x) == 0)].shape

### Retrieve extra UniProtIDs using UniProt API

Run external script `get_UniProtIDs.py`

In [None]:
'''
def create_UniProtID_request(row):
    ECNumber = row.ECNumber[0]
    organism = row.Organism
    # NOTE: could increase the number of hits by not using reviewed:true option, i.e. including UniProtKB unreviewed (TrEMBL) entries
    query0 = "(ec:%s) AND (organism_name:\"%s\")" % (ECNumber, organism)
    EnzymeName = row.EnzymeName
    query1 = query0 + " AND \"%s\"" % (EnzymeName)
    substrate_query = ' OR '.join(['\"'+sub+'\"' for sub in row.Substrate])
    query2 = query0 + ' AND ('+substrate_query+')'    
    query3 = query1 + ' AND ('+substrate_query+')'
    
    return [query0, query1, query2, query3]

all_queries = []
for row in df[df['UniProtID'].apply(lambda x: len(x) == 0)].iterrows():
    #print("Row "+str(row[0]))
    queries = create_UniProtID_request(row[1])
    for query in queries:
        all_queries.append(query)
    #print(queries)
    #print("--------------------------")

print(len(all_queries))
indexes = np.unique(all_queries, return_index=True)[1]
all_queries = [all_queries[index] for index in sorted(indexes)]
print(len(all_queries))

np.save(fdir+'all_ECNumber_organism_queries.npy', all_queries)
'''

# RUN get_UniProtIDs.py

In [None]:
with open(fdir+'UniProtKB_query_map.csv','r') as f:
    reader = csv.reader(f)
    query_map = dict((rows[0],rows[1]) for rows in reader)

In [None]:
def retrieve_UniProtID(row):

    ECNumber = row.ECNumber[0]
    organism = row.Organism
    query0 = "(ec:%s) AND (organism_name:\"%s\")" % (ECNumber, organism)
    #print(query0)
    uniprotid = query_map[query0]
    if (uniprotid):
        return [uniprotid]
    
    EnzymeName = row.EnzymeName
    query1 = query0 + " AND \"%s\"" % (EnzymeName)
    #print(query1)
    uniprotid = query_map[query1]
    if (uniprotid):
        return [uniprotid]
    
    substrate_query = ' OR '.join(['\"'+sub+'\"' for sub in row.Substrate])
    query2 = query0 + ' AND ('+substrate_query+')'
    #print(query2)
    uniprotid = query_map[query2]
    if (uniprotid):
        return [uniprotid]

    query3 = query1 + ' AND ('+substrate_query+')'
    #print(query3)
    uniprotid = query_map[query3]
    if (uniprotid):
        return [uniprotid]

    return []

In [None]:
for row in df[df['UniProtID'].apply(lambda x: len(x) == 0)].iterrows():
    #print("Row "+str(row[0]))
    uniprotid = retrieve_UniProtID(row[1])
    #print(uniprotid)
    #print("--------------------------")
    df.at[row[0], 'UniProtID'] = uniprotid

In [None]:
df[df['UniProtID'].apply(lambda x: len(x) == 0)].shape

## Retrieve UniProt sequences

Run external script `get_UniProtKB_sequences_mp.py`

In [None]:
'''
UniProtIDs = np.unique(np.concatenate(df['UniProtID']))
print(len(UniProtIDs))
np.save(fdir+'all_UniProtIDs.npy', UniProtIDs)
'''

#RUN get_UniProtKB_sequences_mp.py

In [None]:
with open(fdir+'UniProtKB_sequence_map.csv','r') as f:
    reader = csv.reader(f)
    uniprotid_seq_map = dict((rows[0],rows[1]) for rows in reader)

In [None]:
def parse_seq(enzymetype, seq):
    if "bridged" in enzymetype: # not sure what these mean
        # e.g. mutant A251C:S430C disulfide-bridged
        return None
    #mutant = re.findall('[A-Z]\\d+[A-Z]', enzymetype)  # re is of great use
    shift_index = False
    mutant = re.findall(r'\b([A-Z]\d+[A-Z])', enzymetype)
    if mutant != []:
        
        s = list(seq)

        for ind, mutation in enumerate(mutant):
            s1 = mutation[0]
            s2 = mutation[-1]
            i = int(mutation[1:-1])-1

            if ind == 0:
                if (len(seq) > i) and (s[i] == s1):
                    pass
                elif (len(seq) > i+1) and (s[i+1] == s1):
                    print("Shift index: i -> i+1")
                    shift_index = True
                else:
                    print("Peptides do no match")
                    print("IGNORE")
                    return None # if the peptides do not match

            i = i+1 if shift_index else i
            print("Change %s to %s at position %s" % (s1, s2, i))
            if (len(seq) >= i) and s[i] == s1:
                s[i] = s2
            else:
                print("Peptides do no match")
                print("IGNORE")
                return None # if the peptides do not match

        #print("Change as" + '/'.join(mutant))
        enzymetype = enzymetype.upper()
        if "DEL" in enzymetype:
            delete_seq(enzymetype, "".join(s), shift_index)
        else:
            return "".join(s)
    else:
        enzymetype = enzymetype.upper()
        if "DEL" in enzymetype:
            delete_seq(enzymetype, seq, shift_index)
        else:
            print("IGNORE")
            return None
        
# this resolves some of the entries with delta argument
def delete_seq(enzymetype, seq, shift_index):
    
    #TODO: regex could be rewritten in a much cleaner manner
    if ('TRUNCATED' in enzymetype) or ('DELETED' in enzymetype): # or ('tag' in x) -- ignoring the His-tag
        print("IGNORE")
        return None

    enzymetype = enzymetype.upper()
    # "delta39 C/N-terminal His6-tag" -> unclear if (delete 39 from C/N-terminal, and then add histag, or delete 39 from N/N terminal, and add histag at C/N-terminal)
    #res = re.search('delta[0-9]+ [NC]-terminal His', enzymetype)
    res = re.search('delta[0-9]+ [NC]-TERMINAL HIS', enzymetype)
    if res:
        print("IGNORE")
        return None

    # 'mutant IMP-1 delta 61'
    res = re.search('\\bDELTA [0-9]+(\\Z|[ ])', enzymetype)
    if res:
        res = res.group()
        i = int(re.search('[0-9]+', res).group())
        i = i + 1 if shift_index else i
        print("Delete at:", i)
        if len(seq) > i:
            return seq[:i-1] + seq[i:]
        else:
            return None

    #'mutant delta306'
    res = re.search('\\bDELTA[0-9]+(\\Z|[ ])', enzymetype)
    if res:
        res = res.group()
        #print(res)
        i = int(re.search('[0-9]+', res).group())
        i = i + 1 if shift_index else i
        print("Delete at:", i)
        if len(seq) > i:
            return seq[:i-1] + seq[i:]
        else:
            print("IGNORE")
            return None

    #if enzymetype == 'mutant delta 1-77 H453L'.upper():
    #    print("HERE WE ARE")
    #    print(re.search(' DELTA[ ]*[0-9]+-[0-9]+(\\Z|[, ])', enzymetype))
   #     print("???")
    # 'mutant IMP-1 delta 61-66' OR 'mutant delta411-551'
    res = re.search('\\bDELTA[ ]*[0-9]+-[0-9]+(\\Z|[, ])', enzymetype)
    if res:
        res = res.group()
        i1 = int(re.search('[0-9]+-', res).group()[:-1])
        i2 = int(re.search('-[0-9]+', res).group()[1:])
        i1 = i1 + 1 if shift_index else i1
        i2 = i2 + 1 if shift_index else i2
        print("Delete from %s to %s" % (i1, i2))
        if len(seq) > i2:
            return seq[:i1-1] + seq[i2:]
        else:
            print("IGNORE")
            return None

    # 'mutant IMP-1 delta(61-66)'
    res = re.search('\\bDELTA[(][0-9]+-[0-9]+[,)](\\Z|[ ])', enzymetype)
    if res:
        res = res.group()
        i1 = int(re.search('[0-9]+-', res).group()[:-1])
        i2 = int(re.search('-[0-9]+', res).group()[1:])
        i1 = i1 + 1 if shift_index else i1
        i2 = i2 + 1 if shift_index else i2
        print("Delete from %s to %s" % (i1, i2))
        if len(seq) > i2:
            return seq[:i1-1] + seq[i2:]
        else:
            print("IGNORE")
            return None

    # 'mutant (delta1-184)'
    res = re.search('[(]DELTA[0-9]+-[0-9]+[,)]', enzymetype)
    if res:
        res = res.group()
        i1 = int(re.search('[0-9]+-', res).group()[:-1])
        i2 = int(re.search('-[0-9]+', res).group()[1:])
        i1 = i1 + 1 if shift_index else i1
        i2 = i2 + 1 if shift_index else i2
        print("Delete from %s to %s" % (i1, i2))
        if len(seq) > i2:   
            return seq[:i1-1] + seq[i2:]
        else:
            print ("IGNORE")
            return None

    #peptide sequences are written N-terminus to C-terminus, left to right 
    # 'mutant deltaN60' -> delete the first 60 entries
    res = re.search('\\bDELTAN[0-9]+(\\Z|[ ])', enzymetype)
    if res:
        res = res.group()
        i = int(re.search('[0-9]+', res).group())
        print("Delete from 1 to %s" % (i))
        if len(seq) > i:
            return seq[i:]
        else: 
            print("IGNORE")
            return None

    # 'mutant deltaF272'
    res = re.search('\\bDELTA[A-Z][0-9]+(\\Z|[ ])', enzymetype)
    if res:
        res = re.search('DELTA[A-Z][0-9]+(\\Z|[ ])', enzymetype)
        res = res.group()
        s = res[5]
        i = int(res[6:])
        i = i + 1 if shift_index else i
        print("Delete %s at position %s" % (s, i))
        print(seq[i-1])
        if seq[i-1] == s and len(seq) >= i:
            return seq[:i-1] + seq[i:]
        elif not(shift_index) and len(seq) >= i+1 and seq[i] == s: 
            print("Shift index: i -> i+1")
            shift_index = True
            i = i+1
            return seq[:i-1] + seq[i:]
        else:
            print("Peptides do not match")
            print("IGNORE")
            return None

    # 'mutant delta F272'
    res = re.search('\\bDELTA [A-Z][0-9]+(\\Z|[ ])', enzymetype)
    if res:
        res = re.search('DELTA [A-Z][0-9]+(\\Z|[ ])', enzymetype)
        res = res.group()
        s = res[6]
        i = int(res[7:])
        i = i + 1 if shift_index else i
        print("Delete %s at position %s" % (s, i))
        print(seq[i-1])
        if seq[i-1] == s and len(seq) >= i:
            return seq[:i-1] + seq[i:]
        elif not(shift_index) and len(seq) >= i+1 and seq[i] == s: 
            print("Shift index: i -> i+1")
            shift_index = True
            i = i+1
            return seq[:i-1] + seq[i:]
        else:
            print("Peptides do not match")
            print("IGNORE")
            return None

    # 'mutant deltaP321-V363'
    res = re.search('\\bDELTA[A-Z][0-9]+-[A-Z][0-9]+(\\Z|[ ])', enzymetype)
    if res:
        res = re.search('DELTA[A-Z][0-9]+-[A-Z][0-9]+(\\Z|[ ])', enzymetype)
        res = res.group()
        s1 = res[5]
        i1 = int(re.search('DELTA[A-Z][0-9]+', res).group()[6:])
        i1 = i1 + 1 if shift_index else i1
        res = re.search('-[A-Z][0-9]+', res).group()
        s2 = res[1]
        i2 = int(res[2:])
        i2 = i2 + 1 if shift_index else i2
        print("Delete from %s at position %s to %s at position %s" % (s1, i1, s2, i2))
        print(seq[i1-1], " ", seq[i2-1])
        print(seq[i1], " ", seq[i2])
        if (seq[i1-1] == s1) and (seq[i2-1] == s2) and (len(seq) >= i2):
            return seq[:i1-1] + seq[i2:]
        elif not(shift_index) and (len(seq) >= i2+1) and (seq[i1] == s1) and (seq[i2] == s2):
            print("Shift index: i -> i+1")
            shift_index = True
            i1 = i1+1
            i2 = i2+2
            return seq[:i1-1] + seq[i2:]
        else:
            print("Peptides do not match")
            print("IGNORE")
            return None

    # 'mutant delta310/N305L/M306I'
    res = re.search('\\bDELTA[0-9]+/', enzymetype)
    if res:
        res = re.search('DELTA[0-9]+/', enzymetype)
        res = res.group()
        i = int(res[5:-1])
        i = i + 1 if shift_index else i
        print("Delete at %s" % (i))
        if len(seq) > i:
            return seq[:i-1] + seq[i:]
        else:
            print("IGNORE")
            return None

    print("IGNORE")
    return None

In [None]:
seq_list = []
for row in df.iterrows():
    print(row[0])
    EnzymeType = row[1]['EnzymeType']
    UniProtIDs = row[1]['UniProtID'] 
    print(EnzymeType)
    keywords = ['recombinant', 'isoenzyme', 'isozyme', 'carboxymethylated', 'native', 'soluble']
    if len(UniProtIDs) == 0:
        seq_list.append(None)
        print("No UniProtIDs -- no protein sequences")
    elif ('wildtype' in EnzymeType) and ('mutant' in EnzymeType or 'delta' in EnzymeType):
        seq_list.append(None)
        print("Unclear how to parse")
    elif ('wildtype' in EnzymeType) or (len(EnzymeType)==0) or \
         (np.any([keyword in EnzymeType for keyword in keywords]) and not('mutant' in EnzymeType)):
        print("Append the protein sequence(s)")
        seq_list.append([uniprotid_seq_map[uniprotid] if uniprotid in uniprotid_seq_map else None for uniprotid in UniProtIDs])
    else:
        if len(UniProtIDs) > 1: # unclear
            print("Unclear how to parse due to multiple UniProtIDs")
            seq_list.append(None)
        else:
            uniprotid = UniProtIDs[0]
            if uniprotid in uniprotid_seq_map and not(uniprotid_seq_map[uniprotid] == None):
                seq0 = uniprotid_seq_map[uniprotid]
                seq = parse_seq(EnzymeType, seq0)
            seq_list.append([seq])
    print("-------------------------------")
    #if not('wildtype' in enzymetype):
    #    print(row[0])
    #    print(row[1]['UniProtID'])
    #    print(enzymetype)
    #    delete_seq(enzymetype)
    #    print("----------------------------------------------------------")

# Add data

In [None]:
df['ProteinSequences'] = seq_list
df['SubstrateSMILES'] = smiles_list

In [None]:
df.shape

In [None]:
np.sum(df['ProteinSequences'].apply(lambda x: x == None or None in x))

In [None]:
np.sum(df['SubstrateSMILES'].apply(lambda x: None in x))

In [None]:
np.sum(df['ProteinSequences'].apply(lambda x: x == None or None in x) | df['SubstrateSMILES'].apply(lambda x: None in x))

In [None]:
df = df.drop(columns=['EnzymeName'])
mask = df.astype(str).drop_duplicates().index
df = df.loc[mask]
df = df.reset_index(drop=True)
len(df)

In [None]:
df.to_json(fdir+'kcats_merged_SMILES_UniProt.json', index=False)