In [1]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles

### 1. Load complex list

In [2]:
pdb_path = os.path.abspath("../datasets/examples/COACH420/")
info_path = os.path.abspath("../datasets/examples/")

In [3]:
complex_list = os.listdir(pdb_path)

### 2. Run chimera

In [4]:
%%bash -s $pdb_path

path=$1

for file in $path/*/*'.pdb';do
    output_path=${file%.pdb}"_chimera.pdb"
    echo -e "open $file \n write format pdb 0 $output_path \n stop" | chimera --nogui
done > chimer.log

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"

no model ids match "0"



### 3.Load ligand code

In [5]:
def read_file(file):
    return file.readlines()

def preprocessing_lig_code(lines):
    lig_info = dict()
    for line in lines:
        if "#" not in line:
            line_list = line.strip().split(" ")
            pdb = line_list[0].split("/")[1].split(".")[0]
            ligand = line_list[2].split(",")
            lig_info[pdb] = ligand
            
    return lig_info

In [6]:
code_path = os.path.abspath("../datasets/examples/")

In [7]:
lig_info = preprocessing_lig_code(read_file(open(f"{code_path}/COACH420_ligand_code.txt", "r")))

### 4. Load Binding sites info

In [8]:
from scipy.spatial import distance_matrix
from multiprocessing import Process, Queue, Pool

In [9]:
pdb_parser = PDB.PDBParser(QUIET=True)

In [10]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

In [11]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [12]:
def get_binding_sites(protein_coords, ligand_coords, protein_atom_residues):
    P_L_distance_matrix = distance_matrix(protein_coords, ligand_coords)
    return sorted(list(set(protein_atom_residues[np.where(P_L_distance_matrix<=4.)[0]])))

In [13]:
def get_info(pdb):
    """ Load protein info """
    structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_chimera.pdb")
    
    chain_name_list, pdb_sequence_list, seq_lengths_list, protein_atom_coords, protein_atom_residue_list, reindex = list(), list(), list(), list(), list(), 0
    ligand_total_coords, ligand_chain_list, ligand_chain_number, ligand_chain_code = list(), list(), list(), list()
    
    lig_code = lig_info[pdb]

    """ Exclude complex with 20 amino acids ligand """
    for i in lig_code:
        if i in amino_acids_short.keys():
            return None
        
    """ Extract protein info """
    for chain_name in list(structure[0].child_dict.keys()):
        chain = structure[0][chain_name]

        pdb_sequence, binding_index_list = "", list()
        
        for residue in chain.get_residues():
            if residue.resname in amino_acids_short.keys():
                pdb_sequence += amino_acids_short[residue.resname]
                
                # protein info
                for atom in residue:
                    protein_atom_coords.append(atom.get_coord())
                    protein_atom_residue_list.append(reindex)
                reindex += 1 
                
            elif residue.resname in lig_code:
                ligand_coords = list()
                
                ligand_chain_list.append(chain_name)
                ligand_chain_number.append(str(residue.get_id()[1]))
                ligand_chain_code.append(residue.resname)
                
                # ligand info
                for atom in residue:
                    ligand_coords.append(atom.get_coord())  
                ligand_total_coords.append(ligand_coords)
                
        if len(pdb_sequence) != 0:
            chain_name_list.append(chain_name)
            pdb_sequence_list.append(pdb_sequence)
            seq_lengths_list.append(len(pdb_sequence))                


    """ Get binding sites info """
    for lig_coordi in ligand_total_coords:
        if len(protein_atom_coords) != 0 and len(lig_coordi) !=0:
            binding_index = get_binding_sites(protein_atom_coords, lig_coordi, np.array(protein_atom_residue_list)) 

            binding_index = list(map(str, binding_index))
            binding_index_list.append(",".join(binding_index))

        else:
            print(pdb)
            return None
            
    total_seq_lengths = np.sum(np.array(seq_lengths_list))
    seq_lengths_list = list(map(str, seq_lengths_list))
    
    return ",".join(chain_name_list), ",".join(pdb_sequence_list), total_seq_lengths, ",".join(seq_lengths_list), "|".join(binding_index_list), ",".join(ligand_chain_list), ",".join(ligand_chain_number), ",".join(ligand_chain_code)

In [14]:
def get_raw_protein_info_bulk(df):
    return df.coach_PDB.map(get_info)              

In [15]:
pdb = [coach_PDB[:-1] for coach_PDB in complex_list]

In [16]:
data_df = pd.DataFrame({"coach_PDB":complex_list, "PDB":pdb})
data_df

Unnamed: 0,coach_PDB,PDB
0,3efvA,3efv
1,1atlA,1atl
2,1e5qA,1e5q
3,1bnwA,1bnw
4,1afkA,1afk
5,2zgzA,2zgz
6,1c3jA,1c3j
7,1ex8A,1ex8
8,2royA,2roy
9,1a2kC,1a2k


In [17]:
info_results = parallelize_dataframe(data_df, get_raw_protein_info_bulk, num_partitions = 5)

In [18]:
info_results = pd.concat(info_results)

In [19]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)

In [20]:
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)

In [21]:
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)

In [22]:
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)

In [23]:
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)

In [24]:
data_df["Ligand_chain"] = info_results.map(lambda a: a[5] if a is not None else None)

In [25]:
data_df["Ligand_chain_number"] = info_results.map(lambda a: a[6] if a is not None else None)

In [26]:
data_df["Ligand_chain_code"] = info_results.map(lambda a: a[7] if a is not None else None)

In [27]:
data_df = data_df.loc[data_df.Sequence.isna()==False].reset_index(drop=True)
data_df = data_df.loc[data_df.Chain != " "].reset_index(drop=True)

In [28]:
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code
0,3efvA,3efv,A,MTATQALSVNPATGQTLAAMPWANAQEIEHALSLAASGFKKWKMTS...,459,459,"129,130,131,132,133,138,141,156,157,158,159,18...",A,463,NAD
1,1atlA,1atl,A,LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,200,200,"103,104,105,106,139,140,143,149,162,164,165,16...",A,301,0QI
2,1e5qA,1e5q,A,ATKSVLMLGSGFVTRPTLDVLTDSGIKVTVACRTLESAKKLSAGVQ...,449,449,"8,9,10,11,12,31,32,33,52,53,54,55,73,74,75,76,...","A,A",500501,"NDP,SHR"
3,1bnwA,1bnw,A,HWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSV...,256,256,889092115117126138193194195197204,A,555,TPD
4,1afkA,1afk,A,KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,124,124,3610114064666870108110117118119,A,125,PAP
5,2zgzA,2zgz,A,MLVFIDDGSTNIKLQWQESDGTIKQHISPNSFKREWAVSFGDKKVF...,320,320,"7,8,9,10,12,170,171,172,173,174,198,225,226,27...",A,323,GNP
6,1c3jA,1c3j,A,MKIAIINMGNNVINFKTVPSSETIYLFKVISEMGLNVDIISLKNGV...,333,333,"17,168,169,170,172,176,194,195,218,219,221,224...",A,353,UDP
7,1ex8A,1ex8,A,TVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPLG...,158,158,"41,42,43,44,52,54,69,73,81,83,87,88,91,94,96,9...",A,171,A4P
8,2royA,2roy,A,ESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSES...,121,121,81099101103110111112114,A,128,P28
9,1a2kC,1a2k,C,QVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFH...,196,196,"11,12,13,14,15,16,17,62,114,115,117,118,142,14...",C,220,GDP


### 5. Remove complex over than 1,500 protein seq length

In [29]:
lengths = data_df.Total_seq_lengths.values

In [30]:
data_df = data_df[lengths <= 1500].reset_index(drop=True)
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code
0,3efvA,3efv,A,MTATQALSVNPATGQTLAAMPWANAQEIEHALSLAASGFKKWKMTS...,459,459,"129,130,131,132,133,138,141,156,157,158,159,18...",A,463,NAD
1,1atlA,1atl,A,LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,200,200,"103,104,105,106,139,140,143,149,162,164,165,16...",A,301,0QI
2,1e5qA,1e5q,A,ATKSVLMLGSGFVTRPTLDVLTDSGIKVTVACRTLESAKKLSAGVQ...,449,449,"8,9,10,11,12,31,32,33,52,53,54,55,73,74,75,76,...","A,A",500501,"NDP,SHR"
3,1bnwA,1bnw,A,HWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSV...,256,256,889092115117126138193194195197204,A,555,TPD
4,1afkA,1afk,A,KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,124,124,3610114064666870108110117118119,A,125,PAP
5,2zgzA,2zgz,A,MLVFIDDGSTNIKLQWQESDGTIKQHISPNSFKREWAVSFGDKKVF...,320,320,"7,8,9,10,12,170,171,172,173,174,198,225,226,27...",A,323,GNP
6,1c3jA,1c3j,A,MKIAIINMGNNVINFKTVPSSETIYLFKVISEMGLNVDIISLKNGV...,333,333,"17,168,169,170,172,176,194,195,218,219,221,224...",A,353,UDP
7,1ex8A,1ex8,A,TVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPLG...,158,158,"41,42,43,44,52,54,69,73,81,83,87,88,91,94,96,9...",A,171,A4P
8,2royA,2roy,A,ESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSES...,121,121,81099101103110111112114,A,128,P28
9,1a2kC,1a2k,C,QVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFH...,196,196,"11,12,13,14,15,16,17,62,114,115,117,118,142,14...",C,220,GDP


### 6. Remove complex over than 160 SMILES length

In [31]:
def fwrite(fw, lines):
    
    for i in lines:
        fw.write("%s\n"%i)
    fw.close()

def extract_ligand_info(lines, chain, number, code):
    return [line.strip() for line in lines if (line[17:20].strip() == code) and (line[21:22].strip() == chain) and (line[22:26].strip() == number)]
    
def add_ligand(row):
    SMILES = list()
    
    coach_PDB, ligand_chain, ligand_chain_number, ligand_chain_code = row.values[0], row.values[7].split(","), row.values[8].split(","), row.values[9].split(",")
    lines = read_file(open(f"{pdb_path}/{coach_PDB}/{coach_PDB}.pdb", "r"))
    
    for chain, number, code in zip(ligand_chain, ligand_chain_number, ligand_chain_code):
        ligand_files = extract_ligand_info(lines, chain, number, code)

        fwrite(open(f"{pdb_path}/{coach_PDB}/{coach_PDB}_{chain}_{number}_{code}_ligand.pdb", "w"), ligand_files)
        
        try:
            command = f"obabel -ipdb {pdb_path}/{coach_PDB}/{coach_PDB}_{chain}_{number}_{code}_ligand.pdb -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi"

            os.system(command)

            smiles = read_file(open("tmp.smi"))[0].split('\t')[0].strip()            

            smiles = MolToSmiles(MolFromSmiles(smiles),isomericSmiles = False, kekuleSmiles = True)
            SMILES.append(smiles)

        except Exception as e:
            print(coach_PDB, e)
            return None  
    
    return ",".join(SMILES)        

In [32]:
SMILES = data_df.apply(add_ligand, axis = 1)

1ex8A Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)


1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
RDKit ERROR: [22:39:44] Explicit valence for atom # 38 N, 4, is greater than permitted
[22:39:44] Explicit valence for atom # 38 N, 4, is greater than permitted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


In [33]:
data_df["SMILES"] = SMILES

In [34]:
data_df = data_df.loc[data_df.SMILES.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES
0,3efvA,3efv,A,MTATQALSVNPATGQTLAAMPWANAQEIEHALSLAASGFKKWKMTS...,459,459,"129,130,131,132,133,138,141,156,157,158,159,18...",A,463,NAD,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...
1,1atlA,1atl,A,LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,200,200,"103,104,105,106,139,140,143,149,162,164,165,16...",A,301,0QI,COC1=CC=C(CC(NC(=O)C(CS)CC(C)C)C(=O)O)C=C1
2,1e5qA,1e5q,A,ATKSVLMLGSGFVTRPTLDVLTDSGIKVTVACRTLESAKKLSAGVQ...,449,449,"8,9,10,11,12,31,32,33,52,53,54,55,73,74,75,76,...","A,A",500501,"NDP,SHR",NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...
3,1bnwA,1bnw,A,HWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSV...,256,256,889092115117126138193194195197204,A,555,TPD,NS(=O)(=O)C1=CC=C(S(=O)(=O)NCC2=CC=CS2)S1
4,1afkA,1afk,A,KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,124,124,3610114064666870108110117118119,A,125,PAP,NC1=C2N=CN(C3OC(COP(=O)(O)OP(=O)(O)O)C(OP(=O)(...
5,2zgzA,2zgz,A,MLVFIDDGSTNIKLQWQESDGTIKQHISPNSFKREWAVSFGDKKVF...,320,320,"7,8,9,10,12,170,171,172,173,174,198,225,226,27...",A,323,GNP,NC1=NC(=O)C2=C(N1)N(C1OC(COP(=O)(O)OP(N)(=O)O)...
6,1c3jA,1c3j,A,MKIAIINMGNNVINFKTVPSSETIYLFKVISEMGLNVDIISLKNGV...,333,333,"17,168,169,170,172,176,194,195,218,219,221,224...",A,353,UDP,O=C1C=CN(C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)C(=...
7,2royA,2roy,A,ESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSES...,121,121,81099101103110111112114,A,128,P28,CC(=O)NC(CC1=CC=C(OC2=CC([N+](=O)[O-])=C(O)C([...
8,1a2kC,1a2k,C,QVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFH...,196,196,"11,12,13,14,15,16,17,62,114,115,117,118,142,14...",C,220,GDP,NC1=NC(=O)C2=C(N1)N(C1OC(COP(=O)(O)OP(=O)(O)O)...


In [35]:
def get_SMILES_length(df):
    SMILES, index = df.SMILES.values, list()
    
    for smiles in SMILES:
        for smi in smiles.split(","):
            if len(smi) > 160:
                index.append(False)
                break
        index.append(True)
    
    return index

In [36]:
smiles_index = get_SMILES_length(data_df)

In [37]:
data_df = data_df.loc[smiles_index].reset_index(drop=True)

In [38]:
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES
0,3efvA,3efv,A,MTATQALSVNPATGQTLAAMPWANAQEIEHALSLAASGFKKWKMTS...,459,459,"129,130,131,132,133,138,141,156,157,158,159,18...",A,463,NAD,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...
1,1atlA,1atl,A,LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,200,200,"103,104,105,106,139,140,143,149,162,164,165,16...",A,301,0QI,COC1=CC=C(CC(NC(=O)C(CS)CC(C)C)C(=O)O)C=C1
2,1e5qA,1e5q,A,ATKSVLMLGSGFVTRPTLDVLTDSGIKVTVACRTLESAKKLSAGVQ...,449,449,"8,9,10,11,12,31,32,33,52,53,54,55,73,74,75,76,...","A,A",500501,"NDP,SHR",NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...
3,1bnwA,1bnw,A,HWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSV...,256,256,889092115117126138193194195197204,A,555,TPD,NS(=O)(=O)C1=CC=C(S(=O)(=O)NCC2=CC=CS2)S1
4,1afkA,1afk,A,KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,124,124,3610114064666870108110117118119,A,125,PAP,NC1=C2N=CN(C3OC(COP(=O)(O)OP(=O)(O)O)C(OP(=O)(...
5,2zgzA,2zgz,A,MLVFIDDGSTNIKLQWQESDGTIKQHISPNSFKREWAVSFGDKKVF...,320,320,"7,8,9,10,12,170,171,172,173,174,198,225,226,27...",A,323,GNP,NC1=NC(=O)C2=C(N1)N(C1OC(COP(=O)(O)OP(N)(=O)O)...
6,1c3jA,1c3j,A,MKIAIINMGNNVINFKTVPSSETIYLFKVISEMGLNVDIISLKNGV...,333,333,"17,168,169,170,172,176,194,195,218,219,221,224...",A,353,UDP,O=C1C=CN(C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)C(=...
7,2royA,2roy,A,ESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSES...,121,121,81099101103110111112114,A,128,P28,CC(=O)NC(CC1=CC=C(OC2=CC([N+](=O)[O-])=C(O)C([...
8,1a2kC,1a2k,C,QVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFH...,196,196,"11,12,13,14,15,16,17,62,114,115,117,118,142,14...",C,220,GDP,NC1=NC(=O)C2=C(N1)N(C1OC(COP(=O)(O)OP(=O)(O)O)...


### 7. Mapping Uniprot

In [39]:
def read_file(file):
    return file.readlines()

def preprocessing_PDBSWS(lines):
    data = dict()
    
    for line in lines:
        line_list = line.strip().split(" ")
        if len(line_list) == 3:
            data[f"{line_list[0]}_{line_list[1]}"] = line_list[-1]
            
    return data

def preprocessing_SIFTS(lines):
    data = dict()
    
    for line in lines[2:]:
        line_list = line.strip().split("\t")
        data[f"{line_list[0]}_{line_list[1]}"] = line_list[2]
    return data

In [40]:
SIFTS_mapping_table = preprocessing_SIFTS(read_file(open(f"{info_path}/SIFTS_chain_mapping_table.txt", "r")))
PDBSWS_mapping_table = preprocessing_PDBSWS(read_file(open(f"{info_path}/PDBSWS_chain_mapping_table.txt", "r")))

In [41]:
def get_uniprot(row):
    try:
        pdb = row.PDB
        chains = row.Chain.split(",")

        uniprot_ids = list()
        for chain in chains:
            name = f"{pdb}_{chain}"
            if name in SIFTS_mapping_table:
                uniprot_ids.append(SIFTS_mapping_table[name])
            else:
                uniprot_ids.append(PDBSWS_mapping_table[name])
        return ",".join(uniprot_ids)    
    
    except Exception as e:
        print(row.PDB, e)
        return None

In [42]:
def get_uniprot_id_bulk(df):
    return df.apply(get_uniprot, axis=1)

In [43]:
uniprot_ids = parallelize_dataframe(data_df, get_uniprot_id_bulk, num_partitions = 5)

In [44]:
data_df["Uniprot_ID"] = pd.concat(uniprot_ids)

In [45]:
data_df = data_df.loc[data_df.Uniprot_ID.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,coach_PDB,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES,Uniprot_ID
0,3efvA,3efv,A,MTATQALSVNPATGQTLAAMPWANAQEIEHALSLAASGFKKWKMTS...,459,459,"129,130,131,132,133,138,141,156,157,158,159,18...",A,463,NAD,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...,Q8ZPI3
1,1atlA,1atl,A,LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,200,200,"103,104,105,106,139,140,143,149,162,164,165,16...",A,301,0QI,COC1=CC=C(CC(NC(=O)C(CS)CC(C)C)C(=O)O)C=C1,P15167
2,1e5qA,1e5q,A,ATKSVLMLGSGFVTRPTLDVLTDSGIKVTVACRTLESAKKLSAGVQ...,449,449,"8,9,10,11,12,31,32,33,52,53,54,55,73,74,75,76,...","A,A",500501,"NDP,SHR",NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...,Q9P4R4
3,1bnwA,1bnw,A,HWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSV...,256,256,889092115117126138193194195197204,A,555,TPD,NS(=O)(=O)C1=CC=C(S(=O)(=O)NCC2=CC=CS2)S1,P00918
4,1afkA,1afk,A,KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,124,124,3610114064666870108110117118119,A,125,PAP,NC1=C2N=CN(C3OC(COP(=O)(O)OP(=O)(O)O)C(OP(=O)(...,P61823
5,2zgzA,2zgz,A,MLVFIDDGSTNIKLQWQESDGTIKQHISPNSFKREWAVSFGDKKVF...,320,320,"7,8,9,10,12,170,171,172,173,174,198,225,226,27...",A,323,GNP,NC1=NC(=O)C2=C(N1)N(C1OC(COP(=O)(O)OP(N)(=O)O)...,P11904
6,1c3jA,1c3j,A,MKIAIINMGNNVINFKTVPSSETIYLFKVISEMGLNVDIISLKNGV...,333,333,"17,168,169,170,172,176,194,195,218,219,221,224...",A,353,UDP,O=C1C=CN(C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)C(=...,P04547
7,2royA,2roy,A,ESKCPLMVKVLDAVRGSPAINVAVHVFRKAADDTWEPFASGKTSES...,121,121,81099101103110111112114,A,128,P28,CC(=O)NC(CC1=CC=C(OC2=CC([N+](=O)[O-])=C(O)C([...,P02766
8,1a2kC,1a2k,C,QVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFH...,196,196,"11,12,13,14,15,16,17,62,114,115,117,118,142,14...",C,220,GDP,NC1=NC(=O)C2=C(N1)N(C1OC(COP(=O)(O)OP(=O)(O)O)...,P62825


### 7. Save data

In [46]:
data_df = data_df.iloc[:, [0, 3, 6]]

In [47]:
data_df.to_csv("../datasets/examples/COACH420_data.tsv", sep = "\t", index = False)