In [1]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles

### 1. Load complex list

In [2]:
pdb_path = os.path.abspath("../datasets/examples/HOLO4K/")
info_path = os.path.abspath("../datasets/examples/")

In [3]:
complex_list = os.listdir(pdb_path)

### 2. Run chimera

In [4]:
%%bash -s $pdb_path

path=$1

for file in $path/*/*'.pdb';do
    output_path=${file%.pdb}"_chimera.pdb"
    echo -e "open $file \n write format pdb 0 $output_path \n stop" | chimera --nogui
done > chimer.log

### 3. Load ligand code

In [5]:
def read_file(file):
    return file.readlines()

def preprocessing_lig_code(lines):
    lig_info = dict()
    for line in lines:
        if "#" not in line:
            line_list = line.strip().split(" ")
            pdb = line_list[0].split("/")[1].split(".")[0]
            ligand = line_list[2].split(",")
            lig_info[pdb] = ligand
            
    return lig_info

In [6]:
code_path = os.path.abspath("../datasets/examples/")

In [7]:
lig_info = preprocessing_lig_code(read_file(open(f"{code_path}/HOLO4K_ligand_code.txt", "r")))

### 4. Load Binding sites info

In [8]:
from scipy.spatial import distance_matrix
from multiprocessing import Process, Queue, Pool

In [9]:
pdb_parser = PDB.PDBParser(QUIET=True)

In [10]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

In [11]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [12]:
def get_binding_sites(protein_coords, ligand_coords, protein_atom_residues):
    P_L_distance_matrix = distance_matrix(protein_coords, ligand_coords)
    return sorted(list(set(protein_atom_residues[np.where(P_L_distance_matrix<=4.)[0]])))

In [13]:
def get_info(pdb):
    """ Load protein info """
    structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_chimera.pdb")
    
    chain_name_list, pdb_sequence_list, seq_lengths_list, protein_atom_coords, protein_atom_residue_list, reindex = list(), list(), list(), list(), list(), 0
    ligand_total_coords, ligand_chain_list, ligand_chain_number, ligand_chain_code = list(), list(), list(), list()
    
    lig_code = lig_info[pdb]

    """ Exclude complex with 20 amino acids ligand """
    for i in lig_code:
        if i in amino_acids_short.keys():
            return None
        
    """ Extract protein info """
    for chain_name in list(structure[0].child_dict.keys()):
        chain = structure[0][chain_name]

        pdb_sequence, binding_index_list = "", list()
        
        for residue in chain.get_residues():
            if residue.resname in amino_acids_short.keys():
                pdb_sequence += amino_acids_short[residue.resname]
                
                # protein info
                for atom in residue:
                    protein_atom_coords.append(atom.get_coord())
                    protein_atom_residue_list.append(reindex)
                reindex += 1 
                
            elif residue.resname in lig_code:
                ligand_coords = list()
                
                ligand_chain_list.append(chain_name)
                ligand_chain_number.append(str(residue.get_id()[1]))
                ligand_chain_code.append(residue.resname)
                
                # ligand info
                for atom in residue:
                    ligand_coords.append(atom.get_coord())  
                ligand_total_coords.append(ligand_coords)
                
        if len(pdb_sequence) != 0:
            chain_name_list.append(chain_name)
            pdb_sequence_list.append(pdb_sequence)
            seq_lengths_list.append(len(pdb_sequence))                


    """ Get binding sites info """
    for lig_coordi in ligand_total_coords:
        if len(protein_atom_coords) != 0 and len(lig_coordi) !=0:
            binding_index = get_binding_sites(protein_atom_coords, lig_coordi, np.array(protein_atom_residue_list)) 

            binding_index = list(map(str, binding_index))
            binding_index_list.append(",".join(binding_index))

        else:
            print(pdb)
            return None
            
    total_seq_lengths = np.sum(np.array(seq_lengths_list))
    seq_lengths_list = list(map(str, seq_lengths_list))
    
    return ",".join(chain_name_list), ",".join(pdb_sequence_list), total_seq_lengths, ",".join(seq_lengths_list), "|".join(binding_index_list), ",".join(ligand_chain_list), ",".join(ligand_chain_number), ",".join(ligand_chain_code)

In [14]:
def get_raw_protein_info_bulk(df):
    return df.PDB.map(get_info)              

In [15]:
data_df = pd.DataFrame({"PDB":complex_list})
data_df

Unnamed: 0,PDB
0,8cgt
1,6gsx
2,9ldb
3,8cpa
4,6gsy
5,2qwj
6,9ldt
7,4upj
8,2qwk
9,3pgt


In [16]:
info_results = parallelize_dataframe(data_df, get_raw_protein_info_bulk, num_partitions = 5)

In [17]:
info_results = pd.concat(info_results)

In [18]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)

In [19]:
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)

In [20]:
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)

In [21]:
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)

In [22]:
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)

In [23]:
data_df["Ligand_chain"] = info_results.map(lambda a: a[5] if a is not None else None)

In [24]:
data_df["Ligand_chain_number"] = info_results.map(lambda a: a[6] if a is not None else None)

In [25]:
data_df["Ligand_chain_code"] = info_results.map(lambda a: a[7] if a is not None else None)

In [26]:
data_df = data_df.loc[data_df.Sequence.isna()==False].reset_index(drop=True)
data_df = data_df.loc[data_df.Chain != " "].reset_index(drop=True)

In [27]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code
0,8cgt,A,DPDTAVTNKQSFSTDVIYQVFTDRFLDGNPSNNPTGAAYDATCSNL...,684,684,"46,88,93,97,99,100,182,193,194,195,196,232,258...",A,701,TM6
1,6gsx,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,8,10,11,41,44,48,57,58,59,70,71,106,110,11...","A,B",221218,"GPS,GPS"
2,9ldb,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,B,B",401401402,"NAD,NAD,OXM"
3,8cpa,A,ARSTNTFNYATYHTLDEIYDFMDLLVAQHPELVSKLQIGRSYEGRP...,307,307,"68,70,71,126,143,144,162,195,196,197,242,246,2...",A,309,AGF
4,6gsy,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,11,41,44,48,57,58,59,70,71,321|104,223,228...","A,B",218218,"GSH,GSH"
5,2qwj,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143165195196211289323,A,800,G28
6,9ldt,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,A,B,B",401402401402,"NAD,OXM,NAD,OXM"
7,4upj,"A,B",PQFSLWKRPVVTAYIEGQPVEVLLDTGADDSIVAGIELGNNYSPKI...,198,9999,"24,27,28,29,46,47,48,49,81,83,106,123,125,126,...",A,100,U04
8,2qwk,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143195196211213289323,A,800,G39
9,3pgt,"A,B",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,419,210209,"8,10,13,38,44,50,51,52,53,64,65,104,108,205,30...","A,B",214213,"GBX,GBX"


### 5. Remove complex over than 1,500 protein seq length

In [28]:
lengths = data_df.Total_seq_lengths.values

In [29]:
data_df = data_df[lengths <= 1500].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code
0,8cgt,A,DPDTAVTNKQSFSTDVIYQVFTDRFLDGNPSNNPTGAAYDATCSNL...,684,684,"46,88,93,97,99,100,182,193,194,195,196,232,258...",A,701,TM6
1,6gsx,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,8,10,11,41,44,48,57,58,59,70,71,106,110,11...","A,B",221218,"GPS,GPS"
2,9ldb,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,B,B",401401402,"NAD,NAD,OXM"
3,8cpa,A,ARSTNTFNYATYHTLDEIYDFMDLLVAQHPELVSKLQIGRSYEGRP...,307,307,"68,70,71,126,143,144,162,195,196,197,242,246,2...",A,309,AGF
4,6gsy,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,11,41,44,48,57,58,59,70,71,321|104,223,228...","A,B",218218,"GSH,GSH"
5,2qwj,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143165195196211289323,A,800,G28
6,9ldt,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,A,B,B",401402401402,"NAD,OXM,NAD,OXM"
7,4upj,"A,B",PQFSLWKRPVVTAYIEGQPVEVLLDTGADDSIVAGIELGNNYSPKI...,198,9999,"24,27,28,29,46,47,48,49,81,83,106,123,125,126,...",A,100,U04
8,2qwk,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143195196211213289323,A,800,G39
9,3pgt,"A,B",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,419,210209,"8,10,13,38,44,50,51,52,53,64,65,104,108,205,30...","A,B",214213,"GBX,GBX"


### 6. Remove complex over than 160 SMILES length

In [30]:
def fwrite(fw, lines):
    
    for i in lines:
        fw.write("%s\n"%i)
    fw.close()

def extract_ligand_info(lines, chain, number, code):
    return [line.strip() for line in lines if (line[17:20].strip() == code) and (line[21:22].strip() == chain) and (line[22:26].strip() == number)]
    
def add_ligand(row):
    SMILES = list()
    
    coach_PDB, ligand_chain, ligand_chain_number, ligand_chain_code = row.values[0], row.values[6].split(","), row.values[7].split(","), row.values[8].split(",")
    lines = read_file(open(f"{pdb_path}/{coach_PDB}/{coach_PDB}.pdb", "r"))
    
    for chain, number, code in zip(ligand_chain, ligand_chain_number, ligand_chain_code):
        ligand_files = extract_ligand_info(lines, chain, number, code)

        fwrite(open(f"{pdb_path}/{coach_PDB}/{coach_PDB}_{chain}_{number}_{code}_ligand.pdb", "w"), ligand_files)
        
        try:
            command = f"obabel -ipdb {pdb_path}/{coach_PDB}/{coach_PDB}_{chain}_{number}_{code}_ligand.pdb -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi"

            os.system(command)

            smiles = read_file(open("tmp.smi"))[0].split('\t')[0].strip()            

            smiles = MolToSmiles(MolFromSmiles(smiles),isomericSmiles = False, kekuleSmiles = True)
            SMILES.append(smiles)

        except Exception as e:
            print(coach_PDB, e)
            return None  
    
    return ",".join(SMILES)        

In [31]:
SMILES = data_df.apply(add_ligand, axis = 1)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


In [32]:
data_df["SMILES"] = SMILES

In [33]:
data_df = data_df.loc[data_df.SMILES.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES
0,8cgt,A,DPDTAVTNKQSFSTDVIYQVFTDRFLDGNPSNNPTGAAYDATCSNL...,684,684,"46,88,93,97,99,100,182,193,194,195,196,232,258...",A,701,TM6,OCC1OC(SC2C(CO)OC(OC3C(CO)OC(SC4C(CO)OC(OC5C(C...
1,6gsx,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,8,10,11,41,44,48,57,58,59,70,71,106,110,11...","A,B",221218,"GPS,GPS",NC(CCC(=O)NC(CSC1C2=C(C=CC=C2)C2=C(C=CC=C2)C1O...
2,9ldb,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,B,B",401401402,"NAD,NAD,OXM",NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...
3,8cpa,A,ARSTNTFNYATYHTLDEIYDFMDLLVAQHPELVSKLQIGRSYEGRP...,307,307,"68,70,71,126,143,144,162,195,196,197,242,246,2...",A,309,AGF,CC(NC(=O)OCC1=CC=CC=C1)C(=O)NCP(=O)(O)OC(CC1=C...
4,6gsy,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,11,41,44,48,57,58,59,70,71,321|104,223,228...","A,B",218218,"GSH,GSH","NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)C(=O)O,NC(CCC(=O..."
5,2qwj,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143165195196211289323,A,800,G28,CCN(CC)C(=O)C1OC(C(=O)O)CC(N)C1NC(C)=O
6,9ldt,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,A,B,B",401402401402,"NAD,OXM,NAD,OXM",NC(=O)C1C=CCN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N...
7,4upj,"A,B",PQFSLWKRPVVTAYIEGQPVEVLLDTGADDSIVAGIELGNNYSPKI...,198,9999,"24,27,28,29,46,47,48,49,81,83,106,123,125,126,...",A,100,U04,CCC(C1=CC=CC(NC(=O)CNC(=O)OC(C)(C)C)=C1)C1=C(O...
8,2qwk,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143195196211213289323,A,800,G39,CCC(CC)OC1C=C(C(=O)O)CC(N)C1NC(C)=O
9,3pgt,"A,B",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,419,210209,"8,10,13,38,44,50,51,52,53,64,65,104,108,205,30...","A,B",214213,"GBX,GBX",NC(CCC(=O)NC(CSC1C2=C3C=CC4=CC=CC5=CC=C(C=C2C(...


In [34]:
def get_SMILES_length(df):
    SMILES, index = df.SMILES.values, list()
    
    for smiles in SMILES:
        for smi in smiles.split(","):
            if len(smi) > 160:
                index.append(False)
                break
        index.append(True)
    
    return index

In [35]:
smiles_index = get_SMILES_length(data_df)

In [36]:
data_df = data_df.loc[smiles_index].reset_index(drop=True)

In [37]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES
0,8cgt,A,DPDTAVTNKQSFSTDVIYQVFTDRFLDGNPSNNPTGAAYDATCSNL...,684,684,"46,88,93,97,99,100,182,193,194,195,196,232,258...",A,701,TM6,OCC1OC(SC2C(CO)OC(OC3C(CO)OC(SC4C(CO)OC(OC5C(C...
1,6gsx,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,8,10,11,41,44,48,57,58,59,70,71,106,110,11...","A,B",221218,"GPS,GPS",NC(CCC(=O)NC(CSC1C2=C(C=CC=C2)C2=C(C=CC=C2)C1O...
2,9ldb,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,B,B",401401402,"NAD,NAD,OXM",NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...
3,8cpa,A,ARSTNTFNYATYHTLDEIYDFMDLLVAQHPELVSKLQIGRSYEGRP...,307,307,"68,70,71,126,143,144,162,195,196,197,242,246,2...",A,309,AGF,CC(NC(=O)OCC1=CC=CC=C1)C(=O)NCP(=O)(O)OC(CC1=C...
4,6gsy,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,11,41,44,48,57,58,59,70,71,321|104,223,228...","A,B",218218,"GSH,GSH","NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)C(=O)O,NC(CCC(=O..."
5,2qwj,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143165195196211289323,A,800,G28,CCN(CC)C(=O)C1OC(C(=O)O)CC(N)C1NC(C)=O
6,9ldt,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,A,B,B",401402401402,"NAD,OXM,NAD,OXM",NC(=O)C1C=CCN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N...
7,4upj,"A,B",PQFSLWKRPVVTAYIEGQPVEVLLDTGADDSIVAGIELGNNYSPKI...,198,9999,"24,27,28,29,46,47,48,49,81,83,106,123,125,126,...",A,100,U04,CCC(C1=CC=CC(NC(=O)CNC(=O)OC(C)(C)C)=C1)C1=C(O...
8,2qwk,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143195196211213289323,A,800,G39,CCC(CC)OC1C=C(C(=O)O)CC(N)C1NC(C)=O
9,3pgt,"A,B",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,419,210209,"8,10,13,38,44,50,51,52,53,64,65,104,108,205,30...","A,B",214213,"GBX,GBX",NC(CCC(=O)NC(CSC1C2=C3C=CC4=CC=CC5=CC=C(C=C2C(...


### 7. Mapping Uniprot

In [38]:
def read_file(file):
    return file.readlines()

def preprocessing_PDBSWS(lines):
    data = dict()
    
    for line in lines:
        line_list = line.strip().split(" ")
        if len(line_list) == 3:
            data[f"{line_list[0]}_{line_list[1]}"] = line_list[-1]
            
    return data

def preprocessing_SIFTS(lines):
    data = dict()
    
    for line in lines[2:]:
        line_list = line.strip().split("\t")
        data[f"{line_list[0]}_{line_list[1]}"] = line_list[2]
    return data

In [39]:
SIFTS_mapping_table = preprocessing_SIFTS(read_file(open(f"{info_path}/SIFTS_chain_mapping_table.txt", "r")))
PDBSWS_mapping_table = preprocessing_PDBSWS(read_file(open(f"{info_path}/PDBSWS_chain_mapping_table.txt", "r")))

In [40]:
def get_uniprot(row):
    try:
        pdb = row.PDB
        chains = row.Chain.split(",")

        uniprot_ids = list()
        for chain in chains:
            name = f"{pdb}_{chain}"
            if name in SIFTS_mapping_table:
                uniprot_ids.append(SIFTS_mapping_table[name])
            else:
                uniprot_ids.append(PDBSWS_mapping_table[name])
        return ",".join(uniprot_ids)    
    
    except Exception as e:
        print(row.PDB, e)
        return None

In [41]:
def get_uniprot_id_bulk(df):
    return df.apply(get_uniprot, axis=1)

In [42]:
uniprot_ids = parallelize_dataframe(data_df, get_uniprot_id_bulk, num_partitions = 5)

In [43]:
data_df["Uniprot_ID"] = pd.concat(uniprot_ids)

In [44]:
data_df = data_df.loc[data_df.Uniprot_ID.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,Ligand_chain,Ligand_chain_number,Ligand_chain_code,SMILES,Uniprot_ID
0,8cgt,A,DPDTAVTNKQSFSTDVIYQVFTDRFLDGNPSNNPTGAAYDATCSNL...,684,684,"46,88,93,97,99,100,182,193,194,195,196,232,258...",A,701,TM6,OCC1OC(SC2C(CO)OC(OC3C(CO)OC(SC4C(CO)OC(OC5C(C...,P30920
1,6gsx,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,8,10,11,41,44,48,57,58,59,70,71,106,110,11...","A,B",221218,"GPS,GPS",NC(CCC(=O)NC(CSC1C2=C(C=CC=C2)C2=C(C=CC=C2)C1O...,"P04905,P04905"
2,9ldb,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,B,B",401401402,"NAD,NAD,OXM",NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N4C...,"P00339,P00339"
3,8cpa,A,ARSTNTFNYATYHTLDEIYDFMDLLVAQHPELVSKLQIGRSYEGRP...,307,307,"68,70,71,126,143,144,162,195,196,197,242,246,2...",A,309,AGF,CC(NC(=O)OCC1=CC=CC=C1)C(=O)NCP(=O)(O)OC(CC1=C...,P00730
4,6gsy,"A,B",PMILGFWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWL...,434,217217,"5,6,11,41,44,48,57,58,59,70,71,321|104,223,228...","A,B",218218,"GSH,GSH","NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)C(=O)O,NC(CCC(=O...","P04905,P04905"
5,2qwj,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143165195196211289323,A,800,G28,CCN(CC)C(=O)C1OC(C(=O)O)CC(N)C1NC(C)=O,P03472
6,9ldt,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,662,331331,"25,27,28,29,50,51,52,93,94,95,96,97,111,114,11...","A,A,B,B",401402401402,"NAD,OXM,NAD,OXM",NC(=O)C1C=CCN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(N...,"P00339,P00339"
7,4upj,"A,B",PQFSLWKRPVVTAYIEGQPVEVLLDTGADDSIVAGIELGNNYSPKI...,198,9999,"24,27,28,29,46,47,48,49,81,83,106,123,125,126,...",A,100,U04,CCC(C1=CC=CC(NC(=O)CNC(=O)OC(C)(C)C)=C1)C1=C(O...,"P04584,P04584"
8,2qwk,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,388,388,3637697097141143195196211213289323,A,800,G39,CCC(CC)OC1C=C(C(=O)O)CC(N)C1NC(C)=O,P03472
9,3pgt,"A,B",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,419,210209,"8,10,13,38,44,50,51,52,53,64,65,104,108,205,30...","A,B",214213,"GBX,GBX",NC(CCC(=O)NC(CSC1C2=C3C=CC4=CC=CC5=CC=C(C=C2C(...,"P09211,P09211"


### 7. Save data

In [45]:
data_df = data_df.iloc[:, [0, 2, 5]]

In [46]:
data_df.to_csv("../datasets/examples/HOLO4K_data.tsv", sep = "\t", index = False)