In [1]:
import numpy as np
import pandas as pd
import os
from biopandas.mol2 import PandasMol2
from multiprocessing import Process, Queue, Pool
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
from Bio import PDB
from scipy.spatial import distance_matrix
import pickle

### 1. Load data

In [2]:
with open("./preprocessed_bs_data/step3_scPDB_data.pkl", "rb") as f:
    scPDB_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_PDBbind_data.pkl", "rb") as f:
    PDBbind_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_CASF2016_data.pkl", "rb") as f:
    CASF2016_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_CASF2013_data.pkl", "rb") as f:
    CASF2013_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_CSAR2014_data.pkl", "rb") as f:
    CSAR2014_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_CSAR2012_data.pkl", "rb") as f:
    CSAR2012_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_CSARset1_data.pkl", "rb") as f:
    CSARset1_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_CSARset2_data.pkl", "rb") as f:
    CSARset2_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_Astex_data.pkl", "rb") as f:
    Astex_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_COACH420_data.pkl", "rb") as f:
    COACH420_dict = pickle.load(f)
    
with open("./preprocessed_bs_data/step3_HOLO4K_data.pkl", "rb") as f:
    HOLO4K_dict = pickle.load(f)

### 2. Add Uniprot ID and sequence

In [3]:
def preprocessing_PDBSWS(lines):
    data = dict()
    
    for line in lines:
        line_list = line.strip().split(" ")
        if len(line_list) == 3:
            data[f"{line_list[0]}_{line_list[1]}"] = line_list[-1]
            
    return data

In [4]:
def preprocessing_SIFTS(lines):
    data = dict()
    
    for line in lines[2:]:
        line_list = line.strip().split("\t")
        data[f"{line_list[0]}_{line_list[1]}"] = line_list[2]
    return data

In [5]:
def read_file(file):
    return file.readlines()

In [6]:
SIFTS_mapping_table = preprocessing_SIFTS(read_file(open("./data/supplementary/SIFTS_chain_mapping_table.txt", "r")))
len(SIFTS_mapping_table)

593192

In [7]:
PDBSWS_mapping_table = preprocessing_PDBSWS(read_file(open("./data/supplementary/PDBSWS_chain_mapping_table.txt", "r")))
len(PDBSWS_mapping_table)

354753

In [8]:
def add_uniprot_id(dict_):
    uniprot_ids, count = list(), 0 
    
    for keys in list(dict_.keys()):
        pdb_id = keys.split("_")[0]
        chains = dict_[keys]["Chain"].split(",")
        tmp = list()
        
        for chain in chains:
            name = f"{pdb_id}_{chain}"

            if name in SIFTS_mapping_table:
                tmp.append(SIFTS_mapping_table[name])
            
            elif name in PDBSWS_mapping_table:
                tmp.append(PDBSWS_mapping_table[name])
            
            else:
                tmp.append('None')
                count += 1
        
        dict_[keys]["Uniprot_IDs"] = ",".join(tmp)

    print(f"\tNo mapping {count} for total chain")
    return dict_

def get_unique_uniprot_ids(dict_):
    total_list = list() 
    
    for keys in list(dict_.keys()):
        uniprot_ids = dict_[keys]["Uniprot_IDs"]
        
        for uniprot in uniprot_ids.split(","):
            if uniprot != "None":
                total_list.append(uniprot)

    print(f"Unique Uniprot IDs: {len(set(total_list))}")
    print()

def get_uniprot_ids(dict_list):
    total_uniprot_ids = list()
    
    for dict_ in dict_list:
        for keys in list(dict_.keys()):
            uniprot_ids = dict_[keys]["Uniprot_IDs"]
            
            for i in uniprot_ids.split(","):
                if i != "None":
                    total_uniprot_ids.append(i)
    
    for uniprot_ids in list(set(total_uniprot_ids)):
        print(uniprot_ids, end = ",")            

    return set(total_uniprot_ids)

#### 2.1 Add Uniprot ID

In [9]:
scPDB_dict = add_uniprot_id(scPDB_dict)
get_unique_uniprot_ids(scPDB_dict)

	No mapping 233 for total chain
Unique Uniprot IDs: 5515



In [10]:
PDBbind_dict = add_uniprot_id(PDBbind_dict)
get_unique_uniprot_ids(PDBbind_dict)

	No mapping 436 for total chain
Unique Uniprot IDs: 3038



In [11]:
CASF2016_dict = add_uniprot_id(CASF2016_dict)
get_unique_uniprot_ids(CASF2016_dict)

	No mapping 2 for total chain
Unique Uniprot IDs: 77



In [12]:
CASF2013_dict = add_uniprot_id(CASF2013_dict)
get_unique_uniprot_ids(CASF2013_dict)

	No mapping 1 for total chain
Unique Uniprot IDs: 75



In [13]:
CSAR2014_dict = add_uniprot_id(CSAR2014_dict)
get_unique_uniprot_ids(CSAR2014_dict)

	No mapping 0 for total chain
Unique Uniprot IDs: 3



In [14]:
CSAR2012_dict = add_uniprot_id(CSAR2012_dict)
get_unique_uniprot_ids(CSAR2012_dict)

	No mapping 0 for total chain
Unique Uniprot IDs: 8



In [15]:
CSARset1_dict = add_uniprot_id(CSARset1_dict)
get_unique_uniprot_ids(CSARset1_dict)

	No mapping 3 for total chain
Unique Uniprot IDs: 116



In [16]:
CSARset2_dict = add_uniprot_id(CSARset2_dict)
get_unique_uniprot_ids(CSARset2_dict)

	No mapping 8 for total chain
Unique Uniprot IDs: 104



In [17]:
Astex_dict = add_uniprot_id(Astex_dict)
get_unique_uniprot_ids(Astex_dict)

	No mapping 0 for total chain
Unique Uniprot IDs: 79



In [18]:
COACH420_dict = add_uniprot_id(COACH420_dict)
get_unique_uniprot_ids(COACH420_dict)

	No mapping 0 for total chain
Unique Uniprot IDs: 245



In [19]:
HOLO4K_dict = add_uniprot_id(HOLO4K_dict)
get_unique_uniprot_ids(HOLO4K_dict)

	No mapping 8 for total chain
Unique Uniprot IDs: 1144



#### 2.2 Add Uniprot sequences

In [20]:
total_uniprot_ids = get_uniprot_ids([scPDB_dict, PDBbind_dict])

Q8G9Q0,P35270,P62805,Q5EEL8,Q9UBQ7,D4GTL2,P11961,A5F5Y4,Q6A8X5,Q5SJP9,Q6Q783,O52646,P18079,Q5JDA3,Q27797,Q9LCY2,Q1RBS0,E8ME30,P25891,O80842,A0A0H2UYY2,B1MB31,Q5CRJ8,Q86A17,P23478,O80944,B5ZAD9,Q5LUA7,P9WFY1,P43246,Q8VHN2,P00730,Q3M362,P97571,P00516,Q70AY4,Q44467,M1E1E4,A0A0D2ILK1,Q9WZY5,P17157,P0C1S8,Q2S3S9,P32755,P00693,P39315,P00344,Q8PJX9,B0B9A0,B1L7P8,P03366,P47228,Q5YUF0,Q9BUB5,P09945,O75367,F1NUJ7,Q08649,P12530,Q15910,E7E815,D0VWY5,A0A0H2UL03,Q67PI3,Q8X607,Q9X0C3,Q9E3M8,A0R618,P51617,P10868,Q8GR64,B0F481,Q07889,O88188,P0AEY8,Q6IQ20,Q8DLC2,Q9UQR0,Q5FWF5,A0A1L8F5J9,P40859,Q9HUT5,A3MUJ4,O87696,P15813,P39131,Q43088,O08795,P12493,Q9HCH5,P37487,C3TPN7,G7VCG0,H2QL32,Q8D151,Q9SWS1,Q5VWZ2,P11978,Q3JWH1,O32449,P35398,P36929,O57693,Q9KQG2,B7QK46,P14779,Q8KKW2,P00791,P14169,O74933,P20901,A4JC15,O05205,P00259,Q6IWJ5,Q6V1M8,P9WNG3,P16603,W8SZG1,P61431,Q980A5,Q6NZB1,Q9X1G6,Q39588,O08760,B8AVF0,Q8WS26,Q5UPJ7,P70604,Q8IEW2,O15075,P56221,P27395,Q877G8,P58154,Q92905,C4LW95,P39898,P00798,Q2FZ56,Q3YT

In [21]:
def preprocessing_fasta(lines):
    
    data, seqs = dict(), ""
    
    for line in lines:
        line = line.strip()
        
        if ">" in line:
            if len(seqs) == 0:
                uniprot_id = line.split("|")[1]
            else:
                if uniprot_id not in data:
                    data[uniprot_id] = seqs
                seqs = ""
                uniprot_id = line.split("|")[1]
        else:
            seqs += line
    
    data[uniprot_id] = seqs
    
    return data

In [22]:
uniprot_fasta_lines = read_file(open("./data/supplementary/BS_uniprot_mapping_results.fasta", "r"))

In [23]:
uniprot_fasta_dict = preprocessing_fasta(uniprot_fasta_lines)
print(f"Uniprot fasta: {len(uniprot_fasta_dict)}")

Uniprot fasta: 7086


In [24]:
def mapping_to_uniprot_seqs(dict_, uniprot_fasta_dict):
    
    for keys in list(dict_.keys()):
        uniprot_ids = dict_[keys]["Uniprot_IDs"].split(",")
        
        tmp = list()
        for uniprot in uniprot_ids:
            if uniprot in uniprot_fasta_dict:
                tmp.append(uniprot_fasta_dict[uniprot])
            else:
                tmp.append("None")
        
        dict_[keys]["Uniprot_seqs"] = ",".join(tmp)
        
    return dict_

In [25]:
scPDB_dict = mapping_to_uniprot_seqs(scPDB_dict, uniprot_fasta_dict)

In [26]:
PDBbind_dict = mapping_to_uniprot_seqs(PDBbind_dict, uniprot_fasta_dict)

#### 2.3 Check single protein chain

In [27]:
def check_key(dict_):
    remove_keys = list()
    
    for keys in list(dict_.keys()):
        flag = 0

        chain_8A_keys = list(dict_[keys]["BS_8A"].keys())
        chains, uniprot_ids, uniprot_seqs = dict_[keys]["Chain"].split(","), dict_[keys]["Uniprot_IDs"].split(","), dict_[keys]["Uniprot_seqs"].split(",")
        tmp = list()

        for lig in chain_8A_keys:
            cur_pocket_chain = dict_[keys]["BS_8A"][lig][0]
            tmp.extend(cur_pocket_chain)

        check_uniprot = list()

        for t in list(set(tmp)):
            chain_index = chains.index(t)
            if uniprot_seqs[chain_index] == 'None':
                flag = 1
                break
            else:
                check_uniprot.append(uniprot_seqs[chain_index])

        if flag:
            remove_keys.append(keys)
        else:
            if len(set(check_uniprot)) > 1 or len(set(check_uniprot)) == 0:
                remove_keys.append(keys)

    return remove_keys

In [28]:
scPDB_remove_keys = check_key(scPDB_dict)
print(len(scPDB_remove_keys))

for remove_ in scPDB_remove_keys:
    del scPDB_dict[remove_]

print(len(scPDB_dict))

809
16346


In [29]:
PDBbind_remove_keys = check_key(PDBbind_dict)
print(len(PDBbind_remove_keys))

for remove_ in PDBbind_remove_keys:
    del PDBbind_dict[remove_]

print(len(PDBbind_dict))

686
13715


#### 2.3 Make fasta

In [30]:
def make_fasta(dict_, query_file_name, target_file_name):
    fw1 = open(f"./fasta_bs/{query_file_name}", "w")
    fw2 = open(f"./fasta_bs/{target_file_name}", "w")
    
    for keys in list(dict_.keys()):
        pdb_id = keys.split("_")[0]

        pdb_chains, pdb_seqs = dict_[keys]["Chain"].split(","), dict_[keys]["PDB_seqs"].split(",")
        uniprot_ids, uniprot_seqs = dict_[keys]["Uniprot_IDs"].split(","), dict_[keys] ["Uniprot_seqs"].split(",")

        for pdb_chain, pdb_seq, uniprot_id, uniprot_seq in zip(pdb_chains, pdb_seqs, uniprot_ids, uniprot_seqs):
            if uniprot_id != "None":
                fw1.write('>'+pdb_id+'_'+pdb_chain+'\n')
                fw1.write(pdb_seq+'\n')

                fw2.write('>'+pdb_id+'_'+pdb_chain+'_'+uniprot_id+'\n')
                fw2.write(uniprot_seq+'\n')                 
        
    fw1.close()
    fw2.close() 

In [31]:
make_fasta(scPDB_dict, "BS_scPDB_query_pdb.fasta", "BS_scPDB_target_uniprot_pdb.fasta")
make_fasta(PDBbind_dict, "BS_PDBbind_query_pdb.fasta", "BS_PDBbind_target_uniprot_pdb.fasta")

In [32]:
def get_total_uniprot_dict(df_list):
    data = dict()
    
    for df in df_list:
        for line in df.values:
            uni, seq = line[1], line[4]
            data[uni] = seq
    
    return data

def add_uniprot_seqs(input_dict, uniprot_fasta_dict):
    
    for keys in list(input_dict.keys()):
        uniprot_ids = input_dict[keys]["Uniprot_IDs"].split(",")
        
        tmp = list()
        for uniprot in uniprot_ids:
            if uniprot in uniprot_fasta_dict:
                tmp.append(uniprot_fasta_dict[uniprot])
            else:
                tmp.append("None")
        
        input_dict[keys]["Uniprot_seqs"] = ",".join(tmp)
        
    return input_dict

In [33]:
CASF2016_DTA_df = pd.read_csv("./preprocessed_data/step5_CASF2016_data.tsv", sep = "\t")
CASF2013_DTA_df = pd.read_csv("./preprocessed_data/step5_CASF2013_data.tsv", sep = "\t")
CSAR2014_DTA_df = pd.read_csv("./preprocessed_data/step5_CSAR2014_data.tsv", sep = "\t")
CASR2012_DTA_df = pd.read_csv("./preprocessed_data/step5_CSAR2012_data.tsv", sep = "\t")
CSARset1_DTA_df = pd.read_csv("./preprocessed_data/step5_CSARset1_data.tsv", sep = "\t")
CSARset2_DTA_df = pd.read_csv("./preprocessed_data/step5_CSARset2_data.tsv", sep = "\t")
Astex_DTA_df = pd.read_csv("./preprocessed_data/step5_Astex_data.tsv", sep = "\t")
COACH420_df = pd.read_csv("./preprocessed_data/step5_COACH420_data.tsv", sep = "\t")
HOLO4K_df = pd.read_csv("./preprocessed_data/step5_HOLO4K_data.tsv", sep = "\t")

In [34]:
total_uniprot_dict = get_total_uniprot_dict([CASF2016_DTA_df, CASF2013_DTA_df, CSAR2014_DTA_df, CASR2012_DTA_df,
                                            CSARset1_DTA_df, CSARset2_DTA_df, Astex_DTA_df, COACH420_df, HOLO4K_df])

In [35]:
CASF2016_BS_dict = add_uniprot_seqs(CASF2016_BS_dict, total_uniprot_dict)
CASF2013_BS_dict = add_uniprot_seqs(CASF2013_BS_dict, total_uniprot_dict)
CSAR2014_BS_dict = add_uniprot_seqs(CSAR2014_BS_dict, total_uniprot_dict)
CSAR2012_BS_dict = add_uniprot_seqs(CSAR2012_BS_dict, total_uniprot_dict)
CSARset1_BS_dict = add_uniprot_seqs(CSARset1_BS_dict, total_uniprot_dict)
CSARset2_BS_dict = add_uniprot_seqs(CSARset2_BS_dict, total_uniprot_dict)
Astex_BS_dict = add_uniprot_seqs(Astex_BS_dict, total_uniprot_dict)
COACH420_BS_dict = add_uniprot_seqs(COACH420_BS_dict, total_uniprot_dict)
HOLO4K_BS_dict = add_uniprot_seqs(HOLO4K_BS_dict, total_uniprot_dict)

### 3. Mapping to uniprot

In [36]:
def get_results_dict(path):

    f = open(path, "r")
    i, seq_target, seq_auery, align, result_dict, pdb_ratio_dict = -1, "", "", "", dict(), dict()
    for line in f.readlines():
        i += 1
        if i % 4 == 0:
            if 'target_name' in line:
                if len(seq_target) != 0:
                    result_dict[target_name] = (seq_target, seq_query, align, target_start, query_start)
                target_name = line.strip().split(' ')[-1]
                seq_target, seq_query, align = '', '', ''
            else:
                seq_target += line.split('\t')[1]
                
        elif i % 4 == 1:
            if 'query_name' in line:
                query_name = line.strip().split(' ')[-1]
            else:
                align += line.strip('\n').split('\t')[1]
                
        elif i % 4 == 2:
            if 'optimal_alignment_score' in line:
                for item in line.strip().split('\t'):
                    if item.split(' ')[0] == 'target_begin:':
                        target_start = int(item.split(' ')[1])
                    elif item.split(' ')[0] == 'query_begin:':
                        query_start = int(item.split(' ')[1])
            else:
                seq_query += line.split('\t')[1]
    
    result_dict[target_name] = (seq_target, seq_query, align, target_start, query_start)
    
    f.close()
    return result_dict

In [37]:
scPDB_mapping_results = get_results_dict("./fasta_bs/BS_scPDB_pdb_align.txt")
print(f"[scPDB]: {len(scPDB_mapping_results)}")

PDBbind_mapping_results = get_results_dict("./fasta_bs/BS_PDBbind_pdb_align.txt")
print(f"[PDBbind]: {len(PDBbind_mapping_results)}")

CASF2016_mapping_results = get_results_dict("./fasta/CASF2016_pdb_align.txt")
print(f"[CASF2016]: {len(CASF2016_mapping_results)}")

CASF2013_mapping_results = get_results_dict("./fasta/CASF2013_pdb_align.txt")
print(f"[CASF2013]: {len(CASF2013_mapping_results)}")

CSAR2014_mapping_results = get_results_dict("./fasta/CSAR2014_pdb_align.txt")
print(f"[CSAR2014]: {len(CSAR2014_mapping_results)}")

CSAR2012_mapping_results = get_results_dict("./fasta/CSAR2012_pdb_align.txt")
print(f"[CSAR2012]: {len(CSAR2012_mapping_results)}")

CSARset1_mapping_results = get_results_dict("./fasta/CSARset1_pdb_align.txt")
print(f"[CSARset1]: {len(CSARset1_mapping_results)}")

CSARset2_mapping_results = get_results_dict("./fasta/CSARset2_pdb_align.txt")
print(f"[CSARset2]: {len(CSARset2_mapping_results)}")

Astex_mapping_results = get_results_dict("./fasta/Astex_pdb_align.txt")
print(f"[Astex]: {len(Astex_mapping_results)}")

COACH420_mapping_results = get_results_dict("./fasta/COACH420_pdb_align.txt")
print(f"[COACH420]: {len(COACH420_mapping_results)}")

HOLO4K_mapping_results = get_results_dict("./fasta/HOLO4K_pdb_align.txt")
print(f"[HOLO4K]: {len(HOLO4K_mapping_results)}")

[scPDB]: 35131
[PDBbind]: 29461
[CASF2016]: 588
[CASF2013]: 341
[CSAR2014]: 47
[CSAR2012]: 73
[CSARset1]: 301
[CSARset2]: 257
[Astex]: 142
[COACH420]: 522
[HOLO4K]: 5696


In [38]:
def seq_with_gap_to_idx(seq):
    idx_list = []
    i = 0
    for aa in seq:
        if aa == '-':
            idx_list.append(-1)
        else:
            idx_list.append(i)
            i += 1
    return idx_list

def get_target_idx(target_idx_list, query_idx_list, align, target_start, query_start):
    pdb_to_uniprot_idx = []
    for i in range(target_start-1): 
        pdb_to_uniprot_idx.append(-1)
        
    for i in range(len(target_idx_list)):
        if target_idx_list[i] != -1: 
            if align[i]  == '|' and query_idx_list[i] != -1:
                pdb_to_uniprot_idx.append(query_idx_list[i] + query_start-1)
            else: 
                pdb_to_uniprot_idx.append(-1)
    return pdb_to_uniprot_idx

def get_pdb_to_uniprot_map(result_dict):
    pdb_to_uniprot_map_dict = dict()
    
    for name in result_dict:
        pdbid, chain = name.split('_') 
        seq_target, seq_query, align, target_start, query_start = result_dict[name] 

        ratio = float(align.count('|'))/float(len(seq_target.replace('-','')))
        if ratio < 0.9:
            continue

        target_idx_list = seq_with_gap_to_idx(seq_target) 
        query_idx_list = seq_with_gap_to_idx(seq_query) 
        pdb_to_uniprot_idx = get_target_idx(target_idx_list, query_idx_list, align, target_start, query_start)

        if pdbid in pdb_to_uniprot_map_dict:
            pdb_to_uniprot_map_dict[pdbid][chain] = pdb_to_uniprot_idx
        else:
            pdb_to_uniprot_map_dict[pdbid] = {}
            pdb_to_uniprot_map_dict[pdbid][chain] = pdb_to_uniprot_idx
    return pdb_to_uniprot_map_dict

In [39]:
scPDB_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(scPDB_mapping_results)
print(f"[scPDB]: {len(scPDB_pdb_to_uniprot_map_dict)}")

PDBbind_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(PDBbind_mapping_results)
print(f"[PDBbind]: {len(PDBbind_pdb_to_uniprot_map_dict)}")

CASF2016_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(CASF2016_mapping_results)
print(f"[CASF2016]: {len(CASF2016_pdb_to_uniprot_map_dict)}")

CASF2013_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(CASF2013_mapping_results)
print(f"[CASF2013]: {len(CASF2013_pdb_to_uniprot_map_dict)}")

CSAR2014_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(CSAR2014_mapping_results)
print(f"[CSAR2014]: {len(CSAR2014_pdb_to_uniprot_map_dict)}")

CSAR2012_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(CSAR2012_mapping_results)
print(f"[CSAR2012]: {len(CSAR2012_pdb_to_uniprot_map_dict)}")

CSARset1_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(CSARset1_mapping_results)
print(f"[CSARset1]: {len(CSARset1_pdb_to_uniprot_map_dict)}")

CSARset2_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(CSARset2_mapping_results)
print(f"[CSARset2]: {len(CSARset2_pdb_to_uniprot_map_dict)}")

Astex_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(Astex_mapping_results)
print(f"[Astex]: {len(Astex_pdb_to_uniprot_map_dict)}")

COACH420_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(COACH420_mapping_results)
print(f"[COACH420]: {len(COACH420_pdb_to_uniprot_map_dict)}")

HOLO4K_pdb_to_uniprot_map_dict = get_pdb_to_uniprot_map(HOLO4K_mapping_results)
print(f"[HOLO4K]: {len(HOLO4K_pdb_to_uniprot_map_dict)}")

[scPDB]: 15363
[PDBbind]: 14487
[CASF2016]: 268
[CASF2013]: 170
[CSAR2014]: 46
[CSAR2012]: 55
[CSARset1]: 152
[CSARset2]: 132
[Astex]: 74
[COACH420]: 266
[HOLO4K]: 3167


In [40]:
def read_fasta(fasta_file):
    uniprot_seq_dict = {}
    f = open(fasta_file)
    for line in f.readlines():
        if line[0] == '>':
            pdbid = line.strip().split('_')[0][1:]
            chain = line.strip().split('_')[1]
            name = line.strip().split('_')[-1]
        else:
            seq = line.strip()
            uniprot_seq_dict[f"{pdbid}_{chain}"] = (seq,name)
    f.close()
    return uniprot_seq_dict

def make_seq_dict(chains, seqs, seqs_index):

    resutls = dict()
    for chain, seq, seq_index in zip(chains.split(","), seqs.split(","), seqs_index.split(";")):
        resutls[chain] = (seq, seq_index)
    
    return resutls

def get_interact_in_uniprot_seq(pdb_to_uniprot, seq_dict, chain_index, BS_index):
    uniprot_BS, residue_record, chain = list(), "", None            
    
    for chain, bs in zip(chain_index, BS_index.split(";")):
        if chain not in pdb_to_uniprot:
            continue

        sequence, idx_list = seq_dict[chain]
        #idx_list = list(map(int, idx_list.split(",")))
        idx_list = idx_list.split(",")

        for idx in bs.split(","):
            if idx_list.count(idx) != 1: 
                print(idx, idx_list.count(idx), 'idx_list.count(idx) != 1')            
        
            seq_pos = idx_list.index(idx)

            if seq_pos >= len(pdb_to_uniprot[chain]): 
                continue 

            if pdb_to_uniprot[chain][seq_pos] == -1:
                continue            
            
            interact_idx = pdb_to_uniprot[chain][seq_pos]       
            
            uniprot_BS.append(interact_idx)
            residue_record += sequence[seq_pos] # for check
    
    return uniprot_BS, residue_record, chain    

def mapping_interaction(dict_, pdb_to_uniprot_map_dict, fasta_file):
    
    i, count_not_uniprot_map, count_not_in_uniprot_seq_dict, uniprot_seq_dict, final_results = 0, 0, 0, read_fasta(fasta_file), dict()
    
    for idx, keys in enumerate(list(dict_.keys())):
        pdbid, lig_code = keys.split("_")[0], keys.split("_")[1]

        if pdbid not in pdb_to_uniprot_map_dict:
            count_not_uniprot_map += 1
            i += 1
            print(f"[Error: {i}] PDB id: {pdbid} is not in uniprot mapping dict")
            continue

        seq_dict = make_seq_dict(dict_[keys]["Chain"], dict_[keys]["PDB_seqs"], dict_[keys]["PDB_indexes"])

        pdb_to_uniprot = pdb_to_uniprot_map_dict[pdbid]

        uniprot_bs_8A_indexes, uniprot_bs_6_5A_indexes, uniprot_bs_4A_indexes = list(), list(), list()

        """ Convert 8A bs """
        for lig in list(dict_[keys]["BS_8A"].keys()):

            uniprot_bs_8A, residue_check_8A, chain = get_interact_in_uniprot_seq(pdb_to_uniprot, seq_dict, dict_[keys]["BS_8A"][lig][0], dict_[keys]["BS_8A"][lig][1])

            uniprot_bs_4A, residue_check_4A, _ = get_interact_in_uniprot_seq(pdb_to_uniprot, seq_dict, dict_[keys]["BS_4A"][lig][0], dict_[keys]["BS_4A"][lig][1])

            uniprot_seq, uniprot_id = uniprot_seq_dict[f"{pdbid}_{chain}"]

            if residue_check_8A == ''.join(np.array([aa for aa in uniprot_seq])[uniprot_bs_8A].tolist()):
                uniprot_bs_8A_indexes.extend(uniprot_bs_8A)
                uniprot_bs_4A_indexes.extend(uniprot_bs_4A)
            else:
                continue

        if len(uniprot_bs_8A_indexes) != 0:
            final_results[keys] = dict()
            final_results[keys]["Uniprot_seqs"] = uniprot_seq
            final_results[keys]["Uniprot_IDs"] = uniprot_id

            uniprot_bs_8A_indexes = sorted(list(set(uniprot_bs_8A_indexes)))
            uniprot_bs_4A_indexes = sorted(list(set(uniprot_bs_4A_indexes)))

            uniprot_bs_8A_indexes = list(map(str, uniprot_bs_8A_indexes))
            uniprot_bs_4A_indexes = list(map(str, uniprot_bs_4A_indexes))

            final_results[keys]["Uniprot_8A_BS"] = ",".join(uniprot_bs_8A_indexes)
            final_results[keys]["Uniprot_4A_BS"] = ",".join(uniprot_bs_4A_indexes)

            final_results[keys]["PDB_chain"] = dict_[keys]["Chain"]
            final_results[keys]["PDB_seqs"] = dict_[keys]["PDB_seqs"]
            final_results[keys]["PDB_indexes"] = dict_[keys]["PDB_indexes"] 
            
    return final_results     

In [42]:
scPDB_results = mapping_interaction(scPDB_dict,
                                               scPDB_pdb_to_uniprot_map_dict, 
                                               "./fasta_bs/BS_scPDB_target_uniprot_pdb.fasta")

print(f"[PDBbind]: {len(scPDB_final_results)}")

[Error: 1] PDB id: 4iwq is not in uniprot mapping dict
[Error: 2] PDB id: 3sz0 is not in uniprot mapping dict
[Error: 3] PDB id: 3ucb is not in uniprot mapping dict
[Error: 4] PDB id: 5l7i is not in uniprot mapping dict
[Error: 5] PDB id: 5xr8 is not in uniprot mapping dict
[Error: 6] PDB id: 5ds3 is not in uniprot mapping dict
[Error: 7] PDB id: 5xra is not in uniprot mapping dict
[Error: 8] PDB id: 4jkv is not in uniprot mapping dict
[Error: 9] PDB id: 4daj is not in uniprot mapping dict
[Error: 10] PDB id: 4j55 is not in uniprot mapping dict
[Error: 11] PDB id: 3sx6 is not in uniprot mapping dict
[Error: 12] PDB id: 5k2d is not in uniprot mapping dict
[Error: 13] PDB id: 4f1m is not in uniprot mapping dict
[Error: 14] PDB id: 4ye3 is not in uniprot mapping dict
[Error: 15] PDB id: 4npt is not in uniprot mapping dict
[Error: 16] PDB id: 4gda is not in uniprot mapping dict
[Error: 17] PDB id: 2xw7 is not in uniprot mapping dict
[Error: 18] PDB id: 5iu4 is not in uniprot mapping dict
[

In [43]:
PDBbind_final_results = mapping_interaction(PDBbind_dict,
                                               PDBbind_pdb_to_uniprot_map_dict, 
                                               "./fasta_bs/BS_PDBbind_target_uniprot_pdb.fasta")

print(f"[PDBbind]: {len(PDBbind_final_results)}")

[Error: 1] PDB id: 5wqc is not in uniprot mapping dict
[Error: 2] PDB id: 4xy2 is not in uniprot mapping dict
[Error: 3] PDB id: 4ymj is not in uniprot mapping dict
[Error: 4] PDB id: 6drz is not in uniprot mapping dict
[Error: 5] PDB id: 3sv7 is not in uniprot mapping dict
[Error: 6] PDB id: 5jga is not in uniprot mapping dict
[Error: 7] PDB id: 6nrh is not in uniprot mapping dict
[Error: 8] PDB id: 6dm8 is not in uniprot mapping dict
[Error: 9] PDB id: 3suf is not in uniprot mapping dict
[Error: 10] PDB id: 5c2e is not in uniprot mapping dict
[Error: 11] PDB id: 4v25 is not in uniprot mapping dict
[Error: 12] PDB id: 6a94 is not in uniprot mapping dict
[Error: 13] PDB id: 6kdz is not in uniprot mapping dict
[Error: 14] PDB id: 5e7r is not in uniprot mapping dict
[Error: 15] PDB id: 4up5 is not in uniprot mapping dict
[Error: 16] PDB id: 6iql is not in uniprot mapping dict
[Error: 17] PDB id: 6qxj is not in uniprot mapping dict
[Error: 18] PDB id: 6m9t is not in uniprot mapping dict
[

[Error: 154] PDB id: 5iu8 is not in uniprot mapping dict
[Error: 155] PDB id: 5tvn is not in uniprot mapping dict
[Error: 156] PDB id: 2bu5 is not in uniprot mapping dict
[Error: 157] PDB id: 4xnv is not in uniprot mapping dict
[Error: 158] PDB id: 5jgd is not in uniprot mapping dict
[Error: 159] PDB id: 6kdx is not in uniprot mapping dict
[Error: 160] PDB id: 6ke0 is not in uniprot mapping dict
[Error: 161] PDB id: 3ttp is not in uniprot mapping dict
[Error: 162] PDB id: 5vp9 is not in uniprot mapping dict
[Error: 163] PDB id: 5vex is not in uniprot mapping dict
[Error: 164] PDB id: 5cgc is not in uniprot mapping dict
[Error: 165] PDB id: 6au2 is not in uniprot mapping dict
[Error: 166] PDB id: 5yc8 is not in uniprot mapping dict
[Error: 167] PDB id: 3hlo is not in uniprot mapping dict
[Error: 168] PDB id: 4wmu is not in uniprot mapping dict
[Error: 169] PDB id: 5afj is not in uniprot mapping dict
[Error: 170] PDB id: 3ws9 is not in uniprot mapping dict
[Error: 171] PDB id: 3oe6 is no

In [44]:
CASF2016_results = mapping_interaction(CASF2016_dict,
                                               CASF2016_pdb_to_uniprot_map_dict, 
                                               "./fasta/CASF2016_target_uniprot_pdb.fasta")

print(f"[CASF2016]: {len(CASF2016_results)}")

[Error: 1] PDB id: 1q8t is not in uniprot mapping dict
[Error: 2] PDB id: 1q8u is not in uniprot mapping dict
[Error: 3] PDB id: 1ydr is not in uniprot mapping dict
[Error: 4] PDB id: 1ydt is not in uniprot mapping dict
[Error: 5] PDB id: 2c3i is not in uniprot mapping dict
[Error: 6] PDB id: 4f2w is not in uniprot mapping dict
[Error: 7] PDB id: 4f3c is not in uniprot mapping dict
[Error: 8] PDB id: 4w9h is not in uniprot mapping dict
[Error: 9] PDB id: 4w9i is not in uniprot mapping dict
[Error: 10] PDB id: 4w9l is not in uniprot mapping dict
[Error: 11] PDB id: 5c28 is not in uniprot mapping dict
[CASF2016]: 268


In [45]:
CASF2013_results = mapping_interaction(CASF2013_BS_dict,
                                               CASF2013_pdb_to_uniprot_map_dict, 
                                               "./fasta/CASF2013_target_uniprot_pdb.fasta")

print(f"[CASF2013]: {len(CASF2013_results)}")

[Error: 1] PDB id: 1kel is not in uniprot mapping dict
[Error: 2] PDB id: 1q8t is not in uniprot mapping dict
[Error: 3] PDB id: 1q8u is not in uniprot mapping dict
[Error: 4] PDB id: 2pcp is not in uniprot mapping dict
[Error: 5] PDB id: 3su2 is not in uniprot mapping dict
[Error: 6] PDB id: 3su3 is not in uniprot mapping dict
[Error: 7] PDB id: 3su5 is not in uniprot mapping dict
[CASF2013]: 170


In [46]:
CSAR2014_results = mapping_interaction(CSAR2014_BS_dict,
                                               CSAR2014_pdb_to_uniprot_map_dict, 
                                               "./fasta/CSAR2014_target_uniprot_pdb.fasta")

print(f"[CSAR201]: {len(CSAR2014_results)}")

[CSAR201]: 46


In [47]:
CSAR2012_results = mapping_interaction(CSAR2012_BS_dict,
                                               CSAR2012_pdb_to_uniprot_map_dict, 
                                               "./fasta/CSAR2012_target_uniprot_pdb.fasta")

print(f"[CSAR2012]: {len(CSAR2012_results)}")

[CSAR2012]: 55


In [48]:
CSARset1_results = mapping_interaction(CSARset1_BS_dict,
                                               CSARset1_pdb_to_uniprot_map_dict, 
                                               "./fasta/CSARset1_target_uniprot_pdb.fasta")

print(f"[CSARset1]: {len(CSARset1_results)}")

[Error: 1] PDB id: 2cli is not in uniprot mapping dict
[Error: 2] PDB id: 2isw is not in uniprot mapping dict
[Error: 3] PDB id: 2p09 is not in uniprot mapping dict
[Error: 4] PDB id: 1q72 is not in uniprot mapping dict
[CSARset1]: 152


In [49]:
CSARset2_results = mapping_interaction(CSARset2_dict,
                                               CSARset2_pdb_to_uniprot_map_dict, 
                                               "./fasta/CSARset2_target_uniprot_pdb.fasta")

print(f"[CSARset2]: {len(CSARset2_results)}")

[Error: 1] PDB id: 1i7z is not in uniprot mapping dict
[Error: 2] PDB id: 1jgl is not in uniprot mapping dict
[Error: 3] PDB id: 1lnm is not in uniprot mapping dict
[Error: 4] PDB id: 1ws4 is not in uniprot mapping dict
[Error: 5] PDB id: 1y0l is not in uniprot mapping dict
[Error: 6] PDB id: 1zhy is not in uniprot mapping dict
[Error: 7] PDB id: 2buv is not in uniprot mapping dict
[Error: 8] PDB id: 1q0y is not in uniprot mapping dict
[Error: 9] PDB id: 1zhx is not in uniprot mapping dict
[Error: 10] PDB id: 2hj4 is not in uniprot mapping dict
[CSARset2]: 132


In [50]:
Astex_results = mapping_interaction(Astex_dict,
                                               Astex_pdb_to_uniprot_map_dict, 
                                               "./fasta/Astex_target_uniprot_pdb.fasta")

print(f"[Astex]: {len(Astex_results)}")

[Error: 1] PDB id: 1mzc is not in uniprot mapping dict
[Astex]: 74


In [51]:
COACH420_results = mapping_interaction(COACH420_dict,
                                               COACH420_pdb_to_uniprot_map_dict, 
                                               "./fasta/COACH420_target_uniprot_pdb.fasta")

print(f"[COACH420]: {len(COACH420_results)}")

[Error: 1] PDB id: 3hvl is not in uniprot mapping dict
[COACH420]: 322


In [52]:
HOLO4K_results = mapping_interaction(HOLO4K_dict,
                                               HOLO4K_pdb_to_uniprot_map_dict, 
                                               "./fasta/HOLO4K_target_uniprot_pdb.fasta")

print(f"[HOLO4K]: {len(HOLO4K_final_results)}")

[Error: 1] PDB id: 1brw is not in uniprot mapping dict
[Error: 2] PDB id: 1jf0 is not in uniprot mapping dict
[Error: 3] PDB id: 1lke is not in uniprot mapping dict
[Error: 4] PDB id: 1lnm is not in uniprot mapping dict
[Error: 5] PDB id: 1men is not in uniprot mapping dict
[Error: 6] PDB id: 1n0s is not in uniprot mapping dict
[Error: 7] PDB id: 1n3i is not in uniprot mapping dict
[Error: 8] PDB id: 1nsa is not in uniprot mapping dict
[Error: 9] PDB id: 1swg is not in uniprot mapping dict
[Error: 10] PDB id: 4aig is not in uniprot mapping dict
[HOLO4K]: 3899


In [53]:
def convert_df(dict_):
    results = {"PDB_IDs":[], "Lig_codes":[], "PDB_chains":[], "Uniprot_IDs":[], "Uniprot_Seqs":[], 
                  "BS_8A":[], "BS_4A":[], "PDB_seqs":[], "PDB_indexes":[]}

    for keys in list(dict_.keys()):

        pdbid, lig_code = keys.split("_")[0],keys.split("_")[1]
        
        results["PDB_IDs"].append(pdbid) 
        results["Lig_codes"].append(lig_code)
        results["PDB_chains"].append(dict_[keys]['PDB_chain'])
        results["Uniprot_IDs"].append(dict_[keys]['Uniprot_IDs'])
        results["Uniprot_Seqs"].append(dict_[keys]['Uniprot_seqs'])
        results["BS_8A"].append(dict_[keys]['Uniprot_8A_BS'])
        results["BS_4A"].append(dict_[keys]['Uniprot_4A_BS'])
        results["PDB_seqs"].append(dict_[keys]['PDB_seqs'])
        results["PDB_indexes"].append(dict_[keys]['PDB_indexes'])

    return pd.DataFrame(results)

In [54]:
scPDB_df = convert_df(scPDB_results)
scPDB_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,4nx6,FOL,A,P0ABQ4,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,"3,4,5,6,7,13,17,18,19,21,22,23,24,25,26,27,28,...","4,5,6,19,21,26,27,28,29,30,31,45,49,51,53,54,5...",MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
1,3tqx,PLP,"A,B",Q83F40,MQEILSQLNKEIEGLKKAGLYKSERIITSPQNAEIKVGEKEVLNFC...,"47,49,74,75,76,105,106,107,108,109,110,111,112...","107,108,109,112,133,135,178,182,207,209,210,23...",QEILSQLNKEIEGLKKAGLYKSERIITSPQNAEIKVGEKEVLNFCA...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
2,4bek,XK0,A,P56817,MAQALPWLLLWMGAGVLPAHGTQHGIRLPLRSGLGGAPLGLRLPRE...,"71,72,73,74,90,91,92,93,94,95,96,97,129,130,13...",90929495131168175178288290291,RGSFVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVG...,"57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,7..."
3,4dca,ADP,A,Q93ET9,MVNLDAEIYEHLNKQIKINELRYLSSGDDSDTFLCNEQYVVKVPKR...,"22,23,24,25,26,28,29,30,31,32,33,37,38,39,40,4...",2324313941858687196198208209,LDAEIYEHLNKQIKINELRYLSSGDDSDTFLCNEQYVVKVPKRDSV...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2..."
4,4pah,LNR,A,P00439,MSTAVLENPGLGRKLSDFGQETSYIEDNCNQNGAISLIFSLKEEVG...,"246,247,248,249,250,251,253,254,262,263,265,27...",247253284289324329,TVPWFPRTIQELDRFANQILSYGAELDADHPGFKDPVYRARRKQFA...,"117,118,119,120,121,122,123,124,125,126,127,12..."
...,...,...,...,...,...,...,...,...,...
16176,3w2f,FAD,A,P83686,STPAITLENPDIKYPLRLIDKEVVNHDTRRFRFALPSPEHILGLPV...,"26,27,28,30,46,47,48,49,50,51,55,60,61,62,63,6...","48,62,63,64,65,79,80,81,83,84,87,88,91,94,95,9...",TPAITLENPDIKYPLRLIDKEVVNHDTRRFRFALPSPEHILGLPVG...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
16177,3af0,GDP,A,P9WPA7,MSRLSEPSPYVEFDRRQWRALRMSTPLALTEEELVGLRGLGEQIDL...,"36,37,38,39,40,41,94,95,96,97,98,99,100,101,10...",979899100101102103178179237241245246,EPSPYVEFDRRQWRALRMSTPLALTEEELVGLRGLGEQIDLLEVEE...,"6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22..."
16178,4dym,IYZ,A,Q04771,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,"213,214,215,216,218,220,221,222,223,231,232,23...","218,221,232,234,262,282,283,284,285,288,289,33...",QITLLECVGKGRYGEVWRGSWQGENVAVKIFSSRDEKSWFRETELY...,"207,208,209,210,211,212,213,214,215,216,217,21..."
16179,2i47,KGY,"A,B,C,D",P78536,MRQSLLFLTSVVPFVLAPRPPDDPGFGPHQRLEKLDSLLSDYDILS...,"311,312,313,314,315,317,342,343,344,345,346,34...","313,314,345,346,347,348,349,397,400,401,404,40...",PMKNTCKLLVVADHRFYRYMGRGEESTTTNYLIELIDRVDDIYRNT...,"220,221,222,223,224,225,226,227,228,229,230,23..."


In [55]:
PDBbind_df = convert_df(PDBbind_results)
PDBbind_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,3wka,S0G,A,P34913,MTLRAAVFDLDGVLALPAVFGVLGRTEEALALPRGLLNDAFQKGGP...,"265,266,267,268,333,334,335,358,379,380,382,38...",266382418495496497523524,TLRAAVFDLDGVLALPAVFGVLGRTEEALALPRGLLNDAFQKGGPE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
1,5kgx,7SK,A,P12497,MGARASVLSGGELDKWEKIRLRPGGKKQYKLKHIVWASRELERFAV...,"1311,1312,1313,1314,1315,1316,1317,1318,1319,1...",1314131513161317131913201324,CSPGIWQLDTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,"56,57,58,59,60,61,62,63,64,66,67,68,69,70,71,7..."
2,5aen,DP8,A,P09960,MPEIVDTCSLASPASVCRTKHLHLRCSVDFTRRTLTGTAALTVQSQ...,"7,134,135,136,137,138,139,200,265,266,267,268,...","136,137,267,269,270,311,314,367,369,374,375,37...",IVDTCSLASPASVCRTKHLHLRCSVDFTRRTLTGTAALTVQSQEDN...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
3,1utp,PBN,A,P00760,MKTFIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYH...,"23,24,46,47,60,62,63,103,106,140,144,145,153,1...",62193194195196199213215216226,IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGI...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3..."
4,2yc3,MW5,A,P69834,MAMLQTNLGFITSPTFLCPKLKVKLNSYLWFSYRSQVQKLDFSKRV...,"81,83,155,156,157,158,160,161,174,175,176,177,...","156,176,202,203,237,238,239,244,258,260,261,26...",MEKSVSVILLAGGQGPKQYIPLLGQPIALYSFFTFSRMPEVKEIVV...,"75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,9..."
...,...,...,...,...,...,...,...,...,...
13473,5tr6,7KG,A,P43405,MASSGMADSANHLPFFFGNITREEAEDYLVQGGMSDGLYLLRQSRN...,"375,376,377,378,379,381,382,383,384,385,386,39...","376,378,384,399,447,448,449,450,451,453,454,49...",EVYLDRKLLTLEDKELGSGNFGTVKKGYYQMKKVVKTVAVKILPAL...,"362,363,364,365,366,367,368,369,370,371,372,37..."
13474,3fci,3FI,A,P13051,MIGQKTLYSFFSPSPARKRHAPSPEPAVQGTGVAGVPEESGDAAAI...,"150,151,152,153,154,155,156,160,161,162,163,16...","151,152,153,155,165,166,177,212,254,255,256,27...",MEFFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFT...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9..."
13475,2q9n,LK5,A,P05364,MMRKSLCCALLLGISCSALATPVSEKQLAEVVANTITPLMKAQSVP...,"81,82,83,84,85,86,137,138,139,140,169,170,171,...",83138139169171223240336337338339,PVSEKQLAEVVANTITPLMKAQSVPGMAVAVIYQGKPHYYTFGKAD...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
13476,3np7,Z15,A,P00489,MSRPLSDQEKRKQISVRGLAGVENVTELKKNFNRHLHFTLVKDRNV...,"13,16,93,96,100,105,120,121,123,124,490,491,49...",1696120124494495497544545548654655,QISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYYFALAH...,"12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,2..."


In [56]:
CASF2016_df = convert_df(CASF2016_results)
CASF2016_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,1bcu,PRL,"L,H,I",P00734,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,"364,405,458,500,505,507,508,527,550,560,561,56...",561562563564567587589590592593600,"CGLRPLFEKKSLED,IVEGSDAEIGMSPWQVMLFRKPQELLCGASL...","1,2,3,4,5,6,7,8,9,10,11,12,13,14;16,17,18,19,2..."
1,1bzc,TPI,A,P18031,MEMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRY...,"35,43,44,45,46,47,48,83,84,85,109,110,114,115,...","45,46,47,48,119,180,181,214,215,216,217,218,21...",EMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRYR...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
2,1c5z,BEN,"A,B",P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,208,221,223,224,271,274,314,318,319,32...","369,370,371,372,375,393,394,395,396,398,399,40...","LKFQCGQKT,IIGGEFTTIENQPWFAAIYRRHVTYVCGGSLMSPCW...","9,10,11,12,13,14,15,16,17;16,17,18,19,20,21,22..."
3,1e66,HUX,A,P04058,MNLLVTSSLGVLLHLVVLCQADDHSELLVNTKSGKVMGTRVPVLSS...,"89,92,95,98,99,100,101,102,103,104,105,134,136...","104,138,139,141,219,220,310,350,351,354,452,45...",SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2..."
4,1eby,BEB,"A,B",P03366,MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,"507,509,522,523,524,525,526,527,528,529,530,53...","507,522,524,526,527,528,529,531,546,547,548,54...",PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
...,...,...,...,...,...,...,...,...,...
263,5a7b,KMN,"A,B",P04637,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"106,107,108,109,143,144,145,146,147,148,149,15...",144146147148149150151152220221222229,SVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKLFCQLAKTC...,"96,97,98,99,100,101,102,103,104,105,106,107,10..."
264,5aba,UL7,"A,B",P04637,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"106,108,109,143,144,145,146,147,148,149,150,15...",144146147148149150220221222227229,SVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKLFCQLAKTC...,"96,97,98,99,100,101,102,103,104,105,106,107,10..."
265,5c2h,4XU,"A,B",Q9Y233,MRIEERKSQHLTGLTDEKVKAYLSLHPQVLDEFVSESVSAETVEKW...,675,,QFTLPVRLCKEIELFHFDIGPFENMWPGIFVYMVHRSCGTSCFELE...,"451,452,453,454,455,456,457,458,459,460,461,46..."
266,5dwr,5H7,A,P11309,MLLSKINSLAHLRAAPCNDLHATKLAPGKEKEPLESQYQVGPLLGS...,"41,42,43,44,45,46,47,48,49,50,51,52,53,54,62,6...","43,44,48,51,64,66,103,119,120,121,122,125,127,...",PLESQYQVGPLLGSGGFGSVYSGIRVSDNLPVAIKHVEKDRISDWG...,"33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,4..."


In [57]:
CASF2013_df = convert_df(CASF2013_results)
CASF2013_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,10gs,VWW,"A,B",P09211,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,"7,8,9,10,11,12,13,14,17,33,34,35,36,38,39,44,4...",781013384450515253646598108205,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
1,1bcu,PRL,"L,H,I",P00734,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,"364,405,458,500,505,507,508,527,550,560,561,56...",561562563564567587589590592593600,"CGLRPLFEKKSLED,IVEGSDAEIGMSPWQVMLFRKPQELLCGASL...","1,2,3,4,5,6,7,8,9,10,11,12,13,14;16,17,18,19,2..."
2,1e66,HUX,A,P04058,MNLLVTSSLGVLLHLVVLCQADDHSELLVNTKSGKVMGTRVPVLSS...,"89,92,95,98,99,100,101,102,103,104,105,134,136...","104,138,139,141,219,220,310,350,351,354,452,45...",SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2..."
3,1f8b,DAN,A,P03472,MNPNQKILCTSATALVIGTIAVLIGITNLGLNIGLHLKPSCNCSHS...,"118,119,134,136,149,150,151,152,156,179,180,18...","118,119,151,152,179,223,225,247,277,278,293,29...",RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9..."
4,1f8c,4AM,A,P03472,MNPNQKILCTSATALVIGTIAVLIGITNLGLNIGLHLKPSCNCSHS...,"118,119,134,136,149,150,151,152,156,179,180,18...","118,119,151,152,179,223,225,247,277,278,293,29...",RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9..."
...,...,...,...,...,...,...,...,...,...
165,4djv,0KM,"A,B",P56817,MAQALPWLLLWMGAGVLPAHGTQHGIRLPLRSGLGGAPLGLRLPRE...,"69,70,71,72,73,74,90,91,92,93,94,95,96,97,99,1...","70,71,72,73,90,92,94,95,131,136,168,170,175,17...",GSFVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVGA...,"58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,7..."
166,4g8m,G8M,"A,B",P19491,MQKIMHISVLLSPVLWGLIFGVSSNSIQIGGLFPRGADQEYSAFRV...,"420,422,425,469,470,471,472,483,484,497,498,49...",470498499500505670673674675725728752,ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17..."
167,4gid,0GH,"A,B,C,D",P56817,MAQALPWLLLWMGAGVLPAHGTQHGIRLPLRSGLGGAPLGLRLPRE...,"68,69,70,71,72,73,74,90,91,92,93,94,95,96,97,1...","71,72,73,90,92,94,95,129,130,131,132,133,168,1...",FVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVGAAP...,"47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,6..."
168,4gqq,0XR,A,P04746,MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERY...,"248,249,250,251,252,253,254,256,257,258,259,26...",250258259269271299300301302,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."


In [58]:
CSAR2014_df = convert_df(CSAR2014_results)
CSAR2014_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,4ypw,4FD,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,60,61,85,86,87,88,89,90,91,92,93,110,111,11...","85,86,87,88,112,114,115,116,131,132,133,135,13...",GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16..."
1,4ypx,4FG,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"85,86,87,88,89,90,91,92,93,110,111,112,113,114...",868788131132133135136137139140143,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15..."
2,4ypy,4F9,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"85,86,87,88,89,90,91,92,93,110,111,112,113,114...",868788131132133135137139140143,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17..."
3,4ypz,4FL,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"85,86,87,88,89,90,91,92,93,110,111,112,113,114...",868788112130131132133137139140143,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17..."
4,4yq0,4FM,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,60,61,66,85,86,87,88,89,90,91,92,93,110,111...","85,86,87,88,112,113,114,116,117,130,131,132,13...",GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16..."
5,4yq1,4FN,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,60,61,84,85,86,87,88,89,90,91,92,93,110,111...","86,87,88,112,113,114,115,116,130,131,132,133,1...",GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11..."
6,4yq2,EFY,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,60,61,84,85,86,87,88,89,90,91,92,93,110,111...","86,87,88,112,113,114,116,130,131,132,133,134,1...",RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15..."
7,4yq3,4G1,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,61,85,86,87,88,89,90,91,92,93,110,111,112,1...","85,86,87,88,112,113,114,116,130,131,132,133,13...",GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11..."
8,4yq4,4G3,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,60,61,85,86,87,88,89,90,91,92,93,110,111,11...","85,86,87,88,112,114,115,116,131,132,133,135,13...",GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16..."
9,4yq5,4G0,A,P43912,MWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFDKH...,"59,60,61,85,86,87,88,89,90,91,92,93,110,111,11...","85,86,87,88,112,115,116,131,132,133,135,136,13...",LVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPR...,"-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12..."


In [59]:
CSAR2012_df = convert_df(CSAR2012_results)
CSAR2012_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,4fud,6UP,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,208,223,269,271,274,314,319,322,327,33...",369370372375393395396398399406,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
1,4fue,7UP,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,197,207,208,221,222,223,224,225,226,22...","223,226,227,271,369,370,371,372,375,393,394,39...",IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
2,4fu7,1UP,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,208,223,224,268,269,271,274,314,319,32...","369,370,371,372,375,393,394,395,396,397,398,39...",IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
3,4fu8,2UP,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,208,223,269,271,274,314,319,322,327,33...",369370371372375393394395396398399406,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
4,4fu9,675,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,207,208,221,222,223,224,225,226,227,22...","223,227,271,369,370,371,372,375,393,394,395,39...",IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
5,4fub,4UP,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"179,208,222,223,224,225,226,227,264,266,267,26...",223271369370372375393395396398399406,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
6,4fuc,239,A,P00749,MRALLARLLLCVLVVSDSKGSNELHQVPSNCDCLNGGTCVSNKYFS...,"178,179,207,208,221,222,223,224,225,226,227,22...","223,226,227,264,271,369,370,371,372,375,393,39...",IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
7,4fsm,HK1,A,O14757,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,"12,13,14,15,16,21,22,23,24,33,34,35,36,37,54,5...",14223567838485868788899093136147,VPFVEDWDLVQTLGEGEVQLAVNRVTEEAVAVKIVNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,21,22,23..."
8,4fsw,HK6,A,O14757,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,"12,13,14,15,16,17,21,22,23,24,34,35,36,37,38,5...",142235848586878990136,VPFVEDWDLVQTLGEGGEVQLAVNRVTEEAVAVKIVNIKKEICINK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,22..."
9,4ft5,H2K,A,O14757,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,"12,13,14,20,21,22,23,24,33,34,35,36,37,54,58,6...","14,20,22,35,37,54,67,83,84,85,86,87,89,90,133,...",VPFVEDWDLVQTLGEVQLAVNRVTEEAVAVKIVDMNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,22,23,24,25..."


In [60]:
CSARset1_df = convert_df(CSARset1_results)
CSARset1_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,2are,MAN,"A,B",Q8GSD2,MLLNKAYSQDSLSFGFPTFPSDQKNLIFQGDAQIKNNAVQLTKTDS...,"51,52,88,90,91,92,93,94,109,110,111,112,113,11...",9293112113139144145227228229,QDSLSFGFPTFPSDQKNLIFQGDAQIKNNAVQLTKTDSNGNPVAST...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
1,2oag,DLI,"A,B,C,D",P27487,MKTPWKVLLGLLGAAALVTIITVPVVLLNKGTDDATADSRKTYTLT...,"124,125,200,201,202,203,204,205,206,207,208,20...","124,204,205,206,208,356,357,546,629,630,655,66...",SRKTYTLTDYLKNTYRLKLYSLRWISDHEYLYKQENNILVFNAEYG...,"39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,5..."
2,2jbj,G88,A,Q04609,MWNLLHETDSAVATARRPRWLCAGALVLAGGFFLLGFLFGWFIKSS...,"163,204,208,209,210,253,254,255,256,257,258,25...","209,256,376,386,423,424,426,427,452,517,518,55...",NMKAFLDELKAENIKKFLYNFTQIPHLAGTEQNFQLAKQIQSQWKE...,"57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,7..."
3,2pwd,NOJ,"A,B",Q2PS28,MLMKRLFAASLMLAFSSVSSVRAEEAVKPGAPWWKSAVFYQVYPRS...,"40,42,76,78,79,86,87,88,89,90,91,125,126,127,1...",8790130171190224226280352353440444,KPGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
4,2pwg,CTS,"A,B",Q2PS28,MLMKRLFAASLMLAFSSVSSVRAEEAVKPGAPWWKSAVFYQVYPRS...,"42,76,78,79,86,87,88,89,90,91,125,126,127,128,...",8790130171190194226227280282352353440,PGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGID...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
...,...,...,...,...,...,...,...,...,...
147,3ekr,PY9,"A,B",P07900,MPEETQTQDQPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLR...,"43,46,47,48,49,50,51,52,53,54,55,56,57,58,77,9...",475051545792959697105183185,QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,"10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,2..."
148,3ene,NPZ,A,P48736,MELENYKQPVVLREDNCRRRRRMKPRSAAASLSSMELIPIEFVLPT...,"801,803,805,806,807,809,811,828,829,830,831,83...","811,830,832,837,840,866,878,879,880,881,952,96...",SEESQAFQRQLTALIGYDVTDVSNVHDDELEFTRRGLVTPRMAEVA...,"144,145,146,147,148,149,150,151,152,153,154,15..."
149,3eqr,T74,"A,B",Q07912,MQPEEGTGWLLELLSEVQLQQYFLRLRDDLNVTRLSHFEYVKNEDL...,"129,130,131,132,133,134,135,136,137,138,139,14...","131,133,139,155,156,157,176,180,189,202,204,20...",LTCLIGEKDLRLLEKLGDGSFGVVRRGEWDAPSGKTVSVAVKCLKP...,"117,118,119,120,121,122,123,124,125,126,127,12..."
150,3f8c,HT1,A,A2RI36,MAEIPKEMLRAQTNVILLNVLKQGDNYVYGIIKQVKEASNGEMELN...,"2,4,7,9,10,11,12,13,14,15,17,18,55,56,59,61,80...",7149195,EIPKEMLRAQTNVILLNVLKQGDNYVYGIIKQVKEASNGEMELNEA...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."


In [61]:
CSARset2_df = convert_df(CSARset2_results)
CSARset2_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,1a8i,GLS,A,P00489,MSRPLSDQEKRKQISVRGLAGVENVTELKKNFNRHLHFTLVKDRNV...,"84,88,133,134,135,136,137,138,139,140,143,282,...","135,136,139,284,377,378,455,484,573,672,673,67...",QEKRKQISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYY...,"7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2..."
1,1a99,PUT,"A,B,C,D",P31133,MTALNKKWLSGLVAGALMAVSVGTLAAEQKTLHIYNWSDYIAPDTV...,"35,36,37,38,39,40,61,62,63,64,82,83,84,132,133...",36373839243246275277313,QKTLHIYNWSDYIAPDTVANFEKETGIKVVYDVFDSNEVLEGKLMA...,"29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,4..."
2,1ax0,A2G,A,P16404,MATYKLCSVLALSLTLFLLILNKVNSVETISFSFSEFEPGNDNLTL...,"70,71,110,111,112,113,114,115,129,130,131,132,...",114131132156158160242243244247,VETISFSFSEFEPGNDNLTLQGAALITQSGVLQLTKINQNGMPAWD...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
3,1b6l,PI4,"A,B",P03369,MGARASVLSGGELDKWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,"497,512,513,514,515,516,517,518,519,520,521,53...","497,512,514,516,517,518,519,536,537,538,539,57...",PQITLWKRPLVTIRIGGQLKEALLDTGADDTVIEEMNLPGKWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
4,1b6m,PI6,"A,B",P03369,MGARASVLSGGELDKWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,"497,512,513,514,515,516,517,518,519,520,521,53...","497,512,514,516,517,518,519,521,537,538,539,57...",PQITLWKRPLVTIRIGGQLKEALLDTGADDTVIEEMNLPGKWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
...,...,...,...,...,...,...,...,...,...
127,2qrk,AMP,A,P38998,MAAVTLHLRAETKPLEARAALTPTTVKKLIAKGFKIYVEDSPQSTF...,"129,133,134,135,136,197,198,199,200,201,202,20...",198199201202203204226227230249251,AVTLHLRAETKPLEARAALTPTTVKKLIAKGFKIYVEDSPQSTFNI...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
128,2nn1,M28,"A,B",P00915,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...,"5,7,29,30,60,62,64,65,67,69,91,92,93,94,95,96,...",67929496119143197198199200209,PDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLKPIS...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
129,2pou,I7A,A,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,"4,6,28,29,61,63,64,65,66,90,91,92,93,94,95,105...","63,66,91,93,95,118,120,129,139,141,196,197,198...",HHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLS...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
130,3cd0,6HI,"A,B,C,D",P04035,MLSRLFRMHGLFVASHPWEVIVGTVTLTICMMSMNMFTGNNKICGW...,"527,528,556,558,559,560,561,562,563,564,565,56...","558,559,560,561,563,564,567,589,656,660,682,68...",EPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLIE...,"441,442,443,444,445,446,447,448,449,450,451,45..."


In [62]:
Astex_df = convert_df(Astex_results)
Astex_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,1gm8,SOX,"A,B",P06875,MKNRNRMIVNCVTASLMYYWSLPALAEQSSSEIKIVRDEYGMPHIY...,"166,167,170,171,172,173,174,175,289,290,308,30...",167171174289311312355356357359465,SSSEIKIVRDEYGMPHIYANDTWHLFYGYGYVVAQDRLFQMEMARR...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
1,1gpk,HUP,A,P04058,MNLLVTSSLGVLLHLVVLCQADDHSELLVNTKSGKVMGTRVPVLSS...,"89,90,92,100,101,103,104,105,106,134,135,136,1...","104,136,137,138,139,141,142,143,144,147,150,21...",SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2..."
2,1hnn,SKF,"A,B",P11086,MSGADRSPNAGAAPDSAPGQAAVASAYQRFEPRAYLRNNYAPPRGD...,"26,29,34,35,37,38,39,40,42,43,44,46,51,52,53,5...",343839435256181218221257266268271,AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,"22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,3..."
3,1hp0,AD3,"A,B",Q9GPQ4,MAKNVVLDHDGNLDDFVAMVLLASNTEKVRLIGALCTDADCFVENG...,"7,8,9,10,11,12,13,14,15,39,78,79,81,82,83,85,1...","11,13,14,39,78,82,136,163,172,183,184,185,256,...",SAKNVVLDHDGNLDDFVAMVLLASNTEKVRLIGALCTDADCFVENG...,"0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
4,1hq2,PH2,A,P26281,MTVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPL...,"6,7,8,9,10,40,41,42,43,44,45,46,47,48,50,51,52...",4243444553558995123,TVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPLG...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
...,...,...,...,...,...,...,...,...,...
69,1ywr,LI9,A,P47811,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...,"27,28,29,30,31,32,33,34,35,36,37,38,39,48,49,5...","29,34,50,52,103,104,105,106,107,108,110,111,15...",RPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGHRV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,..."
70,1z95,198,A,P10275,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,"685,687,700,701,702,703,704,705,706,707,708,70...","704,705,707,708,711,738,742,745,746,749,752,76...",IFLNVLEAIEPGVVCAGHDNNQPDSFAALLSSLNELGERQLVHVVK...,"672,673,674,675,676,677,678,679,680,681,682,68..."
71,2bm2,PM2,"A,B,C,D",P20231,MLNLLLLALPVLASRAYAAPAPGQALQRVGIVGGQEAPRSKWPWQV...,"30,31,73,76,112,113,114,115,116,117,156,160,16...","113,114,116,217,218,219,220,223,241,243,244,24...",IVGGQEAPRSKWPWQVSLRVHGWMHFCGGSLIHPQWVLTAAHCVGL...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3..."
72,2br1,PFP,A,O14757,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,"12,13,14,15,16,17,18,19,20,21,22,23,24,34,35,3...",141516223583848586878990136146,AVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRCPE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."


In [63]:
COACH420_df = convert_df(COACH420_results)
COACH420_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,1a7x,FKA,"A,B",P62942,MGVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKP...,"24,26,27,28,29,34,35,36,37,38,39,42,44,45,46,4...","26,36,37,42,46,53,54,55,56,59,78,79,81,82,83,8...",GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
1,1afk,PAP,"A,B",P61823,MALKSLVLLSLLVLVLLLVRVQPSLGKETAAAKFERQHMDSSTSAA...,"27,28,29,30,31,32,33,34,35,36,37,60,64,66,68,6...",293236376690929496134136143144145,KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
2,1atl,0QI,"A,B",P15167,MIEVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPRKVTALPKGA...,"261,293,294,295,296,297,298,299,300,301,315,31...","296,297,298,299,332,333,336,342,355,357,358,35...",LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
3,1b8u,NAD,A,Q9ZF99,MAKTPMRVAVTGAAGQICYSLLFRIANGDMLGKDQPVILQLLEIPN...,"9,10,11,12,13,14,15,16,17,18,40,41,42,43,44,45...","10,11,14,15,16,41,42,43,89,90,91,110,114,131,1...",KTPMRVAVTGAAGQICYSLLFRIANGDMLGKDQPVILQLLEIPNEK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
4,1b8u,OAA,A,Q9ZF99,MAKTPMRVAVTGAAGQICYSLLFRIANGDMLGKDQPVILQLLEIPN...,"133,157,160,161,164,188,189,190,191,192,193,22...",160164189190191226227228229240,KTPMRVAVTGAAGQICYSLLFRIANGDMLGKDQPVILQLLEIPNEK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20..."
...,...,...,...,...,...,...,...,...,...
317,7dfr,FOL,A,P0ABQ4,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,"3,4,5,6,7,13,19,21,23,24,25,26,27,28,29,30,31,...",456192627293031454953569399112,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
318,7dfr,NAP,A,P0ABQ4,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,"4,5,6,7,8,11,12,13,14,15,16,17,18,19,20,21,26,...","5,6,13,14,15,16,17,18,19,21,42,43,44,45,48,61,...",MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
319,7est,0Z2,E,P00772,MLRLLVVASLVLYGHSTQDFPETNARVVGGTEAQRNSWPSQISLQY...,"26,54,55,56,68,70,71,73,108,110,111,112,113,11...",70113192209210212213231232233234235,VVGGTEAQRNSWPSQISLQYRSWAHTCGGTLIRQNWVMTAAHCVDR...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3..."
320,830c,RS1,"A,B",P45452,MHPGVLAAFLFLSWTHCRALPLPSGGDEDDLSEEDLQFAERYLRSY...,"106,175,176,177,178,181,182,183,184,185,186,18...","182,183,184,185,186,187,217,218,221,222,225,23...",YNVFPRTLKWSKMNLTYRIVNYTPDMTHSEVEKAFKKAFKVWSDVT...,"104,105,106,107,108,109,110,111,112,113,114,11..."


In [64]:
HOLO4K_df = convert_df(HOLO4K_results)
HOLO4K_df

Unnamed: 0,PDB_IDs,Lig_codes,PDB_chains,Uniprot_IDs,Uniprot_Seqs,BS_8A,BS_4A,PDB_seqs,PDB_indexes
0,121p,GCP,A,P01112,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,"7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,27,2...","10,11,12,13,14,15,16,17,27,28,29,33,34,57,59,1...",MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
1,12as,AMP,"A,B",P00963,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,"45,47,48,51,70,71,73,95,97,99,102,103,104,105,...","99,102,108,109,110,113,115,247,248,249,250,293...",AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2..."
2,13pk,3PG,"A,B,C,D",P07378,MTLNEKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPT...,"21,22,23,24,25,26,27,38,60,61,62,63,64,65,134,...",232538616364134167168171218397,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,..."
3,13pk,ADP,"A,B,C,D",P07378,MTLNEKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPT...,"25,26,27,28,38,64,214,215,216,217,218,219,222,...","216,217,218,222,240,241,244,293,313,314,337,33...",EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,..."
4,16pk,BIS,A,P07378,MTLNEKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPT...,"23,25,38,61,134,166,167,168,170,214,215,216,21...","216,217,222,240,241,244,258,293,313,314,338,33...",EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,..."
...,...,...,...,...,...,...,...,...,...
3894,9gss,GTX,"A,B",P09211,MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...,"7,8,9,10,11,12,13,14,17,33,34,35,36,38,39,44,4...",781335384450515253646598108205,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,..."
3895,9ldb,NAD,"A,B",P00339,MATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELAD...,"24,25,26,27,28,29,30,31,32,33,50,51,52,53,54,5...","26,28,29,30,51,52,53,94,95,96,97,98,112,115,11...",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2..."
3896,9ldb,OXM,"A,B",P00339,MATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELAD...,"30,63,66,98,99,103,105,108,135,136,137,138,139...",99105137168192237247,ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2..."
3897,9ldt,NAD,"A,B",P00339,MATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELAD...,"24,25,26,27,28,29,30,31,32,33,50,51,52,53,54,5...","26,28,29,30,51,52,53,94,95,96,97,98,112,115,11...",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2..."


### 4. Save

In [65]:
scPDB_df.to_csv("./preprocessed_bs_data/step4_scPDB_data.tsv", sep = "\t", index = False)
PDBbind_df.to_csv("./preprocessed_bs_data/step4_PDBbind_data.tsv", sep = "\t", index = False)
CASF2016_df.to_csv("./preprocessed_bs_data/step4_CASF2016_data.tsv", sep = "\t", index = False)
CASF2013_df.to_csv("./preprocessed_bs_data/step4_CASF2013_data.tsv", sep = "\t", index = False)
CSAR2014_df.to_csv("./preprocessed_bs_data/step4_CSAR2014_data.tsv", sep = "\t", index = False)
CSAR2012_df.to_csv("./preprocessed_bs_data/step4_CSAR2012_data.tsv", sep = "\t", index = False)
CSARset1_df.to_csv("./preprocessed_bs_data/step4_CSARset1_data.tsv", sep = "\t", index = False)
CSARset2_df.to_csv("./preprocessed_bs_data/step4_CSARset2_data.tsv", sep = "\t", index = False)
Astex_df.to_csv("./preprocessed_bs_data/step4_Astex_data.tsv", sep = "\t", index = False)
COACH420_df.to_csv("./preprocessed_bs_data/step4_COACH420_data.tsv", sep = "\t", index = False)
HOLO4K_df.to_csv("./preprocessed_bs_data/step4_HOLO4K_data.tsv", sep = "\t", index = False)