# KIBA dataset

##  1 Read raw data from the DeepDTA

> The raw data can be downloaded from [DeepDTA Github Repository](https://github.com/hkmztrk/DeepDTA/tree/master/data).

In [None]:
import pandas as pd  # pandas for table-like data

In [None]:
kiba_data = pd.read_table("../../../data/dta-datasets/KIBA/DTA_Raw_Data/KIBA/kiba_binding_affinity_v2.txt", sep = "\t", header=None)

In [None]:
kiba_data.shape  # all 2111 drugs and 229 proteins make the matrix

In [None]:
kiba_data

In [None]:
select_col = [col for col in range(229)] # the last col is all NaN

In [None]:
kiba_data[select_col].shape

In [None]:
kiba_data[select_col].head()

In [None]:
kiba_data[select_col].describe()  # a lookup for all the data in the dataset

In [None]:
kiba_data[select_col].to_csv('../../../data/dta-datasets/KIBA/kiba_affinities.csv', header=None, index=None)

## 2 Add protein ID and ligand ID

Use the DeepDTA methods to load the data.

### 3.1 Ligands ID and SMILES

Load the raw **2111** ligands from DeepDTA.

In [None]:
import json
ligands = json.load(open("../../../data/dta-datasets/KIBA/DTA_Raw_Data/KIBA/ligands_can.txt"))

In [None]:
ligands

In [None]:
ligands['CHEMBL1087421']

In [None]:
ligands.keys()  # CHEMBL ID as the keys in the dict

In [None]:
import pandas as pd
ligands_df = pd.DataFrame(ligands, index=[0])

In [None]:
ligands_df.T.head()

Then store the raw ligands dataframe into a csv file.

In [None]:
ligands_df.T.to_csv('../../../data/dta-datasets/KIBA/kiba_ligands.csv', index_label=['CHEMBLID'], header=['SMILES'])

### 2.2 Protein ID and AA(Amino Acid) Sequences

Load the raw **442** proteins from DeepDTA.

In [None]:
import json
proteins = json.load(open("../../../data/dta-datasets/KIBA/DTA_Raw_Data/KIBA/proteins.txt"))

In [None]:
import pandas as pd
proteins_df = pd.DataFrame(proteins, index=[0])

In [None]:
proteins_df.T.head()

Then store the raw proteins dataframe into a csv file.

In [None]:
proteins_df.T.to_csv('../../../data/dta-datasets/KIBA/kiba_proteins.csv', index_label=['Gene'], header=['Sequence'])

## 3 Consturct the DTA pair data

Then we need to build a DataFrame with above files.   
*Drug_ID    Drug    Target_ID   Target  Y* # 5 col 

Now Read the above 3 files to get the Drug, Protein and Affinity values. But we need to be careful with the index. So I set a for loop beyond the affinity matrix to build the new DataFrame.

In [None]:
import pandas as pd
import numpy as np

In [None]:
drugs = pd.read_table("../../../data/dta-datasets/KIBA/kiba_ligands.csv", sep = ",")

In [None]:
drugs

In [None]:
drugs.loc[0]["CHEMBLID"]

In [None]:
drugs.loc[0]["SMILES"]

In [None]:
targets = pd.read_table("../../../data/dta-datasets/KIBA/kiba_proteins.csv", sep = ",")

In [None]:
targets

In [None]:
targets.loc[0]["Gene"]

In [None]:
affinities = pd.read_table("../../../data/dta-datasets/KIBA/kiba_affinities.csv",sep = "," , header=None)

In [None]:
affinities

In [None]:
affinities.shape

In [None]:
affinities.loc[0][0] # use the index to get the item affinity

### Loop for new DataFrame

In [None]:
kiba_dataset = pd.DataFrame(columns=('Drug_ID', 'Drug', 'Target_ID', 'Target', 'Y'))  # create a new dataframe for latter append each row into

In [None]:
kiba_dataset

In [None]:
for row in range(affinities.shape[0]):
    drug_id = drugs.loc[row]["CHEMBLID"]
    drug = drugs.loc[row]["SMILES"]
    for col in range(affinities.shape[1]):
        target_id = targets.loc[col]["Gene"]
        target = targets.loc[col]["Sequence"]
        affinity = affinities.loc[row][col]  
        kiba_dataset = kiba_dataset.append(pd.DataFrame({
            'Drug_ID':[drug_id],'Drug':[drug],'Target_ID':[target_id],'Target':[target], 'Y':[affinity]}), ignore_index=True)

In [None]:
kiba_dataset

In [None]:
kiba_dataset.shape

In [None]:
kiba_dataset.dropna(how='any', inplace=True) 

In [None]:
kiba_dataset

In [None]:
kiba_dataset.to_csv('../../../data/dta-datasets/KIBA/kiba_datasets.csv', index=False)

## 4 Visualize the Davis dataset distribution

Define the length plot function for repeative usage

In [None]:
import pandas as pd
kiba_df = pd.read_csv('../../../data/dta-datasets/KIBA/kiba_datasets.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def str2length(sequences):
    sequences = list(set(sequences))
    length_list = []
    for sequence in sequences:
        length_list.append(len(sequence))
    print("Min Len:", min(length_list))
    print("Max Len:", max(length_list))
    return length_list

def data_length_plot(
    length_list, dataset, plot_name, binwidth, xlabel, ylabel, x_start, x_end, y_start, y_end, color
    ):
    """
    sequences: list/pd.series of the data
    plot_name: the name of the data to be plotted
    """
    sns.displot(length_list, binwidth=binwidth, color=color)  # sns.displot(length_list, binwidth=100)
    # sns.set_style("whitegrid")
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xlim(x_start, x_end) 
    plt.ylim(y_start, y_end) 
    plt.title(dataset)
    plt.gca().spines['top'].set_visible(True)
    plt.gca().spines['right'].set_visible(True)
    plt.gcf().set_size_inches(4, 5)
    plt.savefig("../../../result/fig_output/datasets/KIBA/" + plot_name + "Length.png", bbox_inches='tight') # dpi=300, , dpi=300, bbox_inches='tight'

In [None]:
data_length_plot(
    length_list=str2length(kiba_df["Target"]), dataset="KIBA", plot_name="KIBATarget", binwidth=100, 
    xlabel="Length of target sequences", ylabel="Number of targets", 
    x_start=0, x_end=3000, y_start=0, y_end=50, color="limegreen"
    )

In [None]:
data_length_plot(
    length_list=str2length(kiba_df["Drug"]), dataset="KIBA", plot_name="KIBADrug", binwidth=5, 
    xlabel="Length of SMILES", ylabel="Number of drugs", 
    x_start=0, x_end=600, y_start=0, y_end=350, color="slateblue")

In [None]:
print(min(kiba_df["Y"]))
print(max(kiba_df["Y"]))

In [None]:
data_length_plot(
    length_list=kiba_df["Y"],  dataset="KIBA", plot_name="KIBAAffinity", binwidth=0.2, 
    xlabel="KIBA scores", ylabel="Number of DT pairs", 
    x_start=0, x_end=18, y_start=0, y_end=25000, color="khaki")

## 5 Download the 229 protein pdb files.

In [None]:
import pandas as pd
proteins = pd.read_table("../../../data/dta-datasets/KIBA/kiba_proteins.csv", sep = ",")

In [None]:
proteins

In [None]:
len(proteins["Sequence"].unique())

In [None]:
pdb_list = proteins["Gene"].to_list()
pdb_list

In [None]:
len(pdb_list)

You need to get uniprotID to download the corresponding PDBfile from AlphaFold Database. For example, genotype AAK1 corresponds to PDB ID Q2M2I8.

> PDB file in AlphaFold format with v4 version like：https://alphafold.ebi.ac.uk/files/AF-Q2M2I8-F1-model_v4.pdb

In [None]:
proteins.head()

In [None]:
len(set(pdb_list))

In [None]:
prefix = 'https://alphafold.ebi.ac.uk/files/AF-'
suffix = '-F1-model_v4.pdb'

Start download!

In [None]:
import os
from torchdrug import utils
urls = []
path = '../../../data/dta-datasets/KIBA/pdb/'
for item in range(len(pdb_list)):
    url = prefix + pdb_list[item] + suffix
    pdb_name = "AF-" + pdb_list[item] + suffix
    pdb_file = os.path.join(path, pdb_name)
    if os.path.exists(pdb_file):
        print("==================== Using the pdb file: ", pdb_file, "====================")
    else:
        if url not in urls:
            urls.append(url)
            # print(url)
            try:
                utils.download(url, path)
            except:
                print(f'This url: {url} can not be downloaded')

- P78527 Len: 4128 No AlphaFold2 use 7OTY

## 6 Build the new KIBA dataset

### Merge the DTA and pdb table

[TO DO]Need to do

Load the DTA table to DataFrame

In [None]:
import pandas as pd
path = '../../../data/dta-datasets/KIBA/'

In [None]:
kiba_path = path + 'kiba_datasets.csv'
kiba_df = pd.read_csv(kiba_path)

In [None]:
kiba_df

In [None]:
len(kiba_df)

Load the PDB file  col to a df

In [None]:
pdb_path = path + 'kiba_proteins.csv'
pdb_df = pd.read_csv(pdb_path)

In [None]:
pdb_df

load the drug file to a df

In [None]:
drug_path = path + 'kiba_ligands.csv'
drug_df = pd.read_csv(drug_path)

In [None]:
drug_df

In [None]:
pdb_df[pdb_df["Gene"] == kiba_df.loc[0]["Target_ID"]].index.to_list()[0]

In [None]:
drug_df[drug_df["CHEMBLID"] == "CHEMBL1087421"].index.to_list()[0]

Built a dict first to add pdb file into the df

In [None]:
pdb_df.loc[0]["Gene"]

In [None]:
'AF-' + pdb_df.loc[0]["Gene"] + '-F1-model_v4.pdb'

Traverse 118254 rows of data, add the corresponding PDB file name and corresponding protein index to each row of data, and the drug index is convenient for subsequent direct reference.

In [None]:
kiba_df.loc[0]["Target_ID"]

In [None]:
len(kiba_df)

In [None]:
pdb_list = []
protein_index_list = []
drug_index_list = []
# traverse the 30056 line to add the pdb file
for i in range(len(kiba_df)):
    protein_id = kiba_df.loc[i]["Target_ID"]
    drug_id = kiba_df.loc[i]["Drug_ID"]
    pdb_list.append('AF-' + protein_id + '-F1-model_v4.pdb')
    protein_index_list.append(pdb_df[pdb_df["Gene"] == protein_id].index.to_list()[0])
    drug_index_list.append(drug_df[drug_df["CHEMBLID"] == drug_id].index.to_list()[0])

In [None]:
pdb_list

In [None]:
len(pdb_list)

In [None]:
protein_index_list

In [None]:
len(protein_index_list)

In [None]:
drug_index_list

In [None]:
len(drug_index_list)

In [None]:
pdb_col = pd.DataFrame(pdb_list, columns=["PDB_File"])

In [None]:
drug_index_col = pd.DataFrame(drug_index_list, columns=["Drug_Index"])

In [None]:
protein_index_col = pd.DataFrame(protein_index_list, columns=["protein_Index"])

In [None]:
kiba_df

In [None]:
pdb_col

In [None]:
drug_index_col

In [None]:
protein_index_col

In [None]:
temp_df1 = pd.merge(pdb_col, drug_index_col, left_index=True, right_index=True)

In [None]:
temp_df1

In [None]:
temp_df2 = pd.merge(temp_df1, protein_index_col, left_index=True, right_index=True)

In [None]:
temp_df2

In [None]:
kiba_dataset_all = pd.merge(kiba_df, temp_df2, left_index=True, right_index=True)

In [None]:
kiba_dataset_all

In [None]:
kiba_dataset_all.to_csv('../../../data/dta-datasets/KIBA/kiba_datasets.csv', index=None)