In [1]:
import os
import rdkit
import pickle
import numpy as np
import pandas as pd
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles

### 01. Load data

In [2]:
SMILES_df = pd.read_csv(f"/home/ssm/work/Pseq2Affinity/datasets/BindingDB/2023/CID-SMILES", sep = "\t", header = None)

In [3]:
IC50_df = pd.read_csv(f"./preprocessed_data/step1_IC50_data.tsv", sep = "\t")

In [4]:
Ki_df = pd.read_csv(f"./preprocessed_data/step1_Ki_data.tsv", sep = "\t")

### 02. Convert SMILES

In [5]:
def add_smiles(smiles_df, total_CID_ids):
    results_dict = dict()
    
    for cid in total_CID_ids:
        
        try:
            pubchem_smi = smiles_df[smiles_df.iloc[:,0] == cid].iloc[:,1].values[0]
            pubchem_cano_smi = MolToSmiles(MolFromSmiles(pubchem_smi), isomericSmiles = False)
            pubchem_iso_smi = MolToSmiles(MolFromSmiles(pubchem_smi), isomericSmiles = True)
            
            results_dict[cid] = [pubchem_cano_smi, pubchem_iso_smi]
            
        except:
            continue
    
    return results_dict

In [7]:
total_CID_ids = np.unique(list(np.unique(Ki_df.CID.values)) + list(np.unique(IC50_df.CID.values)))
print(f"Unique CID: {len(total_CID_ids)}")

Unique CID: 759723


In [None]:
smiles_dict = add_smiles(SMILES_df, total_CID_ids)

### 03. Remove samples based on SMILES

In [9]:
def make_df(df, smiles_dict):
    df_dict = {"UniProt_IDs":[], "CID":[], "Labels":[], "Seqs":[], "SMILES":[], "SMILES_iso":[], "Reactant_IDs":[]}
    
    for line in df.values:
        uniprot_id, cid, label, seqs, reactant_id = line[5], line[1], line[2], line[0], line[3]
        
        if cid in smiles_dict:
            cano_smi = smiles_dict[cid][0]
            iso_smi = smiles_dict[cid][1]
            
            df_dict["UniProt_IDs"].append(uniprot_id)
            df_dict["CID"].append(cid)
            df_dict["Labels"].append(label)
            df_dict["Seqs"].append(seqs)
            df_dict["SMILES"].append(cano_smi)
            df_dict["SMILES_iso"].append(iso_smi)
            df_dict["Reactant_IDs"].append(reactant_id)
            
    return pd.DataFrame(df_dict)

In [10]:
IC50_df = make_df(IC50_df, smiles_dict)
IC50_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A0B4J268,377972,4.780127,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,209690
1,A0A0B4J268,653978,4.541816,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,209465
2,A0A0B4J268,676291,5.204120,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,209668
3,A0A0B4J268,704725,4.946998,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,209467
4,A0A0B4J268,741311,4.609065,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1c(CO)cc(CO)cc(CO)c1O,O=c1c(CO)cc(CO)cc(CO)c1O,209538
...,...,...,...,...,...,...,...
843164,V9GZ37,7113816,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,57067
843165,V9GZ37,9551519,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,57065
843166,V9GZ37,91896129,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,57036
843167,V9GZ37,91896130,4.708853,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)c1=CC=Cc1ccco1,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)/c1=C\C=Cc1ccco1,5706276570


In [11]:
IC50_df = IC50_df[IC50_df.SMILES.str.len() <= 150]
IC50_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A0B4J268,377972,4.780127,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,209690
1,A0A0B4J268,653978,4.541816,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,209465
2,A0A0B4J268,676291,5.204120,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,209668
3,A0A0B4J268,704725,4.946998,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,209467
4,A0A0B4J268,741311,4.609065,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1c(CO)cc(CO)cc(CO)c1O,O=c1c(CO)cc(CO)cc(CO)c1O,209538
...,...,...,...,...,...,...,...
843164,V9GZ37,7113816,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,57067
843165,V9GZ37,9551519,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,57065
843166,V9GZ37,91896129,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,57036
843167,V9GZ37,91896130,4.708853,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)c1=CC=Cc1ccco1,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)/c1=C\C=Cc1ccco1,5706276570


In [12]:
Ki_df = make_df(Ki_df, smiles_dict)
Ki_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A087WW23,77050673,8.886057,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1ccc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(-c5c...,Cc1ccc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282585
1,A0A087WW23,85470875,8.958607,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cnc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(N5CC...,Cc1cnc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282615
2,A0A087WW23,86267448,7.397940,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,282543
3,A0A087WW23,86267449,6.366532,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,282545
4,A0A087WW23,86267659,5.649752,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,282546
...,...,...,...,...,...,...,...
333737,Q9Z5X1,449343,3.259637,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,O=C(NO)C(O)C(O)CO,O=C(NO)[C@H](O)[C@H](O)CO,50345961
333738,Q9Z5X1,6852192,4.892790,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345969
333739,Q9Z5X1,6852195,5.013228,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345974
333740,Q9Z7P3,137637574,6.399027,MNRRWNLVLATVALALSVASCDVRSKDKDKDQGSLVEYKDNKDTND...,CCC1CN(Cc2ccccn2)C(=O)C2CCCC1N2S(=O)(=O)c1cc(C...,CC[C@H]1CN(Cc2ccccn2)C(=O)[C@@H]2CCC[C@H]1N2S(...,50867036


In [13]:
Ki_df = Ki_df[Ki_df.SMILES.str.len() <= 150]
Ki_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A087WW23,77050673,8.886057,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1ccc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(-c5c...,Cc1ccc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282585
1,A0A087WW23,85470875,8.958607,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cnc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(N5CC...,Cc1cnc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282615
2,A0A087WW23,86267448,7.397940,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,282543
3,A0A087WW23,86267449,6.366532,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,282545
4,A0A087WW23,86267659,5.649752,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,282546
...,...,...,...,...,...,...,...
333737,Q9Z5X1,449343,3.259637,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,O=C(NO)C(O)C(O)CO,O=C(NO)[C@H](O)[C@H](O)CO,50345961
333738,Q9Z5X1,6852192,4.892790,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345969
333739,Q9Z5X1,6852195,5.013228,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345974
333740,Q9Z7P3,137637574,6.399027,MNRRWNLVLATVALALSVASCDVRSKDKDKDQGSLVEYKDNKDTND...,CCC1CN(Cc2ccccn2)C(=O)C2CCCC1N2S(=O)(=O)c1cc(C...,CC[C@H]1CN(Cc2ccccn2)C(=O)[C@@H]2CCC[C@H]1N2S(...,50867036


In [14]:
'''
259: Br, 312: Cl, 934: Ni, 433294: [Cl].[Li], 23994: Zn, 28179: F
28486: Li, 30165: I, 260: Br, 313: Cl, 24841: I, 5047209: S
'''
check_list = [259, 312, 934, 23994, 28179, 28486, 30165, 433294, 260, 313, 24841, 5047209]

In [15]:
IC50_df = IC50_df[~IC50_df.CID.isin(check_list)].reset_index(drop = True)
IC50_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A0B4J268,377972,4.780127,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,209690
1,A0A0B4J268,653978,4.541816,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,209465
2,A0A0B4J268,676291,5.204120,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,209668
3,A0A0B4J268,704725,4.946998,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,209467
4,A0A0B4J268,741311,4.609065,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1c(CO)cc(CO)cc(CO)c1O,O=c1c(CO)cc(CO)cc(CO)c1O,209538
...,...,...,...,...,...,...,...
837150,V9GZ37,7113816,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,57067
837151,V9GZ37,9551519,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,57065
837152,V9GZ37,91896129,4.000000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,57036
837153,V9GZ37,91896130,4.708853,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)c1=CC=Cc1ccco1,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)/c1=C\C=Cc1ccco1,5706276570


In [16]:
Ki_df = Ki_df[~Ki_df.CID.isin(check_list)].reset_index(drop = True)
Ki_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A087WW23,77050673,8.886057,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1ccc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(-c5c...,Cc1ccc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282585
1,A0A087WW23,85470875,8.958607,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cnc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(N5CC...,Cc1cnc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282615
2,A0A087WW23,86267448,7.397940,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,282543
3,A0A087WW23,86267449,6.366532,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,282545
4,A0A087WW23,86267659,5.649752,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,282546
...,...,...,...,...,...,...,...
328117,Q9Z5X1,449343,3.259637,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,O=C(NO)C(O)C(O)CO,O=C(NO)[C@H](O)[C@H](O)CO,50345961
328118,Q9Z5X1,6852192,4.892790,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345969
328119,Q9Z5X1,6852195,5.013228,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345974
328120,Q9Z7P3,137637574,6.399027,MNRRWNLVLATVALALSVASCDVRSKDKDKDQGSLVEYKDNKDTND...,CCC1CN(Cc2ccccn2)C(=O)C2CCCC1N2S(=O)(=O)c1cc(C...,CC[C@H]1CN(Cc2ccccn2)C(=O)[C@@H]2CCC[C@H]1N2S(...,50867036


In [17]:
IC50_df['Labels'] = IC50_df['Labels'].round(4)
IC50_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A0B4J268,377972,4.7801,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,O=C1C=CC(=O)c2c(O)c(N3CCN(CCO)CC3)c(Cl)c(O)c21,209690
1,A0A0B4J268,653978,4.5418,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,CC1CCN(S(=O)(=O)c2ccc(NC(=O)Cn3nnc(-c4ccccc4F)...,209465
2,A0A0B4J268,676291,5.2041,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,O=c1cc(-c2ccc(O)c(O)c2)oc2c1ccc1ccccc12,209668
3,A0A0B4J268,704725,4.9470,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,Nc1ncnc2oc(-c3ccco3)c(-c3ccco3)c12,209467
4,A0A0B4J268,741311,4.6091,MRQVARVIVFLTLSTLSLAKTTQPISMDSYEGQEVNITCSHNNIAT...,O=c1c(CO)cc(CO)cc(CO)c1O,O=c1c(CO)cc(CO)cc(CO)c1O,209538
...,...,...,...,...,...,...,...
837150,V9GZ37,7113816,4.0000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,O=C(CC(=NCCO)C(F)(F)F)c1ccc(Cl)cc1,57067
837151,V9GZ37,9551519,4.0000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,CCOC(=O)CNC(=O)CNC(=O)Cn1ccc2ccccc2c1=O,57065
837152,V9GZ37,91896129,4.0000,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,CC(=O)N1CCC2(CC1)c1c(n[nH]c1C)OC(=N)C2C#N,57036
837153,V9GZ37,91896130,4.7089,MAKAAAIGIDLGTTYSCVGVFQHGKGERNVLIFDLGGGTFDVSILT...,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)c1=CC=Cc1ccco1,C=c1[nH]n(-c2cccc(C(=O)O)c2)c(=O)/c1=C\C=Cc1ccco1,5706276570


In [18]:
Ki_df['Labels'] = Ki_df['Labels'].round(4)
Ki_df

Unnamed: 0,UniProt_IDs,CID,Labels,Seqs,SMILES,SMILES_iso,Reactant_IDs
0,A0A087WW23,77050673,8.8861,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1ccc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(-c5c...,Cc1ccc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282585
1,A0A087WW23,85470875,8.9586,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cnc(COc2ccc3nc(C4CCCCC4C(=O)O)n(Cc4ccc(N5CC...,Cc1cnc(COc2ccc3nc([C@@H]4CCCC[C@@H]4C(=O)O)n(C...,282615
2,A0A087WW23,86267448,7.3979,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,CCC(CC)(Cc1nc2ccc(OCc3ccc(C)cn3)cc2n1Cc1ccc(OC...,282543
3,A0A087WW23,86267449,6.3665,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,Cn1ccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)c...,282545
4,A0A087WW23,86267659,5.6498,MLTFNHDAPWHTQKTLKTSEFGKSFGTLGHIGNISHQCWAGCAAGG...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,Cc1cccc(COc2ccc3nc(CC(C)(C)C(=O)O)n(Cc4ccc(Br)...,282546
...,...,...,...,...,...,...,...
328117,Q9Z5X1,449343,3.2596,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,O=C(NO)C(O)C(O)CO,O=C(NO)[C@H](O)[C@H](O)CO,50345961
328118,Q9Z5X1,6852192,4.8928,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345969
328119,Q9Z5X1,6852195,5.0132,MPLLDSFTVDHTRMNAPAVRVAKTMQTPKGDTITVFDLRFTAPNKD...,NC(CCSCC(O)C(O)C(=O)NO)C(=O)O,N[C@@H](CCSC[C@@H](O)[C@@H](O)C(=O)NO)C(=O)O,50345974
328120,Q9Z7P3,137637574,6.3990,MNRRWNLVLATVALALSVASCDVRSKDKDKDQGSLVEYKDNKDTND...,CCC1CN(Cc2ccccn2)C(=O)C2CCCC1N2S(=O)(=O)c1cc(C...,CC[C@H]1CN(Cc2ccccn2)C(=O)[C@@H]2CCC[C@H]1N2S(...,50867036


In [19]:
IC50_df.to_csv("../input_data/BindingDB/IC50_data.tsv", sep = "\t", index = False)
Ki_df.to_csv("../input_data/BindingDB/Ki_data.tsv", sep = "\t", index = False)

### 04. Download SDF files

In [5]:
'''
Download the SDF files for the compounds from the link below.
https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/
'''

'\nDownload the SDF files for the compounds from the link below.\nhttps://ftp.ncbi.nlm.nih.gov/pubchem/Compound/\n'

In [None]:
compound_list = [i.strip().split(" ")[0].strip() for i in open("./data/supplementary/Pubchem_CID_list", "r").readlines() if "md5" not in i]

In [None]:
def fwrite(lines, fw):
    for line in lines:
        fw.write(f"{line}")
    fw.close()
    
def extract_save_sdf(lines, smiles_dict, fw_path):
    tmp, cid_flag = list(), 0
    for line in lines:
        if cid_flag == 1:
            cid_flag = 0
            cid = line.strip()
            
        tmp.append(line)
        
        if "PUBCHEM_COMPOUND_CID" in line:
            cid_flag = 1
        
        if "$$$$" in line:
            if int(cid) in smiles_dict:
                fwrite(tmp, open(f"{fw_path}/{cid}.sdf", "w"))
            tmp = list()

In [None]:
os.chdir("./data/sdf")

In [None]:
for file in compound_list:
    command = f"wget https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/{file}"
    os.system(command)
    
    command = f"gzip -d {file}"
    os.system(command)

    extract_save_sdf(open(f"./data/sdf/{file[:-3]}", "r").readlines(), smiles_dict, "./data/sdf")
    
    command = f"rm ./data/sdf/{file[:-3]}"
    os.system(command)