In [32]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from rdkit import DataStructs

# Load the CSV files
pharmacologically_active = pd.read_csv('raw/pharmacologically_active.csv')
all_targets = pd.read_csv('raw/all.csv')
dti = pd.read_csv('raw/dti_dataset.csv')
drugbank = pd.read_csv('raw/drugbank.csv')
target_labels = pd.read_csv('raw/target_labels.csv')
drugs = list(target_labels.ID)


In [33]:
len(drugs)

2572

In [36]:
drugbank_filtered = drugbank[drugbank['Drug id'].isin(drugs)]
drugbank_filtered.index = np.arange(0, drugbank_filtered.shape[0])
drugbank_filtered.to_csv("raw/drugbank.csv")

In [37]:
new_drugs = list(drugbank_filtered['Drug id'])

In [38]:
len(new_drugs)

1207

In [39]:
target_labels = target_labels[target_labels['ID'].isin(new_drugs)]

In [40]:
target_labels.shape

(1841, 721)

In [41]:
# Assuming your DataFrame is called 'df'
target_labels = target_labels.drop_duplicates(subset='ID', keep='first')


In [42]:
prot = pd.read_csv("raw/protein_sequences.csv")
prot.head()

Unnamed: 0,pdb_id,sequence
0,1EVU,ETSRTAFGGRRAVPPNNSNAAEDDLPTVEEFLNVTSVHLFKERWDT...
1,1NSI,RHVRIKNWGSGMTFQDTLHHKAKGILTCRSKSCLGSIMTPKSLTRG...
2,1DJL,PMEISGTHTEINLDNAIDMIREANSIIITPGYGLCAAKAQYPIADL...
3,1AB2,GSGNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRS...
4,1CFG,TRYLRIHPQSWVHQIALRMEVL


In [43]:
targets = list(prot['pdb_id'])
targets

['1EVU',
 '1NSI',
 '1DJL',
 '1AB2',
 '1CFG',
 '4BSJ',
 '1Q7D',
 '1FLT',
 '1GAG',
 '1C1Y',
 '1CZS',
 '2AJP',
 '2CSA',
 '2VGB',
 '1A2C',
 '1E51',
 '1GKC',
 '1LVQ',
 '4D1N',
 '1I0Z',
 '2F2S',
 '1LE6',
 '1I7G',
 '1A22',
 '1BZY',
 '1H9U',
 '4V06',
 '3B6R',
 '2FFX',
 '1BYW',
 '2KI9',
 '1BPX',
 '1F0X',
 '2IAG',
 '1A52',
 '1A00',
 '1E3G',
 '1CW3',
 '3QNZ',
 '1EXA',
 '1RJB',
 '3ERY',
 '1BJ1',
 '1YTV',
 '1QK1',
 '2LSQ',
 '1RKP',
 '2HGS',
 '5UH5',
 '2M3O',
 '1A27',
 '2KBI',
 '1X9N',
 '1GQ5',
 '1HRA',
 '4TWK',
 '1B2I',
 '1FM6',
 '1C5M',
 '4MQE',
 '1O86',
 '1HSO',
 '2MKV',
 '1A7C',
 '1G5M',
 '1M73',
 '4CFE',
 '3IYD',
 '1M9J',
 '1AZM',
 '1BBS',
 '1A81',
 '1ANT',
 '1HLL',
 '4JQI',
 '1GER',
 '1ALQ',
 '1BSX',
 '12CA',
 '1HVY',
 '2WGH',
 '1XU7',
 '1BOZ',
 '1BJ4',
 '1BVR',
 '1BF9',
 '1A5K',
 '1BM7',
 '1YYP',
 '1BP3',
 '3HI7',
 '3BXK',
 '1AUI',
 '2KUP',
 '1CFH',
 '2HQW',
 '1VR2',
 '1NB0',
 '1F0Y',
 '1AUT',
 '1D3G',
 '2KS9',
 '1N6U',
 '1KIL',
 '4IB4',
 '1B0F',
 '1FHA',
 '1DEH',
 '1D6G',
 '1D7K',
 '2LE3',
 

In [45]:
targets.insert(0, "ID")

In [46]:
target_labels = target_labels.loc[:, targets]
target_labels

Unnamed: 0,ID,1EVU,1NSI,1DJL,1AB2,1CFG,4BSJ,1Q7D,1FLT,1GAG,...,1TVB,1T5Q,6U6U,7RY7,2MDP,1G2C,4BPU,2X18,2N80,2KR6
4,DB05383,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,DB08814,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,DB09092,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,DB00619,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,DB01254,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2548,DB16690,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2549,DB09401,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2552,DB15011,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2556,DB02587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
target_labels.to_csv("raw/target_labels.csv")

In [None]:
df = pd.read_csv('raw/target_labels.csv')

drug_ids = df['ID'].tolist()
pdb_ids = df.columns[1:].tolist()
labels = df.iloc[:, 1:].values

confirmed_interactions = []

for i, drug_id in enumerate(drug_ids):
    for j, pdb_id in enumerate(pdb_ids):
        if labels[i][j] == 1:
            confirmed_interactions.append((drug_id, pdb_id))

confirmed_df = pd.DataFrame(confirmed_interactions, columns=['Drug ID', 'PDB ID'])

print(confirmed_df)

confirmed_df.to_csv('confirmed_interactions.csv', index=False)


      Drug ID PDB ID
0     DB05383   1NSI
1     DB05383   3HI7
2     DB08814   1NSI
3     DB08814   1MDI
4     DB09092   1DJL
...       ...    ...
1690  DB09401   2WZ1
1691  DB09401   3UVJ
1692  DB15011   5O9H
1693  DB02587   4CLF
1694  DB12005   2KR6

[1695 rows x 2 columns]


In [73]:
drugs = confirmed_df['Drug ID'].value_counts().index[0:304]

In [74]:
targets = confirmed_df['PDB ID'].value_counts().index[0:405]

In [75]:
confirmed_d1 = confirmed_df[confirmed_df["Drug ID"].isin(drugs)]
confirmed_d1

Unnamed: 0,Drug ID,PDB ID
0,DB05383,1NSI
1,DB05383,3HI7
2,DB08814,1NSI
3,DB08814,1MDI
4,DB09092,1DJL
...,...,...
1680,DB00562,3FE4
1684,DB15797,4WUR
1685,DB15797,4R3D
1690,DB09401,2WZ1


In [76]:
confirmed_d2 = confirmed_d1[confirmed_d1["PDB ID"].isin(targets)]
confirmed_d2

Unnamed: 0,Drug ID,PDB ID
0,DB05383,1NSI
2,DB08814,1NSI
3,DB08814,1MDI
4,DB09092,1DJL
5,DB09092,3ERY
...,...,...
1680,DB00562,3FE4
1684,DB15797,4WUR
1685,DB15797,4R3D
1690,DB09401,2WZ1


In [77]:
drugbank_filtered = drugbank[drugbank['Drug id'].isin(drugs)]
drugbank_filtered

Unnamed: 0.1,Unnamed: 0,Drug name,Drug id,Key idx,smiles
1,1,Adenosine phosphate,DB00131,15,Nc1ncnc2c1ncn2[C@H]1O[C@H](COP(=O)(O)O)[C@@H](...
3,3,Riboflavin,DB00140,24,Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(C[C@H](O)[C@H](O)...
6,6,Creatine,DB00148,31,CN(CC(=O)[O-])C(=N)[NH3+]
10,10,Icosapent,DB00159,42,CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O
19,19,Amphetamine,DB00182,64,C[C@@H](N)Cc1ccccc1
...,...,...,...,...,...
1166,1166,Futibatinib,DB15149,8641,C=CC(=O)N1CC[C@H](n2nc(C#Cc3cc(OC)cc(OC)c3)c3c...
1175,1175,Cambinol,DB15493,8807,O=c1nc(S)[nH]c(-c2ccccc2)c1Cc1c(O)ccc2ccccc12
1186,1186,GC-373,DB15797,8886,CC(C)C[C@@H](NC(=O)OCc1ccccc1)C(=O)N[C@H](C=O)...
1203,1203,Belumosudil,DB16703,9215,CC(C)NC(=O)COc1cccc(-c2nc(Nc3ccc4[nH]ncc4c3)c3...


In [82]:
drugbank_filtered.to_csv("raw/drugbank.csv")

In [79]:
prot_filtered = prot[prot["pdb_id"].isin(targets)]
prot_filtered

Unnamed: 0,pdb_id,sequence
1,1NSI,RHVRIKNWGSGMTFQDTLHHKAKGILTCRSKSCLGSIMTPKSLTRG...
2,1DJL,PMEISGTHTEINLDNAIDMIREANSIIITPGYGLCAAKAQYPIADL...
3,1AB2,GSGNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRS...
5,4BSJ,DHNPFISVEWLKGPILEATAGDELVKLPVKLAAYPPPEFQWYKDGK...
7,1FLT,EVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCG...
...,...,...
622,3UVJ,QVVQAKKFSNVTMLFSDIVGFTAICSQCSPLQVITMLNALYTRFDQ...
623,5O9H,NTLRVPDILALVIFAVVFLVGVLGNALVVWVTAFEAKRTINAIWFL...
624,3ALN,SSGKLKISPEQHWDFTAEDLKDLGEIGRGAYGSVNKMVHKPSGQIM...
627,4CLF,MNTPKEEFQDWPIVRIAAHLPDLIVYGHFSPERPFMDYFDGVLMFV...


In [83]:
prot_filtered.to_csv("raw/protein_sequences.csv")

In [84]:
confirmed_d2.to_csv("raw/confirmed_interactions.csv")

In [87]:
target_labels = pd.read_csv("raw/target_labels.csv")


In [88]:
t1 = target_labels[target_labels['ID'].isin(drugs)]
t1

Unnamed: 0.1,Unnamed: 0,ID,1EVU,1NSI,1DJL,1AB2,1CFG,4BSJ,1Q7D,1FLT,...,1TVB,1T5Q,6U6U,7RY7,2MDP,1G2C,4BPU,2X18,2N80,2KR6
0,4,DB05383,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,DB08814,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,DB09092,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7,DB00619,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,DB01254,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,2512,DB01656,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1193,2513,DB01954,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1194,2514,DB00562,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1198,2527,DB15797,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
cols = targets.insert(0, 'ID')

In [90]:
t2 = t1.loc[:, cols]
t2

PDB ID,ID,4COF,3RZE,1M2Z,1GQ4,2LSQ,1HLL,1A52,2KBI,1E3G,...,1DPK,3PTY,1F5V,1IHI,1L9K,3QNT,1AB4,1AJ6,2CBZ,4BGC
0,DB05383,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DB08814,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,DB09092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,DB00619,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DB01254,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,DB01656,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1193,DB01954,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1194,DB00562,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1198,DB15797,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
t2['4COF'].value_counts()

4COF
0    296
1      8
Name: count, dtype: int64

In [92]:
t2.to_csv("raw/target_labels.csv")