In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from chembl_webresource_client.new_client import new_client
from rdkit import Chem

In [18]:
# target = new_client.target
# target_query = target.search('Influenza')
# target_info = target_query[2]
# print(target_query[0])
activity = new_client.activity
activity_query = activity.filter(target_chembl_id='CHEMBL2051')
df = pd.DataFrame.from_dict(activity_query)
len(df)

1106

In [19]:
df = df[['molecule_chembl_id', 'canonical_smiles', 'standard_type', 'standard_relation', 'standard_value']]
df = df[df['standard_type'] == 'IC50']
df = df[df['standard_relation'] == '=']
df['standard_value'] = pd.to_numeric(df['standard_value'])
df['active'] = df['standard_value'].apply(lambda x: 1 if x < 100 else 0)


In [20]:
df.head()
len(df)

711

In [21]:
def smiles_to_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.RDKFingerprint(mol)
    else:
        return None

In [22]:
df['fingerprint'] = df['canonical_smiles'].apply(smiles_to_mol)

In [23]:
X = np.array(list(df['fingerprint']))
y = np.array(list(df['active']))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 0, 1, 1],
       [0, 1, 0, ..., 0, 1, 1],
       [1, 1, 1, ..., 0, 0, 1]])

In [24]:
class DrugDiscoveryModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DrugDiscoveryModel, self).__init__() 
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [25]:
input_size = len(X_train[0])
hidden_size = 128
output_size = 1
learning_rate = 0.001
num_epochs = 100

model = DrugDiscoveryModel(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [26]:
for epoch in range(num_epochs):
    inputs = torch.Tensor(X_train)
    labels = torch.Tensor(y_train)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels.view(-1, 1))
    loss.backward()
    optimizer.step()

with torch.no_grad():
    test_inputs = torch.Tensor(X_test)
    test_labels = torch.Tensor(y_test)
    test_outputs = model(test_inputs)
    predictions = (test_outputs > 0.5).float()
    accuracy = (predictions == test_labels.view(-1, 1)).sum().item() / len(test_labels)
    print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 82.52%


In [27]:
X[0]

array([1, 0, 0, ..., 0, 0, 1])

In [28]:
y

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [29]:
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_relation,standard_value,active,fingerprint
3,CHEMBL115522,CC(=O)N[C@@H]([C@H](O)[C@H](O)CO)[C@H]1OC(O)(C...,IC50,=,40000.0,0,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, ..."
4,CHEMBL119140,CCCCC(NC(C)=O)[C@@H]1CC(C(=O)O)C[C@H]1N=C(N)N,IC50,=,100.0,0,"[0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
5,CHEMBL96712,CC(=O)N[C@H]1[C@H]([C@H](O)[C@H](O)CO)OC(C(=O)...,IC50,=,10000.0,0,"[1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
6,CHEMBL139367,CCC(CC)[C@H](NC(C)=O)[C@@H]1[C@H](O)[C@@H](C(=...,IC50,=,1.4,1,"[1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, ..."
7,CHEMBL222813,CC(=O)N[C@H]1[C@H]([C@H](O)[C@H](O)CO)OC(C(=O)...,IC50,=,2.0,1,"[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, ..."
