# Classification Task

In [1]:
import torch
from torchdrug import data
import pandas as pd
import numpy as np
import random

from tqdm.notebook import tqdm
%matplotlib inline

## Reprodicubility Settings

In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

## Data Loading

In [3]:
DATASET_TYPE = "single" #"single"
DATASET_NAME = "chembl29_predicting_target_P14416_P42336_target_1_vs_random_cpds"
CSV_DATA_PATH = "../data/"+ DATASET_NAME + ".csv"
# "" to use the whole training set, 0 for split 0 and so on
TRAINING_SET_SPLIT = 1 #"FULL" #0, 1 ,2 

smiles_df = pd.read_csv(CSV_DATA_PATH, sep = ",")
display(smiles_df.head())
print(smiles_df["label"].value_counts())

Unnamed: 0,nonstereo_aromatic_smiles,target_pair,label
0,Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1,P14416_P42336,0
1,Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12,P14416_P42336,0
2,Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1,P14416_P42336,0
3,Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1,P14416_P42336,0
4,Brc1cccc(N2CCN(Cc3cc4ccccn4n3)CC2)n1,P14416_P42336,0


0    4174
1    4174
Name: label, dtype: int64


## Define Custom Class
We need to define the ChEMBL datasets class in order to load the model

In [4]:
import os

from torchdrug.core import Registry as R
from torchdrug.utils import doc


@R.register("datasets.ChEMBL") #only first time you launch the class
#@doc.copy_args(data.MoleculeDataset.load_csv, ignore=("path", "smiles_field", "target_fields"))
class ChEMBL(data.MoleculeDataset):
    

    def __init__(self, path, smiles_field, target_fields, verbose=1, **kwargs):
        # path = os.path.expanduser(path)# if not os.path.exists(path):
        #     os.makedirs(path)
        self.path = path
        self.smiles_field = smiles_field
        self.target_fields= target_fields
        #print(self.path)
        # zip_file = utils.download(self.url, path, md5=self.md5)
        # csv_file = utils.extract(zip_file)

        self.load_csv(self.path, smiles_field=self.smiles_field, target_fields=self.target_fields,
                      verbose=verbose, **kwargs)

In [5]:
# target_fields = ["target1", "target2"] if DATASET_TYPE == "dual" else ["label"]
target_fields = ["label"]
chembl_dataset = ChEMBL(path = CSV_DATA_PATH, smiles_field = "nonstereo_aromatic_smiles", target_fields = target_fields)

Loading ../data/chembl29_predicting_target_P14416_P42336_target_1_vs_random_cpds.csv: 100%|██████████| 8349/8349 [00:00<00:00, 79726.98it/s]
Constructing molecules from SMILES: 100%|██████████| 8348/8348 [00:28<00:00, 288.66it/s]


## Obtain edge index to use with PyG

Visualize molecules using NetworkX

In [6]:
from pysmiles import read_smiles
import networkx as nx
    
smiles = chembl_dataset.smiles_list
mols = []
for i in tqdm(range(len(chembl_dataset.smiles_list))):
    mols.append(read_smiles(chembl_dataset.smiles_list[i]))

# mol = mols[0]
# print(mol.nodes(data='element'))
# labels = nx.get_node_attributes(mol, 'element') 
# nx.draw(mol, labels = labels, pos=nx.spring_layout(mol))

  0%|          | 0/8348 [00:00<?, ?it/s]

Define edge index 

In [7]:
edge_index_list = []

for mol in tqdm(mols):
    adj = nx.to_scipy_sparse_matrix(mol).tocoo()
    row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.long)
    col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.long)
    edge_index = torch.stack([row, col], dim=0)
    edge_index_list.append(edge_index)

display(len(mols))

  0%|          | 0/8348 [00:00<?, ?it/s]

8348

Define torchdrug dataset in order to get node features

In [8]:
mols_torchdrug_format = []
for i in tqdm(range(len(chembl_dataset.smiles_list))):
    mols_torchdrug_format.append(data.Molecule.from_smiles(chembl_dataset.smiles_list[i], with_hydrogen = False))


  0%|          | 0/8348 [00:00<?, ?it/s]



## Create Custom Dataset

In [9]:
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data

from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_list = []
y = torch.LongTensor(chembl_dataset.targets["label"]).to(device)

for i in tqdm(range(len(mols))):
    data_list.append(Data(x = mols_torchdrug_format[i].node_feature, edge_index = edge_index_list[i], y = y[i], smiles = chembl_dataset.smiles_list[i]))


  0%|          | 0/8348 [00:00<?, ?it/s]

In [10]:
class ChEMBLDatasetPyG(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None, data_list = None):
        super().__init__(root, transform, pre_transform, pre_filter)
        #self.data, self.slices = torch.load(self.processed_paths[0])
        self.data_list = data_list

        # Read data into huge `Data` list.
        data_list = self.data_list

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        self.data, self.slices = self.collate(data_list)
        

In [11]:
dataset = ChEMBLDatasetPyG(".", data_list = data_list)

Split data in train/val/test (0.8/0.1/0.1)

In [12]:
lengths = [int(0.8 * len(chembl_dataset)), int(0.1 * len(chembl_dataset))]
lengths += [len(chembl_dataset) - sum(lengths)]

print(lengths)
dataset = dataset.shuffle()
train_data = dataset[:lengths[0]]
val_data = dataset[lengths[0]+1:lengths[0] + lengths[1]+1]
test_data = dataset[lengths[0] + lengths[1] + 1: ]
len(train_data), len(val_data), len(test_data)

[6678, 834, 836]


(6678, 834, 835)

## Training set selection for stability check

In [13]:
training_sets_lengths = [int(0.33 * len(train_data)), int(0.33 * len(train_data))]
training_sets_lengths += [len(train_data) - sum(training_sets_lengths)]

train_set_0 = train_data[:training_sets_lengths[0]]
train_set_1 = train_data[training_sets_lengths[0]+1:training_sets_lengths[0] + training_sets_lengths[1]+1]
train_set_2 = train_data[training_sets_lengths[0] + training_sets_lengths[1] + 1: ]

if TRAINING_SET_SPLIT == "FULL":
    pass
elif TRAINING_SET_SPLIT == 0:
    train_data = train_set_0
elif TRAINING_SET_SPLIT == 1:
    train_data = train_set_1
elif TRAINING_SET_SPLIT == 2:
    train_data = train_set_2


In [14]:
for i in range(len(train_data)):
    print(train_data[i].smiles)

CCS(=O)(=O)NC1CCC(CCN2CCC(c3coc4ccccc34)CC2)CC1
O=C1c2ccc(F)cc2C(=O)N1CCCCN1CCN(c2cccc3sccc23)CC1
Clc1cccc(N2CCN(CCCOc3ccc4cn[nH]c4c3)CC2)c1Cl
COCCn1ncc(NC(=O)c2nc(C3CC3)cnc2Nc2cncnc2)c1C(=O)N(C)C
Fc1ccc2cccc(N3CCN(CCCOc4ccc5c(c4)CNC5)CC3)c2c1
CCCN(CCC)C1CCc2ccc3[nH]c(C(N)=O)cc3c2C1
COc1ccccc1N1CCN(Cc2cn(-c3ccc(OCCCOCCF)cc3)nn2)CC1
COc1c(-c2cc(-c3cccnc3)nn2-c2ccc(S(N)(=O)=O)cc2)c(O)cc2occc12
COc1cccc(N2CCN(CCCCNC(=O)c3ccc(-n4ccnc4)cc3)CC2)c1
COC1C(OC(=O)NC(=O)CCl)CCC2(CO2)C1C1(C)OC1CC=C(C)C
O=C1NCc2ccc(OCCCN3CCN(c4cccc5ccccc45)CC3)cc21
Cc1c(S(N)(=O)=O)no[n+]1[O-]
COc1cc(OC)c(-c2cn3ccc(N(C)C)cc3n2)cc1Cl
Cc1nc2n(c(=O)c1CCN1CCN(c3cccc4sccc34)CC1)CCC=C2
CN1C(=O)CCc2c(NC(=O)NC3CCC(c4ccccc4F)C3)cccc21
CN(C)C(=O)C1CNC(C(=O)N2CCCC2C#N)C1
CN1CCc2c1nc(N1CCC3(CC1)C(=O)N(CCO)c1cc(F)cc(F)c13)[nH]c2=O
CC(C(=O)O)c1ccc(=O)n(C(F)F)c1
C=CC(=O)NC1CN(S(=O)(=O)c2ccc(NC(=O)OCc3ccccc3)cc2)C1
CCN(CC)CC(C)NC(=O)c1ccc(-c2noc(C(F)(F)F)n2)cc1
COc1cc(CCn2cnc3sc4c(c3c2=O)CCC(N2CCSC2)C4)cc(OC)c1
O=C1CC2(CCCC2)CC(=O)

In [15]:
batch_size= 32
train_loader = DataLoader(train_data, batch_size=batch_size)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

## GCN Definition

In [16]:
from torch_geometric.nn import GCNConv, Linear
from torch_geometric.nn import global_mean_pool
import torch.nn.functional as F

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(chembl_dataset.node_feature_dim, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.conv4(x, edge_index)
        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=256).to(device)
print(model)

GCN(
  (conv1): GCNConv(69, 256)
  (conv2): GCNConv(256, 256)
  (conv3): GCNConv(256, 256)
  (conv4): GCNConv(256, 256)
  (lin): Linear(256, 2, bias=True)
)


## Training Phase

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(hidden_channels=256).to(device)

lr = lr=1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 100
criterion = torch.nn.CrossEntropyLoss() #Note that this case is equivalent to the combination of LogSoftmax torch.nn.NLLLoss.

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         data = data.to(device)
         
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(epochs):
    train()
    train_acc = test(train_loader)
    val_acc = test(val_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
test_acc = test(test_loader)    
print(f'Test Acc: {test_acc:.4f}')

Epoch: 000, Train Acc: 0.8498, Val Acc: 0.8177
Epoch: 001, Train Acc: 0.8883, Val Acc: 0.8765
Epoch: 002, Train Acc: 0.9128, Val Acc: 0.8957
Epoch: 003, Train Acc: 0.9201, Val Acc: 0.8945
Epoch: 004, Train Acc: 0.9237, Val Acc: 0.8957
Epoch: 005, Train Acc: 0.9292, Val Acc: 0.9029
Epoch: 006, Train Acc: 0.9351, Val Acc: 0.9077
Epoch: 007, Train Acc: 0.9410, Val Acc: 0.9077
Epoch: 008, Train Acc: 0.9496, Val Acc: 0.9149
Epoch: 009, Train Acc: 0.9528, Val Acc: 0.9209
Epoch: 010, Train Acc: 0.9532, Val Acc: 0.9209
Epoch: 011, Train Acc: 0.9578, Val Acc: 0.9257
Epoch: 012, Train Acc: 0.9587, Val Acc: 0.9305
Epoch: 013, Train Acc: 0.9564, Val Acc: 0.9269
Epoch: 014, Train Acc: 0.9650, Val Acc: 0.9353
Epoch: 015, Train Acc: 0.9628, Val Acc: 0.9376
Epoch: 016, Train Acc: 0.9646, Val Acc: 0.9376
Epoch: 017, Train Acc: 0.9669, Val Acc: 0.9341
Epoch: 018, Train Acc: 0.9669, Val Acc: 0.9293
Epoch: 019, Train Acc: 0.9650, Val Acc: 0.9281
Epoch: 020, Train Acc: 0.9664, Val Acc: 0.9269
Epoch: 021, T

## Save the Model

In [18]:
import os

MODEL_NAME = "../models/PyG/" + DATASET_NAME + "_training_set_" + str(TRAINING_SET_SPLIT) + "_model_%s.ckpt"
num_file = 0
while os.path.exists(MODEL_NAME % num_file):
    num_file += 1

MODEL_PATH = MODEL_NAME % num_file

#MODEL_PATH = "../models/PyG/" + DATASET_NAME + "_model.ckpt"
torch.save(model.state_dict(), MODEL_PATH)