In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam

from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool

from rdkit import Chem

# Using ESOL data

import pandas as pd
data_train = pd.read_csv('train_data.csv')
data_train = data_train[['smiles','measured log solubility in mols per litre']]
data_train.columns = ['smiles','solubility']

data_test = pd.read_csv('test_data.csv')
data_test = data_test[['smiles','measured log solubility in mols per litre']]
data_test.columns = ['smiles','solubility']

data_val= pd.read_csv('valid_data.csv')
data_val = data_val[['smiles','measured log solubility in mols per litre']]
data_val.columns = ['smiles','solubility']

In [3]:
def to_graph(smiles, solubility):
    
    mol = Chem.MolFromSmiles(smiles)
    atoms = mol.GetAtoms()
    
    # atoms - nodes
    # node feature vector, just a single feature atomic number
    atm_numbers = [atom.GetAtomicNum() for atom in atoms]
    atomic_numbers = torch.tensor(atm_numbers, dtype=torch.long).unsqueeze(1)
    
    # bonds - edges
    edge_index = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index += [[i, j], [j, i]]
        
    edge_index = torch.tensor(edge_index, dtype=torch.long).t()
    sol_target = torch.tensor(solubility, dtype=torch.float)
    
    return Data(x=atomic_numbers, edge_index=edge_index, y=sol_target)



In [4]:
class ESOLData(InMemoryDataset):
    
    def __init__(self, data):
        super().__init__()
        self.data_list = [to_graph(j.smiles, j.solubility) for i,j in data.iterrows()]
    
    def len(self):
        return len(self.data_list)
    
    def get(self, idx):
        return self.data_list[idx]

train_data = ESOLData(data_train)
test_data = ESOLData(data_test)
val_data = ESOLData(data_val)

# load data to create mini batches
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)
val_loader = DataLoader(val_data, batch_size=32)



In [5]:
class GNNModel(nn.Module):
    
    def __init__(self, hidden_layer=64):
        # super(GNNModel, self).__init__(hidden_layer)
        super().__init__()
        
        self.conv1 = GCNConv(1, hidden_layer)
        self.conv2 = GCNConv(hidden_layer, hidden_layer)
        # self.conv3 = GCNConv(hidden_layer, hidden_layer)
        self.lin = nn.Linear(hidden_layer,1)
    
    def forward(self, data):
        x, edge_index, batch = data.x.float(), data.edge_index, data.batch
        
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        # x = torch.relu(self.conv3(x, edge_index))
        
        x = global_mean_pool(x, batch)
        x = self.lin(x)
        
        return x

In [6]:
model = GNNModel()
opti = Adam(model.parameters(), lr=0.0005)
loss_fx = nn.MSELoss()

def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        opti.zero_grad()
        out = model(data)
        loss = loss_fx(out, data.y)
        loss.backward()
        opti.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def test():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in test_loader:
            out = model(data)
            loss = loss_fx(out, data.y)
            total_loss += loss.item()
    return total_loss / len(test_loader)

In [7]:
epochs = 100
for epoch in range(1, epochs + 1):
    train_loss = train()
    test_loss = test()
    
    print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 001 | Train Loss: 11.2758 | Test Loss: 14.1498
Epoch 002 | Train Loss: 7.3335 | Test Loss: 9.6821
Epoch 003 | Train Loss: 5.4856 | Test Loss: 6.9601
Epoch 004 | Train Loss: 4.9911 | Test Loss: 6.4547
Epoch 005 | Train Loss: 5.0145 | Test Loss: 6.4956
Epoch 006 | Train Loss: 5.0098 | Test Loss: 6.3579
Epoch 007 | Train Loss: 5.1782 | Test Loss: 6.4147
Epoch 008 | Train Loss: 4.9459 | Test Loss: 6.6341
Epoch 009 | Train Loss: 4.9192 | Test Loss: 6.4864
Epoch 010 | Train Loss: 4.8949 | Test Loss: 6.5615
Epoch 011 | Train Loss: 4.9028 | Test Loss: 6.4182
Epoch 012 | Train Loss: 4.8977 | Test Loss: 6.6631
Epoch 013 | Train Loss: 5.2108 | Test Loss: 6.3807
Epoch 014 | Train Loss: 4.8535 | Test Loss: 6.4427
Epoch 015 | Train Loss: 5.0982 | Test Loss: 6.4421
Epoch 016 | Train Loss: 5.0502 | Test Loss: 6.6380
Epoch 017 | Train Loss: 5.0587 | Test Loss: 6.4442
Epoch 018 | Train Loss: 4.8772 | Test Loss: 6.3088
Epoch 019 | Train Loss: 5.1883 | Test Loss: 6.6644
Epoch 020 | Train Loss: 4.883

In [28]:
# Validate first 2 rows of validation set
val_test = data_val.head(2)
print("Experimental Data ->",[a.solubility for i,a in val_test.iterrows()])
val_test1 = ESOLData(val_test)
val_test2 = DataLoader(val_test1)
print("Predicted data")
for data in val_test2:
    out = model(data)
    print(out)


Experimental Data -> [-2.12, -3.401]
Predicted data
tensor([[-2.8470]], grad_fn=<AddmmBackward0>)
tensor([[-2.8418]], grad_fn=<AddmmBackward0>)
