In [1]:
import rdkit
from torch_geometric.datasets import MoleculeNet
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from torch_geometric.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

#Load the dataset
def LoadData():
    data = MoleculeNet(root=".", name="ESOL")
    molecule = Chem.MolFromSmiles(data[0]["smiles"])
    return data, molecule

def WrapData_toLoader(data,
                      data_size,
                      NUM_GRAPHS_PER_BATCH=64,
                      split_ratio=0.8):
    
    loader = DataLoader(data[:int(data_size *split_ratio)],
                        batch_size = NUM_GRAPHS_PER_BATCH, shuffle=True)
    test_loader = DataLoader(data[int(data_size*split_ratio):],
                            batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
    return loader, test_loader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data, molecule = LoadData()
MoleculeNet(root=".", name="ESOL").num_features

9

In [2]:
import torch
from torch.nn import Linear
import torch.nn.functional as F 
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp



embedding_size = 64

class GCN(torch.nn.Module):
    def __init__(self):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)


        # GCN layers
        self.initial_conv = GCNConv(data.num_features, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)

        # Output layer
        self.out = Linear(embedding_size*2, 1)

    def forward(self, x, edge_index, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.tanh(hidden)

        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.tanh(hidden)
        hidden = self.conv2(hidden, edge_index)
        hidden = F.tanh(hidden)
        hidden = self.conv3(hidden, edge_index)
        hidden = F.tanh(hidden)
          
        # Global Pooling (stack different aggregations)
        # 
        hidden = torch.cat([gmp(hidden, batch_index), 
                            gap(hidden, batch_index)], dim=1)

        # Apply a final (linear) classifier.
        out = self.out(hidden)

        return out, hidden

Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv
Processing...
Done!


In [3]:
model = GCN()

#Define Loss Function and Optimizer to be used
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0007)

# Use GPU for training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#Load the Data
data_size = len(data)
loader, test_loader = WrapData_toLoader(data, data_size)

#Define function the train the Data

def train(data):
    #Enumerate through each batch
    for batch in loader:
        batch.to(device)  #Use GPU
        optimizer.zero_grad() #Reset the gradients for each batch
        #Passing node features and connection info
        pred, embedding = model(batch.x.float(), batch.edge_index, batch.batch)
        #Calculating the loss and gradients
        loss = loss_fn(pred, batch.y)
        loss.backward()
        #Update using the gradients
        optimizer.step()
    return loss, embedding

In [4]:
type(data)

torch_geometric.datasets.molecule_net.MoleculeNet

In [4]:
# Training the data
num_epochs = 1000
print("Start Training......")
losses = []
for epoch in range(num_epochs):
    loss,h = train(data)
    losses.append(loss)
    print(f"Epoch {epoch+1} / {num_epochs}  |  Train Loss: {loss}")

Start Training......
Epoch 1 / 1000  |  Train Loss: 11.665949821472168
Epoch 2 / 1000  |  Train Loss: 10.07497501373291
Epoch 3 / 1000  |  Train Loss: 2.6209933757781982
Epoch 4 / 1000  |  Train Loss: 4.1252760887146
Epoch 5 / 1000  |  Train Loss: 1.7706319093704224
Epoch 6 / 1000  |  Train Loss: 2.27337908744812
Epoch 7 / 1000  |  Train Loss: 2.6324708461761475
Epoch 8 / 1000  |  Train Loss: 2.713014602661133
Epoch 9 / 1000  |  Train Loss: 6.059814453125
Epoch 10 / 1000  |  Train Loss: 2.9248058795928955
Epoch 11 / 1000  |  Train Loss: 1.592408537864685
Epoch 12 / 1000  |  Train Loss: 8.007978439331055
Epoch 13 / 1000  |  Train Loss: 0.7166104316711426
Epoch 14 / 1000  |  Train Loss: 7.643935680389404
Epoch 15 / 1000  |  Train Loss: 4.460394859313965
Epoch 16 / 1000  |  Train Loss: 6.1365203857421875
Epoch 17 / 1000  |  Train Loss: 0.8607726693153381
Epoch 18 / 1000  |  Train Loss: 0.9701941609382629
Epoch 19 / 1000  |  Train Loss: 11.737007141113281
Epoch 20 / 1000  |  Train Loss: 2.

KeyboardInterrupt: 

In [None]:
print(losses)

In [None]:
#Visualizing the training loss
import seaborn as sns
import matplotlib.pyplot as plt
losses_float = [float(loss.cpu().detach().numpy()) for loss in losses] 
loss_indices = [i for i,l in enumerate(losses_float)] 


fig = plt.figure(figsize=(18, 8))
plt.plot(loss_indices, losses_float)
plt.xlabel('Batch Number')
plt.ylabel('Loss')
plt.show()

In [None]:
#Getting a prediction

import pandas as pd

results = pd.DataFrame(columns=["y_real", "y_pred"]) # create an empty DataFrame to store results
with torch.no_grad():
    for test_batch in test_loader: # iterate over all batches in test_loader
        test_batch.to(device)
        pred, embed = model(test_batch.x.float(), test_batch.edge_index, test_batch.batch)
        df = pd.DataFrame()
        df["y_real"] = test_batch.y.tolist()
        df["y_pred"] = pred.tolist()
        df["y_real"] = df["y_real"].apply(lambda row: row[0])
        df["y_pred"] = df["y_pred"].apply(lambda row: row[0])
        results = pd.concat([results, df]) # concatenate the results into a single DataFrame
results.reset_index(drop=True, inplace=True) # reset the index of the DataFrame

In [None]:
results["Diff"] = results["y_real"]-results["y_pred"]
results["Diff Sq"] = results["Diff"].apply(lambda x: x*x)
test_mse = results["Diff Sq"].mean()
print(f"Test MSE Error is {test_mse}")

In [None]:
import numpy as np

plt.scatter(results.loc[:,"y_real"],results.loc[:,"y_pred"],marker ='.')
plt.xlabel("Actual Data")
plt.ylabel("Predicted Data")
line = np.linspace(1.2*results["y_real"].min(),1.2*results["y_real"].max())
plt.plot(line,line,'-r')
plt.show
