In [96]:
import pandas as pd
import numpy as np
import torch
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GINConv
from torch.nn import Linear, Sequential, BatchNorm1d, ReLU
import torch.nn.functional as Fun

from torch_geometric.datasets import QM9

import math

In [86]:
qm9 = QM9(root="/home/sardorbek/MyResearch/data_prep/data/QM9")
qm9.data

Data(x=[2359210, 11], edge_index=[2, 4883516], edge_attr=[4883516, 4], y=[130831, 19], pos=[2359210, 3], z=[2359210], smiles=[130831], name=[130831], idx=[130831])

# Data split

In [87]:
y_target = pd.DataFrame(qm9.data.y.numpy())
y_target = y_target[3]

In [82]:
qm9.data.y = torch.Tensor(y_target)

In [83]:
qm9 = qm9.shuffle()

# data split
data_size = 3000
train_index = int(data_size*0.8)
test_index = train_index + int(data_size*0.1)
val_index = test_index + int(data_size*0.1)

# normalizing the data
data_mean = qm9.data.y[0:train_index].mean()
data_std = qm9.data.y[0:train_index].std()

qm9.data.y = (qm9.data.y - data_mean)/data_std

# dataset into DataLoader
train_loader = DataLoader(qm9[0:train_index], batch_size=64, shuffle=True)
test_loader = DataLoader(qm9[train_index:test_index], batch_size=64, shuffle=True)
val_loader = DataLoader(qm9[test_index:val_index], batch_size=64, shuffle=True)




In [39]:
# qm9.data.y[:, 3] [this is homo energy target]

In [88]:
qm9.data

Data(x=[2359210, 11], edge_index=[2, 4883516], edge_attr=[4883516, 4], y=[130831, 19], pos=[2359210, 3], z=[2359210], smiles=[130831], name=[130831], idx=[130831])

# model [GNN] architecture

In [92]:
class HOMOnet(torch.nn.Module):
    def __init__(self, dim_h):
        super(HOMOnet, self).__init__()
        self.conv1 = GINConv(
            Sequential(
                Linear(11, dim_h), BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h), ReLU()
            )
        )
        self.conv2 = GINConv(
            Sequential(
                Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h), ReLU()
            )
        )
        self.conv3 = GINConv(
            Sequential(
                Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h), ReLU()
            )
        )
        self.lin1 = Linear(dim_h, dim_h)
        self.lin2 = Linear(dim_h, 1)

    def forward(self, data):
        x = data.x
        edge_index= data.edge_index
        batch = data.batch

        # Node Embaddings
        h = self.lin1()
        h = h.relu()
        h = Fun.dropout(h, p=0.5, training=self.training)
        h = self.lin2(h)

        return h


# training 

In [93]:
def training(loader, model, loss, optimizer):
    """Training one epoch

    Args:
        loader(DataLoader): training data devidev into batchs
        model(nn.Module): GNN model to train on
        loss: loss function to use during training
        optimizer(torch.optim): optimizer for training

    Returns:
        training loss
    """
    model.train()

    current_loss = 0
    for d in loader:
        optimizer.zero_grad()
        d.x = d.x.float()

        out = model(d)

        l = loss(out, torch.reshape(d.y, (len(d.y), 1)))
        current_loss += 1/len(loader)
        l.backward()
        optimizer.step()
    return current_loss, model

In [94]:
# validation
def validation(loader, model, loss):
    model.eval()
    val_loss = 0
    for d in loader:
        out = model(d)
        l = loss(out, torch.reshape(d.y, (len(d.y), 1)))
        val_loss += 1/len(loader)
    return val_loss

In [95]:
@torch.no_grad()

def testing(loader, model):
    loss = torch.nn.MSELoss()
    test_loss = 0
    test_target = np.empty((0))
    test_y_target = np.empty((0))
    for d in loader:
        out = model(d)
        l = loss(out, torch.reshape(d.y, (len(d.y), 1)))
        test_loss += 1/len(loader)

        # save prediction vs ground_truth values for plotting
        test_target = np.concatenate((test_target, out.detach().numpy()[:, 0]))
        test_y_target = np.concatenate((test_y_target, d.y.detach().numpy()))
    return test_loss, test_target, test_y_target

In [97]:
def train_epochs(epochs, model, train_loader, val_loader, path):
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=5e-4)
    loss = torch.nn.MSELoss()

    train_target = np.empty((0))
    train_y_target = np.empty((0))
    train_loss = np.empty(epochs)
    val_loss = np.empty(epochs)
    best_loss = math.inf

    for e in range(epochs):
        epoch_loss, model = training(train_loader, model, loss, optimizer)
        v_loss = validation(val_loss, model, loss)
        if v_loss < best_loss:
            torch.save(model.state_dict(), path)
        for d in train_loader:
            out = model(d)
            if e == epochs - 1:
                # record truly vs predicted value
                train_target = np.concatenate((train_target, out.detach().numpy()[:, 0]))
                train_y_target = np.concatenate((train_y_target, d.y.detach().numpy()))
        
        train_loss[epochs] = epoch_loss.detach().numpy()
        val_loss[epochs] = v_loss.detach().numpy()

        # print current train and val loss
        if e % 2 == 0:
            print(
                "Epoch: ",
                str(e)
                + ", Train Loss: "
                + str(epoch_loss.item())
                + ", Val Loss: "
                + str(v_loss.item())
            )
    return train_loss, val_loss, train_target, train_y_target

In [100]:
model = HOMOnet(dim_h=64)
model

HOMOnet(
  (conv1): GINConv(nn=Sequential(
    (0): Linear(in_features=11, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=64, bias=True)
    (4): ReLU()
  ))
  (conv2): GINConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=64, bias=True)
    (4): ReLU()
  ))
  (conv3): GINConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=64, bias=True)
    (4): ReLU()
  ))
  (lin1): Linear(in_features=64, out_features=64, bias=True)
  (lin2): Linear(in_features=64, out_features=1, bias=True)
)

# Epoch Train

In [101]:
train_loss, val_loss, train_target, train_y_target = train_epochs(
    epochs=10, model= model, train_loader=train_loader, val_loader=test_loader, path="/home/sardorbek/MyResearch/TainHOMOnet"
)

TypeError: Linear.forward() missing 1 required positional argument: 'input'