In [62]:
import typing as t

import deepchem as dc
import lightning as L
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch_geometric
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import IPythonConsole
from sklearn.metrics import r2_score as sklearn_r2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.nn import BatchNorm1d, HuberLoss, L1Loss, Linear, ModuleList, MSELoss
from torch_geometric.loader import DataLoader
from torch_geometric.nn import (
    GATv2Conv,
    GCN2Conv,
    GraphNorm,
    TopKPooling,
    global_max_pool,
    global_mean_pool,
    summary,
)
from torcheval.metrics import R2Score
from torcheval.metrics.functional import r2_score
from twinning import twin


In [14]:
def _generate_graph_list(df: pd.DataFrame):
    data_list = []
    for _, row in df.iterrows():
        smiles = row["smiles"]
        label = row["MP"]
        featurizer = dc.feat.MolGraphConvFeaturizer(
            use_edges=True, use_partial_charge=True
        )
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            try:
                f = featurizer._featurize(mol)
                graph = f.to_pyg_graph()
                graph.y = float(label)
                graph.smiles = smiles
                data_list.append(graph)
            except IndexError:
                pass
    return data_list


# Nodes


def generate_graph_loader(df: pd.DataFrame):
    data_list = _generate_graph_list(df)
    graph_loader = DataLoader(
        data_list,
        batch_size=32,
        shuffle=True,
        drop_last=True
    )
    return graph_loader

In [21]:
dataloder = generate_graph_loader(df)

[15:10:07] Explicit valence for atom # 25 H, 2, is greater than permitted
[15:10:07] Explicit valence for atom # 25 H, 2, is greater than permitted
[15:10:07] Explicit valence for atom # 25 H, 2, is greater than permitted


In [53]:
# # class nn.module
# class GNN(torch.nn.Module):
#     def __init__(self, model_parameters):
#         # Loading params
#         super().__init__()
#         # hidden_size = model_parameters["hidden_size"]
#         # n_heads = model_parameters["n_heads"]
#         # self.n_layers = model_parameters["n_layers"]
#         # dropout_rate = model_parameters["dropout_rate"]
#         # top_k_ratio = model_parameters["top_k_ratio"]
#         # dense_size = model_parameters["dense_size"]
#         hidden_size = 16
#         n_heads = 3
#         self.n_layers = 0
#         dropout_rate = 0.2
#         top_k_ratio = 0.5
#         dense_size = 8
#         # Module lists
#         self.conv_layers = ModuleList([])
#         self.transf_layers = ModuleList([])
#         self.pooling_layers = ModuleList([])
#         self.gn_layers = ModuleList([])
#         # Initial aggregation layer
#         self.conv1 = GATv2Conv(
#             in_channels=31,
#             out_channels=hidden_size,
#             heads=n_heads,
#             dropout=dropout_rate,
#             edge_dim=11,
#         )
#         self.transf1 = Linear(hidden_size * 3, hidden_size)
#         self.gn1 = GraphNorm(hidden_size)
#         self.pooling_layer1 = TopKPooling(hidden_size, ratio=top_k_ratio)
#         # Internal layers
#         for _ in range(self.n_layers):
#             self.conv_layers.append(
#                 GATv2Conv(
#                     in_channels=hidden_size,
#                     out_channels=hidden_size,
#                     heads=n_heads,
#                     dropout=dropout_rate,
#                     edge_dim=11,
#                 )
#             )
#             self.transf_layers.append(Linear(hidden_size * n_heads, hidden_size))
#             self.gn_layers.append(GraphNorm(hidden_size))
#             self.pooling_layers.append(TopKPooling(hidden_size, ratio=top_k_ratio))
#         # Linear layers
#         self.linear1 = Linear(2 * hidden_size, dense_size)
#         self.linear2 = Linear(dense_size, int(dense_size / 2))
#         self.linear3 = Linear(int(dense_size / 2), 1)

#     def forward(self, x, edge_attr, edge_index, batch_index):
#         global_representation = []
#         # Aggregation block
#         x = self.conv1(x, edge_index, edge_attr)
#         x = torch.relu(self.transf1(x))
#         x = self.gn1(x, batch_index)
#         # x, edge_index, edge_attr, batch_index, _, _ = self.pooling_layer1(
#         #     x, edge_index, edge_attr, batch_index
#         # )
#         global_representation.append(
#             torch.cat(
#                 [global_mean_pool(x, batch_index), global_max_pool(x, batch_index)],
#                 dim=1,
#             )
#         )
#         # Internal layers
#         for i in range(self.n_layers):
#             x = self.conv_layers[i](x, edge_index, edge_attr)
#             x = F.leaky_relu(self.transf_layers[i](x))
#             x = self.gn_layers[i](x, batch_index)
#             x, edge_index, edge_attr, batch_index, _, _ = self.pooling_layers[i](
#                 x, edge_index, edge_attr, batch_index
#             )
#             global_representation.append(
#                 torch.cat(
#                     [global_mean_pool(x, batch_index), global_max_pool(x, batch_index)],
#                     dim=1,
#                 )
#             )
#         # Output block
#         x = sum(global_representation)
#         x = torch.relu(self.linear1(x))
#         x = F.dropout(x, p=0.2, training=self.training)
#         x = torch.relu(self.linear2(x))
#         x = F.dropout(x, p=0.2, training=self.training)
#         x = self.linear3(x)
#         return x

import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.nn import GCNConv, global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(31, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 1)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64)

class GNN_L(L.LightningModule):
    def __init__(self, model, trainer_parameters):
        super().__init__()
        self.model = model
        self.lr = trainer_parameters["lr"]
        self.weight_decay = trainer_parameters["weight_decay"]
        self.scheduler_gamma = trainer_parameters["scheduler_gamma"]
        self.loss_fn = MSELoss()

    def forward(self, x, edge_index, batch_index):
        return self.model(
            x.float(), edge_index, batch_index
        ).squeeze()

    def training_step(self, batch, batch_nb):
        preds = self(
            batch.x.float(), batch.edge_index, batch.batch
        ).squeeze()
        target = batch.y.float()
        loss = self.loss_fn(preds, target)
        r2 = r2_score(preds, target)
        self.log("r2", r2)
        self.log("loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )
        scheduler = torch.optim.lr_scheduler.ExponentialLR(
            optimizer, gamma=self.scheduler_gamma
        )
        return [optimizer], [{"scheduler": scheduler, "interval": "epoch"}]


def train_model(train_dataloader, trainer_parameters):
    L.seed_everything(42)
    model = GNN_L(GCN(64), trainer_parameters)
    early_stopping = EarlyStopping("loss", patience=100)
    lr_monitor = LearningRateMonitor(logging_interval="epoch")
    trainer = L.Trainer(
        max_epochs=100,
        callbacks=[early_stopping, lr_monitor],
        log_every_n_steps=5,
        logger=True,
        deterministic=True,
        accumulate_grad_batches=1,
    )
    trainer.fit(model=model, train_dataloaders=train_dataloader)
    return model


In [54]:
trainer_parameters = {"lr": 0.001,
                      "weight_decay": 0.,
                      "scheduler_gamma": 0.99}
model_parameters = {}

In [57]:
model = train_model(dataloder, trainer_parameters)

Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | model   | GCN     | 10.4 K | train
1 | loss_fn | MSELoss | 0      | train
--------------------------------------------
10.4 K    Trainable params
0         Non-trainable params
10.4 K    Total params
0.042     Total estimated model params size (MB)
c:\Users\01121272\Desktop\Projects\gnn-mp-model\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 99: 100%|██████████| 29/29 [00:00<00:00, 32.33it/s, v_num=15]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 29/29 [00:00<00:00, 31.83it/s, v_num=15]


In [65]:
batch[0]

Data(x=[39, 31], edge_index=[2, 76], edge_attr=[76, 11], y=[1], smiles='CCCCCCCCCCCCCC[N+](C)(C)Cc1ccccc1.FC(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F')

In [73]:
model(batch.x.float(), batch.edge_index, batch.batch
        ).squeeze()

tensor([289.4050, 292.0450, 289.4050, 297.7302, 287.9824, 315.9386, 289.4050,
        343.5855, 344.1520, 285.9215, 312.0751, 314.0991, 354.7935, 289.4050,
        309.6705, 289.4050, 267.6820, 281.7561, 313.6778, 324.7451, 289.4050,
        285.7552, 315.2130, 369.7042, 276.5990, 312.0751, 289.4050, 289.4050,
        276.5990, 307.1325, 289.4050, 315.1769], grad_fn=<SqueezeBackward0>)

In [74]:
batch.y

tensor([288.0000, 243.2000, 287.9000, 318.2000, 275.2000, 298.2000, 253.2000,
        432.2000, 414.4000, 306.2000, 283.2000, 250.8000, 355.2000, 325.2000,
        335.0000, 314.2000, 259.7000, 342.2000, 217.6000, 304.2000, 282.2000,
        428.0000, 307.1000, 390.2000, 262.2000, 294.7000, 299.1000, 302.2000,
        285.2000, 371.2000, 260.2000, 254.9000])

In [72]:
r2_score(input=preds, target=target)

tensor(0.2251)

In [75]:
from sklearn.metrics import root_mean_squared_error

In [77]:
root_mean_squared_error(target.numpy(), preds.detach().numpy())

46.350647