<a href="https://colab.research.google.com/github/FlyingWorkshop/AdaptiveThresholding/blob/main/toy_gnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task: implement a GNN to predict one (invariant) molecular feature (e.g. molecular weight) of the molecules in the QM9 data set

In [None]:
!pip install torch_geometric
!pip install rdkit



In [None]:
# load QM9 dataset
# documentation: https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.QM9.html?highlight=qm9#torch_geometric.datasets.QM9

from torch_geometric.datasets import QM9

dataset = QM9(root='./data/QM9/')

Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
Extracting data/QM9/raw/qm9.zip
Downloading https://ndownloader.figshare.com/files/3195404


In [None]:
# explore what one example looks like

example = dataset[7]
print(f"{example['name']=}")
print(f"{example.num_node_features=}")
print(f"{example.x.shape=}") # the shape of x is (??, num_node_features)
print(f"{example.y.shape=}")
example.__dict__

example['name']='gdb_8'
example.num_node_features=11
example.x.shape=torch.Size([6, 11])
example.y.shape=torch.Size([1, 19])


{'_edge_attr_cls': torch_geometric.data.data.DataEdgeAttr,
 '_tensor_attr_cls': torch_geometric.data.data.DataTensorAttr,
 '_store': {'x': tensor([[0., 1., 0., 0., 0., 6., 0., 0., 0., 0., 3.],
         [0., 0., 0., 1., 0., 8., 0., 0., 0., 0., 1.],
         [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]), 'edge_index': tensor([[0, 0, 0, 0, 1, 1, 2, 3, 4, 5],
         [1, 2, 3, 4, 0, 5, 0, 0, 0, 1]]), 'edge_attr': tensor([[1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.]]), 'y': tensor([[ 1.5258e+00,  1.6970e+01, -7.2192e+00,  2.1334e+00,  9.3526e+00,
           8.3794e+01,  1.3934e+00, -3.1478e+03, -3.1477e+03, -3.1477e

In [None]:
# let's predict e_homo (energy of the highest occupied molecular orbital), but first let's extract the e_homo target from our example

target_idx = 2
example.y[:, target_idx]

tensor([-10.5499])

In [None]:
# Now that we've identified what target feature we're gonna try to predict, let's examine the dataset as a whole so we can prepare to batch it

print(f"{len(dataset)=}")

len(dataset)=130831


In [None]:
# let's try creating a batch

from torch_geometric.loader import DataLoader

batch_size = 32

loader = DataLoader(dataset, batch_size=batch_size)
for batch in loader:
  print(batch)
  x = batch.x
  y = batch.y[:, target_idx]
  print(x.shape)
  print(y.shape)
  break

DataBatch(x=[230, 11], edge_index=[2, 400], edge_attr=[400, 4], y=[32, 19], pos=[230, 3], idx=[32], name=[32], z=[230], batch=[230], ptr=[33])
torch.Size([230, 11])
torch.Size([32])


In [None]:
# create a GCN w/ PyG

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return x

In [None]:
gcn = GCN()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
gcn = GCN()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

AttributeError: ignored