In [3]:
! pip install -i  https://pypi.tuna.tsinghua.edu.cn/simple rdkit-pypi

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting rdkit-pypi
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/2c/c0/d049fb64f3d8b0410bba8d8754ade7ea7a3b234199976d08e865ab00a26e/rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [16]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import AllChem

In [17]:
def smile2graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    AllChem.Compute2DCoords(mol)
    atom_features = torch.tensor(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=256), dtype=torch.float32).view(-1, 1)
    adj = torch.tensor(Chem.GetAdjacencyMatrix(mol), dtype=torch.float32)
    x = atom_features
    edge_index = torch.nonzero(adj, as_tuple=False).t().contiguous()
    y = torch.tensor([target], dtype=torch.float32)
    return Data(x=x, edge_index=edge_index, y=y)

In [8]:
import pandas as pd
df = pd.read_csv("/home/sardorbek/MyResearch/data_prep/data/qm9.csv")
df.head()

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,C,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,...,0.044749,-40.47893,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.64329,-401.014647,-372.471772
1,gdb_2,N,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,...,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,O,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,...,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,C#C,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,...,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,C#N,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,0.016601,-93.411888,-93.40937,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028


In [13]:
dataset = df[["smiles", "mu"]]
dataset = dataset[:100]
dataset.shape

(100, 2)

In [19]:
garph_data_list = [smile2graph(smiles, target) for smiles, target in dataset.values]
garph_data_list[0]

Data(x=[256, 1], edge_index=[2, 0], y=[1])

In [20]:
train_data, test_data = train_test_split(garph_data_list, test_size=0.2, random_state=42)
# dataloader for batching and shuffling
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)



In [21]:
# GNN model 

class GNNModel(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
    
model = GNNModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
epochs = 50

for e in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = F.mse_loss(out, batch.y)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        test_loss = 0
        for batch in test_loader:
            out = model(batch)
            test_loss += F.mse_loss(out, batch.y).item()
            test_loss /= len(test_loader)
    print(f'Epoch {e + 1}/{epochs}, Test loss: {test_loss: .4f}')