In [1]:
import os, sys

sys.path.append(os.pardir)

from src.datamodules.components.dataset2d import TrainDataset, TestDataset
from src.datamodules.datamodule2d import BaseDataModule

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR

import torch_geometric.nn as gnn
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=123456789)
                for i, (train_idx, val_idx) in enumerate(kfold.split(self.full_data)):
                    if i == self.hparams.fold:
                        break

In [12]:
from typing import Union
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

from tqdm import tqdm

from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR

import torch_geometric.nn as gnn
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset


from ogb.utils.features import (allowable_features, atom_to_feature_vector,
 bond_to_feature_vector, atom_feature_vector_to_dict, bond_feature_vector_to_dict) 


def mol2graph(mol):
    """
    Converts SMILES string to graph Data object
    :input: SMILES string (str)
    :return: graph object
    """

    # atoms
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_to_feature_vector(atom))
    x = np.array(atom_features_list, dtype = np.int64)

    # bonds
    num_bond_features = 3  # bond type, bond stereo, is_conjugated
    if len(mol.GetBonds()) > 0: # mol has bonds
        edges_list = []
        edge_features_list = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()

            edge_feature = bond_to_feature_vector(bond)

            # add edges in both directions
            edges_list.append((i, j))
            edge_features_list.append(edge_feature)
            edges_list.append((j, i))
            edge_features_list.append(edge_feature)

        # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
        edge_index = np.array(edges_list, dtype = np.int64).T

        # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
        edge_attr = np.array(edge_features_list, dtype = np.int64)

    else:   # mol has no bonds
        edge_index = np.empty((2, 0), dtype = np.int64)
        edge_attr = np.empty((0, num_bond_features), dtype = np.int64)

    return x, edge_attr, edge_index


def get_coordinate_features(mol):
    conf = mol.GetConformer()
    return conf.GetPositions()

def get_mol_data(root, prefix, y=None):
    if prefix.startswith("train"):
        set_dir = "train_set"
    else:
        set_dir = "test_set"
        
    ex = Chem.MolFromMolFile(f"{root}/{set_dir}/{prefix}_ex.mol", removeHs=False)
    g = Chem.MolFromMolFile(f"{root}/{set_dir}/{prefix}_g.mol", removeHs=False)
    
    # Atom features
    X, edge_attr, edge_index = mol2graph(ex)
    
    # Atom 3D coordinates
    co_ex = get_coordinate_features(ex)
    co_g = get_coordinate_features(g)
            
    X = np.concatenate([X, co_ex, co_g], axis=1)
    
    X = torch.tensor(X, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    y = torch.tensor([y], dtype=torch.float)
            
    return Data(x=X, edge_index=edge_index, edge_attr=edge_attr, y=y)
        

def get_datalist(df, root):
    data_list = []
    if "Reorg_g" in df.columns:
        for _, item in tqdm(df.iterrows()):
            y = [item.Reorg_g, item.Reorg_ex]
            data = get_mol_data(root, item[0], y)
            data_list.append(data)
    else:
        for _, item in tqdm(df.iterrows()):
            data = get_mol_data(root, item[0])
            data_list.append(data)
        
    return data_list


class TrainDataset(InMemoryDataset):
    def __init__(
        self,
        root="/data/project/danyoung/reorg/data/mol_files",
        fold: Union[int, str] = 0,
        train: bool = True,
        transform=None,
        pre_transform=None,
        pre_filter=None
    ):
        super().__init__(root, transform, pre_transform, pre_filter)
        data, slices = torch.load(self.processed_paths[0])
        self.data, self.slices = data, slices

    @property
    def raw_file_names(self):
        mol_list = os.listdir(os.path.join(self.root, "train_set"))
        mol_list = [os.path.join(self.root, "train_set", file) for file in mol_list]
            
        return mol_list

    @property
    def processed_file_names(self):
        return ["2d_dataset_train.pt"]

    def process(self):
        # Read data into huge `Data` list.
        df = pd.read_csv(f"{self.root}/../train_set.ReorgE.csv")
        data_list = get_datalist(df, self.root)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
        
    
class TestDataset(InMemoryDataset):
    def __init__(
        self,
        root="/data/project/danyoung/reorg/data/mol_files", 
        transform=None,
        pre_transform=None,
        pre_filter=None
    ):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        mol_list = os.listdir(os.path.join(self.root, "test_set"))
        mol_list = [os.path.join(self.root, "test_set", file) for file in mol_list]
            
        return mol_list

    @property
    def processed_file_names(self):
        return ["2d_dataset_test.pt"]

    def process(self):
        # Read data into huge `Data` list.
        df = pd.read_csv(f"{self.root}/../test_set.csv")
        data_list = get_datalist(df, self.root)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
from torch_geometric.data import LightningDataset

def get_datamodule(train_data, test_data, fold=0, batch_size=32, num_workers=4):
    if type(fold) != str:
        
    

In [3]:
from torch import nn
import torch_geometric.nn as gnn


class GATNet(nn.Module):
    def __init__(
        self,
        input_dim: int = 15,
        gat_hidden_dim: int = 64,
        edge_dim: int = 3,
        heads: int = 4,
        n_gat_layers: int = 3,
        n_fc_layers: int = 3,
        fc_hidden_dim: int = 256,
        fc_dropout: float = 0.5
    ):
        super().__init__()
        self.gat1 = gnn.GATv2Conv(in_channels=input_dim, 
                                  out_channels=gat_hidden_dim, heads=heads, edge_dim=edge_dim)
        self.gat_list = nn.ModuleList([
            gnn.GATv2Conv(in_channels=gat_hidden_dim*heads, 
                          out_channels=gat_hidden_dim, heads=heads, edge_dim=edge_dim)
            for _ in range(n_gat_layers - 1)
        ])
        self.fc1 = nn.Sequential(
            nn.Linear(gat_hidden_dim * heads, fc_hidden_dim),
            nn.BatchNorm1d(fc_hidden_dim),
            nn.ReLU(),
            nn.Dropout(fc_dropout)
        )
        self.fc_list = nn.ModuleList([
            nn.Sequential(
                nn.Linear(fc_hidden_dim, fc_hidden_dim),
                nn.BatchNorm1d(fc_hidden_dim),
                nn.ReLU(),
                nn.Dropout(fc_dropout)
            ) for _ in range(n_fc_layers - 2)
        ])
        self.do = nn.Dropout(fc_dropout)
        self.fc_last = nn.Linear(fc_hidden_dim, 2)

    def forward(self, x, edge_index, edge_attr, batch):
        x = F.relu(self.gat1(x, edge_index, edge_attr))
        
        for gat_layer in self.gat_list:
            x = gat_layer(x, edge_index, edge_attr)
            x = F.relu(x)
            
        x = gnn.global_mean_pool(x, batch)
        x = self.fc1(x)
        
        for fc_layer in self.fc_list:
            x = fc_layer(x)
            
        x = self.fc_last(x)
        
        return x

In [16]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torchmetrics import MeanSquaredError, MinMetric


class BaseNet(pl.LightningModule):
    def __init__(
        self,
        net: nn.Module,
        lr: float = 1e-3,
        weight_decay: float = 1e-5,
        max_epochs: int = 30
    ):
        super().__init__()
        self.save_hyperparameters(ignore=["net"])
        self.net = net
        self.criterion = nn.MSELoss()
        
        self.train_rmse = MeanSquaredError(squared=False)
        self.val_rmse = MeanSquaredError(squared=False)
        
        self.val_rmse_best = MinMetric()
        
    def forward(self, batch):
        return self.net(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
    
    def on_train_start(self):
        self.val_rmse_best.reset()
        
    def step(self, batch):
        pred = self(batch)
        loss = self.criterion(pred, batch.y)
        
        return loss, pred, batch.y
    
    def training_step(self, batch, batch_idx):
        loss, pred, target = self.step(batch)
        
        return loss
        
    def validation_step(self, batch, batch_idx):
        loss, preds, targets = self.step(batch)
        self.val_rmse.update(preds, targets)
        
    def validation_epoch_end(self, outputs):
        # get val metric from current epoch
        epoch_rmse = self.val_rmse.compute()
        
        # log epoch metrics
        metrics = {"val/rmse": epoch_rmse}
        self.log_dict(metrics, on_epoch=True, prog_bar=True)

        # log best metric
        self.val_rmse_best.update(epoch_rmse)
        self.log("val/rmse_best", self.val_rmse_best.compute(), on_epoch=True, prog_bar=True)

        # reset val metrics
        self.val_rmse.reset()
    
    def predict_step(self, batch, batch_idx):
        _, preds, _ = self.step(batch)
        
        return preds
    
    def on_predict_epoch_end(self, outputs):
        preds = np.array(torch.cat(outputs[0]))
        
        sub_df = pd.read_csv("../data/sample_submission.csv")
        sub_df["Reorg_g"] = preds[:, 0]
        sub_df["Reorg_ex"] = preds[:, 1]
        sub_df.to_csv("submission.csv", sep=",", index=False)

        print("Saved submission file!")
        
    def configure_optimizers(self):
        n_steps = len(self.trainer._data_connector._train_dataloader_source.dataloader())
        
        optimizer = torch.optim.Adam(
            self.parameters(), 
            lr=self.hparams.lr, 
            weight_decay=self.hparams.weight_decay
        )
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=self.hparams.max_epochs * n_steps
        )
        
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

In [18]:
train_data = TrainDataset()
test_data = TestDataset()

dm = BaseDataModule(train_data, test_data, batch_size=32, fold=0)

net = GATNet()
model = BaseNet(net, lr=1e-3, weight_decay=1e-5, max_epochs=30)

trainer = pl.Trainer(max_epochs=30, gpus=[1])

trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name          | Type             | Params
---------------------------------------------------
0 | net           | GATNet           | 408 K 
1 | criterion     | MSELoss          | 0     
2 | train_rmse    | MeanSquaredError | 0     
3 | val_rmse      | MeanSquaredError | 0     
4 | val_rmse_best | MinMetric        | 0     
---------------------------------------------------
408 K     Trainable params
0         Non-trainable params
408 K     Total params
1.633     Total estimated model params size (MB)


Epoch 0:  80%|███████▉  | 453/567 [00:15<00:03, 28.99it/s, loss=0.129, v_num=4]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/114 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/114 [00:00<?, ?it/s][A
Epoch 0:  80%|████████  | 454/567 [00:16<00:03, 28.32it/s, loss=0.129, v_num=4]
Epoch 0:  80%|████████  | 455/567 [00:16<00:03, 28.34it/s, loss=0.129, v_num=4]
Epoch 0:  80%|████████  | 456/567 [00:16<00:03, 28.32it/s, loss=0.129, v_num=4]
Epoch 0:  81%|████████  | 457/567 [00:16<00:03, 28.34it/s, loss=0.129, v_num=4]
Epoch 0:  81%|████████  | 458/567 [00:16<00:03, 28.36it/s, loss=0.129, v_num=4]
Epoch 0:  81%|████████  | 459/567 [00:16<00:03, 28.38it/s, loss=0.129, v_num=4]
Epoch 0:  81%|████████  | 460/567 [00:16<00:03, 28.41it/s, loss=0.129, v_num=4]
Epoch 0:  81%|████████▏ | 461/567 [00:16<00:03, 28.43it/s, loss=0.129, v_num=4]
Epoch 0:  81%|████████▏ | 462/567 [00:16<00:03, 28.44it/s, loss=0.129, v_num=4]
Epoch 0:  82%|████████▏ | 463/567 [00:16<00

In [12]:
num_epochs = 30

model = GATNet(15, 2)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs * len(train_dataloader))
device = torch.device("cuda:1")

model.to(device)


for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    train_loss, val_loss = 0., 0.
    
    # train
    model.train()

    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()

        batch = batch.to(device)
        pred = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        loss = criterion(pred, batch.y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    
    train_loss /= len(train_dataloader)
    
    # validation
    model.eval()
    
    for batch in val_dataloader:
        batch = batch.to(device)
        pred = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        loss = criterion(pred, batch.y)
        val_loss += loss * len(batch.y)
    
    val_loss /= len(val_data)
    
    print(f"Train Loss: {train_loss}")
    print(f"Val Loss: {val_loss}")

Epoch 0


100%|██████████| 453/453 [00:11<00:00, 38.35it/s]


Train Loss: 0.12264837247317463
Val Loss: 0.11823252588510513
Epoch 1


100%|██████████| 453/453 [00:12<00:00, 37.67it/s]


Train Loss: 0.1135789750690755
Val Loss: 0.11279676854610443
Epoch 2


100%|██████████| 453/453 [00:12<00:00, 37.02it/s]


Train Loss: 0.09695399670181158
Val Loss: 0.08205020427703857
Epoch 3


100%|██████████| 453/453 [00:12<00:00, 36.06it/s]


Train Loss: 0.0808124654652925
Val Loss: 0.09489650279283524
Epoch 4


100%|██████████| 453/453 [00:12<00:00, 35.82it/s]


Train Loss: 0.07588006033835558
Val Loss: 0.08151715993881226
Epoch 5


100%|██████████| 453/453 [00:12<00:00, 35.95it/s]


Train Loss: 0.07270524785679171
Val Loss: 0.07278463989496231
Epoch 6


100%|██████████| 453/453 [00:13<00:00, 34.21it/s]


Train Loss: 0.0707416209098256
Val Loss: 0.07197262346744537
Epoch 7


100%|██████████| 453/453 [00:13<00:00, 33.80it/s]


Train Loss: 0.06920491806570662
Val Loss: 0.07027502357959747
Epoch 8


100%|██████████| 453/453 [00:13<00:00, 33.55it/s]


Train Loss: 0.06744053234525074
Val Loss: 0.07098870724439621
Epoch 9


100%|██████████| 453/453 [00:13<00:00, 33.59it/s]


Train Loss: 0.06588058465594224
Val Loss: 0.06632950156927109
Epoch 10


100%|██████████| 453/453 [00:12<00:00, 35.05it/s]


Train Loss: 0.06486200984921019
Val Loss: 0.0678791031241417
Epoch 11


100%|██████████| 453/453 [00:12<00:00, 34.90it/s]


Train Loss: 0.06371644361327027
Val Loss: 0.06562931090593338
Epoch 12


100%|██████████| 453/453 [00:13<00:00, 34.47it/s]


Train Loss: 0.06215264864068552
Val Loss: 0.06718893349170685
Epoch 13


100%|██████████| 453/453 [00:13<00:00, 34.36it/s]


Train Loss: 0.06086020463580064
Val Loss: 0.061521340161561966
Epoch 14


100%|██████████| 453/453 [00:11<00:00, 38.91it/s]


Train Loss: 0.059979709070874895
Val Loss: 0.06226282939314842
Epoch 15


100%|██████████| 453/453 [00:11<00:00, 38.48it/s]


Train Loss: 0.059344057733846815
Val Loss: 0.0634029284119606
Epoch 16


100%|██████████| 453/453 [00:12<00:00, 34.91it/s]


Train Loss: 0.05834461612499445
Val Loss: 0.060086771845817566
Epoch 17


100%|██████████| 453/453 [00:12<00:00, 36.73it/s]


Train Loss: 0.05679242212169086
Val Loss: 0.05918416753411293
Epoch 18


100%|██████████| 453/453 [00:12<00:00, 37.61it/s]


Train Loss: 0.05571853669635771
Val Loss: 0.058793921023607254
Epoch 19


100%|██████████| 453/453 [00:11<00:00, 37.77it/s]


Train Loss: 0.054527673138424784
Val Loss: 0.05806384235620499
Epoch 20


100%|██████████| 453/453 [00:11<00:00, 37.97it/s]


Train Loss: 0.05377750939979459
Val Loss: 0.058353133499622345
Epoch 21


100%|██████████| 453/453 [00:12<00:00, 36.44it/s]


Train Loss: 0.05272164444539852
Val Loss: 0.05835919827222824
Epoch 22


100%|██████████| 453/453 [00:12<00:00, 35.13it/s]


Train Loss: 0.05194892519242058
Val Loss: 0.057384613901376724
Epoch 23


100%|██████████| 453/453 [00:12<00:00, 35.65it/s]


Train Loss: 0.05119385129091624
Val Loss: 0.05701125040650368
Epoch 24


100%|██████████| 453/453 [00:12<00:00, 35.18it/s]


Train Loss: 0.05046723569202634
Val Loss: 0.05732369422912598
Epoch 25


100%|██████████| 453/453 [00:12<00:00, 36.71it/s]


Train Loss: 0.0499665065777486
Val Loss: 0.05713200941681862
Epoch 26


100%|██████████| 453/453 [00:12<00:00, 35.90it/s]


Train Loss: 0.04949362321487457
Val Loss: 0.0570257306098938
Epoch 27


100%|██████████| 453/453 [00:12<00:00, 36.69it/s]


Train Loss: 0.04914069553183404
Val Loss: 0.056887321174144745
Epoch 28


100%|██████████| 453/453 [00:12<00:00, 37.17it/s]


Train Loss: 0.04893808028605205
Val Loss: 0.05693376436829567
Epoch 29


100%|██████████| 453/453 [00:12<00:00, 36.22it/s]


Train Loss: 0.04880983686736614
Val Loss: 0.056932494044303894


In [None]:
preds = []

model.eval()
for batch in tqdm(test_dataloader):
    batch = batch.to(device)
    pred = model(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
    preds.append(pred)

preds = torch.cat(preds).detach().cpu().numpy()

sub_df = pd.read_csv("data/sample_submission.csv")
sub_df["Reorg_g"] = preds[:, 0]
sub_df["Reorg_ex"] = preds[:, 1]
sub_df.to_csv("submission.csv", sep=",", index=False)