In [None]:
!pip install datamol
!pip install rdkit-pypi
!pip install pandas
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=96f17a0231d0dac11b4059c25801930c247251f1221e9ef825f650ff4463d34f
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.3.1


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem import rdmolops

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear

In [None]:
from torch.nn import BatchNorm1d
from torch_geometric.nn import GCNConv, GlobalAttention
from torch_geometric.nn import global_add_pool, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [None]:
from scipy.stats import spearmanr

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Project/ALVS")

Mounted at /content/drive


In [None]:
def one_hot(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))


def get_bond_pair(mol):
    bonds = mol.GetBonds()
    res = [[],[]]
    for bond in bonds:
        res[0] += [bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]
        res[1] += [bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()]
    return res


def get_atom_features(mol):
    acceptor_smarts_one = '[!$([#1,#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]'
    acceptor_smarts_two = "[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),$([O,S;H0;v2]),$([O,S;-]),$([N;v3;!$(N-*=[O,N,P,S])]),n&H0&+0,$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]"
    donor_smarts_one = "[$([N;!H0;v3,v4&+1]),$([O,S;H1;+0]),n&H1&+0]"
    donor_smarts_two = "[!$([#6,H0,-,-2,-3]),$([!H0;#7,#8,#9])]"

    hydrogen_donor_one = Chem.MolFromSmarts(donor_smarts_one)
    hydrogen_donor_two = Chem.MolFromSmarts(donor_smarts_two)
    hydrogen_acceptor_one = Chem.MolFromSmarts(acceptor_smarts_one)
    hydrogen_acceptor_two = Chem.MolFromSmarts(acceptor_smarts_two)

    hydrogen_donor_match_one = mol.GetSubstructMatches(hydrogen_donor_one)
    hydrogen_donor_match_two = mol.GetSubstructMatches(hydrogen_donor_two)
    hydrogen_donor_match = []
    hydrogen_donor_match.extend(hydrogen_donor_match_one)
    hydrogen_donor_match.extend(hydrogen_donor_match_two)
    hydrogen_donor_match = list(set(hydrogen_donor_match))

    hydrogen_acceptor_match_one = mol.GetSubstructMatches(hydrogen_acceptor_one)
    hydrogen_acceptor_match_two = mol.GetSubstructMatches(hydrogen_acceptor_two)
    hydrogen_acceptor_match = []
    hydrogen_acceptor_match.extend(hydrogen_acceptor_match_one)
    hydrogen_acceptor_match.extend(hydrogen_acceptor_match_two)
    hydrogen_acceptor_match = list(set(hydrogen_acceptor_match))

    ring = mol.GetRingInfo()

    m = []
    for atom_idx in range(mol.GetNumAtoms()):
        atom = mol.GetAtomWithIdx(atom_idx)

        o = []
        o += one_hot(atom.GetSymbol(), ['C', 'H', 'O', 'N', 'S', 'Cl', 'F', 'Br', 'P',
                                        'I'])
        o += [atom.GetDegree()]
        o += one_hot(atom.GetHybridization(), [Chem.rdchem.HybridizationType.SP,
                                               Chem.rdchem.HybridizationType.SP2,
                                               Chem.rdchem.HybridizationType.SP3,
                                               Chem.rdchem.HybridizationType.SP3D,
                                               Chem.rdchem.HybridizationType.SP3D2])
        o += [atom.GetImplicitValence()]
        o += [atom.GetIsAromatic()]
        o += [ring.IsAtomInRingOfSize(atom_idx, 3),
              ring.IsAtomInRingOfSize(atom_idx, 4),
              ring.IsAtomInRingOfSize(atom_idx, 5),
              ring.IsAtomInRingOfSize(atom_idx, 6),
              ring.IsAtomInRingOfSize(atom_idx, 7),
              ring.IsAtomInRingOfSize(atom_idx, 8)]

        o += [atom_idx in hydrogen_donor_match]
        o += [atom_idx in hydrogen_acceptor_match]
        o += [atom.GetFormalCharge()]
        m.append(o)
    return m


def mol2vec(mol, score=None):
    node_f = get_atom_features(mol)
    edge_index = get_bond_pair(mol)

    data = Data(x=torch.tensor(node_f, dtype=torch.float32),
                edge_index=torch.tensor(edge_index, dtype=torch.long),
                score=torch.tensor([[score]], dtype=torch.float))
    return data

In [None]:
def generate_datasets(df, test_size):
    datasets = []
    for idx, row in df.iterrows():
        mol = Chem.MolFromSmiles(row[1])
        score = row[2]
        if not mol:
            continue
        data = mol2vec(mol, score=score)
        datasets.append(data)

    train_dataset, valid_dataset = train_test_split(datasets, test_size=test_size)
    return train_dataset, valid_dataset

In [None]:
def graph_from_smiles(smi):
    mol = Chem.MolFromSmiles(smi)
    if not mol:
        return np.nan
    node_f = get_atom_features(mol)
    edge_index = get_bond_pair(mol)

    batch = np.zeros(len(node_f), )
    data = Data(x=torch.tensor(node_f, dtype=torch.float32),
                    edge_index=torch.tensor(edge_index, dtype=torch.long),
                    batch=torch.tensor(batch, dtype=torch.long))
    return data

In [None]:
n_features = 27
hidden = 1024

class GCNNet(torch.nn.Module):
    def __init__(self):
        super(GCNNet, self).__init__()
        self.conv1 = GCNConv(n_features, 1024, cached=False) # if you defined cache=True, the shape of batch must be same!
        self.bn1 = BatchNorm1d(1024)
        self.dropout1 = nn.Dropout(p=0.2)
        self.conv2 = GCNConv(1024, 512, cached=False)
        self.bn2 = BatchNorm1d(512)
        self.dropout2 = nn.Dropout(p=0.2)
        self.conv3 = GCNConv(512, 256, cached=False)
        self.bn3 = BatchNorm1d(256)
        self.dropout3 = nn.Dropout(p=0.2)
        self.conv4 = GCNConv(256, 512, cached=False)
        self.bn4 = BatchNorm1d(512)
        self.dropout4 = nn.Dropout(p=0.2)
        self.conv5 = GCNConv(512, 1024, cached=False)
        self.bn5 = BatchNorm1d(1024)
        self.dropout5 = nn.Dropout(p=0.2)

        # self.att = GlobalAttention(Linear(hidden, 1))
        self.fc2 = Linear(1024, 128)
        self.dropout6 = nn.Dropout(p=0.2)
        self.fc3 = Linear(128, 16)
        self.dropout7 = nn.Dropout(p=0.2)
        self.fc4 = Linear(16, 1)

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()
        self.conv3.reset_parameters()
        self.conv4.reset_parameters()
        self.conv5.reset_parameters()

        self.att.reset_parameters()
        self.fc2.reset_parameters()
        self.fc3.reset_parameters()
        self.fc4.reset_parameters()

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.relu(self.conv2(x, edge_index))
        x = self.bn2(x)
        x = self.dropout2(x)
        x = F.relu(self.conv3(x, edge_index))
        x = self.bn3(x)
        x = self.dropout3(x)
        x = F.relu(self.conv4(x, edge_index))
        x = self.bn4(x)
        x = self.dropout4(x)
        x = F.relu(self.conv5(x, edge_index))
        x = self.bn5(x)
        x = self.dropout5(x)
        x = global_mean_pool(x, batch)

        x = F.relu(self.fc2(x))
        x = self.dropout6(x)
        x = F.relu(self.fc3(x))
        x = self.dropout7(x)
        x = self.fc4(x)
        return x

In [None]:
def train_epoch(loader, model, optimizer, device):
    model.train()

    loss_all = 0
    i = 0
    for data in loader:
        data = data.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, data.score)
        loss.backward()

        loss_all += loss.item()
        optimizer.step()
        i += 1
    return loss_all / i

In [None]:
def test_epoch(loader, model,  device):
    model.eval()

    MSE, MAE = 0, 0
    trues, preds = [], []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)

            output = model(data)
            pred = output.cpu().squeeze().numpy().tolist()
            true = data.score.cpu().squeeze().numpy().tolist()

            trues.extend(true)
            preds.extend(pred)
    MAE = mean_absolute_error(trues, preds)
    RMSE = np.sqrt(mean_squared_error(trues, preds))
    R2 = r2_score(trues, preds)
    Sp = spearmanr(trues, preds)[0]
    return MAE, RMSE, R2, Sp

In [None]:
def init_model(device):
    model = GCNNet()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    lr = 0.0001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

In [None]:
def prepare_dataloader(dff, batch_size, test_size=0.2):
    train_dataset, valid_dataset = generate_datasets(dff, test_size)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)
    return train_loader, valid_loader

In [None]:
def train_step(dff, epochs, batch_size, device="cuda"):
    model, optimizer = init_model(device=device)
    train_loader, valid_loader = prepare_dataloader(dff,  batch_size=batch_size)

    model_folder = "models/aa2ar/gnn/base_gnn"
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    hist = {"train-loss":[], "test-mae":[], "test-rmse":[], "test-r2":[], "test-sp":[]}
    for epoch in range(epochs):
        train_loss = train_epoch(train_loader, model, optimizer, device)
        test_mae, test_rmse, test_r2, test_sp = test_epoch(valid_loader, model, device)
        hist["train-loss"].append(train_loss)
        hist["test-mae"].append(test_mae)
        hist["test-rmse"].append(test_rmse)
        hist["test-r2"].append(test_r2)
        hist["test-sp"].append(test_sp)

        if test_rmse <= min(hist["test-rmse"]):
            weight_path = os.path.join(model_folder, "weight_gnn.pth")
            torch.save(model.state_dict(), weight_path)

        print(f'Epoch: {epoch}, Train loss: {train_loss:.3}, Test mae: {test_mae:.3}, Test rmse: {test_rmse:.3}, Test r2: {test_r2:.3}, Test sp: {test_sp:.3}')
    return weight_path

In [None]:
def load_model(best_model_path, device="cuda"):
    model= GCNNet().to(device)
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    return model

In [None]:
def predict(data, model, device):
    data = data.to(device)
    with torch.no_grad():
        output = model(data)
        pred = output.cpu().numpy()[0][0]
    return pred

In [None]:
def eval_model(dff, best_model_path):
    model = load_model(best_model_path)
    dff["pred"] = dff["graph"].map(lambda x: predict(x,  model, device="cuda"))

    MAE = mean_absolute_error(dff["score"], dff["pred"])
    RMSE = np.sqrt(mean_squared_error(dff["score"], dff["pred"]))
    R2 = r2_score(dff["score"], dff["pred"])
    Sp = spearmanr(dff["score"], dff["pred"])[0]
    return MAE, RMSE, R2, Sp

In [None]:
df = pd.read_csv("preprocess_data/aa2ar/aa2ar_train.csv", sep="\t")
df_test = pd.read_csv("preprocess_data/aa2ar/aa2ar_test.csv", sep="\t")
df_test["graph"] = df_test["smiles"].map(lambda x: graph_from_smiles(x))
df_test = df_test[~df_test["graph"].isna()]

In [None]:

weight_path = train_step(df, epochs=100, batch_size=32, device="cuda")

Epoch: 0, Train loss: 23.4, Test mae: 1.13, Test rmse: 1.69, Test r2: -1.83, Test sp: 0.278
Epoch: 1, Train loss: 5.89, Test mae: 0.849, Test rmse: 1.21, Test r2: -0.454, Test sp: 0.496
Epoch: 2, Train loss: 4.63, Test mae: 0.82, Test rmse: 1.12, Test r2: -0.228, Test sp: 0.535
Epoch: 3, Train loss: 4.41, Test mae: 0.783, Test rmse: 1.03, Test r2: -0.054, Test sp: 0.575
Epoch: 4, Train loss: 4.04, Test mae: 0.673, Test rmse: 0.906, Test r2: 0.19, Test sp: 0.605
Epoch: 5, Train loss: 3.95, Test mae: 0.671, Test rmse: 0.895, Test r2: 0.209, Test sp: 0.638
Epoch: 6, Train loss: 3.7, Test mae: 0.611, Test rmse: 0.862, Test r2: 0.267, Test sp: 0.645
Epoch: 7, Train loss: 3.57, Test mae: 0.651, Test rmse: 0.842, Test r2: 0.3, Test sp: 0.684
Epoch: 8, Train loss: 3.53, Test mae: 0.659, Test rmse: 0.856, Test r2: 0.276, Test sp: 0.664
Epoch: 9, Train loss: 3.43, Test mae: 0.769, Test rmse: 0.936, Test r2: 0.135, Test sp: 0.649
Epoch: 10, Train loss: 3.32, Test mae: 0.715, Test rmse: 0.898, Tes

In [None]:
MAE, RMSE, R2, Sp = eval_model(df_test, weight_path)

In [None]:
print(MAE, RMSE, R2, Sp)

0.4319217328804947 0.5796868518238731 0.6763401792529892 0.8229489586541638
