In [1]:
!pip install datamol
!pip install rdkit-pypi
!pip install pandas

Collecting datamol
  Downloading datamol-0.11.3-py3-none-any.whl (381 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/381.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/381.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.7/381.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting loguru (from datamol)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting selfies (from datamol)
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Installing collected packages: selfies, loguru, datamol
Successfully installed datamol-0.11.3 loguru-0.7.2 selfies-2.1.1
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.utils.data import DataLoader
import tqdm
from scipy.stats import spearmanr

In [3]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Project/ALVS")

Mounted at /content/drive


In [4]:
def ecfp_from_smiles(smiles,
                     R = 2,
                     L = 2**11,
                     use_features = True,
                     use_chirality = False):
    """
    Inputs:

    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features

    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """

    molecule = Chem.MolFromSmiles(smiles)
    if not molecule:
        return np.nan
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius = R,
                                                         nBits = L,
                                                         useFeatures = use_features,
                                                         useChirality = use_chirality)
    return np.array(feature_list)

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, fps, scores):
        self.fps = fps
        self.scores = scores
        self.dataLen = len(scores)

    def __getitem__(self, index):
        fp = np.array(self.fps[index])
        score = np.array(self.scores[index]).reshape(1,)
        return fp, score

    def __len__(self):
        return self.dataLen

In [6]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc1 = Linear(2048, 1024)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = Linear(1024, 516)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc3 = Linear(516, 256)
        self.dropout3 = nn.Dropout(p=0.2)
        self.fc4 = Linear(256, 128)
        self.dropout4 = nn.Dropout(p=0.2)
        self.fc5 = Linear(128, 1)


    def forward(self, data):
        x = F.relu(self.fc1(data))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

In [7]:
def train_epoch(loader, model, optimizer, device):
    model.train()

    loss_all = 0
    i = 0
    for fps, scores in loader:
        fps = fps.float().to(device)
        scores = scores.float().to(device)

        optimizer.zero_grad()
        output = model(fps)
        loss = F.mse_loss(output, scores)
        loss.backward()

        loss_all += loss.item()
        optimizer.step()
        i += 1
    return loss_all / i

In [8]:
def test_epoch(loader, model,  device):
    model.eval()

    MSE, MAE = 0, 0
    trues, preds = [], []
    with torch.no_grad():
        for fps, scores in loader:
            fps = fps.float().to(device)
            scores = scores.float().to(device)

            output = model(fps)
            pred = output.cpu().squeeze().numpy().tolist()
            true = scores.cpu().squeeze().numpy().tolist()

            trues.extend(true)
            preds.extend(pred)
    MAE = mean_absolute_error(trues, preds)
    RMSE = np.sqrt(mean_squared_error(trues, preds))
    R2 = r2_score(trues, preds)
    Sp = spearmanr(trues, preds)[0]
    return MAE, RMSE, R2, Sp

In [9]:
def init_model(device):
    model = DNN()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    lr = 0.0001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

In [10]:
def prepare_dataloader(dff, batch_size, test_size=0.8):
    train_fps, valid_fps, train_scores, valid_scores = train_test_split(
                                                                        dff["ecfp"],
                                                                        dff["score"],
                                                                        test_size=test_size)
    train_dataset = Dataset(fps=list(train_fps),
                            scores = list(train_scores))
    valid_dataset = Dataset(fps=list(valid_fps),
                            scores = list(valid_scores))
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)
    return train_loader, valid_loader

In [11]:
def train_step(dff, epochs, batch_size, device="cuda"):
    model, optimizer = init_model(device=device)
    train_loader, valid_loader = prepare_dataloader(dff,  batch_size=batch_size)

    model_folder = "models/src/ecfp/base_ecfp"
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    hist = {"train-loss":[], "test-mae":[], "test-rmse":[], "test-r2":[],  "test-sp":[]}
    for epoch in range(epochs):
        train_loss = train_epoch(train_loader, model, optimizer, device)
        test_mae, test_rmse, test_r2, test_sp = test_epoch(valid_loader, model, device)
        hist["train-loss"].append(train_loss)
        hist["test-mae"].append(test_mae)
        hist["test-rmse"].append(test_rmse)
        hist["test-r2"].append(test_r2)
        hist["test-sp"].append(test_sp)

        if test_rmse <= min(hist["test-rmse"]):
            weight_path = os.path.join(model_folder, "weight_ecfp.pth")
            torch.save(model.state_dict(), weight_path)

        print(f'Epoch: {epoch}, Train loss: {train_loss:.3}, Test mae: {test_mae:.3}, Test rmse: {test_rmse:.3}, Test r2: {test_r2:.3}, Test sp: {test_sp:.3}')
    return weight_path

In [12]:
def load_model(best_model_path, device="cuda"):
    model= DNN().to(device)
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    return model

In [13]:
def predict(fpnp, model, device):
    fpnp = fpnp.reshape(1, 2048)
    fp = torch.from_numpy(fpnp).float().to(device)

    with torch.no_grad():
        output = model(fp)
        pred = output.cpu().numpy()[0][0]
    return pred

In [14]:
def eval_model(dff, best_model_path):
    model = load_model(best_model_path)
    dff["pred"] = dff["ecfp"].map(lambda x: predict(x,  model, device="cuda"))
    MAE = mean_absolute_error(dff["score"], dff["pred"])
    RMSE = np.sqrt(mean_squared_error(dff["score"], dff["pred"]))
    R2 = r2_score(dff["score"], dff["pred"])
    Sp = spearmanr(dff["score"], dff["pred"])[0]
    return MAE, RMSE, R2, Sp

In [15]:
df = pd.read_csv("preprocess_data/src/src_train.csv", sep="\t")
df["ecfp"] = df["smiles"].map(lambda x: ecfp_from_smiles(x))
df = df[~df["ecfp"].isna()]

In [16]:
df_test = pd.read_csv("preprocess_data/src/src_test.csv", sep="\t")
df_test["ecfp"] = df_test["smiles"].map(lambda x: ecfp_from_smiles(x))
df_test = df_test[~df_test["ecfp"].isna()]

In [17]:
best_model_path = train_step(df, epochs=100, batch_size=32, device="cuda")

Epoch: 0, Train loss: 24.4, Test mae: 0.858, Test rmse: 1.1, Test r2: -0.631, Test sp: 0.354
Epoch: 1, Train loss: 1.77, Test mae: 0.793, Test rmse: 0.989, Test r2: -0.323, Test sp: 0.511
Epoch: 2, Train loss: 1.38, Test mae: 0.686, Test rmse: 0.888, Test r2: -0.0678, Test sp: 0.555
Epoch: 3, Train loss: 1.14, Test mae: 0.651, Test rmse: 0.844, Test r2: 0.0367, Test sp: 0.601
Epoch: 4, Train loss: 1.06, Test mae: 0.598, Test rmse: 0.773, Test r2: 0.191, Test sp: 0.608
Epoch: 5, Train loss: 0.929, Test mae: 0.602, Test rmse: 0.777, Test r2: 0.183, Test sp: 0.632
Epoch: 6, Train loss: 0.847, Test mae: 0.575, Test rmse: 0.74, Test r2: 0.26, Test sp: 0.638
Epoch: 7, Train loss: 0.818, Test mae: 0.557, Test rmse: 0.719, Test r2: 0.301, Test sp: 0.656
Epoch: 8, Train loss: 0.761, Test mae: 0.565, Test rmse: 0.725, Test r2: 0.289, Test sp: 0.645
Epoch: 9, Train loss: 0.733, Test mae: 0.666, Test rmse: 0.845, Test r2: 0.034, Test sp: 0.651
Epoch: 10, Train loss: 0.71, Test mae: 0.572, Test rms

In [18]:
MAE, RMSE, R2, Sp = eval_model(df_test, best_model_path)

In [19]:
print(MAE, RMSE, R2, Sp)

0.5090402475468345 0.6507218383195484 0.4302843930050321 0.6856614069698486
