In [None]:
!pip install datamol
!pip install rdkit-pypi
!pip install pandas

Collecting datamol
  Downloading datamol-0.11.3-py3-none-any.whl (381 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/381.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/381.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.7/381.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting loguru (from datamol)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting selfies (from datamol)
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Installing collected packages: selfies, loguru, datamol
Successfully installed datamol-0.11.3 loguru-0.7.2 selfies-2.1.1
Collecting rdkit-pypi
  

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.utils.data import DataLoader
import tqdm
from scipy.stats import spearmanr

In [None]:
def ecfp_from_smiles(smiles,
                     R = 2,
                     L = 2**11,
                     use_features = True,
                     use_chirality = False):
    """
    Inputs:

    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features

    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """

    molecule = Chem.MolFromSmiles(smiles)
    if not molecule:
        return np.nan
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius = R,
                                                         nBits = L,
                                                         useFeatures = use_features,
                                                         useChirality = use_chirality)
    return np.array(feature_list)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Project/ALVS")

Mounted at /content/drive


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, fps, scores):
        self.fps = fps
        self.scores = scores
        self.dataLen = len(scores)

    def __getitem__(self, index):
        fp = np.array(self.fps[index])
        score = np.array(self.scores[index]).reshape(1,)
        return fp, score

    def __len__(self):
        return self.dataLen

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc1 = Linear(2048, 1024)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = Linear(1024, 516)
        self.dropout2 = nn.Dropout(p=0.5)
        self.fc3 = Linear(516, 256)
        self.dropout3 = nn.Dropout(p=0.5)
        self.fc4 = Linear(256, 128)
        self.dropout4 = nn.Dropout(p=0.5)
        self.fc5 = Linear(128, 1)


    def forward(self, data):
        x = F.relu(self.fc1(data))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

In [None]:
def train_epoch(loader, model, optimizer, device):
    model.train()

    loss_all = 0
    i = 0
    for fps, scores in loader:
        fps = fps.float().to(device)
        scores = scores.float().to(device)

        optimizer.zero_grad()
        output = model(fps)
        loss = F.mse_loss(output, scores)
        loss.backward()

        loss_all += loss.item()
        optimizer.step()
        i += 1
    return loss_all / i

In [None]:
def test_epoch(loader, model,  device):
    model.eval()

    MSE, MAE = 0, 0
    trues, preds = [], []
    with torch.no_grad():
        for fps, scores in loader:
            fps = fps.float().to(device)
            scores = scores.float().to(device)

            output = model(fps)
            pred = output.cpu().squeeze().numpy().tolist()
            true = scores.cpu().squeeze().numpy().tolist()

            trues.extend(true)
            preds.extend(pred)
    MAE = mean_absolute_error(trues, preds)
    RMSE = np.sqrt(mean_squared_error(trues, preds))
    R2 = r2_score(trues, preds)
    Sp = spearmanr(trues, preds)[0]
    return MAE, RMSE, R2, Sp

In [None]:
def init_model(device, lr=0.0001):
    model = DNN()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

In [None]:
def prepare_dataloader(dff, batch_size, test_size=0.8):
    train_fps, valid_fps, train_scores, valid_scores = train_test_split(
                                                                        dff["ecfp"],
                                                                        dff["score"],
                                                                        test_size=test_size)
    train_dataset = Dataset(fps=list(train_fps),
                            scores = list(train_scores))
    valid_dataset = Dataset(fps=list(valid_fps),
                            scores = list(valid_scores))
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)
    return train_loader, valid_loader

In [None]:
def train_step(dff, epochs, batch_size, step_time, device="cuda"):
    model, optimizer = init_model(device=device)
    train_loader, valid_loader = prepare_dataloader(dff,  batch_size=batch_size)

    model_folder = "models/dpp4/ecfp/models_dropout/{}".format(step_time)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    hist = {"train-loss":[], "test-mae":[], "test-rmse":[], "test-r2":[], "test-sp":[]}
    for epoch in range(epochs):
        train_loss = train_epoch(train_loader, model, optimizer, device)
        test_mae, test_rmse, test_r2, test_sp = test_epoch(valid_loader, model, device)
        hist["train-loss"].append(train_loss)
        hist["test-mae"].append(test_mae)
        hist["test-rmse"].append(test_rmse)
        hist["test-r2"].append(test_r2)
        hist["test-sp"].append(test_sp)

        if test_rmse <= min(hist["test-rmse"]):
            weight_path = os.path.join(model_folder, "weight_{}.pth".format(epoch))
            torch.save(model.state_dict(), weight_path)

        # print(f'Epoch: {epoch}, Train loss: {train_loss:.3}, Test mae: {test_mae:.3}, Test rmse: {test_rmse:.3}, Test r2: {test_r2:.3}')
    print("---------------------------------\nvalidation min rmse: {}\n---------------------------------\n".format(min(hist["test-rmse"])))
    return weight_path

In [None]:
def random_select(df, n_samples):
    df_select = df.sample(n=n_samples).copy()
    sample_indexs = df_select["name"].tolist()
    df_remaining = df[~df["name"].isin(sample_indexs)].copy()
    return df_select, df_remaining

In [None]:
def enable_dropout(model):
    """ Function to enable the dropout layers during test-time """
    for m in model.modules():
        if m.__class__.__name__.startswith('Dropout'):
            m.train()

In [None]:
def load_model(best_model_path, device="cuda", enable_dp=True):
    model= DNN().to(device)
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    if enable_dp:
        enable_dropout(model)
    return model

In [None]:
def predict_with_uncertainty(fpnp, model, device, times=5):
    fpnp = fpnp.reshape(1, 2048)
    fp = torch.from_numpy(fpnp).float().to(device)
    dropout_predictions = []
    with torch.no_grad():
        for _ in range(times):
            output = model(fp)
            pred = output.cpu().numpy()[0][0]
            dropout_predictions.append(pred)
    mean = np.mean(dropout_predictions)
    variance = np.var(dropout_predictions)
    return [mean, variance]

In [None]:
def uncertainty_select(dff, best_model_path, n_samples):
    model = load_model(best_model_path, enable_dp=True)
    dff["mean_var"] = dff["ecfp"].map(lambda x: predict_with_uncertainty(x,  model, device="cuda"))
    dff["mean"] = dff["mean_var"].map(lambda x: x[0])
    dff["var"] = dff["mean_var"].map(lambda x: x[1])
    dff_sort = dff.sort_values(by="var", ascending=False)
    dff_select = dff_sort.iloc[:n_samples, :4].copy()
    sample_indexs = dff_select["name"].tolist()
    dff_remaining = dff_sort[~dff_sort["name"].isin(sample_indexs)].copy()
    return dff_select, dff_remaining

In [None]:
def predict(fpnp, model, device):
    fpnp = fpnp.reshape(1, 2048)
    fp = torch.from_numpy(fpnp).float().to(device)
    with torch.no_grad():
        output = model(fp)
        pred = output.cpu().numpy()[0][0]
    return pred

In [None]:
def eval_model(dff, best_model_path):
    model = load_model(best_model_path, enable_dp=False)
    dff["pred"] = dff["ecfp"].map(lambda x: predict(x,  model, device="cuda"))
    MAE = mean_absolute_error(dff["score"], dff["pred"])
    RMSE = np.sqrt(mean_squared_error(dff["score"], dff["pred"]))
    R2 = r2_score(dff["score"], dff["pred"])
    Sp = spearmanr(dff["score"], dff["pred"])[0]
    return MAE, RMSE, R2, Sp

In [None]:
def run_step(dfs, epochs, stime):
    dfs_new = dfs.copy()

    best_model_path = train_step(dfs_new, epochs, batch_size=32, step_time=stime)
    return best_model_path

In [None]:
def run_iterations(df, df_test, epochs=200, iterations=100, n_samples=100):
    results = {"RMSE": [], "R2": [], "Sp": []}

    print("len(df):", len(df))

    df["ecfp"] = df["smiles"].map(lambda x: ecfp_from_smiles(x))
    df = df[~df["ecfp"].isna()]

    for stime in range(iterations):
        if stime < 1:
            dfs, dfr = random_select(df, n_samples)
        else:
            df_temp = dfs.copy()
            dfs, dfr = uncertainty_select(dfr, best_model_path, n_samples)
            dfs = pd.concat([dfs, df_temp], axis=0)
            dfs = dfs.reset_index(drop=True)
        print("number of mol pool:", len(dfs))
        best_model_path = run_step(dfs, epochs, stime)
        MAE, RMSE, R2, Sp = eval_model(df_test, best_model_path)
        print("MAE:{}, RMSE:{}, R2:{}, Sp:{}, step:{}".format(MAE, RMSE, R2, Sp, stime))

        results["RMSE"].append(RMSE)
        results["R2"].append(R2)
        results["Sp"].append(Sp)

    # Convert results dictionary to DataFrame
    results_df = pd.DataFrame(results)

    results_df.to_csv('ecfp_dropout_results.csv', index=False)


In [None]:
df = pd.read_csv("preprocess_data/dpp4/dpp4_train.csv", sep="\t")
df_test = pd.read_csv("preprocess_data/dpp4/dpp4_test.csv", sep="\t")
df_test["ecfp"] = df_test["smiles"].map(lambda x: ecfp_from_smiles(x))
df_test = df_test[~df_test["ecfp"].isna()]

In [22]:
run_iterations(df, df_test)

len(df): 24865
number of mol pool: 100
---------------------------------
validation min rmse: 1.383199298025133
---------------------------------

MAE:1.0603895248130804, RMSE:1.3383103186723486, R2:-0.48818745390980633, Sp:0.5417137174281076, step:0
number of mol pool: 200
---------------------------------
validation min rmse: 1.2148833875775444
---------------------------------

MAE:1.0716670790571619, RMSE:1.358009077999684, R2:-0.5323195135137317, Sp:0.5699087830728118, step:1
number of mol pool: 300
---------------------------------
validation min rmse: 1.316134035632068
---------------------------------

MAE:1.1822090062742527, RMSE:1.4796357220828098, R2:-0.8190875494621062, Sp:0.5750198577000786, step:2
number of mol pool: 400
---------------------------------
validation min rmse: 1.0358342024124383
---------------------------------

MAE:1.084862570673553, RMSE:1.3641204620437248, R2:-0.5461421965795794, Sp:0.6149513377694065, step:3
number of mol pool: 500
--------------------