In [None]:
!pip install datamol
!pip install rdkit-pypi
!pip install pandas

Collecting datamol
  Downloading datamol-0.11.3-py3-none-any.whl (381 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/381.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/381.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.7/381.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting loguru (from datamol)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting selfies (from datamol)
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Installing collected packages: selfies, loguru, datamol
Successfully installed datamol-0.11.3 loguru-0.7.2 selfies-2.1.1
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.utils.data import DataLoader
import tqdm
from scipy.stats import spearmanr

In [None]:
def ecfp_from_smiles(smiles,
                     R = 2,
                     L = 2**11,
                     use_features = True,
                     use_chirality = False):
    """
    Inputs:

    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features

    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """

    molecule = Chem.MolFromSmiles(smiles)
    if not molecule:
        return np.nan
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius = R,
                                                         nBits = L,
                                                         useFeatures = use_features,
                                                         useChirality = use_chirality)
    return np.array(feature_list)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Project/ALVS")

Mounted at /content/drive


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, fps, scores):
        self.fps = fps
        self.scores = scores
        self.dataLen = len(scores)

    def __getitem__(self, index):
        fp = np.array(self.fps[index])
        score = np.array(self.scores[index]).reshape(1,)
        return fp, score

    def __len__(self):
        return self.dataLen

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc1 = Linear(2048, 1024)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = Linear(1024, 516)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc3 = Linear(516, 256)
        self.dropout3 = nn.Dropout(p=0.2)
        self.fc4 = Linear(256, 128)
        self.dropout4 = nn.Dropout(p=0.2)
        self.fc5 = Linear(128, 1)


    def forward(self, data):
        x = F.relu(self.fc1(data))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

In [None]:
def train_epoch(loader, model, optimizer, device):
    model.train()

    loss_all = 0
    i = 0
    for fps, scores in loader:
        fps = fps.float().to(device)
        scores = scores.float().to(device)

        optimizer.zero_grad()
        output = model(fps)
        loss = F.mse_loss(output, scores)
        loss.backward()

        loss_all += loss.item()
        optimizer.step()
        i += 1
    return loss_all / i

In [None]:
def test_epoch(loader, model,  device):
    model.eval()

    MSE, MAE = 0, 0
    trues, preds = [], []
    with torch.no_grad():
        for fps, scores in loader:
            fps = fps.float().to(device)
            scores = scores.float().to(device)

            output = model(fps)
            pred = output.cpu().squeeze().numpy().tolist()
            true = scores.cpu().squeeze().numpy().tolist()

            trues.extend(true)
            preds.extend(pred)
    MAE = mean_absolute_error(trues, preds)
    RMSE = np.sqrt(mean_squared_error(trues, preds))
    R2 = r2_score(trues, preds)
    Sp = spearmanr(trues, preds)[0]
    return MAE, RMSE, R2, Sp

In [None]:
def rand_float(l, h):
    if l > h:
        return None
    else:
        a = h - l
        b = h - a
        out = (np.random.rand(1) * a + b).tolist()
        out = np.array(out)[0]
        return out

In [None]:
def init_model(device):
    model = DNN()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    lr = rand_float(l=0.0001, h=0.0008)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

In [None]:
def prepare_dataloader(dff, batch_size, test_size=0.8):
    train_fps, valid_fps, train_scores, valid_scores = train_test_split(
                                                                        dff["ecfp"],
                                                                        dff["score"],
                                                                        test_size=test_size)
    train_dataset = Dataset(fps=list(train_fps),
                            scores = list(train_scores))
    valid_dataset = Dataset(fps=list(valid_fps),
                            scores = list(valid_scores))
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)
    return train_loader, valid_loader

In [None]:
def train_step(dff, epochs, batch_size, step_time, model_index, device="cuda"):
    model, optimizer = init_model(device=device)
    train_loader, valid_loader = prepare_dataloader(dff,  batch_size=batch_size)

    model_folder = "models/dpp4/ecfp/models_ensemble/{}".format(step_time)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    hist = {"train-loss":[], "test-mae":[], "test-rmse":[], "test-r2":[], "test-sp":[]}
    for epoch in range(epochs):
        train_loss = train_epoch(train_loader, model, optimizer, device)
        test_mae, test_rmse, test_r2, test_sp= test_epoch(valid_loader, model, device)
        hist["train-loss"].append(train_loss)
        hist["test-mae"].append(test_mae)
        hist["test-rmse"].append(test_rmse)
        hist["test-r2"].append(test_r2)
        hist["test-sp"].append(test_sp)

        if test_rmse <= min(hist["test-rmse"]):
            weight_path = os.path.join(model_folder, "weight_{}_{}.pth".format(model_index, epoch))
            torch.save(model.state_dict(), weight_path)

        # print(f'Epoch: {epoch}, Train loss: {train_loss:.3}, Test mae: {test_mae:.3}, Test rmse: {test_rmse:.3}, Test r2: {test_r2:.3}')
    # print("---------------------------------\nmin mae: {}\n---------------------------------\n".format(min(hist["test-mae"])))
    print("---------------------------------\nmodel index: {}, validation min rmse: {}\n---------------------------------\n".format(model_index, min(hist["test-rmse"])))
    return weight_path

In [None]:
def train_ensemble(dff, epochs, batch_size, step_time, num_models, device="cuda"):
    model_paths = []
    for midx in range(num_models):
        path = train_step(dff=dff, epochs=epochs, batch_size=batch_size, step_time=step_time, model_index=midx, device="cuda")
        model_paths.append(path)
    return model_paths

In [None]:
def random_select(df, n_samples):
    df_select = df.sample(n=n_samples).copy()
    sample_indexs = df_select["name"].tolist()
    df_remaining = df[~df["name"].isin(sample_indexs)].copy()
    return df_select, df_remaining

In [None]:
def load_model(best_model_path, device="cuda"):
    model= DNN().to(device)
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    return model

In [None]:
def load_ensemble_model(best_model_paths, device="cuda"):
    models = []
    for model_path in best_model_paths:
        model = load_model(best_model_path=model_path)
        models.append(model)
    return models

In [None]:
def predict_with_uncertainty(fpnp, models, device, times=10):
    fpnp = fpnp.reshape(1, 2048)
    fp = torch.from_numpy(fpnp).float().to(device)
    predictions = []
    with torch.no_grad():
        for model in models:
            output = model(fp)
            pred = output.cpu().numpy()[0][0]
            predictions.append(pred)
    mean = np.mean(predictions)
    variance = np.var(predictions)
    return [mean, variance]

In [None]:
def uncertainty_select(dff, best_model_paths, n_samples):
    models = load_ensemble_model(best_model_paths)
    dff["mean_var"] = dff["ecfp"].map(lambda x: predict_with_uncertainty(x,  models, device="cuda"))
    dff["mean"] = dff["mean_var"].map(lambda x: x[0])
    dff["var"] = dff["mean_var"].map(lambda x: x[1])
    dff_sort = dff.sort_values(by="var", ascending=False)
    dff_select = dff_sort.iloc[:n_samples, :4].copy()
    sample_indexs = dff_select["name"].tolist()
    dff_remaining = dff_sort[~dff_sort["name"].isin(sample_indexs)].copy()
    return dff_select, dff_remaining

In [None]:
def predict(fpnp, models, device):
    fpnp = fpnp.reshape(1, 2048)
    fp = torch.from_numpy(fpnp).float().to(device)
    preds = []
    with torch.no_grad():
        for model in models:
            output = model(fp)
            pred = output.cpu().numpy()[0][0]
            preds.append(pred)
    return np.mean(preds)

In [None]:
def eval_model(dff, best_model_paths):
    models = load_ensemble_model(best_model_paths)
    dff["pred"] = dff["ecfp"].map(lambda x: predict(x,  models, device="cuda"))
    MAE = mean_absolute_error(dff["score"], dff["pred"])
    RMSE = np.sqrt(mean_squared_error(dff["score"], dff["pred"]))
    R2 = r2_score(dff["score"], dff["pred"])
    Sp = spearmanr(dff["score"], dff["pred"])[0]
    return MAE, RMSE, R2, Sp

In [None]:
def run_step(dfs, epochs, stime):
    dfs_new = dfs.copy()

    best_model_paths = train_ensemble(dfs_new, epochs, batch_size=32, step_time=stime, num_models=5)
    return best_model_paths

In [None]:
def run_iterations(df, df_test, epochs=200, iterations=100, n_samples=100):
    results = {"RMSE": [], "R2": [], "Sp": []}

    print("len(df):", len(df))

    df["ecfp"] = df["smiles"].map(lambda x: ecfp_from_smiles(x))
    df = df[~df["ecfp"].isna()]

    for stime in range(iterations):
        if stime < 1:
            dfs, dfr = random_select(df, n_samples)
        else:
            df_temp = dfs.copy()
            dfs, dfr = uncertainty_select(dfr, best_model_path, n_samples)
            dfs = pd.concat([dfs, df_temp], axis=0)
            dfs = dfs.reset_index(drop=True)
        print("number of mol pool:", len(dfs))
        best_model_path = run_step(dfs, epochs, stime)
        MAE, RMSE, R2, Sp = eval_model(df_test, best_model_path)
        print("MAE:{}, RMSE:{}, R2:{}, Sp:{}, step: {}".format(MAE, RMSE, R2, Sp, stime))

        results["RMSE"].append(RMSE)
        results["R2"].append(R2)
        results["Sp"].append(Sp)

    # Convert results dictionary to DataFrame
    results_df = pd.DataFrame(results)

    results_df.to_csv('ecfp_ensemble_results.csv', index=False)

In [None]:
df = pd.read_csv("preprocess_data/dpp4/dpp4_train.csv", sep="\t")
df_test = pd.read_csv("preprocess_data/dpp4/dpp4_test.csv", sep="\t")
df_test["ecfp"] = df_test["smiles"].map(lambda x: ecfp_from_smiles(x))
df_test = df_test[~df_test["ecfp"].isna()]

In [None]:
run_iterations(df, df_test)

len(df): 24865
number of mol pool: 100
---------------------------------
model index: 0, validation min rmse: 1.4663361779790904
---------------------------------

---------------------------------
model index: 1, validation min rmse: 1.267600302822738
---------------------------------

---------------------------------
model index: 2, validation min rmse: 1.4319162548535886
---------------------------------

---------------------------------
model index: 3, validation min rmse: 1.3672111510659817
---------------------------------

---------------------------------
model index: 4, validation min rmse: 1.4573718468322598
---------------------------------

MAE:0.9831650690948153, RMSE:1.2434486374992917, R2:-0.28469397555314013, Sp:0.5457433445503245, step: 0
number of mol pool: 200
---------------------------------
model index: 0, validation min rmse: 1.3326185096050625
---------------------------------

---------------------------------
model index: 1, validation min rmse: 1.2915851856

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-24-d7f2db78b0b6>", line 1, in <cell line: 1>
    run_iterations(df, df_test)
  File "<ipython-input-22-a3c64730747f>", line 18, in run_iterations
    best_model_path = run_step(dfs, epochs, stime)
  File "<ipython-input-21-bfab7243e2f3>", line 4, in run_step
    best_model_paths = train_ensemble(dfs_new, epochs, batch_size=32, step_time=stime, num_models=5)
  File "<ipython-input-13-0c2f5a1c5fe5>", line 4, in train_ensemble
    path = train_step(dff=dff, epochs=epochs, batch_size=batch_size, step_time=step_time, model_index=midx, device="cuda")
  File "<ipython-input-12-ffb0ccb90b5e>", line 21, in train_step
    torch.save(model.state_dict(), weight_path)
  File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 440, in save
    with _open_zipfile_writ