In [None]:
!pip install datamol
!pip install rdkit-pypi
!pip install pandas



In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.utils.data import DataLoader
import tqdm

# Load data

In [None]:
def ecfp_from_smiles(smiles,
                     R = 3,
                     L = 2**11,
                     use_features = True,
                     use_chirality = False):
    """
    Inputs:

    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features

    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """

    molecule = Chem.MolFromSmiles(smiles)
    if not molecule:
        return np.nan
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius = R,
                                                         nBits = L,
                                                         useFeatures = use_features,
                                                         useChirality = use_chirality)
    return np.array(feature_list)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Project/ALVS")

Mounted at /content/drive


In [None]:
# contains only smile strings --> DUD-E target
df = pd.read_csv("aa2ar_processed.csv", sep="\t")

In [None]:
# df["ecfp"] = df["smiles"].map(lambda x: ecfp_from_smiles(x))

In [None]:
# dff = df[~df["ecfp"].isna()]

# Mol Graph DNN Regressor

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, fps, scores):
        self.fps = fps
        self.scores = scores
        self.dataLen = len(scores)

    def __getitem__(self, index):
        fp = np.array(self.fps[index])
        score = np.array(self.scores[index]).reshape(1,)
        return fp, score

    def __len__(self):
        return self.dataLen

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc1 = Linear(2048, 1024)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = Linear(1024, 516)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc3 = Linear(516, 256)
        self.dropout3 = nn.Dropout(p=0.2)
        self.fc4 = Linear(256, 128)
        self.dropout4 = nn.Dropout(p=0.2)
        self.fc5 = Linear(128, 1)


    def forward(self, data):
        x = F.relu(self.fc1(data))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

In [None]:
def train_epoch(loader, model, optimizer, device):
    model.train()

    loss_all = 0
    i = 0
    for fps, scores in loader:
        fps = fps.float().to(device)
        scores = scores.float().to(device)

        optimizer.zero_grad()
        output = model(fps)
        loss = F.mse_loss(output, scores)
        loss.backward()

        loss_all += loss.item()
        optimizer.step()
        i += 1
    return loss_all / i

In [None]:
def test_epoch(loader, model,  device):
    model.eval()

    MSE, MAE = 0, 0
    trues, preds = [], []
    with torch.no_grad():
        for fps, scores in loader:
            fps = fps.float().to(device)
            scores = scores.float().to(device)

            output = model(fps)
            pred = output.cpu().squeeze().numpy().tolist()
            true = scores.cpu().squeeze().numpy().tolist()

            trues.extend(true)
            preds.extend(pred)
    MAE = mean_absolute_error(trues, preds)
    RMSE = np.sqrt(mean_squared_error(trues, preds))
    R2 = r2_score(trues, preds)
    return MAE, RMSE, R2

In [None]:
def init_model(device, lr=0.001):
    model = DNN()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

In [None]:
def prepare_dataloader(dff, batch_size, test_size=0.8):
    train_fps, valid_fps, train_scores, valid_scores = train_test_split(
                                                                        dff["ecfp"],
                                                                        dff["score"],
                                                                        test_size=test_size)
    train_dataset = Dataset(fps=list(train_fps),
                            scores = list(train_scores))
    valid_dataset = Dataset(fps=list(valid_fps),
                            scores = list(valid_scores))
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)
    return train_loader, valid_loader

In [None]:
def train_step(dff, epochs, batch_size, step_time, device="cuda"):
    model, optimizer = init_model(device=device)
    train_loader, valid_loader = prepare_dataloader(dff,  batch_size=batch_size)

    model_folder = "models/{}".format(step_time)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    hist = {"train-loss":[], "test-mae":[], "test-mse":[], "test-r2":[]}
    for epoch in range(epochs):
        train_loss = train_epoch(train_loader, model, optimizer, device)
        test_mae, test_rmse, test_r2 = test_epoch(valid_loader, model, device)
        hist["train-loss"].append(train_loss)
        hist["test-mae"].append(test_mae)
        hist["test-mse"].append(test_rmse)
        hist["test-r2"].append(test_r2)

        if test_mae <= min(hist["test-mae"]):
            weight_path = os.path.join(model_folder, "weight_{}.pth".format(epoch))
            torch.save(model.state_dict(), weight_path)

        print(f'Epoch: {epoch}, Train loss: {train_loss:.3}, Test mae: {test_mae:.3}, Test rmse: {test_rmse:.3}, Test r2: {test_r2:.3}')
    print("---------------------------------\nmin mae: {}\n---------------------------------\n".format(min(hist["test-mae"])))
    return weight_path

In [None]:
# train_fps, valid_fps, train_scores, valid_scores = train_test_split(dff["ecfp"], dff["score"], test_size=0.8)

In [None]:
# train_dataset = Dataset(fps=list(train_fps), scores = list(train_scores))
# valid_dataset = Dataset(fps=list(valid_fps), scores = list(valid_scores))

In [None]:
# train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
# valid_dataloader = DataLoader(valid_dataset, batch_size=2, shuffle=True)

In [None]:
# train_step(train_dataloader, valid_dataloader, epochs=50, step_time=1)

In [None]:
def random_select(df, n_samples):
    df_select = df.sample(n=n_samples).copy()
    sample_indexs = df_select["name"].tolist()
    df_remaining = df[~df["name"].isin(sample_indexs)].copy()
    return df_select, df_remaining

In [None]:
# dfs, dfr = random_select(dff, n_samples=50)

In [None]:
def run_step(dfs, epochs, stime):
    dfs_new = dfs.copy()
    dfs_new["ecfp"] = dfs_new["smiles"].map(lambda x: ecfp_from_smiles(x))
    dffs = dfs_new[~dfs_new["ecfp"].isna()]

    best_model_path = train_step(dffs, epochs, batch_size=32, step_time=stime)
    return best_model_path

In [None]:
def run_iterations(df, epochs=30, iterations=10, n_samples=500):
    print("len(df):", len(df))
    for stime in range(iterations):
        if stime < 1:
            dfs, dfr = random_select(df, n_samples)
        else:
            df_temp = dfs.copy()
            dfs, dfr = random_select(dfr, n_samples)
            dfs = pd.concat([dfs, df_temp], axis=0)
            dfs = dfs.reset_index(drop=True)
        print("number of mol pool:", len(dfs))
        best_model_path = run_step(dfs, epochs, stime)
        print(best_model_path)

In [None]:
querry_fcn = {
    'random_sample': random_select,
    'dropout_score': dropout_score,
}

In [None]:
run_iterations(df)

len(df): 31971
number of mol pool: 500
Epoch: 0, Train loss: 73.3, Test mae: 8.23, Test rmse: 8.29, Test r2: -64.7
Epoch: 1, Train loss: 55.3, Test mae: 4.46, Test rmse: 4.59, Test r2: -19.1
Epoch: 2, Train loss: 15.6, Test mae: 1.95, Test rmse: 2.43, Test r2: -4.65
Epoch: 3, Train loss: 7.34, Test mae: 3.6, Test rmse: 3.76, Test r2: -12.5
Epoch: 4, Train loss: 10.1, Test mae: 2.93, Test rmse: 3.13, Test r2: -8.37
Epoch: 5, Train loss: 4.3, Test mae: 1.2, Test rmse: 1.54, Test r2: -1.26
Epoch: 6, Train loss: 6.36, Test mae: 1.63, Test rmse: 1.93, Test r2: -2.57
Epoch: 7, Train loss: 4.27, Test mae: 2.86, Test rmse: 3.06, Test r2: -7.96
Epoch: 8, Train loss: 3.96, Test mae: 1.41, Test rmse: 1.73, Test r2: -1.86
Epoch: 9, Train loss: 2.93, Test mae: 1.14, Test rmse: 1.45, Test r2: -1.02
Epoch: 10, Train loss: 2.21, Test mae: 1.97, Test rmse: 2.24, Test r2: -3.79
Epoch: 11, Train loss: 2.3, Test mae: 2.08, Test rmse: 2.34, Test r2: -4.25
Epoch: 12, Train loss: 1.57, Test mae: 1.19, Test r