# Generate Fingerprint

In [None]:
!pip install datamol
!pip install rdkit-pypi
!pip install pandas

Collecting datamol
  Downloading datamol-0.11.1-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting loguru (from datamol)
  Downloading loguru-0.7.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting selfies (from datamol)
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Installing collected packages: selfies, loguru, datamol
Successfully installed datamol-0.11.1 loguru-0.7.0 selfies-2.1.1
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np

In [None]:
def ecfp_from_smiles(smiles,
                     R = 2,
                     L = 2**11,
                     use_features = True,
                     use_chirality = False):
    """
    Inputs:

    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features

    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """

    molecule = Chem.MolFromSmiles(smiles)
    if not molecule:
        return np.nan
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius = R,
                                                         nBits = L,
                                                         useFeatures = use_features,
                                                         useChirality = use_chirality)
    return np.array(feature_list)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Project/ALVS")

Mounted at /content/drive


In [None]:
df = pd.read_csv("aa2ar_processed.csv", sep="\t")

In [None]:
df.head()

Unnamed: 0,name,smiles,score
0,CHEMBL100382,O=C(CCc1ccccc1)Nc1nc2ccc(Cl)cc2c2nc(-c3ccco3)nn12,-9.294887
1,CHEMBL106265,Cn1c(=O)c2[nH]c(C3CCCC3)nc2n(C)c1=O,-7.728042
2,CHEMBL1079801,N#Cc1ccc(NC(=O)C2CC2)nc1-c1ccco1,-7.387805
3,CHEMBL1082005,Cc1ccc(-c2nc(NC(=O)C3CC3)ccc2-c2ccncc2)o1,-8.844135
4,CHEMBL1086846,Nc1nc(-c2cccs2)c2c(n1)-c1ccccc1C2=O,-9.619289


In [None]:
df["ecfp"] = df["smiles"].map(lambda x: ecfp_from_smiles(x))

[11:10:05] SMILES Parse Error: syntax error while parsing: error
[11:10:05] SMILES Parse Error: Failed parsing SMILES 'error' for input: 'error'


In [None]:
dff = df[~df["ecfp"].isna()]

In [None]:
dff.head()

Unnamed: 0,name,smiles,score,ecfp
0,CHEMBL100382,O=C(CCc1ccccc1)Nc1nc2ccc(Cl)cc2c2nc(-c3ccco3)nn12,-9.294887,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,CHEMBL106265,Cn1c(=O)c2[nH]c(C3CCCC3)nc2n(C)c1=O,-7.728042,"[1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CHEMBL1079801,N#Cc1ccc(NC(=O)C2CC2)nc1-c1ccco1,-7.387805,"[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CHEMBL1082005,Cc1ccc(-c2nc(NC(=O)C3CC3)ccc2-c2ccncc2)o1,-8.844135,"[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CHEMBL1086846,Nc1nc(-c2cccs2)c2c(n1)-c1ccccc1C2=O,-9.619289,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
dff["ecfp"] = dff["ecfp"].map(lambda x: x.tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff["ecfp"] = dff["ecfp"].map(lambda x: x.tolist())


# ECFP Molecule Fingerprint

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.utils.data import DataLoader

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, fps, scores):
        self.fps = fps
        self.scores = scores
        self.dataLen = len(scores)

    def __getitem__(self, index):
        fp = np.array(self.fps[index])
        score = np.array(self.scores[index]).reshape(1,)
        return fp, score

    def __len__(self):
        return self.dataLen

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc1 = Linear(2048, 1024)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = Linear(1024, 516)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc3 = Linear(516, 256)
        self.dropout3 = nn.Dropout(p=0.2)
        self.fc4 = Linear(256, 128)
        self.dropout4 = nn.Dropout(p=0.2)
        self.fc5 = Linear(128, 1)


    def forward(self, data):
        x = F.relu(self.fc1(data))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        x = F.relu(self.fc4(x))
        x = self.dropout4(x)
        x = self.fc5(x)
        return x

In [None]:
def train_step(loader, model, optimizer, device):
    model.train()

    loss_all = 0
    i = 0
    for fps, scores in loader:
        fps = fps.float().to(device)
        scores = scores.float().to(device)

        optimizer.zero_grad()
        output = model(fps)
        loss = F.mse_loss(output, scores)
        loss.backward()

        loss_all += loss.item()
        optimizer.step()
        i += 1
    return loss_all / i

In [None]:
def test_step(loader, model,  device):
    model.eval()

    MSE, MAE = 0, 0
    trues, preds = [], []
    with torch.no_grad():
        for fps, scores in loader:
            fps = fps.float().to(device)
            scores = scores.float().to(device)

            output = model(fps)
            pred = output.cpu().numpy()[0][0]
            true = scores.cpu().numpy()[0][0]

            trues.append(true)
            preds.append(pred)
    MAE = mean_absolute_error(trues, preds)
    MSE = mean_squared_error(trues, preds)
    R2 = r2_score(trues, preds)
    return MAE, MSE, R2

In [None]:
def train(train_loader, test_loader, epochs):
    model = DNN()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    hist = {"train-loss":[], "test-mae":[], "test-mse":[], "test-r2":[]}
    for epoch in range(epochs):
        weight_path = "models/weight_ap_{}.pth".format(epoch)
        train_loss = train_step(train_loader, model, optimizer, device)
        test_mae, test_mse, test_r2 = test_step(test_loader, model, device)
        hist["train-loss"].append(train_loss)
        hist["test-mae"].append(test_mae)
        hist["test-mse"].append(test_mse)
        hist["test-r2"].append(test_r2)

        if test_mae <= min(hist["test-mae"]):
            torch.save(model.state_dict(), weight_path)

        print(f'Epoch: {epoch}, Train loss: {train_loss:.3}, Test mae: {test_mae:.3}, Test mse: {test_mse:.3}, Test r2: {test_r2:.3}')
    print("---------------------------------\nmin mae: {}\n---------------------------------\n".format(min(hist["test-mae"])))
    return

In [None]:
train_fps, valid_fps, train_scores, valid_scores = train_test_split(dff["ecfp"], dff["score"], test_size=0.8)

In [None]:
train_dataset = Dataset(fps=list(train_fps), scores = list(train_scores))
valid_dataset = Dataset(fps=list(valid_fps), scores = list(valid_scores))

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=True)

In [None]:
! rm models/*.pth

In [None]:
train(train_dataloader, valid_dataloader, epochs=20)

Epoch: 0, Train loss: 13.0, Test mae: 0.946, Test mse: 1.47, Test r2: -0.41
Epoch: 1, Train loss: 1.89, Test mae: 0.755, Test mse: 0.959, Test r2: 0.0785
Epoch: 2, Train loss: 1.46, Test mae: 0.698, Test mse: 0.826, Test r2: 0.206
Epoch: 3, Train loss: 1.22, Test mae: 0.669, Test mse: 0.758, Test r2: 0.271
Epoch: 4, Train loss: 1.11, Test mae: 0.664, Test mse: 0.748, Test r2: 0.282
Epoch: 5, Train loss: 1.04, Test mae: 0.643, Test mse: 0.706, Test r2: 0.322
Epoch: 6, Train loss: 0.987, Test mae: 0.639, Test mse: 0.693, Test r2: 0.334
Epoch: 7, Train loss: 0.919, Test mae: 0.627, Test mse: 0.675, Test r2: 0.352
Epoch: 8, Train loss: 0.89, Test mae: 0.651, Test mse: 0.715, Test r2: 0.312
Epoch: 9, Train loss: 0.908, Test mae: 0.621, Test mse: 0.662, Test r2: 0.363
Epoch: 10, Train loss: 0.873, Test mae: 0.635, Test mse: 0.686, Test r2: 0.341
Epoch: 11, Train loss: 0.898, Test mae: 0.633, Test mse: 0.685, Test r2: 0.341
Epoch: 12, Train loss: 0.795, Test mae: 0.703, Test mse: 0.805, Test 

# Enable Dropout

In [None]:
def enable_dropout(model):
    """ Function to enable the dropout layers during test-time """
    for m in model.modules():
        if m.__class__.__name__.startswith('Dropout'):
            m.train()

In [None]:
def predict_with_uncertainty(fps, model, device, n_samples=10):
    dropout_predictions = []
    with torch.no_grad():
        for _ in range(n_samples):
            fps = fps.float().to(device)
            output = model(fps)
            pred = output.cpu().numpy()[0][0]
            dropout_predictions.append(pred)
    mean = np.mean(dropout_predictions)
    variance = np.var(dropout_predictions)
    return mean, variance

In [None]:
def load_model(model_file, device="cpu"):
    model= DNN().to(device)
    model.load_state_dict(torch.load(model_file, map_location=device))
    return model

In [None]:
!ls models

weight_ap_0.pth   weight_ap_1.pth  weight_ap_4.pth  weight_ap_7.pth
weight_ap_14.pth  weight_ap_2.pth  weight_ap_5.pth  weight_ap_9.pth
weight_ap_17.pth  weight_ap_3.pth  weight_ap_6.pth


In [None]:
model_file = "models/weight_ap_17.pth"
device = "cuda"
model = load_model(model_file, device=device)

In [None]:
model.eval()
enable_dropout(model)

In [None]:
all_preds = []
for fp, score in valid_dataloader:
    mean, var = predict_with_uncertainty(fp, model, device)
    all_preds.append([mean, var, score.cpu().numpy()[0][0]])
dfp = pd.DataFrame(all_preds)
dfp.columns = ["mean", "var", "score"]

In [None]:
dfp = dfp.sort_values(by="var", ascending=False)

In [None]:
dfp.head()

Unnamed: 0,mean,var,score
11052,-9.941459,2.660612,-8.025254
16261,-9.882836,2.346683,-10.39535
1295,-9.19338,2.29285,-8.035105
13998,-9.111752,2.240069,-9.024943
25367,-10.843528,2.211563,-8.342705
