In [None]:
%reload_ext autoreload
%autoreload 2
use_gpu = True
if not use_gpu:
    import os
    os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import seaborn as sns

In [None]:
import sys

In [None]:
import desmiles
from desmiles.data import Vocab, FpSmilesList, DesmilesLoader, DataBunch
from desmiles.learner import desmiles_model_learner
from desmiles.models import Desmiles, RecurrentDESMILES, get_fp_to_embedding_model, get_embedded_fp_to_smiles_model
from desmiles.utils import load_old_pretrained_desmiles, load_pretrained_desmiles

In [None]:
from pathlib import Path
import numpy as np
import torch
import pandas as pd
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

In [None]:
from desmiles.config import DATA_DIR

In [None]:
model_fn = Path(DATA_DIR) / 'pretrained/model_2000_400_2000_5.h5'
architecture = {'fp_emb_sz': 2000, 'emb_sz': 400, 'nh': 2000, 'nl': 5, 'clip':0.3, 'alpha':2., 'beta':1.}
# load fastai learner
learner = load_old_pretrained_desmiles(model_fn, return_learner=True, **architecture)

In [None]:
from desmiles.learner import OriginalFastaiOneCycleScheduler, Learner

# generate training data
n=1000
sigma=0.1
# learn function y = x**2 + noise
x = np.linspace(-1,1, n)
y = x**2 + (np.random.randn(n) * sigma)
x_t = torch.tensor(x, dtype=torch.float).unsqueeze(1)
y_t = torch.tensor(y, dtype=torch.float).unsqueeze(1)

# create databunch
trn_ds = torch.utils.data.TensorDataset(x_t, y_t)
val_ds = torch.utils.data.TensorDataset(x_t, y_t)
trn_loader = torch.utils.data.DataLoader(trn_ds, batch_size=10, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=10, shuffle=False)
db = DataBunch(trn_loader, val_loader)

# train model 
model = torch.nn.Sequential(torch.nn.Linear(1,100), torch.nn.ReLU(), torch.nn.Linear(100,1), torch.nn.ReLU())
learner = Learner(db, model, loss_func=torch.nn.functional.mse_loss)
div_factor=10
# Use the old fastai one cycle training policy
one_cycle_linear_cb = OriginalFastaiOneCycleScheduler(learner, 0.002, div_factor=div_factor)
learner.fit(5, callbacks=[one_cycle_linear_cb])
learner.recorder.plot_lr()

In [None]:
# The current fastai library uses the following for their one cycle training policy
learner = Learner(db, model, loss_func=torch.nn.functional.mse_loss)
learner.fit_one_cycle(5, 0.002)
learner.recorder.plot_lr()

In [None]:
import scipy.sparse
import os
MYDATA=os.path.join(DATA_DIR, 'notebooks')
    
trn_smiles = np.load(os.path.join(MYDATA, 'training.enc8000.split_0.npy'))
trn_fps = scipy.sparse.load_npz(os.path.join(MYDATA, 'training_fp.split_0.npz'))

val_smiles = np.load(os.path.join(MYDATA, 'validation.enc8000.npy'))
val_fps = scipy.sparse.load_npz(os.path.join(MYDATA,'validation_fp.npz'))

itos_fn=os.path.join(DATA_DIR, 'pretrained', 'id.dec8000')
itos = [s.strip() for i,s in enumerate(open(itos_fn, encoding='utf-8'))]
vocab = Vocab(itos)

In [None]:
#Let's train DESMILSE on 1% of 1/4 the data

num_trn_smiles = trn_smiles.shape[0] 
trn_inds = np.random.permutation(np.arange(num_trn_smiles))
num_to_keep = int(num_trn_smiles*0.01)
trn_inds = trn_inds[:num_to_keep]

num_val_smiles = val_smiles.shape[0] 
val_inds = np.random.permutation(np.arange(num_val_smiles))
num_to_keep = int(num_val_smiles*0.01)
val_inds = val_inds[:num_to_keep]

In [None]:
# create data bunch
bs=200
trn_ds = FpSmilesList(trn_smiles[trn_inds], trn_fps[trn_inds], vocab)
val_ds = FpSmilesList(val_smiles[val_inds], val_fps[val_inds], vocab)
trn_dl = DesmilesLoader(trn_ds, bs=bs, vocab=vocab)
val_dl = DesmilesLoader(val_ds, bs=bs, vocab=vocab)
db = DataBunch(trn_dl, val_dl)

In [None]:
from desmiles.utils import accuracy4
architecture = {'fp_emb_sz': 200, 'emb_sz': 200, 'nh': 200, 'nl': 1}
regularization = {'clip':0.3, 'alpha':2., 'beta':1.}

# Training parameters
max_lr = 0.001
div_factor = 10.

# 1) Create learner object
learner = desmiles_model_learner(db, **architecture, **regularization)

learner.metrics = [accuracy4]
# 2) Specify training schedule
one_cycle_linear_cb = OriginalFastaiOneCycleScheduler(learner, max_lr, div_factor=div_factor)
# 3) Train 
learner.fit(50, callbacks=[one_cycle_linear_cb])
# 4) Save model
#learner.save('model_1')

In [None]:
learner.recorder.plot_lr()

In [None]:
from desmiles.utils import decoder, image_of_mols
from functools import partial

# helper function to map from one-hot-encoded vector to smiles string
decoder = partial(decoder, itos=itos)
def smiles_idx_to_string(smiles_idx, decoder=decoder):
    return decoder(smiles_idx[smiles_idx > 0].tolist())

In [None]:
from desmiles.decoding.astar import AstarTreeParallelHybrid as AstarTree

#learner.load('model_1')

# Lets see if we at least fit our training set a bit
(smiles_idx, fps, lengths), y = next(iter(trn_dl))
test_smiles_idx = smiles_idx[:,-1]
test_fp = fps[-1]
test_smiles = smiles_idx_to_string(test_smiles_idx)
image_of_mols([test_smiles])

In [None]:
model = learner.model
model.eval()
rdesmiles = RecurrentDESMILES(model)

astar = AstarTree(test_fp.unsqueeze(0).to('cuda'), rdesmiles, num_expand=100)
neg_log_prob, smiles_idx =  next(astar)
smiles = smiles_idx_to_string(smiles_idx)
image_of_mols([smiles, test_smiles])

In [None]:
from desmiles.decoding.astar import AstarTreeParallelHybrid as AstarTree

#learner.load('model_1')

# Lets see if we at least fit our training set a bit
(smiles_idx, fps, lengths), y = next(iter(val_dl))
test_smiles_idx = smiles_idx[:,-1]
test_fp = fps[-1]
test_smiles = smiles_idx_to_string(test_smiles_idx)
image_of_mols([test_smiles])

In [None]:
model = learner.model
model.eval()
rdesmiles = RecurrentDESMILES(model)

astar = AstarTree(test_fp.unsqueeze(0).to('cuda'), rdesmiles, num_expand=100)
neg_log_prob, smiles_idx =  next(astar)
smiles = smiles_idx_to_string(smiles_idx)
image_of_mols([smiles, test_smiles])

In [None]:
model_fn = Path(DATA_DIR) / 'pretrained/model_2000_400_2000_5.h5'
architecture = {'fp_emb_sz': 2000, 'emb_sz': 400, 'nh': 2000, 'nl': 5, 'clip':0.3, 'alpha':2., 'beta':1.}
learner = load_old_pretrained_desmiles(model_fn, return_learner=True, **architecture)
model = learner.model
model.eval()
# make a RecurrentDESMILES model
model = RecurrentDESMILES(model)

In [None]:
from desmiles.utils import smiles_to_fingerprint
validation_smiles = [s.strip() for s in open(os.path.join(DATA_DIR, 'pretrained', 'validation_smiles_10k.smi'))]
inds = np.random.permutation(np.arange(len(validation_smiles)))
i = 0
smiles_to_invert = validation_smiles[inds[i]]
fp = smiles_to_fingerprint(smiles_to_invert, as_tensor=True)
image_of_mols([smiles_to_invert])

In [None]:
astar = AstarTree(fp.unsqueeze(0), model, num_expand=100)
nlp, smiles_idx = next(astar)
smiles = smiles_idx_to_string(smiles_idx)
image_of_mols([smiles, smiles_to_invert])

In [None]:
# model size
np.sum([np.prod(p.shape) for p in model.parameters()])

In [None]:
%%time
# Lets use the fast variant of A* to get 100 top solution
astar = AstarTree(fp.unsqueeze(0), model, num_expand=1000, max_branches=5000)
from collections import defaultdict
scores = defaultdict(float)
all_leaf_nodes = []
for _ in range(1000):
    nlp, smiles_idx = next(astar)
    smiles = smiles_idx_to_string(smiles_idx)
    print(smiles,  np.exp(-nlp))
    scores[smiles] += np.exp(-nlp)
    all_leaf_nodes.append(smiles)

In [None]:
sorted(scores.items(), key=lambda x: -x[1])

Don't forget to regenerate the DRD2 dataset if you haven't already done it.
To do so, please run DESMILES/tests/download_drd2_dataset.sh <PATH/TO>/DESMILES/data/notebooks

In [None]:
from drd2 import *

In [None]:
bs=200
original_smile, train_fp, train_enc = load_training_data(raise_prob=True)
db = create_databunch(train_fp, train_enc, itos_fn, bs)

In [None]:
model_fn = Path(os.path.join(DATA_DIR, 'pretrained', 'model_2000_400_2000_5.h5'))
learner = load_old_pretrained_desmiles(model_fn, return_learner=True)
learner.metrics = [accuracy4]
learner.data = db

num_epochs = 5
max_lr = 0.001
div_factor = 7

one_cycle_linear_cb = OriginalFastaiOneCycleScheduler(learner, max_lr, div_factor=div_factor)
learner.fit(num_epochs, callbacks=[one_cycle_linear_cb])

In [None]:
(val_smiles_idx, val_fps, _), _ = next(iter(db.valid_dl))

In [None]:
model = learner.model
model.eval()
model = RecurrentDESMILES(model)
astar = AstarTree(val_fps[0].unsqueeze(0), model, num_expand=100)
all_leaf_nodes = []
for _ in range(100):
    nlp, smiles_idx = next(astar)
    smiles = smiles_idx_to_string(smiles_idx)
    print(smiles)
    all_leaf_nodes.append(smiles)