In [123]:
import torch
import pickle
import numpy as np
import time
import os
from shutil import copyfile

from model_reinvent import RNN
#from data_structs import Vocabulary, Experience
#from scoring_functions import get_scoring_function
from utils import Variable, seq_to_smiles, unique
from vizard_logger import VizardLog
import rdkit
from rdkit.Chem.Crippen import MolLogP, MolMR
from rdkit.Chem.Lipinski import  NumHAcceptors, NumHDonors
from rdkit.Chem.rdMolDescriptors import CalcNumRings, CalcNumRotatableBonds, CalcExactMolWt
from providers import robust_standardizer
import requests
from rdkit import Chem

from legogram.legogram.base import LegoGram
from legogram.legogram.rnn_sampler import LegoGramRNNSampler


In [131]:
def scoring_func(smiles, loyality):
    endpoints = requests.get("https://backend.syntelly.com/endpoints").json()
    endpoints_id2name = dict(zip([e['id'] for e in endpoints], [e['view'] for e in endpoints]))
    vals_another = requests.post("https://backend.syntelly.com/tempSmilesArrToPredict",
                                 json={'smiles': smiles}).json()

    for idx in range(len(vals_another)):
        elem = vals_another[idx]['data']
        for e in elem:
            e["endpoint_id"] = endpoints_id2name[e["endpoint_id"]]
    e2v = []
    for idx in range(len(vals_another)):
        e2v.append(dict(zip([e['endpoint_id'] for e in vals_another[idx]['data']],
                            [e['value'] for e in vals_another[idx]['data']])))

    smiles = [val['smiles'] for val in vals_another]
    mols = [robust_standardizer(mol) for mol in smiles]
    mols = [Chem.MolFromSmiles(mol) for mol in mols]
    molecular_weights = [CalcExactMolWt(mol) for mol in mols]
    logp = [MolLogP(mol) for mol in mols]
    atom_count = [mol.GetNumAtoms() for mol in mols]
    molar_reflactivity = [MolMR(mol) for mol in mols]
    numRings = [CalcNumRings(mol) for mol in mols]
    numRotBonds = [CalcNumRotatableBonds(mol) for mol in mols]
    numHAcceptors = [NumHAcceptors(mol) for mol in mols]
    numHDonors = [NumHDonors(mol) for mol in mols]
    bcf = [e['Bioconcentration factor'] for e in e2v]
    dev_tox = [e['Developmental toxicity'] for e in e2v]
    flash_point = [e['Flash point'] for e in e2v]
    boiling_point = [e['Boiling point'] for e in e2v]
    melting_points = [e['Melting point'] for e in e2v]
    water_solubility = [e['Water Solubility'] for e in e2v]

    result = [1] * len(smiles)
    for idx in range(len(smiles)):
        val = 0
        if (molecular_weights[idx] <= 480 and molecular_weights[idx] >= 160):
            val += 1
        if (logp[idx] <= 5.6 and logp[idx] >= -0.4):
            val += 1
        if (atom_count[idx] <= 70 and atom_count[idx] >= 20):
            val += 1
        if (molar_reflactivity[idx] >= 40 and molar_reflactivity[idx] <= 130):
            val += 1
        if (bcf[idx] < 3):
            val += 1
        if (dev_tox[idx] == 'Negative'):
            val += 1
        if (flash_point[idx] > (350 - 273.15)):
            val += 1
        if (boiling_point[idx] > (300 - 273.15)):
            val += 1
        if (numRings[idx] > 0):
            val += 1
        if (numRotBonds[idx] < 5):
            val += 1
        if (numHAcceptors[idx] <= 10):
            val += 1
        if (numHDonors[idx] <= 5):
            val += 1
        if (boiling_point[idx] <50):
            val += 5
        result[idx] = val/17

    return list(zip(smiles, result))

In [125]:
dataset = torch.load('lg.bin')
Agent = RNN()

In [126]:
Agent.rnn.load_state_dict(torch.load("Agent_bp_less_than_50.ckpt"))

<All keys matched successfully>

In [127]:
mols = []
for i in range(30):
    seqs, _, _ = Agent.sample(100)
    for idx, s in enumerate(seqs.cpu().numpy()):
        try:

            s = list(s)
            s = s[:s.index(2972)]
            mol = dataset.model.decode(s)
            if Chem.MolFromSmiles(mol):
                #print(mol)
                mols.append(mol)
        except:
            mols.append(None)

In [128]:
smiles2id = {mols[idx]: idx for idx in range(len(mols)) if mols[idx] is not None}
correct_smiles = list(filter(lambda x: x is not None, mols))

In [129]:
fraction_valid_smiles = len(correct_smiles)/3000

In [132]:
fraction_valid_smiles

0.9903333333333333

In [133]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [134]:
from tqdm import tqdm

In [135]:
vals = []
for s in tqdm(chunks(correct_smiles, 50)):
    res = scoring_func(s, 10)
    vals.append(res)


0it [00:00, ?it/s][A
1it [00:11, 11.25s/it][A
2it [00:21, 11.10s/it][A
3it [00:32, 11.03s/it][A
4it [00:43, 10.91s/it][A
5it [00:54, 10.86s/it][A
6it [01:04, 10.70s/it][A
7it [01:14, 10.56s/it][A
8it [01:25, 10.52s/it][A
9it [01:35, 10.51s/it][A
10it [02:14, 19.14s/it][A
11it [02:51, 24.24s/it][A
12it [03:27, 27.74s/it][A
13it [04:03, 30.40s/it][A
14it [04:39, 32.04s/it][A
15it [05:17, 33.75s/it][A
16it [05:52, 34.34s/it][A
17it [06:31, 35.50s/it][A
18it [07:06, 35.31s/it][A
19it [07:40, 35.04s/it][A
20it [08:14, 34.67s/it][A
21it [08:50, 35.04s/it][A
22it [09:25, 35.21s/it][A
23it [09:58, 34.33s/it][A
24it [10:33, 34.81s/it][A
25it [11:11, 35.71s/it][A
26it [11:46, 35.51s/it][A
27it [12:21, 35.31s/it][A
28it [12:54, 34.52s/it][A
29it [13:31, 35.19s/it][A
30it [14:04, 34.57s/it][A
31it [14:40, 35.08s/it][A
32it [15:14, 34.74s/it][A
33it [15:36, 31.06s/it][A
34it [15:48, 25.14s/it][A
35it [15:59, 20.96s/it][A
36it [16:09, 17.80s/it][A
37it [16:21, 1

In [136]:
mols_score = []
for val in vals:
    for mol, score in val:
        mols_score.append((mol, score))

In [137]:
mols_score = sorted(mols_score, key = lambda x: x[1], reverse = True)[:1000]

In [138]:
import pandas as pd
df = pd.DataFrame(mols_score)

In [139]:
from rdkit.Chem.Draw import MolsToGridImage


In [140]:
img =MolsToGridImage([Chem.MolFromSmiles(x) for x, y in mols_score[:15]], molsPerRow = 5)

In [141]:
img.save("reinvent.png")

In [143]:
df.rename(columns = {0:"SMILES BP less 50", 1: "Score"}).to_csv("SMILES_BP_less_50.csv", index = False)

In [42]:
dataset.model.encode("Cc1ccccc1")

[7, 5, 9, 0, 0, 0, 0, 0]

In [120]:
df[1] = df[1]*12/17

In [121]:
df

Unnamed: 0,0,1
0,O=C(Nc1ccc(F)cc1O)C(=O)Nc1cnccc1Cl,1.000000
1,Cc1ccccc1NC(=O)NNc1cc(Cl)cc(N)c1O,1.000000
2,Cc1ccccc1NC(=O)CNc1cc(Cl)cc(Br)c1,1.000000
3,O=C(NCc1ccc(O)c(Cl)c1)c1cc(Cl)ccc1O,1.000000
4,CCCc1ccccc1NC(=O)Nc1ccc(Cl)cn1,1.000000
...,...,...
995,O=C(NCc1cc(Cl)cc(Cl)c1)NNC(=O)Nc1ccc(F)cn1,0.941176
996,O=C(NCc1cncc(Cl)c1)c1cc(Cl)c(Cl)cc1Cl,0.941176
997,Cc1ccc(Cl)cc1NC(=O)NNC(=O)NNc1ccc(Cl)c(Cl)c1,0.941176
998,Nc1cc(NC(=O)Nc2cc(Cl)c(F)c(Cl)c2)ccc1Cl,0.941176
