In [1]:
import os
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from lightgbm import LGBMClassifier, LGBMRegressor
import optuna
import deepchem as dc
seed = 19

In [2]:
def get_inputs(sm):
    seq_len = 220
    sm = sm.split()
    if len(sm)>218:
        print('SMILES is too long ({:d})'.format(len(sm)))
        sm = sm[:109]+sm[-109:]
    ids = [vocab.stoi.get(token, unk_index) for token in sm]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    return ids, seg

def get_array(smiles):
    x_id, x_seg = [], []
    for sm in smiles:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)

In [3]:
import torch
from pretrain_trfm import TrfmSeq2seq
from build_vocab import WordVocab
from utils import split

pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

vocab = WordVocab.load_vocab('data/vocab.pkl')

trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
trfm.load_state_dict(torch.load('.save/trfm_12_23000.pkl', map_location=torch.device('cpu') ))
trfm.eval()
print('Total parameters:', sum(p.numel() for p in trfm.parameters()))

Total parameters: 4245037


In [10]:
## Optuna

def objective_regressor(trial):
    model_name = trial.suggest_categorical('regressor', ['MLP', 'ridge', 'LGBM'])
    if model_name == 'MLP':
        n_layers = trial.suggest_int('n_layers', 1, 3)
        layers = []
        for i in range(n_layers):
            n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 4, 500))
            layers.append(n_units)
        lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
        alpha = trial.suggest_loguniform('alpha', 1e-6, 0.1)
        reg = MLPRegressor(hidden_layer_sizes=n_layers, learning_rate_init=lr, alpha=alpha, max_iter=1000)
    elif model_name=='ridge':
        alpha = trial.suggest_loguniform('lr', 1e-4, 10)
        reg = Ridge(alpha=alpha, max_iter=1000)
    else:
        num_leaves = trial.suggest_int('num_leaves', 5, 100)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 100)
        max_depth = trial.suggest_int('max_depth', -1, 10)
        lr = trial.suggest_loguniform('lr', 1e-5, 1.0)
        max_bin = trial.suggest_int('max_bin', 10, 500)
        reg = LGBMRegressor(num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, max_depth=max_depth,
                           learning_rate=lr, max_bin=max_bin)
    
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_val)
    return mean_squared_error(y_pred, y_val)**0.5

def objective_classifier(trial):
    model_name = trial.suggest_categorical('classifier', ['MLP', 'ridge', 'LGBM'])
    if model_name == 'MLP':
        n_layers = trial.suggest_int('n_layers', 1, 3)
        layers = []
        for i in range(n_layers):
            n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 4, 500))
            layers.append(n_units)
        lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
        alpha = trial.suggest_loguniform('alpha', 1e-6, 0.1)
        clf = MLPClassifier(hidden_layer_sizes=n_layers, learning_rate_init=lr, alpha=alpha, max_iter=1000)
    elif model_name=='ridge':
        C = trial.suggest_loguniform('C', 0.1, 1e5)
        clf = LogisticRegression(penalty='l2', C=C, solver='lbfgs', max_iter=1000)
    else:
        num_leaves = trial.suggest_int('num_leaves', 5, 100)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 100)
        max_depth = trial.suggest_int('max_depth', -1, 10)
        lr = trial.suggest_loguniform('lr', 1e-5, 1.0)
        max_bin = trial.suggest_int('max_bin', 10, 500)
        clf = LGBMClassifier(num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, max_depth=max_depth,
                           learning_rate=lr, max_bin=max_bin)
    
    clf.fit(X_train, y_train)
    y_score = clf.predict_proba(X_val)
    return - roc_auc_score(y_val, y_score[:,1])

# requires X, y
def objective_classifier_multi(trial):
    model_name = trial.suggest_categorical('classifier', ['MLP', 'ridge', 'LGBM'])
    if model_name == 'MLP':
        n_layers = trial.suggest_int('n_layers', 1, 3)
        layers = []
        for i in range(n_layers):
            n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 4, 500))
            layers.append(n_units)
        lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
        alpha = trial.suggest_loguniform('alpha', 1e-6, 0.1)
        clf = MLPClassifier(hidden_layer_sizes=n_layers, learning_rate_init=lr, alpha=alpha, max_iter=100)
    elif model_name=='ridge':
        C = trial.suggest_loguniform('C', 0.1, 1e5)
        clf = LogisticRegression(penalty='l2', C=C, solver='lbfgs', max_iter=1000)
    else:
        num_leaves = trial.suggest_int('num_leaves', 5, 100)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 100)
        max_depth = trial.suggest_int('max_depth', -1, 10)
        lr = trial.suggest_loguniform('lr', 1e-5, 1.0)
        max_bin = trial.suggest_int('max_bin', 10, 500)
        clf = LGBMClassifier(num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, max_depth=max_depth,
                           learning_rate=lr, max_bin=max_bin)
    
    aucs = np.empty(len(KEYS))
    for i,key in enumerate(KEYS):
        _clf = copy.deepcopy(clf)
        _clf.fit(X_train[df_train[key].notna()], df_train[key].dropna().values)
        y_score = _clf.predict_proba(X_val[df_val[key].notna()])
        aucs[i] = roc_auc_score(df_val[key].dropna().values, y_score[:,1])
    return - np.mean(aucs)

In [18]:
def evaluate_regression(model, X_train, X_val, X_test, y_train, y_val, y_test):
    reg = copy.deepcopy(model)
    reg.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
    y_pred = reg.predict(X_test)
    rmse = mean_squared_error(y_pred, y_test)**0.5
    r2 = r2_score(y_pred, y_test)
    return rmse, r2

def evaluate_classification(model, X_train, X_val, X_test, y_train, y_val, y_test):
    clf = copy.deepcopy(model)
    clf.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
    y_score = clf.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_score[:,1])
    return auc

from tqdm import tqdm_notebook as tqdm
def evaluate_classification_multi(model, X_train, X_val, X_test):
    aucs = np.empty(len(KEYS))
    for i,key in tqdm(enumerate(KEYS)):
        clf = copy.deepcopy(model)
        _X_train = np.concatenate([X_train[df_train[key].notna()], X_val[df_val[key].notna()]])
        _y_train = np.concatenate([df_train[key].dropna().values, df_val[key].dropna().values])
        clf.fit(_X_train, _y_train)
        y_score = clf.predict_proba(X_test[df_test[key].notna()])
        aucs[i] = roc_auc_score(df_test[key].dropna().values, y_score[:,1])
    return np.mean(aucs)

In [6]:
def train_val_test_split(X, y, train_size=0.8, val_size=0.1):
    X_train, X_res, y_train, y_res = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_res, y_res, train_size=val_size/(1-train_size))
    return X_train, X_val, X_test, y_train, y_val, y_test

def df_split(df, train_size=0.8, val_size=0.1):
    df_train, df_res = train_test_split(df, train_size=train_size)
    df_val, df_test = train_test_split(df_res, train_size=val_size/(1-train_size))
    return df_train, df_val, df_test

def load_csv(task_name):
    if task_name=='HIV':
        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=['HIV_active'], smiles_field='smiles', featurizer=featurizer)
        dataset = loader.featurize('data/hiv.csv')
    elif task_name=='BACE':
        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=['Class'], smiles_field='mol', featurizer=featurizer)
        dataset = loader.featurize('data/bace.csv')
    elif task_name=='BBBP':
        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=['p_np'], smiles_field='smiles', featurizer=featurizer)
        dataset = loader.featurize('data/bbbp.csv')
    return dataset

def scaffold_split(dataset):
    splitter = dc.splits.ScaffoldSplitter()
    train, val, test = splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1)
    return train.ids, val.ids, test.ids, train.y, val.y, test.y

def transform_for_scaffold(X):
    x_split = [split(sm) for sm in X]
    xid, xseg = get_array(x_split)
    X = trfm.encode(torch.t(xid))[:,:256]
    return X

## ESOL

In [82]:
df = pd.read_csv('data/esol.csv')
print(df.shape)
df.head()

(1128, 10)


Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


In [83]:
x_split = [split(sm) for sm in df['smiles'].values]
xid, xseg = get_array(x_split)
X = trfm.encode(torch.t(xid))[:,:256]
print(X.shape)

There are 1128 molecules. It will take a little time.
(1128, 256)


In [124]:
# R1
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['measured log solubility in mols per litre'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 15:37:08,461] Finished a trial resulted in value: 0.8062984182516997. Current best value is 0.8062984182516997 with parameters: {'regressor': 'MLP', 'n_layers': 1, 'n_units_l0': 29.82175928576803, 'lr': 0.005043479327380707, 'alpha': 0.0002520699999128908}.
[I 2019-08-18 15:37:14,761] Finished a trial resulted in value: 0.7650370453171574. Current best value is 0.7650370453171574 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 320.7555553362534, 'n_units_l1': 4.662924888719898, 'n_units_l2': 297.06445011941446, 'lr': 0.001779548241739313, 'alpha': 0.020178001607915507}.
[I 2019-08-18 15:37:16,013] Finished a trial resulted in value: 0.8143507679520788. Current best value is 0.7650370453171574 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 320.7555553362534, 'n_units_l1': 4.662924888719898, 'n_units_l2': 297.06445011941446, 'lr': 0.001779548241739313, 'alpha': 0.020178001607915507}.
[I 2019-08-18 15:37:21,478] Finished a trial resulte

[I 2019-08-18 15:37:54,678] Finished a trial resulted in value: 2.136578608915249. Current best value is 0.7650370453171574 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 320.7555553362534, 'n_units_l1': 4.662924888719898, 'n_units_l2': 297.06445011941446, 'lr': 0.001779548241739313, 'alpha': 0.020178001607915507}.
[I 2019-08-18 15:37:59,725] Finished a trial resulted in value: 0.8571274018244209. Current best value is 0.7650370453171574 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 320.7555553362534, 'n_units_l1': 4.662924888719898, 'n_units_l2': 297.06445011941446, 'lr': 0.001779548241739313, 'alpha': 0.020178001607915507}.
[I 2019-08-18 15:38:01,069] Finished a trial resulted in value: 2.2394589384596335. Current best value is 0.7650370453171574 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 320.7555553362534, 'n_units_l1': 4.662924888719898, 'n_units_l2': 297.06445011941446, 'lr': 0.001779548241739313, 'alpha': 0.02017800

[I 2019-08-18 15:39:51,574] Finished a trial resulted in value: 1.5297913492872184. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}.
[I 2019-08-18 15:39:52,007] Finished a trial resulted in value: 4.23710881745223. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}.
[I 2019-08-18 15:40:01,354] Finished a trial resulted in value: 2.002050415546741. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0

[I 2019-08-18 15:40:38,508] Finished a trial resulted in value: 2.42383938476398. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}.
[I 2019-08-18 15:40:49,424] Finished a trial resulted in value: 0.8893231073461816. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}.
[I 2019-08-18 15:40:49,538] Finished a trial resulted in value: 0.8566780511152926. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 

[I 2019-08-18 15:42:30,402] Finished a trial resulted in value: 1.609593295860789. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}.
[I 2019-08-18 15:42:31,856] Finished a trial resulted in value: 3.0566703187957334. Current best value is 0.7306629020435753 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}.


{'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 60.837748829623166, 'n_units_l1': 27.675266310899964, 'n_units_l2': 152.13327076618842, 'lr': 0.0009568041958378149, 'alpha': 0.0010053023765605766}


In [130]:
rmses, r2s = [], []
rmse, r2 = evaluate_regression(MLPRegressor(hidden_layer_sizes=(61, 28, 152), learning_rate_init=0.001, alpha=0.001, max_iter=1000),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

0.660794779541726 0.8642114094993025


In [131]:
# R2
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['measured log solubility in mols per litre'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 15:46:45,522] Finished a trial resulted in value: 2.140931227727507. Current best value is 2.140931227727507 with parameters: {'regressor': 'MLP', 'n_layers': 1, 'n_units_l0': 142.68295738022042, 'lr': 0.001785514920725926, 'alpha': 1.977303531901813e-05}.
[I 2019-08-18 15:46:49,469] Finished a trial resulted in value: 0.8199581676754127. Current best value is 0.8199581676754127 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 13.144594235838024, 'n_units_l1': 107.45462708422734, 'n_units_l2': 233.740445016075, 'lr': 0.00673495540233891, 'alpha': 4.221274419519423e-05}.
[I 2019-08-18 15:46:55,768] Finished a trial resulted in value: 0.7942510591530612. Current best value is 0.7942510591530612 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 159.07193446217943, 'n_units_l1': 46.379182388601244, 'lr': 0.0016439138463031772, 'alpha': 0.08506603564315626}.
[I 2019-08-18 15:47:02,398] Finished a trial resulted in value: 2.4193138105521417. C

[I 2019-08-18 15:48:34,319] Finished a trial resulted in value: 0.8978021381196148. Current best value is 0.7910534715966173 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 77.45417274362354, 'n_units_l1': 31.59177146032885, 'lr': 0.0006296291325761092, 'alpha': 0.0015792971375486618}.
[I 2019-08-18 15:48:37,756] Finished a trial resulted in value: 0.7993690176485719. Current best value is 0.7910534715966173 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 77.45417274362354, 'n_units_l1': 31.59177146032885, 'lr': 0.0006296291325761092, 'alpha': 0.0015792971375486618}.
[I 2019-08-18 15:48:38,583] Finished a trial resulted in value: 2.2636250009005168. Current best value is 0.7910534715966173 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 77.45417274362354, 'n_units_l1': 31.59177146032885, 'lr': 0.0006296291325761092, 'alpha': 0.0015792971375486618}.
[I 2019-08-18 15:48:39,834] Finished a trial resulted in value: 0.7980334686818655

[I 2019-08-18 15:50:20,523] Finished a trial resulted in value: 0.7937955783551642. Current best value is 0.762054950488069 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 17.641383032314938, 'n_units_l1': 121.12189095435465, 'n_units_l2': 23.989834911643005, 'lr': 0.0056120333760818885, 'alpha': 0.00019379954510889284}.
[I 2019-08-18 15:50:25,088] Finished a trial resulted in value: 0.8019730061003272. Current best value is 0.762054950488069 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 17.641383032314938, 'n_units_l1': 121.12189095435465, 'n_units_l2': 23.989834911643005, 'lr': 0.0056120333760818885, 'alpha': 0.00019379954510889284}.
[I 2019-08-18 15:50:27,160] Finished a trial resulted in value: 3.523571896964662. Current best value is 0.762054950488069 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 17.641383032314938, 'n_units_l1': 121.12189095435465, 'n_units_l2': 23.989834911643005, 'lr': 0.0056120333760818885, 'alpha': 

{'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 17.641383032314938, 'n_units_l1': 121.12189095435465, 'n_units_l2': 23.989834911643005, 'lr': 0.0056120333760818885, 'alpha': 0.00019379954510889284}


In [132]:
rmse, r2 = evaluate_regression(MLPRegressor(hidden_layer_sizes=(18, 121, 24), learning_rate_init=0.0056, alpha=0.0002, max_iter=1000),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

0.7828080358778235 0.8571673815180736


In [133]:
# R3
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['measured log solubility in mols per litre'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 15:55:25,408] Finished a trial resulted in value: 0.8156356244847911. Current best value is 0.8156356244847911 with parameters: {'regressor': 'MLP', 'n_layers': 1, 'n_units_l0': 6.319873816307988, 'lr': 0.009405724637900733, 'alpha': 1.0313616668458457e-06}.
[I 2019-08-18 15:55:26,642] Finished a trial resulted in value: 0.7685166120580087. Current best value is 0.7685166120580087 with parameters: {'regressor': 'MLP', 'n_layers': 1, 'n_units_l0': 55.49864896757794, 'lr': 0.005584038660423042, 'alpha': 0.006611507327003259}.
[I 2019-08-18 15:55:29,898] Finished a trial resulted in value: 2.033501450739607. Current best value is 0.7685166120580087 with parameters: {'regressor': 'MLP', 'n_layers': 1, 'n_units_l0': 55.49864896757794, 'lr': 0.005584038660423042, 'alpha': 0.006611507327003259}.
[I 2019-08-18 15:55:41,651] Finished a trial resulted in value: 0.7801978045063838. Current best value is 0.7685166120580087 with parameters: {'regressor': 'MLP', 'n_layers': 1, 'n_units

[I 2019-08-18 15:57:16,080] Finished a trial resulted in value: 0.7736573217695947. Current best value is 0.7571146071076595 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 61.941740604212505, 'n_units_l1': 13.717320051328366, 'n_units_l2': 16.416621792082225, 'lr': 0.0035136260064463744, 'alpha': 2.541000590518668e-05}.
[I 2019-08-18 15:57:18,268] Finished a trial resulted in value: 2.301567980225644. Current best value is 0.7571146071076595 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 61.941740604212505, 'n_units_l1': 13.717320051328366, 'n_units_l2': 16.416621792082225, 'lr': 0.0035136260064463744, 'alpha': 2.541000590518668e-05}.
[I 2019-08-18 15:57:31,704] Finished a trial resulted in value: 2.0340399545894416. Current best value is 0.7571146071076595 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 61.941740604212505, 'n_units_l1': 13.717320051328366, 'n_units_l2': 16.416621792082225, 'lr': 0.0035136260064463744, 'alpha':

[I 2019-08-18 15:59:04,009] Finished a trial resulted in value: 1.079885988876427. Current best value is 0.7571146071076595 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 61.941740604212505, 'n_units_l1': 13.717320051328366, 'n_units_l2': 16.416621792082225, 'lr': 0.0035136260064463744, 'alpha': 2.541000590518668e-05}.
[I 2019-08-18 15:59:06,358] Finished a trial resulted in value: 0.7919574468695977. Current best value is 0.7571146071076595 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 61.941740604212505, 'n_units_l1': 13.717320051328366, 'n_units_l2': 16.416621792082225, 'lr': 0.0035136260064463744, 'alpha': 2.541000590518668e-05}.
[I 2019-08-18 15:59:09,433] Finished a trial resulted in value: 0.7948798752598903. Current best value is 0.7571146071076595 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 61.941740604212505, 'n_units_l1': 13.717320051328366, 'n_units_l2': 16.416621792082225, 'lr': 0.0035136260064463744, 'alpha':

[I 2019-08-18 16:00:01,005] Finished a trial resulted in value: 0.7672279616838401. Current best value is 0.7561884576522679 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 126.99250184497143, 'n_units_l1': 53.111556314795806, 'n_units_l2': 14.80166300803984, 'lr': 0.00031876392033144464, 'alpha': 1.1224925887191937e-06}.
[I 2019-08-18 16:00:04,250] Finished a trial resulted in value: 0.7821638135879333. Current best value is 0.7561884576522679 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 126.99250184497143, 'n_units_l1': 53.111556314795806, 'n_units_l2': 14.80166300803984, 'lr': 0.00031876392033144464, 'alpha': 1.1224925887191937e-06}.
[I 2019-08-18 16:00:06,359] Finished a trial resulted in value: 0.7235384582271918. Current best value is 0.7235384582271918 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 105.54077769552725, 'n_units_l1': 122.67705872448927, 'n_units_l2': 35.301060529322676, 'lr': 0.0003802655304484301, 'alph

{'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 105.54077769552725, 'n_units_l1': 122.67705872448927, 'n_units_l2': 35.301060529322676, 'lr': 0.0003802655304484301, 'alpha': 4.891680797779517e-06}


In [134]:
rmse, r2 = evaluate_regression(MLPRegressor(hidden_layer_sizes=(106, 123, 35), learning_rate_init=1e-4, alpha=4e-4, max_iter=1000),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

0.7104176213028807 0.8789203069112204


In [138]:
print('RMSE | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(rmses), np.std(rmses)))
print('R2   | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(r2s), np.std(r2s)))

RMSE | MEAN: 0.7180 | STD: 0.0501
R2   | MEAN: 0.8668 | STD: 0.0091


## FreeSolv

In [139]:
df = pd.read_csv('data/freesolv.csv')
print(df.shape)
df.head()

(642, 4)


Unnamed: 0,iupac,smiles,expt,calc
0,"4-methoxy-N,N-dimethyl-benzamide",CN(C)C(=O)c1ccc(cc1)OC,-11.01,-9.625
1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219
2,3-methylbut-1-ene,CC(C)C=C,1.83,2.452
3,2-ethylpyrazine,CCc1cnccn1,-5.45,-5.809
4,heptan-1-ol,CCCCCCCO,-4.21,-2.917


In [140]:
x_split = [split(sm) for sm in df['smiles'].values]
xid, xseg = get_array(x_split)
X = trfm.encode(torch.t(xid))[:,:256]
print(X.shape)

There are 642 molecules. It will take a little time.
(642, 256)


In [142]:
# R1
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['expt'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 16:14:43,117] Finished a trial resulted in value: 2.239786012505541. Current best value is 2.239786012505541 with parameters: {'regressor': 'ridge', 'lr': 0.001962386393838839}.
[I 2019-08-18 16:14:43,147] Finished a trial resulted in value: 2.4937679280309646. Current best value is 2.239786012505541 with parameters: {'regressor': 'ridge', 'lr': 0.001962386393838839}.
[I 2019-08-18 16:14:43,500] Finished a trial resulted in value: 3.741885907364826. Current best value is 2.239786012505541 with parameters: {'regressor': 'ridge', 'lr': 0.001962386393838839}.
[I 2019-08-18 16:14:43,784] Finished a trial resulted in value: 2.096047724464638. Current best value is 2.096047724464638 with parameters: {'regressor': 'LGBM', 'num_leaves': 16, 'min_data_in_leaf': 100, 'max_depth': 8, 'lr': 0.12920672579032424, 'max_bin': 280}.
[I 2019-08-18 16:14:44,005] Finished a trial resulted in value: 3.286690305994146. Current best value is 2.096047724464638 with parameters: {'regressor': 'LGB

[I 2019-08-18 16:15:27,538] Finished a trial resulted in value: 1.6520964664685023. Current best value is 1.619933116635565 with parameters: {'regressor': 'ridge', 'lr': 0.21932610014229692}.
[I 2019-08-18 16:15:27,635] Finished a trial resulted in value: 1.6598593659088088. Current best value is 1.619933116635565 with parameters: {'regressor': 'ridge', 'lr': 0.21932610014229692}.
[I 2019-08-18 16:15:27,728] Finished a trial resulted in value: 1.674079747675599. Current best value is 1.619933116635565 with parameters: {'regressor': 'ridge', 'lr': 0.21932610014229692}.
[I 2019-08-18 16:15:27,761] Finished a trial resulted in value: 1.722028359644771. Current best value is 1.619933116635565 with parameters: {'regressor': 'ridge', 'lr': 0.21932610014229692}.
[I 2019-08-18 16:15:27,900] Finished a trial resulted in value: 1.6272137614122808. Current best value is 1.619933116635565 with parameters: {'regressor': 'ridge', 'lr': 0.21932610014229692}.
[I 2019-08-18 16:15:27,921] Finished a tri

{'regressor': 'ridge', 'lr': 0.23306784850656748}


In [144]:
rmses, r2s = [], []
rmse, r2 = evaluate_regression(Ridge(alpha=0.233, max_iter=1000),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

1.3315136782418537 0.8162533636387914


In [145]:
# R2
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['expt'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 16:17:49,569] Finished a trial resulted in value: 1.9771329556093928. Current best value is 1.9771329556093928 with parameters: {'regressor': 'ridge', 'lr': 0.08339779796353189}.
[I 2019-08-18 16:17:49,592] Finished a trial resulted in value: 1.8879978858411761. Current best value is 1.8879978858411761 with parameters: {'regressor': 'ridge', 'lr': 1.8048265469337135}.
[I 2019-08-18 16:17:50,294] Finished a trial resulted in value: 3.518870198107538. Current best value is 1.8879978858411761 with parameters: {'regressor': 'ridge', 'lr': 1.8048265469337135}.
[I 2019-08-18 16:17:51,373] Finished a trial resulted in value: 3.45533119176083. Current best value is 1.8879978858411761 with parameters: {'regressor': 'ridge', 'lr': 1.8048265469337135}.
[I 2019-08-18 16:17:51,399] Finished a trial resulted in value: 3.0231223165377528. Current best value is 1.8879978858411761 with parameters: {'regressor': 'ridge', 'lr': 1.8048265469337135}.
[I 2019-08-18 16:17:51,440] Finished a tri

[I 2019-08-18 16:18:49,303] Finished a trial resulted in value: 4.233008388423114. Current best value is 1.5018229999061394 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}.
[I 2019-08-18 16:18:52,500] Finished a trial resulted in value: 4.501597483837352. Current best value is 1.5018229999061394 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}.
[I 2019-08-18 16:18:53,178] Finished a trial resulted in value: 2.442603111079527. Current best value is 1.5018229999061394 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}.
[I 2019-08-18 16:18:53,189] Finished a trial resulted in value: 2.0326356203785445. C

[I 2019-08-18 16:19:13,662] Finished a trial resulted in value: 5.808025795054196. Current best value is 1.5018229999061394 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}.
[I 2019-08-18 16:19:15,516] Finished a trial resulted in value: 2.5422817603414187. Current best value is 1.5018229999061394 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}.
[I 2019-08-18 16:19:18,984] Finished a trial resulted in value: 2.030256236926934. Current best value is 1.5018229999061394 with parameters: {'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}.
[I 2019-08-18 16:19:21,714] Finished a trial resulted in value: 3.721877549509574. C

{'regressor': 'MLP', 'n_layers': 2, 'n_units_l0': 13.320665246375722, 'n_units_l1': 69.41150491552574, 'lr': 0.005584532769073192, 'alpha': 1.708113387062951e-05}


In [146]:
rmse, r2 = evaluate_regression(MLPRegressor(hidden_layer_sizes=(13, 69), learning_rate_init=6e-3, alpha=1.7e-5, max_iter=1000),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

1.4936558323751365 0.7541146212763161


In [147]:
# R3
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['expt'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 16:22:57,793] Finished a trial resulted in value: 1.7090753497008506. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:22:57,804] Finished a trial resulted in value: 1.7407598535788895. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:22:57,803] Finished a trial resulted in value: 2.5088305920220018. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:22:58,270] Finished a trial resulted in value: 3.533981111531126. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:22:58,548] Finished a trial resulted in value: 2.526004418494889. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:22:58,669] Finished a trial re

[I 2019-08-18 16:23:19,973] Finished a trial resulted in value: 2.9442242473970768. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:23:20,894] Finished a trial resulted in value: 2.2441180999083628. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:23:21,048] Finished a trial resulted in value: 1.7123441122065715. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:23:21,106] Finished a trial resulted in value: 2.4150483731313757. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:23:21,192] Finished a trial resulted in value: 2.247982312228734. Current best value is 1.7090753497008506 with parameters: {'regressor': 'ridge', 'lr': 0.518126012332248}.
[I 2019-08-18 16:23:21,256] Finished a trial r

[I 2019-08-18 16:24:12,349] Finished a trial resulted in value: 1.7407686259512336. Current best value is 1.6595424580510483 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 13.119719506912197, 'n_units_l1': 104.0894932302287, 'n_units_l2': 192.0026833998186, 'lr': 0.0014441751455692398, 'alpha': 0.0815726312202126}.
[I 2019-08-18 16:24:14,195] Finished a trial resulted in value: 2.029607463960102. Current best value is 1.6595424580510483 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 13.119719506912197, 'n_units_l1': 104.0894932302287, 'n_units_l2': 192.0026833998186, 'lr': 0.0014441751455692398, 'alpha': 0.0815726312202126}.
[I 2019-08-18 16:24:14,477] Finished a trial resulted in value: 1.830740949858326. Current best value is 1.6595424580510483 with parameters: {'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 13.119719506912197, 'n_units_l1': 104.0894932302287, 'n_units_l2': 192.0026833998186, 'lr': 0.0014441751455692398, 'alpha': 0.0815726312

{'regressor': 'MLP', 'n_layers': 3, 'n_units_l0': 13.119719506912197, 'n_units_l1': 104.0894932302287, 'n_units_l2': 192.0026833998186, 'lr': 0.0014441751455692398, 'alpha': 0.0815726312202126}


In [148]:
rmse, r2 = evaluate_regression(MLPRegressor(hidden_layer_sizes=(13, 104, 192), learning_rate_init=1e-3, alpha=0.08, max_iter=1000),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

2.1321125120833133 0.7630091700966755


In [149]:
print('RMSE | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(rmses), np.std(rmses)))
print('R2   | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(r2s), np.std(r2s)))

RMSE | MEAN: 1.6524 | STD: 0.3456
R2   | MEAN: 0.7778 | STD: 0.0274


## Lipo

In [150]:
df = pd.read_csv('data/lipo.csv')
print(df.shape)
df.head()

(4200, 3)


Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


In [151]:
x_split = [split(sm) for sm in df['smiles'].values]
xid, xseg = get_array(x_split)
X = trfm.encode(torch.t(xid))[:,:256]
print(X.shape)
rmses, r2s = [], []

SMILES is too long (251)
SMILES is too long (267)
There are 4200 molecules. It will take a little time.
(4200, 256)


In [152]:
# R1
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['exp'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

  overwrite_a=True).T
  overwrite_a=True).T
[I 2019-08-18 16:29:34,709] Finished a trial resulted in value: 1.0572785064992871. Current best value is 1.0572785064992871 with parameters: {'regressor': 'ridge', 'lr': 0.00020635203126048333}.
[I 2019-08-18 16:29:34,717] Finished a trial resulted in value: 1.0495996024506158. Current best value is 1.0495996024506158 with parameters: {'regressor': 'ridge', 'lr': 1.2480972599297566}.
[I 2019-08-18 16:29:34,747] Finished a trial resulted in value: 1.0571394039696356. Current best value is 1.0495996024506158 with parameters: {'regressor': 'ridge', 'lr': 1.2480972599297566}.
[I 2019-08-18 16:29:34,756] Finished a trial resulted in value: 1.0517048151019113. Current best value is 1.0495996024506158 with parameters: {'regressor': 'ridge', 'lr': 1.2480972599297566}.
  overwrite_a=True).T
[I 2019-08-18 16:29:35,653] Finished a trial resulted in value: 1.0572845302186462. Current best value is 1.0495996024506158 with parameters: {'regressor': 'ridge

[I 2019-08-18 16:32:16,845] Finished a trial resulted in value: 1.0585540548984256. Current best value is 0.9776029610520721 with parameters: {'regressor': 'LGBM', 'num_leaves': 29, 'min_data_in_leaf': 46, 'max_depth': 8, 'lr': 0.08427898777956587, 'max_bin': 107}.
[I 2019-08-18 16:32:21,375] Finished a trial resulted in value: 1.0104908805067427. Current best value is 0.9776029610520721 with parameters: {'regressor': 'LGBM', 'num_leaves': 29, 'min_data_in_leaf': 46, 'max_depth': 8, 'lr': 0.08427898777956587, 'max_bin': 107}.
[I 2019-08-18 16:32:23,639] Finished a trial resulted in value: 1.0532974644495465. Current best value is 0.9776029610520721 with parameters: {'regressor': 'LGBM', 'num_leaves': 29, 'min_data_in_leaf': 46, 'max_depth': 8, 'lr': 0.08427898777956587, 'max_bin': 107}.
[I 2019-08-18 16:32:25,889] Finished a trial resulted in value: 1.0599472079841965. Current best value is 0.9776029610520721 with parameters: {'regressor': 'LGBM', 'num_leaves': 29, 'min_data_in_leaf': 

{'regressor': 'LGBM', 'num_leaves': 31, 'min_data_in_leaf': 19, 'max_depth': 5, 'lr': 0.130073706289035, 'max_bin': 147}


In [154]:
rmse, r2 = evaluate_regression(LGBMRegressor(num_leaves=31, min_data_in_leaf=19, max_depth=5,
                           learning_rate=0.13, max_bin=147),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

0.9114038773899895 -0.5169241818778088


In [155]:
# R2
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['exp'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

  overwrite_a=True).T
[I 2019-08-18 16:35:12,315] Finished a trial resulted in value: 0.9505819100337827. Current best value is 0.9505819100337827 with parameters: {'regressor': 'ridge', 'lr': 0.23343699523240924}.
[I 2019-08-18 16:35:12,318] Finished a trial resulted in value: 0.9651631051647794. Current best value is 0.9505819100337827 with parameters: {'regressor': 'ridge', 'lr': 0.23343699523240924}.
[I 2019-08-18 16:35:12,344] Finished a trial resulted in value: 0.9602388641549501. Current best value is 0.9505819100337827 with parameters: {'regressor': 'ridge', 'lr': 0.23343699523240924}.
[I 2019-08-18 16:35:12,823] Finished a trial resulted in value: 1.159460623302985. Current best value is 0.9505819100337827 with parameters: {'regressor': 'ridge', 'lr': 0.23343699523240924}.
[I 2019-08-18 16:35:13,211] Finished a trial resulted in value: 1.158608165763756. Current best value is 0.9505819100337827 with parameters: {'regressor': 'ridge', 'lr': 0.23343699523240924}.
[I 2019-08-18 1

[I 2019-08-18 16:37:58,897] Finished a trial resulted in value: 0.9309270035818991. Current best value is 0.8688252803162235 with parameters: {'regressor': 'LGBM', 'num_leaves': 100, 'min_data_in_leaf': 44, 'max_depth': 0, 'lr': 0.07566593298688909, 'max_bin': 471}.
[I 2019-08-18 16:38:01,580] Finished a trial resulted in value: 1.0609522720058746. Current best value is 0.8688252803162235 with parameters: {'regressor': 'LGBM', 'num_leaves': 100, 'min_data_in_leaf': 44, 'max_depth': 0, 'lr': 0.07566593298688909, 'max_bin': 471}.
[I 2019-08-18 16:38:27,429] Finished a trial resulted in value: 0.9599224934587656. Current best value is 0.8688252803162235 with parameters: {'regressor': 'LGBM', 'num_leaves': 100, 'min_data_in_leaf': 44, 'max_depth': 0, 'lr': 0.07566593298688909, 'max_bin': 471}.
[I 2019-08-18 16:39:06,854] Finished a trial resulted in value: 0.9490649883017466. Current best value is 0.8688252803162235 with parameters: {'regressor': 'LGBM', 'num_leaves': 100, 'min_data_in_lea

{'regressor': 'LGBM', 'num_leaves': 100, 'min_data_in_leaf': 44, 'max_depth': 0, 'lr': 0.07566593298688909, 'max_bin': 471}


In [156]:
rmse, r2 = evaluate_regression(LGBMRegressor(num_leaves=100, min_data_in_leaf=44, max_depth=0,
                           learning_rate=0.076, max_bin=471),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

0.9417347448303828 -0.5839789817112275


In [157]:
# R3
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, df['exp'].values, 0.8, 0.1)
study = optuna.create_study()
study.optimize(objective_regressor, n_trials=100, n_jobs=8)
print(study.best_params)

[I 2019-08-18 16:41:12,900] Finished a trial resulted in value: 1.0340343546629056. Current best value is 1.0340343546629056 with parameters: {'regressor': 'ridge', 'lr': 0.19609048976211932}.
[I 2019-08-18 16:41:12,905] Finished a trial resulted in value: 1.0360290145268145. Current best value is 1.0340343546629056 with parameters: {'regressor': 'ridge', 'lr': 0.19609048976211932}.
[I 2019-08-18 16:41:14,557] Finished a trial resulted in value: 1.055183024326468. Current best value is 1.0340343546629056 with parameters: {'regressor': 'ridge', 'lr': 0.19609048976211932}.
  overwrite_a=True).T
[I 2019-08-18 16:41:14,607] Finished a trial resulted in value: 1.0480545842804339. Current best value is 1.0340343546629056 with parameters: {'regressor': 'ridge', 'lr': 0.19609048976211932}.
[I 2019-08-18 16:41:16,795] Finished a trial resulted in value: 0.9988159775267536. Current best value is 0.9988159775267536 with parameters: {'regressor': 'LGBM', 'num_leaves': 12, 'min_data_in_leaf': 13, '

[I 2019-08-18 16:42:55,201] Finished a trial resulted in value: 1.150381154454286. Current best value is 0.9486874400747575 with parameters: {'regressor': 'LGBM', 'num_leaves': 69, 'min_data_in_leaf': 77, 'max_depth': 8, 'lr': 0.07710836899579111, 'max_bin': 56}.
[I 2019-08-18 16:43:24,672] Finished a trial resulted in value: 1.020783336824812. Current best value is 0.9486874400747575 with parameters: {'regressor': 'LGBM', 'num_leaves': 69, 'min_data_in_leaf': 77, 'max_depth': 8, 'lr': 0.07710836899579111, 'max_bin': 56}.
[I 2019-08-18 16:43:42,219] Finished a trial resulted in value: 1.017331920179582. Current best value is 0.9486874400747575 with parameters: {'regressor': 'LGBM', 'num_leaves': 69, 'min_data_in_leaf': 77, 'max_depth': 8, 'lr': 0.07710836899579111, 'max_bin': 56}.
[I 2019-08-18 16:43:43,454] Finished a trial resulted in value: 1.0124274839514233. Current best value is 0.9486874400747575 with parameters: {'regressor': 'LGBM', 'num_leaves': 69, 'min_data_in_leaf': 77, 'm

{'regressor': 'LGBM', 'num_leaves': 95, 'min_data_in_leaf': 12, 'max_depth': 0, 'lr': 0.031342906079087025, 'max_bin': 144}


In [158]:
rmse, r2 = evaluate_regression(LGBMRegressor(num_leaves=95, min_data_in_leaf=12, max_depth=0,
                           learning_rate=0.03, max_bin=144),
                    X_train, X_val, X_test, y_train, y_val, y_test)
rmses.append(rmse)
r2s.append(r2)
print(rmse, r2)

0.9197848251994364 -1.034599012765328


In [159]:
print('RMSE | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(rmses), np.std(rmses)))
print('R2   | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(r2s), np.std(r2s)))

RMSE | MEAN: 0.9211 | STD: 0.0124
R2   | MEAN: -0.6631 | STD: 0.2162


## HIV

In [24]:
dataset = load_csv('HIV')

Loading raw samples now.
shard_size: 8192
About to start loading CSV from data/hiv.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 35.115 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 36.497 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 38.041 s
Loading shard 4 of size 8192.
Featurizing sample 0
Featuri

In [25]:
aucs = []

In [26]:
# R1
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=8)
print(study.best_params)

TIMING: dataset construction took 53.094 s
Loading dataset from disk.
TIMING: dataset construction took 15.898 s
Loading dataset from disk.
TIMING: dataset construction took 11.779 s
Loading dataset from disk.
SMILES is too long (226)
SMILES is too long (244)
SMILES is too long (243)
SMILES is too long (346)
SMILES is too long (240)
SMILES is too long (370)
SMILES is too long (224)
SMILES is too long (283)
SMILES is too long (265)
SMILES is too long (240)
SMILES is too long (219)
SMILES is too long (246)
SMILES is too long (243)
SMILES is too long (284)
SMILES is too long (270)
SMILES is too long (232)
SMILES is too long (260)
SMILES is too long (284)
SMILES is too long (284)
SMILES is too long (439)
SMILES is too long (491)
SMILES is too long (439)
SMILES is too long (296)
SMILES is too long (341)
SMILES is too long (285)
SMILES is too long (327)
SMILES is too long (341)
SMILES is too long (400)
SMILES is too long (263)
SMILES is too long (238)
SMILES is too long (383)
SMILES is too l

  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:29:12,619] Finished trial#0 resulted in value: -0.7400609935332159. Current best value is -0.7400609935332159 with parameters: {'C': 8.46381444440547, 'classifier': 'ridge'}.
[I 2019-08-18 18:29:18,980] Finished trial#1 resulted in value: -0.750153096217911. Current best value is -0.750153096217911 with parameters: {'C': 0.6231597036256558, 'classifier': 'ridge'}.
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:29:26,210] Finished trial#2 resulted in value: -0.7177824319028023. Current best value is -0.750153096217911 with parameters: {'C': 0.6231597036256558, 'classifier': 'ridge'}.
[I 2019-08-18 18:29:39,042] Finished trial#3 resulted in value: -0.6852066186556925. Current best value is -0.750153096217911 with parameters: {'C': 0.6231597036256558, 'classifier': 'ridge'}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:29:43,016] Finished trial#4 resulted in value: -0.7233903463648835. Current best 

  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:31:51,704] Finished trial#25 resulted in value: -0.7426758769351361. Current best value is -0.7750434793258867 with parameters: {'max_depth': -1, 'num_leaves': 86, 'max_bin': 286, 'lr': 0.11162737852297755, 'classifier': 'LGBM', 'min_data_in_leaf': 66}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:31:55,454] Finished trial#26 resulted in value: -0.7481659073094259. Current best value is -0.7750434793258867 with parameters: {'max_depth': -1, 'num_leaves': 86, 'max_bin': 286, 'lr': 0.11162737852297755, 'classifier': 'LGBM', 'min_data_in_leaf': 66}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:31:57,065] Finished trial#27 resulted in value: -0.7275698731138546. Current best value is -0.7750434793258867 with parameters: {'max_depth': -1, 'num_leaves': 86, 'max_bin': 286, 'lr': 0.11162737852297755, 'classifier': 'LGBM', 'min_data_in_leaf': 66}.
  y = column_or_1d(y, 

  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:33:45,156] Finished trial#39 resulted in value: -0.7697004213207916. Current best value is -0.8031427591612776 with parameters: {'max_depth': -1, 'num_leaves': 100, 'max_bin': 374, 'lr': 0.055484165866305536, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:33:51,894] Finished trial#40 resulted in value: -0.7994898834019204. Current best value is -0.8031427591612776 with parameters: {'max_depth': -1, 'num_leaves': 100, 'max_bin': 374, 'lr': 0.055484165866305536, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:33:57,918] Finished trial#41 resulted in value: -0.7993643445032333. Current best value is -0.8031427591612776 with parameters: {'max_depth': -1, 'num_leaves': 100, 'max_bin': 374, 'lr': 0.055484165866305536, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
  y = column_or_

[I 2019-08-18 18:35:19,320] Finished trial#58 resulted in value: -0.7428167254556144. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:35:31,418] Finished trial#59 resulted in value: -0.7681265922006664. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:35:38,800] Finished trial#60 resulted in value: -0.8035959239662943. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, 

[I 2019-08-18 18:36:16,773] Finished trial#68 resulted in value: -0.754843964334705. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:36:27,664] Finished trial#69 resulted in value: -0.7339861356065059. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:36:34,267] Finished trial#70 resulted in value: -0.5234160665294925. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:36:40,

[I 2019-08-18 18:38:21,639] Finished trial#88 resulted in value: -0.7817521555947482. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:38:24,351] Finished trial#89 resulted in value: -0.7521586566725456. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:38:32,517] Finished trial#90 resulted in value: -0.793503821281599. Current best value is -0.8054208308837939 with parameters: {'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[I 2019-08-18 18:38:42,

{'max_depth': -1, 'num_leaves': 93, 'max_bin': 382, 'lr': 0.10518384033697922, 'classifier': 'LGBM', 'min_data_in_leaf': 72}


In [27]:
auc = evaluate_classification(LGBMClassifier(num_leaves=93, min_data_in_leaf=72, max_depth=-1,
                           learning_rate=0.1, max_bin=382),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.7420266903570946


In [32]:
# R2
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=8)
print(study.best_params)

TIMING: dataset construction took 52.165 s
Loading dataset from disk.
TIMING: dataset construction took 16.157 s
Loading dataset from disk.
TIMING: dataset construction took 12.073 s
Loading dataset from disk.
SMILES is too long (226)
SMILES is too long (244)
SMILES is too long (243)
SMILES is too long (346)
SMILES is too long (240)
SMILES is too long (370)
SMILES is too long (224)
SMILES is too long (283)
SMILES is too long (265)
SMILES is too long (240)
SMILES is too long (219)
SMILES is too long (246)
SMILES is too long (243)
SMILES is too long (284)
SMILES is too long (270)
SMILES is too long (232)
SMILES is too long (260)
SMILES is too long (284)
SMILES is too long (284)
SMILES is too long (439)
SMILES is too long (491)
SMILES is too long (439)
SMILES is too long (296)
SMILES is too long (341)
SMILES is too long (285)
SMILES is too long (327)
SMILES is too long (341)
SMILES is too long (400)
SMILES is too long (263)
SMILES is too long (238)
SMILES is too long (383)
SMILES is too l

  **self._backend_args)
[I 2019-08-18 18:54:42,284] Finished trial#6 resulted in value: -0.7667946551048402. Current best value is -0.7667946551048402 with parameters: {'max_depth': 3, 'num_leaves': 31, 'max_bin': 100, 'lr': 0.05196371475095054, 'classifier': 'LGBM', 'min_data_in_leaf': 17}.
[I 2019-08-18 18:54:47,147] Finished trial#0 resulted in value: -0.6704098691945913. Current best value is -0.7667946551048402 with parameters: {'max_depth': 3, 'num_leaves': 31, 'max_bin': 100, 'lr': 0.05196371475095054, 'classifier': 'LGBM', 'min_data_in_leaf': 17}.
[I 2019-08-18 18:54:56,781] Finished trial#2 resulted in value: -0.5. Current best value is -0.7667946551048402 with parameters: {'max_depth': 3, 'num_leaves': 31, 'max_bin': 100, 'lr': 0.05196371475095054, 'classifier': 'LGBM', 'min_data_in_leaf': 17}.
[I 2019-08-18 18:55:05,477] Finished trial#9 resulted in value: -0.7128068048206937. Current best value is -0.7667946551048402 with parameters: {'max_depth': 3, 'num_leaves': 31, 'max_

[I 2019-08-18 19:00:24,941] Finished trial#56 resulted in value: -0.7398619684499315. Current best value is -0.7733471732314324 with parameters: {'max_depth': -1, 'num_leaves': 24, 'max_bin': 314, 'lr': 0.030901030437331257, 'classifier': 'LGBM', 'min_data_in_leaf': 55}.
[I 2019-08-18 19:00:26,089] Finished trial#53 resulted in value: -0.7606769302371152. Current best value is -0.7733471732314324 with parameters: {'max_depth': -1, 'num_leaves': 24, 'max_bin': 314, 'lr': 0.030901030437331257, 'classifier': 'LGBM', 'min_data_in_leaf': 55}.
[I 2019-08-18 19:00:31,201] Finished trial#61 resulted in value: -0.7376451352145796. Current best value is -0.7733471732314324 with parameters: {'max_depth': -1, 'num_leaves': 24, 'max_bin': 314, 'lr': 0.030901030437331257, 'classifier': 'LGBM', 'min_data_in_leaf': 55}.
[I 2019-08-18 19:00:36,628] Finished trial#60 resulted in value: -0.7534875318440133. Current best value is -0.7733471732314324 with parameters: {'max_depth': -1, 'num_leaves': 24, 'ma

{'max_depth': 3, 'num_leaves': 33, 'max_bin': 80, 'lr': 0.08438026049287423, 'classifier': 'LGBM', 'min_data_in_leaf': 74}


In [33]:
auc = evaluate_classification(LGBMClassifier(num_leaves=33, min_data_in_leaf=74, max_depth=3,
                           learning_rate=0.08, max_bin=80),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7132853087158886


In [34]:
# R3
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=8)
print(study.best_params)

TIMING: dataset construction took 54.427 s
Loading dataset from disk.
TIMING: dataset construction took 16.149 s
Loading dataset from disk.
TIMING: dataset construction took 11.532 s
Loading dataset from disk.
SMILES is too long (226)
SMILES is too long (244)
SMILES is too long (243)
SMILES is too long (346)
SMILES is too long (240)
SMILES is too long (370)
SMILES is too long (224)
SMILES is too long (283)
SMILES is too long (265)
SMILES is too long (240)
SMILES is too long (219)
SMILES is too long (246)
SMILES is too long (243)
SMILES is too long (284)
SMILES is too long (270)
SMILES is too long (232)
SMILES is too long (260)
SMILES is too long (284)
SMILES is too long (284)
SMILES is too long (439)
SMILES is too long (491)
SMILES is too long (439)
SMILES is too long (296)
SMILES is too long (341)
SMILES is too long (285)
SMILES is too long (327)
SMILES is too long (341)
SMILES is too long (400)
SMILES is too long (263)
SMILES is too long (238)
SMILES is too long (383)
SMILES is too l

  **self._backend_args)
[I 2019-08-18 19:23:33,412] Finished trial#6 resulted in value: -0.7685797570056829. Current best value is -0.7685797570056829 with parameters: {'max_depth': 7, 'num_leaves': 18, 'max_bin': 333, 'lr': 0.12661128192332385, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-18 19:23:50,400] Finished trial#8 resulted in value: -0.7647860327258474. Current best value is -0.7685797570056829 with parameters: {'max_depth': 7, 'num_leaves': 18, 'max_bin': 333, 'lr': 0.12661128192332385, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-18 19:24:03,708] Finished trial#0 resulted in value: -0.697215485988634. Current best value is -0.7685797570056829 with parameters: {'max_depth': 7, 'num_leaves': 18, 'max_bin': 333, 'lr': 0.12661128192332385, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-18 19:24:33,549] Finished trial#7 resulted in value: -0.7344515481089556. Current best value is -0.7685797570056829 with parameters: {'max_depth': 7, 'num_l

[I 2019-08-18 19:31:14,200] Finished trial#62 resulted in value: -0.6923546810699588. Current best value is -0.7972056878306878 with parameters: {'max_depth': 5, 'num_leaves': 69, 'max_bin': 103, 'lr': 0.17587268572211712, 'classifier': 'LGBM', 'min_data_in_leaf': 70}.
[I 2019-08-18 19:32:18,760] Finished trial#60 resulted in value: -0.678605109739369. Current best value is -0.7972056878306878 with parameters: {'max_depth': 5, 'num_leaves': 69, 'max_bin': 103, 'lr': 0.17587268572211712, 'classifier': 'LGBM', 'min_data_in_leaf': 70}.
[I 2019-08-18 19:32:22,557] Finished trial#64 resulted in value: -0.6767710170487948. Current best value is -0.7972056878306878 with parameters: {'max_depth': 5, 'num_leaves': 69, 'max_bin': 103, 'lr': 0.17587268572211712, 'classifier': 'LGBM', 'min_data_in_leaf': 70}.
[I 2019-08-18 19:32:46,685] Finished trial#66 resulted in value: -0.7616475602586713. Current best value is -0.7972056878306878 with parameters: {'max_depth': 5, 'num_leaves': 69, 'max_bin': 

{'max_depth': 5, 'num_leaves': 69, 'max_bin': 103, 'lr': 0.17587268572211712, 'classifier': 'LGBM', 'min_data_in_leaf': 70}


In [37]:
auc = evaluate_classification(LGBMClassifier(num_leaves=69, min_data_in_leaf=70, max_depth=5,
                           learning_rate=0.18, max_bin=103),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7305548581471252


In [38]:
print('AUC  | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(aucs), np.std(aucs)))

AUC  | MEAN: 0.7291 | STD: 0.0103


## BACE

In [39]:
dataset = load_csv('BACE')
aucs = []

Loading raw samples now.
shard_size: 8192
About to start loading CSV from data/bace.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 8.568 s
TIMING: dataset construction took 10.969 s
Loading dataset from disk.


In [44]:
# R1
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=1)
print(study.best_params)

TIMING: dataset construction took 2.438 s
Loading dataset from disk.
TIMING: dataset construction took 1.119 s
Loading dataset from disk.
TIMING: dataset construction took 1.122 s
Loading dataset from disk.
There are 1210 molecules. It will take a little time.
There are 151 molecules. It will take a little time.
There are 152 molecules. It will take a little time.


[I 2019-08-19 19:33:03,817] Finished trial#0 resulted in value: -0.5119047619047619. Current best value is -0.5119047619047619 with parameters: {'n_layers': 1, 'classifier': 'MLP', 'lr': 1.620007446839598e-05, 'alpha': 0.01050190705183941, 'n_units_l0': 5.290230888118444}.
[I 2019-08-19 19:33:04,072] Finished trial#1 resulted in value: -0.47219260838663835. Current best value is -0.5119047619047619 with parameters: {'n_layers': 1, 'classifier': 'MLP', 'lr': 1.620007446839598e-05, 'alpha': 0.01050190705183941, 'n_units_l0': 5.290230888118444}.
[I 2019-08-19 19:33:04,816] Finished trial#2 resulted in value: -0.582089552238806. Current best value is -0.582089552238806 with parameters: {'n_layers': 1, 'classifier': 'MLP', 'lr': 0.0009966865813156288, 'alpha': 5.7744926773770855e-06, 'n_units_l0': 19.66849070960734}.
[I 2019-08-19 19:33:10,447] Finished trial#3 resulted in value: -0.5426439232409381. Current best value is -0.582089552238806 with parameters: {'n_layers': 1, 'classifier': 'ML

[I 2019-08-19 19:33:29,876] Finished trial#30 resulted in value: -0.6902985074626865. Current best value is -0.6983830845771144 with parameters: {'max_depth': 7, 'num_leaves': 83, 'max_bin': 253, 'lr': 0.04339243955525369, 'classifier': 'LGBM', 'min_data_in_leaf': 58}.
[I 2019-08-19 19:33:30,277] Finished trial#31 resulted in value: -0.6736851457000711. Current best value is -0.6983830845771144 with parameters: {'max_depth': 7, 'num_leaves': 83, 'max_bin': 253, 'lr': 0.04339243955525369, 'classifier': 'LGBM', 'min_data_in_leaf': 58}.
[I 2019-08-19 19:33:30,823] Finished trial#32 resulted in value: -0.6874555792466239. Current best value is -0.6983830845771144 with parameters: {'max_depth': 7, 'num_leaves': 83, 'max_bin': 253, 'lr': 0.04339243955525369, 'classifier': 'LGBM', 'min_data_in_leaf': 58}.
[I 2019-08-19 19:33:31,257] Finished trial#33 resulted in value: -0.6759950248756218. Current best value is -0.6983830845771144 with parameters: {'max_depth': 7, 'num_leaves': 83, 'max_bin':

[I 2019-08-19 19:34:10,202] Finished trial#60 resulted in value: -0.6753731343283582. Current best value is -0.7003375977256574 with parameters: {'max_depth': 7, 'num_leaves': 59, 'max_bin': 339, 'lr': 0.15158733847740805, 'classifier': 'LGBM', 'min_data_in_leaf': 73}.
[I 2019-08-19 19:34:13,293] Finished trial#61 resulted in value: -0.6608919687277897. Current best value is -0.7003375977256574 with parameters: {'max_depth': 7, 'num_leaves': 59, 'max_bin': 339, 'lr': 0.15158733847740805, 'classifier': 'LGBM', 'min_data_in_leaf': 73}.
[I 2019-08-19 19:34:16,103] Finished trial#62 resulted in value: -0.687455579246624. Current best value is -0.7003375977256574 with parameters: {'max_depth': 7, 'num_leaves': 59, 'max_bin': 339, 'lr': 0.15158733847740805, 'classifier': 'LGBM', 'min_data_in_leaf': 73}.
[I 2019-08-19 19:34:19,195] Finished trial#63 resulted in value: -0.6903873489694385. Current best value is -0.7003375977256574 with parameters: {'max_depth': 7, 'num_leaves': 59, 'max_bin': 

[I 2019-08-19 19:34:48,008] Finished trial#90 resulted in value: -0.6940298507462687. Current best value is -0.7146410803127221 with parameters: {'max_depth': 2, 'num_leaves': 71, 'max_bin': 286, 'lr': 0.15117751742537208, 'classifier': 'LGBM', 'min_data_in_leaf': 63}.
[I 2019-08-19 19:34:49,971] Finished trial#91 resulted in value: -0.6899431414356788. Current best value is -0.7146410803127221 with parameters: {'max_depth': 2, 'num_leaves': 71, 'max_bin': 286, 'lr': 0.15117751742537208, 'classifier': 'LGBM', 'min_data_in_leaf': 63}.
[I 2019-08-19 19:34:51,852] Finished trial#92 resulted in value: -0.6909203980099502. Current best value is -0.7146410803127221 with parameters: {'max_depth': 2, 'num_leaves': 71, 'max_bin': 286, 'lr': 0.15117751742537208, 'classifier': 'LGBM', 'min_data_in_leaf': 63}.
[I 2019-08-19 19:34:52,356] Finished trial#93 resulted in value: -0.7295664534470505. Current best value is -0.7295664534470505 with parameters: {'max_depth': 2, 'num_leaves': 71, 'max_bin':

{'max_depth': 2, 'num_leaves': 71, 'max_bin': 268, 'lr': 0.9264504476679438, 'classifier': 'LGBM', 'min_data_in_leaf': 53}


In [45]:
auc = evaluate_classification(LGBMClassifier(num_leaves=71, min_data_in_leaf=53, max_depth=2,
                           learning_rate=0.93, max_bin=268),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7132246376811594


In [46]:
# R2
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=1)
print(study.best_params)

TIMING: dataset construction took 2.521 s
Loading dataset from disk.
TIMING: dataset construction took 1.182 s
Loading dataset from disk.
TIMING: dataset construction took 1.190 s
Loading dataset from disk.
There are 1210 molecules. It will take a little time.
There are 151 molecules. It will take a little time.
There are 152 molecules. It will take a little time.


[I 2019-08-19 19:36:45,287] Finished trial#0 resulted in value: -0.6353056147832268. Current best value is -0.6353056147832268 with parameters: {'n_units_l2': 5.6251949404687105, 'lr': 0.0036299200369930723, 'n_units_l0': 122.39645231210204, 'n_layers': 3, 'n_units_l1': 48.50889906625687, 'classifier': 'MLP', 'alpha': 0.00012799311180379443}.
[I 2019-08-19 19:36:45,713] Finished trial#1 resulted in value: -0.665955934612651. Current best value is -0.665955934612651 with parameters: {'C': 3.8267800564996213, 'classifier': 'ridge'}.
[I 2019-08-19 19:36:46,362] Finished trial#2 resulted in value: -0.6162046908315565. Current best value is -0.665955934612651 with parameters: {'C': 3.8267800564996213, 'classifier': 'ridge'}.
[I 2019-08-19 19:36:46,847] Finished trial#3 resulted in value: -0.6106965174129353. Current best value is -0.665955934612651 with parameters: {'C': 3.8267800564996213, 'classifier': 'ridge'}.
[I 2019-08-19 19:36:47,413] Finished trial#4 resulted in value: -0.6174484719

[I 2019-08-19 19:37:30,523] Finished trial#39 resulted in value: -0.7034470504619759. Current best value is -0.7034470504619759 with parameters: {'max_depth': 2, 'num_leaves': 82, 'max_bin': 379, 'lr': 0.15112184913932025, 'classifier': 'LGBM', 'min_data_in_leaf': 96}.
[I 2019-08-19 19:37:30,982] Finished trial#40 resulted in value: -0.6923418621179815. Current best value is -0.7034470504619759 with parameters: {'max_depth': 2, 'num_leaves': 82, 'max_bin': 379, 'lr': 0.15112184913932025, 'classifier': 'LGBM', 'min_data_in_leaf': 96}.
[I 2019-08-19 19:37:31,729] Finished trial#41 resulted in value: -0.6810589907604834. Current best value is -0.7034470504619759 with parameters: {'max_depth': 2, 'num_leaves': 82, 'max_bin': 379, 'lr': 0.15112184913932025, 'classifier': 'LGBM', 'min_data_in_leaf': 96}.
[I 2019-08-19 19:37:32,249] Finished trial#42 resulted in value: -0.716684434968017. Current best value is -0.716684434968017 with parameters: {'max_depth': 0, 'num_leaves': 100, 'max_bin': 

[I 2019-08-19 19:37:54,062] Finished trial#69 resulted in value: -0.6756396588486141. Current best value is -0.716684434968017 with parameters: {'max_depth': 0, 'num_leaves': 100, 'max_bin': 39, 'lr': 0.2970471450384615, 'classifier': 'LGBM', 'min_data_in_leaf': 74}.
[I 2019-08-19 19:37:54,535] Finished trial#70 resulted in value: -0.634683724235963. Current best value is -0.716684434968017 with parameters: {'max_depth': 0, 'num_leaves': 100, 'max_bin': 39, 'lr': 0.2970471450384615, 'classifier': 'LGBM', 'min_data_in_leaf': 74}.
[I 2019-08-19 19:37:56,256] Finished trial#71 resulted in value: -0.693230277185501. Current best value is -0.716684434968017 with parameters: {'max_depth': 0, 'num_leaves': 100, 'max_bin': 39, 'lr': 0.2970471450384615, 'classifier': 'LGBM', 'min_data_in_leaf': 74}.
[I 2019-08-19 19:37:57,033] Finished trial#72 resulted in value: -0.6646233120113717. Current best value is -0.716684434968017 with parameters: {'max_depth': 0, 'num_leaves': 100, 'max_bin': 39, 'lr

[I 2019-08-19 19:38:25,612] Finished trial#99 resulted in value: -0.6825692963752665. Current best value is -0.7437810945273632 with parameters: {'max_depth': 2, 'num_leaves': 48, 'max_bin': 346, 'lr': 0.5006902838774282, 'classifier': 'LGBM', 'min_data_in_leaf': 89}.


{'max_depth': 2, 'num_leaves': 48, 'max_bin': 346, 'lr': 0.5006902838774282, 'classifier': 'LGBM', 'min_data_in_leaf': 89}


In [47]:
auc = evaluate_classification(LGBMClassifier(num_leaves=48, min_data_in_leaf=89, max_depth=2,
                           learning_rate=0.5, max_bin=346),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7327898550724639


In [48]:
# R3
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=1)
print(study.best_params)

TIMING: dataset construction took 2.538 s
Loading dataset from disk.
TIMING: dataset construction took 1.163 s
Loading dataset from disk.
TIMING: dataset construction took 1.182 s
Loading dataset from disk.
There are 1210 molecules. It will take a little time.
There are 151 molecules. It will take a little time.
There are 152 molecules. It will take a little time.


[I 2019-08-19 19:40:02,335] Finished trial#0 resulted in value: -0.5569474058280027. Current best value is -0.5569474058280027 with parameters: {'max_depth': 6, 'num_leaves': 95, 'max_bin': 29, 'lr': 3.3524054566204284e-05, 'classifier': 'LGBM', 'min_data_in_leaf': 45}.
[I 2019-08-19 19:40:03,197] Finished trial#1 resulted in value: -0.5904406538734898. Current best value is -0.5904406538734898 with parameters: {'lr': 0.001146754816516162, 'classifier': 'MLP', 'n_layers': 2, 'n_units_l1': 11.501489377561954, 'n_units_l0': 5.1124720819474, 'alpha': 0.0001474814192002621}.
[I 2019-08-19 19:40:04,023] Finished trial#2 resulted in value: -0.5708066808813077. Current best value is -0.5904406538734898 with parameters: {'lr': 0.001146754816516162, 'classifier': 'MLP', 'n_layers': 2, 'n_units_l1': 11.501489377561954, 'n_units_l0': 5.1124720819474, 'alpha': 0.0001474814192002621}.
[I 2019-08-19 19:40:04,641] Finished trial#3 resulted in value: -0.6883439943141436. Current best value is -0.68834

[I 2019-08-19 19:40:43,312] Finished trial#30 resulted in value: -0.6603589196872779. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:40:43,759] Finished trial#31 resulted in value: -0.7043354655294954. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:40:44,497] Finished trial#32 resulted in value: -0.7095771144278606. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:40:45,203] Finished trial#33 resulted in value: -0.6830135039090264. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin':

[I 2019-08-19 19:41:24,420] Finished trial#60 resulted in value: -0.662224591329069. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:41:25,108] Finished trial#61 resulted in value: -0.6703091684434969. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:41:25,771] Finished trial#62 resulted in value: -0.6701314854299928. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:41:26,627] Finished trial#63 resulted in value: -0.6812366737739872. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 

[I 2019-08-19 19:42:00,931] Finished trial#90 resulted in value: -0.6695984363894811. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:42:01,866] Finished trial#91 resulted in value: -0.7054015636105188. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:42:03,894] Finished trial#92 resulted in value: -0.6802594171997157. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}.
[I 2019-08-19 19:42:04,920] Finished trial#93 resulted in value: -0.7113539445628997. Current best value is -0.7210376687988629 with parameters: {'max_depth': 8, 'num_leaves': 47, 'max_bin':

{'max_depth': 8, 'num_leaves': 47, 'max_bin': 422, 'lr': 0.9097699444782519, 'classifier': 'LGBM', 'min_data_in_leaf': 100}


In [49]:
auc = evaluate_classification(LGBMClassifier(num_leaves=47, min_data_in_leaf=100, max_depth=8,
                           learning_rate=0.9, max_bin=422),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7248188405797101


In [50]:
print('AUC  | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(aucs), np.std(aucs)))

AUC  | MEAN: 0.7008 | STD: 0.0287


## BBBP

In [51]:
dataset = load_csv('BBBP')
aucs = []

Loading raw samples now.
shard_size: 8192
About to start loading CSV from data/bbbp.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
TIMING: featurizing shard 0 took 9.117 s
TIMING: dataset construction took 11.216 s
Loading dataset from disk.


In [52]:
# R1
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=1)
print(study.best_params)

TIMING: dataset construction took 2.465 s
Loading dataset from disk.
TIMING: dataset construction took 1.149 s
Loading dataset from disk.
TIMING: dataset construction took 1.137 s
Loading dataset from disk.
There are 1631 molecules. It will take a little time.
SMILES is too long (380)
SMILES is too long (332)
There are 204 molecules. It will take a little time.
SMILES is too long (256)
SMILES is too long (239)
SMILES is too long (258)
There are 204 molecules. It will take a little time.


[I 2019-08-19 19:49:31,139] Finished trial#0 resulted in value: -0.9665178571428572. Current best value is -0.9665178571428572 with parameters: {'n_layers': 1, 'classifier': 'MLP', 'lr': 0.0008550603348564699, 'alpha': 9.730116049171839e-06, 'n_units_l0': 65.52805169326021}.
[I 2019-08-19 19:49:39,247] Finished trial#1 resulted in value: -0.984083850931677. Current best value is -0.984083850931677 with parameters: {'lr': 0.0003674812688028495, 'classifier': 'MLP', 'n_layers': 2, 'n_units_l1': 31.57866388038685, 'n_units_l0': 46.603788913269234, 'alpha': 9.004648557542518e-05}.
[I 2019-08-19 19:49:39,855] Finished trial#2 resulted in value: -0.9605007763975156. Current best value is -0.984083850931677 with parameters: {'lr': 0.0003674812688028495, 'classifier': 'MLP', 'n_layers': 2, 'n_units_l1': 31.57866388038685, 'n_units_l0': 46.603788913269234, 'alpha': 9.004648557542518e-05}.
[I 2019-08-19 19:49:40,114] Finished trial#3 resulted in value: -0.9833074534161491. Current best value is 

[I 2019-08-19 19:50:04,296] Finished trial#29 resulted in value: -0.6217003105590062. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:50:05,269] Finished trial#30 resulted in value: -0.9908773291925466. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:50:05,865] Finished trial#31 resulted in value: -0.9925271739130435. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:50:06,431] Finished trial#32 resulted in value: -0.9899068322981366. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin':

[I 2019-08-19 19:50:36,171] Finished trial#59 resulted in value: -0.9932065217391305. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:50:36,902] Finished trial#60 resulted in value: -0.9922360248447204. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:50:37,516] Finished trial#61 resulted in value: -0.9902950310559007. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:50:38,166] Finished trial#62 resulted in value: -0.9908773291925467. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin':

[I 2019-08-19 19:51:00,696] Finished trial#89 resulted in value: -0.9902950310559007. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:51:01,350] Finished trial#90 resulted in value: -0.9921389751552796. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:51:01,972] Finished trial#91 resulted in value: -0.9908773291925467. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}.
[I 2019-08-19 19:51:02,508] Finished trial#92 resulted in value: -0.9879658385093169. Current best value is -0.9942740683229814 with parameters: {'max_depth': 7, 'num_leaves': 99, 'max_bin':

{'max_depth': 7, 'num_leaves': 99, 'max_bin': 343, 'lr': 0.13060745356784811, 'classifier': 'LGBM', 'min_data_in_leaf': 94}


In [53]:
auc = evaluate_classification(LGBMClassifier(num_leaves=99, min_data_in_leaf=94, max_depth=7,
                           learning_rate=0.13, max_bin=343),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7054629540418152


In [54]:
# R2
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=1)
print(study.best_params)

TIMING: dataset construction took 2.368 s
Loading dataset from disk.
TIMING: dataset construction took 1.182 s
Loading dataset from disk.
TIMING: dataset construction took 1.157 s
Loading dataset from disk.
There are 1631 molecules. It will take a little time.
SMILES is too long (380)
SMILES is too long (332)
There are 204 molecules. It will take a little time.
SMILES is too long (256)
SMILES is too long (239)
SMILES is too long (258)
There are 204 molecules. It will take a little time.


[I 2019-08-19 19:53:35,195] Finished trial#0 resulted in value: -0.9817546583850932. Current best value is -0.9817546583850932 with parameters: {'C': 1.800362262893563, 'classifier': 'ridge'}.
[I 2019-08-19 19:53:38,591] Finished trial#1 resulted in value: -0.9781638198757764. Current best value is -0.9817546583850932 with parameters: {'C': 1.800362262893563, 'classifier': 'ridge'}.
[I 2019-08-19 19:53:39,218] Finished trial#2 resulted in value: -0.9927212732919255. Current best value is -0.9927212732919255 with parameters: {'max_depth': -1, 'num_leaves': 83, 'max_bin': 61, 'lr': 0.29767086440348356, 'classifier': 'LGBM', 'min_data_in_leaf': 28}.
[I 2019-08-19 19:53:39,976] Finished trial#3 resulted in value: -0.90625. Current best value is -0.9927212732919255 with parameters: {'max_depth': -1, 'num_leaves': 83, 'max_bin': 61, 'lr': 0.29767086440348356, 'classifier': 'LGBM', 'min_data_in_leaf': 28}.
[I 2019-08-19 19:53:40,727] Finished trial#4 resulted in value: -0.9546292701863355. Cu

[I 2019-08-19 19:54:01,929] Finished trial#30 resulted in value: -0.9874805900621118. Current best value is -0.9927212732919255 with parameters: {'max_depth': -1, 'num_leaves': 83, 'max_bin': 61, 'lr': 0.29767086440348356, 'classifier': 'LGBM', 'min_data_in_leaf': 28}.
[I 2019-08-19 19:54:02,415] Finished trial#31 resulted in value: -0.9897127329192547. Current best value is -0.9927212732919255 with parameters: {'max_depth': -1, 'num_leaves': 83, 'max_bin': 61, 'lr': 0.29767086440348356, 'classifier': 'LGBM', 'min_data_in_leaf': 28}.
[I 2019-08-19 19:54:02,835] Finished trial#32 resulted in value: -0.9895186335403727. Current best value is -0.9927212732919255 with parameters: {'max_depth': -1, 'num_leaves': 83, 'max_bin': 61, 'lr': 0.29767086440348356, 'classifier': 'LGBM', 'min_data_in_leaf': 28}.
[I 2019-08-19 19:54:03,505] Finished trial#33 resulted in value: -0.9758346273291926. Current best value is -0.9927212732919255 with parameters: {'max_depth': -1, 'num_leaves': 83, 'max_bin'

[I 2019-08-19 19:54:31,423] Finished trial#60 resulted in value: -0.9905861801242235. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}.
[I 2019-08-19 19:54:32,029] Finished trial#61 resulted in value: -0.9922360248447205. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}.
[I 2019-08-19 19:54:32,628] Finished trial#62 resulted in value: -0.9880628881987576. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}.
[I 2019-08-19 19:54:33,646] Finished trial#63 resulted in value: -0.9918478260869565. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin':

[I 2019-08-19 19:54:55,204] Finished trial#90 resulted in value: -0.9915566770186336. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}.
[I 2019-08-19 19:54:55,616] Finished trial#91 resulted in value: -0.985442546583851. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}.
[I 2019-08-19 19:54:56,704] Finished trial#92 resulted in value: -0.9915566770186336. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}.
[I 2019-08-19 19:54:57,219] Finished trial#93 resulted in value: -0.9908773291925466. Current best value is -0.9943711180124224 with parameters: {'max_depth': 6, 'num_leaves': 56, 'max_bin': 

{'max_depth': 6, 'num_leaves': 56, 'max_bin': 262, 'lr': 0.11063395000603944, 'classifier': 'LGBM', 'min_data_in_leaf': 91}


In [55]:
auc = evaluate_classification(LGBMClassifier(num_leaves=56, min_data_in_leaf=91, max_depth=6,
                           learning_rate=0.11, max_bin=262),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7027651989594373


In [56]:
# R3
X_train, X_val, X_test, y_train, y_val, y_test = scaffold_split(dataset)
X_train = transform_for_scaffold(X_train)
X_val = transform_for_scaffold(X_val)
X_test = transform_for_scaffold(X_test)
y_train, y_val, y_test = y_train.flatten(), y_val.flatten(), y_test.flatten()
study = optuna.create_study()
study.optimize(objective_classifier, n_trials=100, n_jobs=1)
print(study.best_params)

TIMING: dataset construction took 2.531 s
Loading dataset from disk.
TIMING: dataset construction took 1.243 s
Loading dataset from disk.
TIMING: dataset construction took 1.238 s
Loading dataset from disk.
There are 1631 molecules. It will take a little time.
SMILES is too long (380)
SMILES is too long (332)
There are 204 molecules. It will take a little time.
SMILES is too long (256)
SMILES is too long (239)
SMILES is too long (258)
There are 204 molecules. It will take a little time.


[I 2019-08-19 19:56:21,595] Finished trial#0 resulted in value: -0.49621506211180133. Current best value is -0.49621506211180133 with parameters: {'lr': 2.3612377600561096e-05, 'classifier': 'MLP', 'n_layers': 2, 'n_units_l1': 12.514520390859674, 'n_units_l0': 4.998237308511305, 'alpha': 0.0006486760264130237}.
[I 2019-08-19 19:56:25,319] Finished trial#1 resulted in value: -0.9790372670807455. Current best value is -0.9790372670807455 with parameters: {'n_layers': 1, 'classifier': 'MLP', 'lr': 2.2718717845796408e-05, 'alpha': 7.58250977006782e-05, 'n_units_l0': 196.31922069411448}.
[I 2019-08-19 19:56:26,870] Finished trial#2 resulted in value: -0.9589479813664596. Current best value is -0.9790372670807455 with parameters: {'n_layers': 1, 'classifier': 'MLP', 'lr': 2.2718717845796408e-05, 'alpha': 7.58250977006782e-05, 'n_units_l0': 196.31922069411448}.
[I 2019-08-19 19:56:27,139] Finished trial#3 resulted in value: -0.9831133540372671. Current best value is -0.9831133540372671 with p

[I 2019-08-19 19:56:54,768] Finished trial#29 resulted in value: -0.9924301242236024. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:56:55,597] Finished trial#30 resulted in value: -0.9878687888198758. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:56:56,000] Finished trial#31 resulted in value: -0.9880628881987576. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:56:56,560] Finished trial#32 resulted in value: -0.9916537267080745. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306

[I 2019-08-19 19:57:26,092] Finished trial#58 resulted in value: -0.9561335403726708. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:57:26,519] Finished trial#59 resulted in value: -0.9874805900621118. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:57:27,053] Finished trial#60 resulted in value: -0.9886451863354038. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:57:27,554] Finished trial#61 resulted in value: -0.9903920807453417. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306

[I 2019-08-19 19:57:45,268] Finished trial#88 resulted in value: -0.9879658385093167. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:57:45,799] Finished trial#89 resulted in value: -0.9906832298136645. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:57:46,324] Finished trial#90 resulted in value: -0.9915566770186335. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306, 'lr': 0.32493520479150906, 'classifier': 'LGBM', 'min_data_in_leaf': 82}.
[I 2019-08-19 19:57:46,877] Finished trial#91 resulted in value: -0.9909743788819876. Current best value is -0.9939829192546584 with parameters: {'max_depth': 7, 'num_leaves': 7, 'max_bin': 306

{'max_depth': 5, 'num_leaves': 48, 'max_bin': 258, 'lr': 0.7175296638285248, 'classifier': 'LGBM', 'min_data_in_leaf': 78}


In [57]:
auc = evaluate_classification(LGBMClassifier(num_leaves=48, min_data_in_leaf=78, max_depth=5,
                           learning_rate=0.72, max_bin=258),
                    X_train, X_val, X_test, y_train, y_val, y_test)
aucs.append(auc)
print(auc)

0.7047885152712207


In [58]:
print('AUC  | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(aucs), np.std(aucs)))

AUC  | MEAN: 0.7043 | STD: 0.0011


## Tox21

In [35]:
df = pd.read_csv('data/tox21.csv')
print(df.shape)
KEYS = df.columns[:-2]
df.head()

(7831, 14)


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [11]:
# R1
df_train, df_val, df_test = df_split(df)
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, xseg = get_array(x_split)
X_train = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_val['smiles'].values]
xid, xseg = get_array(x_split)
X_val = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, xseg = get_array(x_split)
X_test = trfm.encode(torch.t(xid))[:,:256]
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100, n_jobs=6)
print(study.best_params)



SMILES is too long (325)
SMILES is too long (255)
SMILES is too long (273)
SMILES is too long (251)
SMILES is too long (251)
SMILES is too long (284)
SMILES is too long (271)
SMILES is too long (264)
SMILES is too long (253)
SMILES is too long (219)
SMILES is too long (275)
SMILES is too long (226)
SMILES is too long (271)
SMILES is too long (340)
SMILES is too long (221)
SMILES is too long (230)
SMILES is too long (225)
SMILES is too long (251)
SMILES is too long (306)
SMILES is too long (277)
SMILES is too long (225)
SMILES is too long (231)
SMILES is too long (235)
There are 6264 molecules. It will take a little time.
SMILES is too long (233)
SMILES is too long (248)
SMILES is too long (264)
There are 783 molecules. It will take a little time.
SMILES is too long (227)
SMILES is too long (243)
SMILES is too long (263)
SMILES is too long (311)
There are 784 molecules. It will take a little time.


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-19 21:24:59,453] Finished trial#3 resulted in value: -0.6819011070575639. Current best value is -0.6819011070575639 with parameters: {'lr': 1.0476338704385186e-05, 'max_depth': 3, 'num_leaves': 77, 'max_bin': 468, 'min_data_in_leaf': 33, 'classifier': 'LGBM'}.
[I 2019-08-19 21:25:29,664] Finished trial#5 resulted in value: -0.7861894333870304. Current best value is -0.7861894333870304 with parameters: {'C': 3.170819617273813, 'classifier': 'ridge'}.
[I 2019-08-19 21:26:06,525] Finished trial#2 resulted in value: -0.7449635644455018. Current best value is -0.7861894333870304 with parameters: {'C': 3.170819617273813, 'classifier': 'ridge'}.
[I 2019-08-19 21:26:16,562] Finished trial#1 resulted in value: -0.78179986350

[I 2019-08-19 21:32:31,656] Finished trial#7 resulted in value: -0.752561032410117. Current best value is -0.7861894333870304 with parameters: {'C': 3.170819617273813, 'classifier': 'ridge'}.
[I 2019-08-19 21:33:16,668] Finished trial#10 resulted in value: -0.6521476868221222. Current best value is -0.7861894333870304 with parameters: {'C': 3.170819617273813, 'classifier': 'ridge'}.
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[W 2019-08-19 21:38:00,348] Setting status of trial#17 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 6264 but corresponding boolean dimension is 140270081380192',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_

[I 2019-08-19 21:41:35,496] Finished trial#15 resulted in value: -0.7511683256633583. Current best value is -0.7861894333870304 with parameters: {'C': 3.170819617273813, 'classifier': 'ridge'}.
[I 2019-08-19 21:42:47,323] Finished trial#24 resulted in value: -0.7870489587704368. Current best value is -0.7870489587704368 with parameters: {'C': 0.3677908505061795, 'classifier': 'ridge'}.
[I 2019-08-19 21:43:31,113] Finished trial#21 resulted in value: -0.7758866570274899. Current best value is -0.7870489587704368 with parameters: {'C': 0.3677908505061795, 'classifier': 'ridge'}.
[I 2019-08-19 21:44:06,245] Finished trial#19 resulted in value: -0.7614472316003443. Current best value is -0.7870489587704368 with parameters: {'C': 0.3677908505061795, 'classifier': 'ridge'}.
[I 2019-08-19 21:44:49,192] Finished trial#20 resulted in value: -0.7494019228311593. Current best value is -0.7870489587704368 with parameters: {'C': 0.3677908505061795, 'classifier': 'ridge'}.
[I 2019-08-19 21:45:16,479

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-19 21:48:43,476] Finished trial#37 resulted in value: -0.7853342451475744. Current best value is -0.7883281794594894 with parameters: {'C': 0.8476648979097382, 'classifier': 'ridge'}.
  **self._backend_args)
[I 2019-08-19 21:48:47,653] Finished trial#36 resulted in value: -0.7877516571550472. Current best value is -0.7883281794594894 with parameters: {'C': 0.8476648979097382, 'classifier': 'ridge'}.
[I 2019-08-19 21:48:54,963] Finished trial#27 resulted in value: -0.7496318648321768. Current best value is -0.7883281794594894 with parameters: {'C': 0.8476648979097382, 'classifier': 'ridge'}.
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-19 21:49:42,273] Finished trial#40 resulted in value: -0.7883785241479974. Current best value is -0.7883785241479974 with parameters: {'C': 1.550055939615112, 'classifier': 'ridge'}.
[I 2019-08-19 21:49:44,596] Finished trial#41 resulte

[I 2019-08-19 21:52:02,483] Finished trial#50 resulted in value: -0.6119684026508208. Current best value is -0.794587007804795 with parameters: {'lr': 0.1951107848937753, 'max_depth': 0, 'num_leaves': 61, 'max_bin': 34, 'min_data_in_leaf': 25, 'classifier': 'LGBM'}.
[I 2019-08-19 21:52:35,371] Finished trial#51 resulted in value: -0.6220787277157337. Current best value is -0.794587007804795 with parameters: {'lr': 0.1951107848937753, 'max_depth': 0, 'num_leaves': 61, 'max_bin': 34, 'min_data_in_leaf': 25, 'classifier': 'LGBM'}.
[I 2019-08-19 21:53:31,453] Finished trial#48 resulted in value: -0.7490705636467375. Current best value is -0.794587007804795 with parameters: {'lr': 0.1951107848937753, 'max_depth': 0, 'num_leaves': 61, 'max_bin': 34, 'min_data_in_leaf': 25, 'classifier': 'LGBM'}.
[I 2019-08-19 21:55:18,832] Finished trial#55 resulted in value: -0.7982981613567085. Current best value is -0.7982981613567085 with parameters: {'lr': 0.0933301944054477, 'max_depth': 5, 'num_leaves

[I 2019-08-19 22:07:01,099] Finished trial#79 resulted in value: -0.7650020446003681. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}.
[I 2019-08-19 22:07:47,636] Finished trial#80 resulted in value: -0.7614950228041995. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}.
[I 2019-08-19 22:08:09,220] Finished trial#76 resulted in value: -0.7998112707181578. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}.
[I 2019-08-19 22:09:00,148] Finished trial#81 resulted in value: -0.7522030663968874. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, '

[I 2019-08-19 22:17:40,219] Finished trial#86 resulted in value: -0.7965828513095367. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}.
[I 2019-08-19 22:18:30,086] Finished trial#90 resulted in value: -0.5464045016754754. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}.
[I 2019-08-19 22:19:28,830] Finished trial#89 resulted in value: -0.7372093961205556. Current best value is -0.8028080150107136 with parameters: {'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}.
[W 2019-08-19 22:21:34,149] Setting status of trial#95 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension i

{'lr': 0.12516499497072375, 'max_depth': 4, 'num_leaves': 21, 'max_bin': 426, 'min_data_in_leaf': 83, 'classifier': 'LGBM'}


In [12]:
study.trials_dataframe()

Unnamed: 0_level_0,number,state,value,datetime_start,datetime_complete,params,params,params,params,params,params,params,params,params,params,params,params,system_attrs,system_attrs
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,C,alpha,classifier,lr,max_bin,max_depth,min_data_in_leaf,n_layers,n_units_l0,n_units_l1,n_units_l2,num_leaves,_number,fail_reason
0,0,TrialState.COMPLETE,-0.727007,2019-08-19 21:23:27.633123,2019-08-19 21:31:48.446544,,0.002824,MLP,0.000172,,,,3.0,120.899893,106.969472,64.142597,,0,
1,1,TrialState.COMPLETE,-0.781800,2019-08-19 21:23:27.649527,2019-08-19 21:26:16.559119,6.701652,,ridge,,,,,,,,,,1,
2,2,TrialState.COMPLETE,-0.744964,2019-08-19 21:23:27.666230,2019-08-19 21:26:06.522768,,0.001117,MLP,0.006748,,,,3.0,59.490594,9.298771,5.211710,,2,
3,3,TrialState.COMPLETE,-0.681901,2019-08-19 21:23:27.678596,2019-08-19 21:24:59.451012,,,LGBM,0.000010,468.0,3.0,33.0,,,,,77.0,3,
4,4,TrialState.COMPLETE,-0.749359,2019-08-19 21:23:27.690583,2019-08-19 21:29:00.635838,29203.950497,,ridge,,,,,,,,,,4,
5,5,TrialState.COMPLETE,-0.786189,2019-08-19 21:23:27.694607,2019-08-19 21:25:29.653438,3.170820,,ridge,,,,,,,,,,5,
6,6,TrialState.COMPLETE,-0.715308,2019-08-19 21:24:59.491569,2019-08-19 21:31:52.839321,,0.000726,MLP,0.000040,,,,2.0,292.258773,75.303848,,,6,
7,7,TrialState.COMPLETE,-0.752561,2019-08-19 21:25:29.694171,2019-08-19 21:32:31.610560,,0.000001,MLP,0.000094,,,,3.0,312.799300,17.816207,4.462947,,7,
8,8,TrialState.COMPLETE,-0.782649,2019-08-19 21:26:06.571808,2019-08-19 21:29:37.495249,5.979129,,ridge,,,,,,,,,,8,
9,9,TrialState.COMPLETE,-0.782310,2019-08-19 21:26:16.612601,2019-08-19 21:29:14.071299,6.224763,,ridge,,,,,,,,,,9,


In [19]:
aucs = []
auc = evaluate_classification_multi(LGBMClassifier(num_leaves=21, min_data_in_leaf=83, max_depth=4,
                           learning_rate=0.13, max_bin=426),
                    X_train, X_val, X_test)
aucs.append(auc)
print(auc)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


0.8128625514235361


In [20]:
# R2
df_train, df_val, df_test = df_split(df)
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, xseg = get_array(x_split)
X_train = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_val['smiles'].values]
xid, xseg = get_array(x_split)
X_val = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, xseg = get_array(x_split)
X_test = trfm.encode(torch.t(xid))[:,:256]
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100)
print(study.best_params)



SMILES is too long (271)
SMILES is too long (231)
SMILES is too long (221)
SMILES is too long (235)
SMILES is too long (225)
SMILES is too long (284)
SMILES is too long (233)
SMILES is too long (277)
SMILES is too long (275)
SMILES is too long (264)
SMILES is too long (226)
SMILES is too long (251)
SMILES is too long (230)
SMILES is too long (251)
SMILES is too long (225)
SMILES is too long (251)
SMILES is too long (243)
SMILES is too long (273)
SMILES is too long (263)
SMILES is too long (340)
SMILES is too long (253)
SMILES is too long (255)
SMILES is too long (306)
SMILES is too long (271)
There are 6264 molecules. It will take a little time.
SMILES is too long (219)
SMILES is too long (248)
There are 783 molecules. It will take a little time.
SMILES is too long (227)
SMILES is too long (264)
SMILES is too long (325)
SMILES is too long (311)
There are 784 molecules. It will take a little time.


[I 2019-08-20 20:35:34,944] Finished trial#0 resulted in value: -0.7851436216704112. Current best value is -0.7851436216704112 with parameters: {'C': 1.932400809582752, 'classifier': 'ridge'}.
[I 2019-08-20 20:52:34,400] Finished trial#1 resulted in value: -0.7843442129235143. Current best value is -0.7851436216704112 with parameters: {'C': 1.932400809582752, 'classifier': 'ridge'}.


KeyboardInterrupt: 

In [21]:
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100, n_jobs=8)
print(study.best_params)

  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:04:37,434] Finished trial#6 resulted in value: -0.7838451549157198. Current best value is -0.7838451549157198 with parameters: {'C': 0.2925610672009905, 'classifier': 'ridge'}.
[W 2019-08-20 21:04:58,347] Setting status of trial#5 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140270075145280',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 85, in objective_classifier_multi
    y_score = _clf.predict_proba(X_val[df_val[key].notna()])
IndexError: boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140270075145280
  **self._backend_args)
  **self._backend_ar

[I 2019-08-20 21:11:57,069] Finished trial#2 resulted in value: -0.7691662690976385. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'}.
  **self._backend_args)
[I 2019-08-20 21:12:04,713] Finished trial#4 resulted in value: -0.7067388663666744. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'}.
  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:12:47,328] Finished trial#8 resulted in value: -0.7606065634722823. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'}.
  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:14:21,047] Finished trial#0 resulted in value: -0.705

[W 2019-08-20 21:21:04,617] Setting status of trial#18 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 6264 but corresponding boolean dimension is 140269605405760',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 84, in objective_classifier_multi
    _clf.fit(X_train[df_train[key].notna()], df_train[key].dropna().values)
IndexError: boolean index did not match indexed array along dimension 0; dimension is 6264 but corresponding boolean dimension is 140269605405760
[I 2019-08-20 21:21:23,639] Finished trial#15 resulted in value: -0.7283026313087522. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'

[I 2019-08-20 21:27:16,274] Finished trial#31 resulted in value: -0.6845282538232267. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'}.
[I 2019-08-20 21:29:13,316] Finished trial#26 resulted in value: -0.7473726916740024. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'}.
[I 2019-08-20 21:29:52,853] Finished trial#20 resulted in value: -0.7988405943071659. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 5, 'num_leaves': 25, 'max_bin': 261, 'min_data_in_leaf': 97, 'classifier': 'LGBM'}.
[I 2019-08-20 21:31:10,649] Finished trial#32 resulted in value: -0.7739231893899027. Current best value is -0.8007844349412699 with parameters: {'lr': 0.008743113860993584, 'max_depth': 

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:46:22,715] Finished trial#50 resulted in value: -0.8097355840015407. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:47:12,448] Finished trial#52 resulted in value: -0.8024783276718616. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
  **self._backend_args)
[I 2019-08-20 21:47:26,144] Finished trial#57 resulted in value: -0.7851844525467683. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 8

  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:52:40,073] Finished trial#56 resulted in value: -0.7616029691624733. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-20 21:54:51,177] Finished trial#62 resulted in value: -0.8059678969754095. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
[I 2019-08-20 21:54:53,518] Finished trial#64 resulted in value: -0.8002458993995877. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 8

IndexError: boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269891923872
[I 2019-08-20 21:59:09,505] Finished trial#69 resulted in value: -0.6543030067009018. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
[W 2019-08-20 21:59:31,592] Setting status of trial#73 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269616754240',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 85, in objective_classifier_multi
    y_score = _clf.predict_proba(X_val[df_val[key].notna()])
IndexError: boo

[I 2019-08-20 22:08:42,670] Finished trial#88 resulted in value: -0.782849346352187. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
[I 2019-08-20 22:09:13,836] Finished trial#93 resulted in value: -0.8021706606314835. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
[I 2019-08-20 22:09:39,713] Finished trial#89 resulted in value: -0.8013704355427532. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}.
[I 2019-08-20 22:09:42,314] Finished trial#90 resulted in value: -0.8004505515493147. Current best value is -0.8124470072032896 with parameters: {'lr': 0.03049588832173513, 'max_depth': 10, '

{'lr': 0.03049588832173513, 'max_depth': 10, 'num_leaves': 50, 'max_bin': 82, 'min_data_in_leaf': 86, 'classifier': 'LGBM'}


In [22]:
auc = evaluate_classification_multi(LGBMClassifier(num_leaves=50, min_data_in_leaf=86, max_depth=10,
                           learning_rate=0.03, max_bin=82),
                    X_train, X_val, X_test)
aucs.append(auc)
print(auc)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


0.7902974992473704


In [36]:
# R3
df_train, df_val, df_test = df_split(df)
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, xseg = get_array(x_split)
X_train = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_val['smiles'].values]
xid, xseg = get_array(x_split)
X_val = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, xseg = get_array(x_split)
X_test = trfm.encode(torch.t(xid))[:,:256]
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100, n_jobs=6)
print(study.best_params)



SMILES is too long (226)
SMILES is too long (221)
SMILES is too long (225)
SMILES is too long (251)
SMILES is too long (235)
SMILES is too long (231)
SMILES is too long (230)
SMILES is too long (251)
SMILES is too long (271)
SMILES is too long (248)
SMILES is too long (306)
SMILES is too long (340)
SMILES is too long (264)
SMILES is too long (311)
SMILES is too long (325)
SMILES is too long (275)
SMILES is too long (225)
SMILES is too long (243)
SMILES is too long (284)
SMILES is too long (271)
SMILES is too long (255)
SMILES is too long (263)
SMILES is too long (273)
There are 6264 molecules. It will take a little time.
SMILES is too long (233)
SMILES is too long (227)
SMILES is too long (251)
SMILES is too long (253)
There are 783 molecules. It will take a little time.
SMILES is too long (277)
SMILES is too long (264)
SMILES is too long (219)
There are 784 molecules. It will take a little time.


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[W 2019-08-21 09:38:00,723] Setting status of trial#4 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269958098144',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 85, in objective_classifier_multi
    y_score = _clf.predict_proba(X_val[df_val[key].notna()])
IndexError: boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269958098144
[W 2019-08-21 09:38:09,525] Setting status of trial#2 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding b

[W 2019-08-21 09:40:26,145] Setting status of trial#8 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269818844608',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 85, in objective_classifier_multi
    y_score = _clf.predict_proba(X_val[df_val[key].notna()])
IndexError: boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269818844608
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-21 09:41:22,805] Finished trial#14 resulted in value: -0.7435502553534423. Current best value is -0.7921837847651805 with parameters: {'C': 38.844843825853

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-21 09:45:09,303] Finished trial#16 resulted in value: -0.7075748006541289. Current best value is -0.7921837847651805 with parameters: {'C': 38.844843825853445, 'classifier': 'ridge'}.
[I 2019-08-21 09:45:49,810] Finished trial#22 resulted in value: -0.7878379875143678. Current best value is -0.7921837847651805 with parameters: {'C': 38.844843825853445, 'classifier': 'ridge'}.
[I 2019-08-21 09:46:09,332] Finished trial#21 resulted in value: -0.7942913064877284. Current best value is -0.7942913064877284 with parameters: {'C': 8.509783428387417, 'classifier': 'ridge'}.
[I 2019-08-21 09:46:21,363] Finished trial#17 resulted in value: -0.7724972687940337. Current best value is -0.7942913064877284 with parameters: {'C': 8.509783428387417, 'classifier': 'ridge'}.
[I 2019-08-21 09:46:29,017] Finished trial#19 resulted in value: -0.7939616374777517. Current best value is -0.794291306487728

IndexError: boolean index did not match indexed array along dimension 0; dimension is 6264 but corresponding boolean dimension is 140269751757632
[W 2019-08-21 09:53:39,473] Setting status of trial#47 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269554453040',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 85, in objective_classifier_multi
    y_score = _clf.predict_proba(X_val[df_val[key].notna()])
IndexError: boolean index did not match indexed array along dimension 0; dimension is 783 but corresponding boolean dimension is 140269554453040
[I 2019-08-21 09:53:44,909] Finished trial#42 resulted in value: -0.780422973839944. Current best value is -0.7942913064877284 with paramet

[I 2019-08-21 09:55:32,757] Finished trial#60 resulted in value: -0.7906950760812211. Current best value is -0.7942913064877284 with parameters: {'C': 8.509783428387417, 'classifier': 'ridge'}.
[I 2019-08-21 09:55:39,412] Finished trial#62 resulted in value: -0.7903615903577158. Current best value is -0.7942913064877284 with parameters: {'C': 8.509783428387417, 'classifier': 'ridge'}.
[I 2019-08-21 09:56:13,236] Finished trial#65 resulted in value: -0.7948134386152459. Current best value is -0.7948134386152459 with parameters: {'lr': 0.052391623908701626, 'max_depth': 2, 'num_leaves': 88, 'max_bin': 277, 'min_data_in_leaf': 37, 'classifier': 'LGBM'}.
[I 2019-08-21 09:56:25,645] Finished trial#67 resulted in value: -0.7960907190264276. Current best value is -0.7960907190264276 with parameters: {'lr': 0.05901220161919727, 'max_depth': 2, 'num_leaves': 75, 'max_bin': 300, 'min_data_in_leaf': 30, 'classifier': 'LGBM'}.
[I 2019-08-21 09:56:29,727] Finished trial#68 resulted in value: -0.797

[I 2019-08-21 10:05:53,751] Finished trial#93 resulted in value: -0.7396357995060457. Current best value is -0.8211311239883017 with parameters: {'lr': 0.053153327151045135, 'max_depth': 10, 'num_leaves': 35, 'max_bin': 120, 'min_data_in_leaf': 24, 'classifier': 'LGBM'}.
[I 2019-08-21 10:06:17,150] Finished trial#88 resulted in value: -0.7587799062885422. Current best value is -0.8211311239883017 with parameters: {'lr': 0.053153327151045135, 'max_depth': 10, 'num_leaves': 35, 'max_bin': 120, 'min_data_in_leaf': 24, 'classifier': 'LGBM'}.
[I 2019-08-21 10:06:21,701] Finished trial#94 resulted in value: -0.8144145724653913. Current best value is -0.8211311239883017 with parameters: {'lr': 0.053153327151045135, 'max_depth': 10, 'num_leaves': 35, 'max_bin': 120, 'min_data_in_leaf': 24, 'classifier': 'LGBM'}.
[I 2019-08-21 10:06:27,263] Finished trial#95 resulted in value: -0.6134766927370391. Current best value is -0.8211311239883017 with parameters: {'lr': 0.053153327151045135, 'max_depth

{'lr': 0.053153327151045135, 'max_depth': 10, 'num_leaves': 35, 'max_bin': 120, 'min_data_in_leaf': 24, 'classifier': 'LGBM'}


In [37]:
auc = evaluate_classification_multi(LGBMClassifier(num_leaves=35, min_data_in_leaf=24, max_depth=10,
                           learning_rate=0.053, max_bin=120),
                    X_train, X_val, X_test)
aucs.append(auc)
print(auc)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


0.804013116979409


In [38]:
aucs = [0.8128625514235361, 0.7902974992473704, 0.804013116979409]
print('AUC  | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(aucs), np.std(aucs)))

AUC  | MEAN: 0.8024 | STD: 0.0093


### ClinTox

In [23]:
df = pd.read_csv('data/clintox.csv')
KEYS  = df.keys()[1:]
print(df.shape)
df.head()

(1484, 3)


Unnamed: 0,smiles,FDA_APPROVED,CT_TOX
0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,1,0
1,[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C...,1,0
2,[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)...,1,0
3,[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])...,1,0
4,[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(...,1,0


In [24]:
# R1
df_train, df_val, df_test = df_split(df)
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, xseg = get_array(x_split)
X_train = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_val['smiles'].values]
xid, xseg = get_array(x_split)
X_val = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, xseg = get_array(x_split)
X_test = trfm.encode(torch.t(xid))[:,:256]
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100, n_jobs=6)
print(study.best_params)



SMILES is too long (284)
SMILES is too long (219)
SMILES is too long (255)
SMILES is too long (238)
SMILES is too long (261)
SMILES is too long (251)
SMILES is too long (236)
SMILES is too long (339)
SMILES is too long (240)
SMILES is too long (314)
SMILES is too long (227)
SMILES is too long (221)
SMILES is too long (263)
There are 1187 molecules. It will take a little time.
SMILES is too long (253)
SMILES is too long (253)
SMILES is too long (318)
SMILES is too long (279)
SMILES is too long (225)
There are 148 molecules. It will take a little time.
SMILES is too long (271)
SMILES is too long (227)
SMILES is too long (230)
There are 149 molecules. It will take a little time.


  **self._backend_args)
[I 2019-08-21 09:11:01,864] Finished trial#1 resulted in value: -0.9089927003410059. Current best value is -0.9089927003410059 with parameters: {'lr': 0.00012115264593546623, 'max_depth': 2, 'num_leaves': 8, 'max_bin': 22, 'min_data_in_leaf': 80, 'classifier': 'LGBM'}.
[I 2019-08-21 09:11:03,505] Finished trial#0 resulted in value: -0.7192988064791134. Current best value is -0.9089927003410059 with parameters: {'lr': 0.00012115264593546623, 'max_depth': 2, 'num_leaves': 8, 'max_bin': 22, 'min_data_in_leaf': 80, 'classifier': 'LGBM'}.
  **self._backend_args)
[I 2019-08-21 09:11:04,154] Finished trial#3 resulted in value: -0.9467178175618074. Current best value is -0.9467178175618074 with parameters: {'lr': 0.00020016802758969932, 'max_depth': 4, 'num_leaves': 31, 'max_bin': 137, 'min_data_in_leaf': 71, 'classifier': 'LGBM'}.
[I 2019-08-21 09:11:04,945] Finished trial#7 resulted in value: -0.9911684782608696. Current best value is -0.9911684782608696 with paramete

[I 2019-08-21 09:11:36,132] Finished trial#32 resulted in value: -0.9921435421994885. Current best value is -0.9921435421994885 with parameters: {'C': 0.11756550518674178, 'classifier': 'ridge'}.
[I 2019-08-21 09:11:36,226] Finished trial#25 resulted in value: -0.9876305413469736. Current best value is -0.9921435421994885 with parameters: {'C': 0.11756550518674178, 'classifier': 'ridge'}.
[I 2019-08-21 09:11:37,298] Finished trial#36 resulted in value: -0.9921435421994885. Current best value is -0.9921435421994885 with parameters: {'C': 0.11756550518674178, 'classifier': 'ridge'}.
[I 2019-08-21 09:11:38,021] Finished trial#37 resulted in value: -0.9921435421994885. Current best value is -0.9921435421994885 with parameters: {'C': 0.11756550518674178, 'classifier': 'ridge'}.
[I 2019-08-21 09:11:38,847] Finished trial#34 resulted in value: -0.9921435421994885. Current best value is -0.9921435421994885 with parameters: {'C': 0.11756550518674178, 'classifier': 'ridge'}.
[I 2019-08-21 09:11:

[I 2019-08-21 09:12:24,888] Finished trial#52 resulted in value: -0.7434782608695653. Current best value is -0.9942055626598465 with parameters: {'n_units_l0': 484.22857897444146, 'lr': 0.008541220591296259, 'n_layers': 2, 'alpha': 0.00040391913984482195, 'n_units_l1': 11.38626161364681, 'classifier': 'MLP'}.
[W 2019-08-21 09:12:25,655] Setting status of trial#55 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 1187 but corresponding boolean dimension is 140269755736960',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result = func(trial)
  File "<ipython-input-10-be611a7cb47b>", line 84, in objective_classifier_multi
    _clf.fit(X_train[df_train[key].notna()], df_train[key].dropna().values)
IndexError: boolean index did not match indexed array along dimension 0; dimension is 1187 but correspo

[I 2019-08-21 09:12:55,917] Finished trial#79 resulted in value: -0.99086210571185. Current best value is -0.9942055626598465 with parameters: {'n_units_l0': 484.22857897444146, 'lr': 0.008541220591296259, 'n_layers': 2, 'alpha': 0.00040391913984482195, 'n_units_l1': 11.38626161364681, 'classifier': 'MLP'}.
[I 2019-08-21 09:12:56,382] Finished trial#77 resulted in value: -0.9914748508098892. Current best value is -0.9942055626598465 with parameters: {'n_units_l0': 484.22857897444146, 'lr': 0.008541220591296259, 'n_layers': 2, 'alpha': 0.00040391913984482195, 'n_units_l1': 11.38626161364681, 'classifier': 'MLP'}.
[I 2019-08-21 09:12:57,362] Finished trial#64 resulted in value: -0.7417279411764706. Current best value is -0.9942055626598465 with parameters: {'n_units_l0': 484.22857897444146, 'lr': 0.008541220591296259, 'n_layers': 2, 'alpha': 0.00040391913984482195, 'n_units_l1': 11.38626161364681, 'classifier': 'MLP'}.
[I 2019-08-21 09:12:57,441] Finished trial#81 resulted in value: -0.9

{'n_units_l0': 484.22857897444146, 'lr': 0.008541220591296259, 'n_layers': 2, 'alpha': 0.00040391913984482195, 'n_units_l1': 11.38626161364681, 'classifier': 'MLP'}


In [27]:
auc = []
auc = evaluate_classification_multi(MLPClassifier(hidden_layer_sizes=(484, 11), learning_rate_init=0.009, alpha=0.0004, max_iter=1000),
                    X_train, X_val, X_test)
aucs.append(auc)
print(auc)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


0.9689671718252779


In [28]:
# R2
df_train, df_val, df_test = df_split(df)
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, xseg = get_array(x_split)
X_train = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_val['smiles'].values]
xid, xseg = get_array(x_split)
X_val = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, xseg = get_array(x_split)
X_test = trfm.encode(torch.t(xid))[:,:256]
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100, n_jobs=6)
print(study.best_params)



SMILES is too long (339)
SMILES is too long (253)
SMILES is too long (263)
SMILES is too long (219)
SMILES is too long (238)
SMILES is too long (318)
SMILES is too long (314)
SMILES is too long (240)
SMILES is too long (279)
SMILES is too long (227)
SMILES is too long (271)
SMILES is too long (251)
SMILES is too long (225)
SMILES is too long (236)
SMILES is too long (227)
SMILES is too long (261)
SMILES is too long (255)
There are 1187 molecules. It will take a little time.
SMILES is too long (230)
There are 148 molecules. It will take a little time.
SMILES is too long (253)
SMILES is too long (221)
SMILES is too long (284)
There are 149 molecules. It will take a little time.


  **self._backend_args)
  **self._backend_args)
[I 2019-08-21 09:17:36,534] Finished trial#2 resulted in value: -0.9979360766045549. Current best value is -0.9979360766045549 with parameters: {'C': 0.22095030983454514, 'classifier': 'ridge'}.
  **self._backend_args)
[I 2019-08-21 09:17:37,984] Finished trial#5 resulted in value: -0.9980201863354037. Current best value is -0.9980201863354037 with parameters: {'C': 3.2735589189405196, 'classifier': 'ridge'}.
  **self._backend_args)
[I 2019-08-21 09:17:38,759] Finished trial#4 resulted in value: -0.7414660973084887. Current best value is -0.9980201863354037 with parameters: {'C': 3.2735589189405196, 'classifier': 'ridge'}.
  **self._backend_args)
[I 2019-08-21 09:17:39,966] Finished trial#3 resulted in value: -0.9773291925465839. Current best value is -0.9980201863354037 with parameters: {'C': 3.2735589189405196, 'classifier': 'ridge'}.
  **self._backend_args)
[I 2019-08-21 09:17:41,273] Finished trial#8 resulted in value: -0.987060041407

[I 2019-08-21 09:18:18,231] Finished trial#25 resulted in value: -0.7242753623188405. Current best value is -0.9980201863354037 with parameters: {'C': 3.2735589189405196, 'classifier': 'ridge'}.
[I 2019-08-21 09:18:20,341] Finished trial#23 resulted in value: -0.4893957039337474. Current best value is -0.9980201863354037 with parameters: {'C': 3.2735589189405196, 'classifier': 'ridge'}.
[I 2019-08-21 09:18:21,194] Finished trial#24 resulted in value: -0.9976287525879917. Current best value is -0.9980201863354037 with parameters: {'C': 3.2735589189405196, 'classifier': 'ridge'}.
[W 2019-08-21 09:18:22,791] Setting status of trial#29 as TrialState.FAIL because of the following error: IndexError('boolean index did not match indexed array along dimension 0; dimension is 1187 but corresponding boolean dimension is 140269538309104',)
Traceback (most recent call last):
  File "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/optuna/study.py", line 469, in _run_trial
    result 

[I 2019-08-21 09:18:58,334] Finished trial#42 resulted in value: -0.9956359989648034. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}.
[I 2019-08-21 09:18:58,346] Finished trial#45 resulted in value: -0.9989130434782609. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}.
[I 2019-08-21 09:19:04,478] Finished trial#44 resulted in value: -0.9854101966873706. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.320229

[I 2019-08-21 09:19:33,456] Finished trial#65 resulted in value: -0.9975737577639752. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}.
  **self._backend_args)
[I 2019-08-21 09:19:34,991] Finished trial#58 resulted in value: -0.9956521739130435. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}.
[I 2019-08-21 09:19:36,143] Finished trial#68 resulted in value: -0.9979360766045549. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 

[I 2019-08-21 09:20:08,612] Finished trial#90 resulted in value: -0.9979360766045549. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}.
[I 2019-08-21 09:20:11,070] Finished trial#87 resulted in value: -0.9980201863354037. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}.
[I 2019-08-21 09:20:13,534] Finished trial#80 resulted in value: -0.9927212732919255. Current best value is -0.9989130434782609 with parameters: {'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.320229

{'n_units_l0': 291.05181142991347, 'lr': 0.009855849412471479, 'n_layers': 3, 'alpha': 0.0038415732302038387, 'n_units_l1': 4.742371602606811, 'n_units_l2': 147.32022945973043, 'classifier': 'MLP'}


In [29]:
auc = evaluate_classification_multi(MLPClassifier(hidden_layer_sizes=(291, 5, 147), learning_rate_init=0.01, alpha=0.004, max_iter=1000),
                    X_train, X_val, X_test)
aucs.append(auc)
print(auc)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


0.9468271536616861


In [30]:
# R3
df_train, df_val, df_test = df_split(df)
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, xseg = get_array(x_split)
X_train = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_val['smiles'].values]
xid, xseg = get_array(x_split)
X_val = trfm.encode(torch.t(xid))[:,:256]
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, xseg = get_array(x_split)
X_test = trfm.encode(torch.t(xid))[:,:256]
study = optuna.create_study()
study.optimize(objective_classifier_multi, n_trials=100, n_jobs=6)
print(study.best_params)



SMILES is too long (253)
SMILES is too long (318)
SMILES is too long (240)
SMILES is too long (314)
SMILES is too long (221)
SMILES is too long (279)
SMILES is too long (263)
SMILES is too long (253)
SMILES is too long (261)
SMILES is too long (251)
SMILES is too long (227)
SMILES is too long (230)
SMILES is too long (219)
SMILES is too long (236)
SMILES is too long (238)
SMILES is too long (284)
SMILES is too long (255)
There are 1187 molecules. It will take a little time.
SMILES is too long (225)
SMILES is too long (339)
There are 148 molecules. It will take a little time.
SMILES is too long (271)
SMILES is too long (227)
There are 149 molecules. It will take a little time.


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
[I 2019-08-21 09:25:12,512] Finished trial#4 resulted in value: -0.9741040611439183. Current best value is -0.9741040611439183 with parameters: {'C': 2.648771159588512, 'classifier': 'ridge'}.
[I 2019-08-21 09:25:13,382] Finished trial#5 resulted in value: -0.7077443552788824. Current best value is -0.9741040611439183 with parameters: {'C': 2.648771159588512, 'classifier': 'ridge'}.
  **self._backend_args)
[I 2019-08-21 09:25:13,974] Finished trial#1 resulted in value: -0.9713139906124206. Current best value is -0.9741040611439183 with parameters: {'C': 2.648771159588512, 'classifier': 'ridge'}.
[I 2019-08-21 09:25:15,389] Finished trial#0 resulted in value: -0.9612848540013377. Current best value is -0.9741040611439183 with parameters: {'C': 2.648771159588512, 'classifier': 'ridge'}.
[I 2019-08-21 09:25:16,197] Finished trial#9 resulted in value: -0.9664051108151303. Current best value is -0.9741040611439183 with 

[I 2019-08-21 09:25:44,193] Finished trial#24 resulted in value: -0.9765175251968299. Current best value is -0.9887718226215312 with parameters: {'lr': 0.40778282473907684, 'max_depth': 10, 'num_leaves': 15, 'max_bin': 141, 'min_data_in_leaf': 16, 'classifier': 'LGBM'}.
[I 2019-08-21 09:25:47,266] Finished trial#23 resulted in value: -0.9845559517880056. Current best value is -0.9887718226215312 with parameters: {'lr': 0.40778282473907684, 'max_depth': 10, 'num_leaves': 15, 'max_bin': 141, 'min_data_in_leaf': 16, 'classifier': 'LGBM'}.
[I 2019-08-21 09:25:47,878] Finished trial#25 resulted in value: -0.9792999044692581. Current best value is -0.9887718226215312 with parameters: {'lr': 0.40778282473907684, 'max_depth': 10, 'num_leaves': 15, 'max_bin': 141, 'min_data_in_leaf': 16, 'classifier': 'LGBM'}.
[I 2019-08-21 09:25:57,279] Finished trial#26 resulted in value: -0.984571334306145. Current best value is -0.9887718226215312 with parameters: {'lr': 0.40778282473907684, 'max_depth': 10

[I 2019-08-21 09:26:43,429] Finished trial#49 resulted in value: -0.9854308988111435. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}.
[I 2019-08-21 09:26:44,950] Finished trial#54 resulted in value: -0.9715778803632609. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}.
[I 2019-08-21 09:26:46,148] Finished trial#52 resulted in value: -0.9693911758449909. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}.
[I 2019-08-21 09:26:46,303] Finished trial#48 resulted in value: -0.9797751712366006. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth

[I 2019-08-21 09:27:21,867] Finished trial#75 resulted in value: -0.979383182239875. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}.
[I 2019-08-21 09:27:22,036] Finished trial#76 resulted in value: -0.9255577621512611. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}.
[I 2019-08-21 09:27:25,987] Finished trial#78 resulted in value: -0.9717059796091462. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}.
[I 2019-08-21 09:27:27,757] Finished trial#80 resulted in value: -0.9810344159973945. Current best value is -0.988832026614939 with parameters: {'lr': 0.0014688842298596092, 'max_depth'

{'lr': 0.0014688842298596092, 'max_depth': 10, 'num_leaves': 41, 'max_bin': 169, 'min_data_in_leaf': 14, 'classifier': 'LGBM'}


In [31]:
auc = evaluate_classification_multi(LGBMClassifier(num_leaves=41, min_data_in_leaf=14, max_depth=10,
                           learning_rate=0.001, max_bin=169),
                    X_train, X_val, X_test)
aucs.append(auc)
print(auc)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


0.9665467625899281


In [34]:
aucs = [0.9689671718252779, 0.9468271536616861, 0.9468271536616861]
print('AUC  | MEAN: {:.4f} | STD: {:.4f}'.format(np.mean(aucs), np.std(aucs)))

AUC  | MEAN: 0.9542 | STD: 0.0104


In [33]:
aucs

[0.8128625514235361,
 0.7902974992473704,
 0.9689671718252779,
 0.9468271536616861,
 0.9665467625899281]