# Performance comparaison

| Fingerprint | MLP | RF |  
|:-:|:-:|:-:|  
|BERT on GDB17 EP1|0.833|0.734| 
|ECFP| 0.715 | 0.639 |
|Can2Can|0.642|0.618|
|Enum2Enum|0.676|0.640|
|Transformer|0.862||
| NFP| 0.8845 |  |

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch
import optuna

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [2]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [3]:
df_train = pd.read_csv('data/sol_train.csv')
df_test = pd.read_csv('data/sol_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles,spaced
0,[nH0]1c(SC)c2c([nH0]cc[nH0]2)[nH0]c1,6966-78-5,-2.36,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...
1,CCC(C)Cl,78-86-4,-1.96,C C C ( C ) Cl,C C C ( C ) C l
2,O=C(NC(=O)c1ccccc1)c1ccccc1,614-28-8,-2.27,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...
3,CC(C(C)(C)C)O,464-07-3,-0.62,C C ( C ( C ) ( C ) C ) O,C C ( C ( C ) ( C ) C ) O
4,[O-][N+](c1c(O)cccc1)=O,88-75-5,-1.74,[ O- ] [ N+ ] ( c 1 c ( O ) c c c c 1 ) = O,[ O - ] [ N + ] ( c 1 c ( O ) c c c c 1 ) = O


In [4]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [5]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [6]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index] + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [7]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [8]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([322, 220])
torch.Size([322, 220])


# Encode to fingerprint

In [20]:
model = BERT(len(vocab), hidden=256, n_layers=4, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/GDB17_embed20/ep_02.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [34]:
# -b 16 -e 30 --hidden 256 -l 7 --n_head 4 --lr 1e-4 --lr-decay 5 --final-lr 0.01
model = BERT(len(vocab), hidden=256, n_layers=7, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/chembl/ep_29.pkl'))
model.eval()

RuntimeError: Error(s) in loading state_dict for BERT:
	size mismatch for embedding.position.pe: copying a param with shape torch.Size([1, 512, 256]) from checkpoint, the shape in current model is torch.Size([1, 220, 256]).

In [10]:
del df_test, df_train, x_train, x_test

In [21]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [22]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [23]:
X_train.shape

(968, 220, 512)

In [24]:
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)
X_train = X_train[:,0,:]
X_test = X_test[:,0,:]
#y_train = df_train['solubility']
#y_test = df_test['solubility']

# Prediction
### MLP

In [25]:
# Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor()
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.5302 ± 0.0647
Test MSE: 1.9180 ± 0.2641


In [18]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1,3)
    layers = []
    for i in range(n_layers):
        n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 1, 1000))
        layers.append(n_units)
    
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        mlp = MLPRegressor(hidden_layer_sizes=layers)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        mlp.fit(X_trn, y_trn)
        y_pred = mlp.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_mlp, n_trials=100)

[I 2019-03-27 16:10:40,632] Finished a trial resulted in value: 1.078223806803261. Current best value is 1.078223806803261 with parameters: {'n_layers': 3, 'n_units_l0': 522.5154959925112, 'n_units_l1': 359.05574364726056, 'n_units_l2': 4.538790729315909}.
[I 2019-03-27 16:10:47,855] Finished a trial resulted in value: 0.9503060803017159. Current best value is 0.9503060803017159 with parameters: {'n_layers': 2, 'n_units_l0': 81.89920997875582, 'n_units_l1': 19.637541301665927}.
[I 2019-03-27 16:11:03,436] Finished a trial resulted in value: 6.106243949126198. Current best value is 0.9503060803017159 with parameters: {'n_layers': 2, 'n_units_l0': 81.89920997875582, 'n_units_l1': 19.637541301665927}.
[I 2019-03-27 16:11:06,130] Finished a trial resulted in value: 6.415340249305028. Current best value is 0.9503060803017159 with parameters: {'n_layers': 2, 'n_units_l0': 81.89920997875582, 'n_units_l1': 19.637541301665927}.
[I 2019-03-27 16:11:10,751] Finished a trial resulted in value: 2.0

[I 2019-03-27 16:12:56,012] Finished a trial resulted in value: 5.5226727558886. Current best value is 0.8982407051899326 with parameters: {'n_layers': 2, 'n_units_l0': 133.73754618018938, 'n_units_l1': 67.43930779550047}.
[I 2019-03-27 16:13:00,256] Finished a trial resulted in value: 1.9286163310444426. Current best value is 0.8982407051899326 with parameters: {'n_layers': 2, 'n_units_l0': 133.73754618018938, 'n_units_l1': 67.43930779550047}.
[I 2019-03-27 16:13:10,855] Finished a trial resulted in value: 1.1382062131974746. Current best value is 0.8982407051899326 with parameters: {'n_layers': 2, 'n_units_l0': 133.73754618018938, 'n_units_l1': 67.43930779550047}.
[I 2019-03-27 16:13:26,957] Finished a trial resulted in value: 1.1673541157058316. Current best value is 0.8982407051899326 with parameters: {'n_layers': 2, 'n_units_l0': 133.73754618018938, 'n_units_l1': 67.43930779550047}.
[I 2019-03-27 16:13:31,998] Finished a trial resulted in value: 0.8674629990726801. Current best va

[I 2019-03-27 16:14:25,322] Finished a trial resulted in value: 1.172661811604669. Current best value is 0.8674629990726801 with parameters: {'n_layers': 2, 'n_units_l0': 17.929948762271206, 'n_units_l1': 86.35801638136803}.
[I 2019-03-27 16:14:27,875] Finished a trial resulted in value: 6.847016090848998. Current best value is 0.8674629990726801 with parameters: {'n_layers': 2, 'n_units_l0': 17.929948762271206, 'n_units_l1': 86.35801638136803}.
[I 2019-03-27 16:14:33,286] Finished a trial resulted in value: 0.9141492195843076. Current best value is 0.8674629990726801 with parameters: {'n_layers': 2, 'n_units_l0': 17.929948762271206, 'n_units_l1': 86.35801638136803}.
[I 2019-03-27 16:14:37,506] Finished a trial resulted in value: 0.9950803607664038. Current best value is 0.8674629990726801 with parameters: {'n_layers': 2, 'n_units_l0': 17.929948762271206, 'n_units_l1': 86.35801638136803}.
[I 2019-03-27 16:14:42,563] Finished a trial resulted in value: 1.0573623905852405. Current best v

[I 2019-03-27 16:20:04,008] Finished a trial resulted in value: 1.1848326896295627. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:20:06,845] Finished a trial resulted in value: 2.1499802263927217. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:20:10,645] Finished a trial resulted in value: 1.006322138395831. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:20:13,563] Finished a trial resulted in value: 1.414063644474485. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:20:18,093] Finished a trial resulted in value: 1.0008543661452203. Current best v

[I 2019-03-27 16:21:43,693] Finished a trial resulted in value: 4.838294104130707. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:21:46,966] Finished a trial resulted in value: 1.4481927537106303. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:21:50,200] Finished a trial resulted in value: 12.718568251150364. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:21:56,118] Finished a trial resulted in value: 0.9423611045126628. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:21:58,891] Finished a trial resulted in value: 1.4361544663492158. Current best 

[I 2019-03-27 16:23:58,346] Finished a trial resulted in value: 0.9186332112601087. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:24:11,618] Finished a trial resulted in value: 0.9397975921579987. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:24:16,566] Finished a trial resulted in value: 0.9813025588773188. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:24:43,199] Finished a trial resulted in value: 1.0388699988360073. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:24:54,412] Finished a trial resulted in value: 0.9899729775950588. Current best

[I 2019-03-27 16:28:16,298] Finished a trial resulted in value: 1.3572627982180647. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:28:19,125] Finished a trial resulted in value: 3.003730798345056. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:28:39,689] Finished a trial resulted in value: 0.8980734446302894. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:28:42,979] Finished a trial resulted in value: 3.9764941996847534. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:28:54,202] Finished a trial resulted in value: 1.1355848371661468. Current best 

[I 2019-03-27 16:29:41,026] Finished a trial resulted in value: 7.258754099765501. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:29:44,758] Finished a trial resulted in value: 1.0536733215007894. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:29:47,675] Finished a trial resulted in value: 2.582640929413974. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:29:52,914] Finished a trial resulted in value: 0.9629598585114967. Current best value is 0.8625507264707821 with parameters: {'n_layers': 2, 'n_units_l0': 983.8650934412274, 'n_units_l1': 30.579448664774333}.
[I 2019-03-27 16:30:00,226] Finished a trial resulted in value: 1.0255930358131033. Current best v

In [19]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor((983, 30))
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.7609 ± 0.0430
Test MSE: 0.9762 ± 0.1754


### RF

In [17]:
#Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.7537 ± 0.0081
Test MSE: 1.0056 ± 0.0332


In [45]:
def objective_rf(trial):
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 100))
    n_estimators = int(trial.suggest_loguniform('n_estimators', 2, 1000))
    max_features = trial.suggest_int('max_features', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        rf.fit(X_trn, y_trn)
        y_pred = rf.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_rf, n_trials=100)

[I 2019-03-27 16:00:50,059] Finished a trial resulted in value: 1.5218836980010286. Current best value is 1.5218836980010286 with parameters: {'max_depth': 15.380039789241337, 'n_estimators': 147.90052040104513, 'max_features': 3, 'min_samples_split': 8, 'min_samples_leaf': 9}.
[I 2019-03-27 16:00:50,973] Finished a trial resulted in value: 1.4251916881928215. Current best value is 1.4251916881928215 with parameters: {'max_depth': 21.895441401945902, 'n_estimators': 88.38222724853792, 'max_features': 4, 'min_samples_split': 7, 'min_samples_leaf': 8}.
[I 2019-03-27 16:00:51,085] Finished a trial resulted in value: 1.348715189557296. Current best value is 1.348715189557296 with parameters: {'max_depth': 90.10057829792703, 'n_estimators': 7.645999324433651, 'max_features': 6, 'min_samples_split': 9, 'min_samples_leaf': 8}.
[I 2019-03-27 16:00:51,581] Finished a trial resulted in value: 1.8121345064864918. Current best value is 1.348715189557296 with parameters: {'max_depth': 90.1005782979

[I 2019-03-27 16:02:02,007] Finished a trial resulted in value: 1.34094073328218. Current best value is 1.0388982715660036 with parameters: {'max_depth': 19.83583123349122, 'n_estimators': 223.2113536895508, 'max_features': 7, 'min_samples_split': 7, 'min_samples_leaf': 4}.
[I 2019-03-27 16:02:06,455] Finished a trial resulted in value: 1.1079117090422448. Current best value is 1.0388982715660036 with parameters: {'max_depth': 19.83583123349122, 'n_estimators': 223.2113536895508, 'max_features': 7, 'min_samples_split': 7, 'min_samples_leaf': 4}.
[I 2019-03-27 16:02:08,196] Finished a trial resulted in value: 1.2082863202072889. Current best value is 1.0388982715660036 with parameters: {'max_depth': 19.83583123349122, 'n_estimators': 223.2113536895508, 'max_features': 7, 'min_samples_split': 7, 'min_samples_leaf': 4}.
[I 2019-03-27 16:02:09,685] Finished a trial resulted in value: 1.1652544246089114. Current best value is 1.0388982715660036 with parameters: {'max_depth': 19.835831233491

[I 2019-03-27 16:03:00,300] Finished a trial resulted in value: 1.2987897176560308. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:00,637] Finished a trial resulted in value: 1.3344446829133374. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:09,057] Finished a trial resulted in value: 1.1569104959044276. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:10,472] Finished a trial resulted in value: 1.1966306525547787. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.2770911

[I 2019-03-27 16:03:39,866] Finished a trial resulted in value: 1.2628017019083921. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:40,466] Finished a trial resulted in value: 1.16977474926273. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:40,986] Finished a trial resulted in value: 1.2579088138941756. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:41,382] Finished a trial resulted in value: 1.2529060451549971. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.277091153

In [47]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor(max_depth=26, n_estimators=41, max_features=5,
                              min_samples_split=6, min_samples_leaf=4)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.7342 ± 0.0045
Test MSE: 1.0853 ± 0.0185
