# Performance comparaison

| Fingerprint | MLP | RF |  
|:-:|:-:|:-:|  
|BERT on ChEMBL|0.842|0.797| 
|ECFP| 0.715 | 0.639 |
|Can2Can|0.642|0.618|
|Enum2Enum|0.676|0.640|
|Transformer|0.842|0.772|
| NFP| 0.885 |  |

In [23]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch
import optuna

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [24]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [25]:
df_train = pd.read_csv('data/sol_train.csv')
df_test = pd.read_csv('data/sol_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles,spaced
0,[nH0]1c(SC)c2c([nH0]cc[nH0]2)[nH0]c1,6966-78-5,-2.36,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...
1,CCC(C)Cl,78-86-4,-1.96,C C C ( C ) Cl,C C C ( C ) C l
2,O=C(NC(=O)c1ccccc1)c1ccccc1,614-28-8,-2.27,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...
3,CC(C(C)(C)C)O,464-07-3,-0.62,C C ( C ( C ) ( C ) C ) O,C C ( C ( C ) ( C ) C ) O
4,[O-][N+](c1c(O)cccc1)=O,88-75-5,-1.74,[ O- ] [ N+ ] ( c 1 c ( O ) c c c c 1 ) = O,[ O - ] [ N + ] ( c 1 c ( O ) c c c c 1 ) = O


In [26]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [27]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [28]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index] + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [29]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [30]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([322, 220])
torch.Size([322, 220])


# Encode to fingerprint

In [31]:
model = BERT(len(vocab), hidden=256, n_layers=4, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/GDB17/ep00_it020000.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [56]:
# -b 16 -e 30 --hidden 256 -l 7 --n_head 4 --lr 1e-4 --lr-decay 5 --final-lr 0.01
model = BERT(len(vocab), hidden=256, n_layers=8, attn_heads=8, dropout=0)
model.load_state_dict(torch.load('../result/chembl/ep00_it008000.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [40]:
del df_test, df_train, x_train, x_test

In [57]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [58]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [59]:
X_train.shape

(968, 220, 512)

In [60]:
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)
X_train = X_train[:,0,:]
X_test = X_test[:,0,:]
#y_train = df_train['solubility']
#y_test = df_test['solubility']

# Prediction
### MLP

In [61]:
# Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor()
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.8237 ± 0.0147
Test MSE: 0.7199 ± 0.0601


In [51]:
# Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor()
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.8360 ± 0.0189
Test MSE: 0.6694 ± 0.0773


In [52]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1,3)
    layers = []
    for i in range(n_layers):
        n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 1, 1000))
        layers.append(n_units)
    
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        mlp = MLPRegressor(hidden_layer_sizes=layers)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        mlp.fit(X_trn, y_trn)
        y_pred = mlp.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_mlp, n_trials=100)

[I 2019-04-02 15:36:37,622] Finished a trial resulted in value: 0.8559266200353248. Current best value is 0.8559266200353248 with parameters: {'n_layers': 1, 'n_units_l0': 15.928423836660851}.
[I 2019-04-02 15:36:43,502] Finished a trial resulted in value: 0.8199343369919891. Current best value is 0.8199343369919891 with parameters: {'n_layers': 1, 'n_units_l0': 15.232459464743572}.
[I 2019-04-02 15:36:59,718] Finished a trial resulted in value: 0.6314810944529718. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:38:09,384] Finished a trial resulted in value: 0.7282005567277781. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:38:29,786] Finished a trial resulted in value: 0.7181770457953779. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_u

[I 2019-04-02 15:43:31,103] Finished a trial resulted in value: 7.043994374354233. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:43:34,885] Finished a trial resulted in value: 2.6282653313456636. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:43:44,509] Finished a trial resulted in value: 0.6952471100346782. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:43:51,520] Finished a trial resulted in value: 6.214041581886516. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:43:58,538] Finished a trial resulted in value: 0.749634433538562. Current best va

[I 2019-04-02 15:48:20,071] Finished a trial resulted in value: 0.762569315219106. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:48:25,103] Finished a trial resulted in value: 2.0186445793998744. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:48:33,636] Finished a trial resulted in value: 0.8156975306639547. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:48:40,275] Finished a trial resulted in value: 0.7067313455316933. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:50:03,267] Finished a trial resulted in value: 0.7624154870661504. Current best 

[I 2019-04-02 15:52:47,861] Finished a trial resulted in value: 2.4258538736072817. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:53:01,272] Finished a trial resulted in value: 0.8650374933033049. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:53:09,958] Finished a trial resulted in value: 0.6625937012636202. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:53:25,631] Finished a trial resulted in value: 0.6537118954549249. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 15:53:42,781] Finished a trial resulted in value: 0.6671243398304865. Current best

[I 2019-04-02 16:01:35,889] Finished a trial resulted in value: 0.7051897493082673. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 16:01:43,756] Finished a trial resulted in value: 0.6850041214099772. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 16:01:49,242] Finished a trial resulted in value: 0.7620382512354136. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 16:01:58,703] Finished a trial resulted in value: 0.746036908779865. Current best value is 0.6314810944529718 with parameters: {'n_layers': 2, 'n_units_l0': 199.25078735241402, 'n_units_l1': 33.32414153966739}.
[I 2019-04-02 16:02:28,796] Finished a trial resulted in value: 0.757561070536608. Current best v

[I 2019-04-02 16:07:27,222] Finished a trial resulted in value: 3.5736347711974763. Current best value is 0.6220692612471702 with parameters: {'n_layers': 1, 'n_units_l0': 411.7338203518689}.
[I 2019-04-02 16:07:48,546] Finished a trial resulted in value: 0.6213386132054782. Current best value is 0.6213386132054782 with parameters: {'n_layers': 1, 'n_units_l0': 319.0851006146218}.
[I 2019-04-02 16:07:52,102] Finished a trial resulted in value: 0.8617475260535498. Current best value is 0.6213386132054782 with parameters: {'n_layers': 1, 'n_units_l0': 319.0851006146218}.
[I 2019-04-02 16:07:59,584] Finished a trial resulted in value: 0.8732462678206554. Current best value is 0.6213386132054782 with parameters: {'n_layers': 1, 'n_units_l0': 319.0851006146218}.
[I 2019-04-02 16:08:18,194] Finished a trial resulted in value: 0.7198834346969369. Current best value is 0.6213386132054782 with parameters: {'n_layers': 1, 'n_units_l0': 319.0851006146218}.
[I 2019-04-02 16:08:29,415] Finished a t

[I 2019-04-02 16:10:28,555] Finished a trial resulted in value: 0.8126493737879572. Current best value is 0.6213386132054782 with parameters: {'n_layers': 1, 'n_units_l0': 319.0851006146218}.
[I 2019-04-02 16:10:33,682] Finished a trial resulted in value: 0.7052973724637451. Current best value is 0.6213386132054782 with parameters: {'n_layers': 1, 'n_units_l0': 319.0851006146218}.


In [55]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor((100))
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.8416 ± 0.0117
Test MSE: 0.6465 ± 0.0478


### RF

In [62]:
#Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.7754 ± 0.0095
Test MSE: 0.9171 ± 0.0387


In [45]:
def objective_rf(trial):
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 100))
    n_estimators = int(trial.suggest_loguniform('n_estimators', 2, 1000))
    max_features = trial.suggest_int('max_features', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        rf.fit(X_trn, y_trn)
        y_pred = rf.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_rf, n_trials=100)

[I 2019-03-27 16:00:50,059] Finished a trial resulted in value: 1.5218836980010286. Current best value is 1.5218836980010286 with parameters: {'max_depth': 15.380039789241337, 'n_estimators': 147.90052040104513, 'max_features': 3, 'min_samples_split': 8, 'min_samples_leaf': 9}.
[I 2019-03-27 16:00:50,973] Finished a trial resulted in value: 1.4251916881928215. Current best value is 1.4251916881928215 with parameters: {'max_depth': 21.895441401945902, 'n_estimators': 88.38222724853792, 'max_features': 4, 'min_samples_split': 7, 'min_samples_leaf': 8}.
[I 2019-03-27 16:00:51,085] Finished a trial resulted in value: 1.348715189557296. Current best value is 1.348715189557296 with parameters: {'max_depth': 90.10057829792703, 'n_estimators': 7.645999324433651, 'max_features': 6, 'min_samples_split': 9, 'min_samples_leaf': 8}.
[I 2019-03-27 16:00:51,581] Finished a trial resulted in value: 1.8121345064864918. Current best value is 1.348715189557296 with parameters: {'max_depth': 90.1005782979

[I 2019-03-27 16:03:00,300] Finished a trial resulted in value: 1.2987897176560308. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:00,637] Finished a trial resulted in value: 1.3344446829133374. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:09,057] Finished a trial resulted in value: 1.1569104959044276. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.27709115303057, 'n_estimators': 40.736098033085035, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-27 16:03:10,472] Finished a trial resulted in value: 1.1966306525547787. Current best value is 1.0300176525975921 with parameters: {'max_depth': 26.2770911

In [47]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor(max_depth=26, n_estimators=41, max_features=5,
                              min_samples_split=6, min_samples_leaf=4)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.7342 ± 0.0045
Test MSE: 1.0853 ± 0.0185
