# Performance comparaison

Delaney's aquaous solubility  

| Fingerprint | MLP R2| MLP RMSE| RF R2 |RF RMSE | 
|:-:|:-:|:-:| :-:| :-:|
|BERT on ChEMBL|0.842| - |0.797|-| 
|ECFP| 0.715 |- | 0.639 | -|
|Can2Can|0.642|-|0.618|-|
|Enum2Enum|0.676|-|0.640|-|
|Transformer|0.842|-|0.772|-|
|MPNN| 0.903 | 0.662 |-|-|
| Graph Convolution | 0.877 | 0.744|-|-|
| Weave |0.897 | 0.681|-|-|

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch
import optuna

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [2]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [3]:
df_train = pd.read_csv('data/ESOL_train.csv')
df_test = pd.read_csv('data/ESOL_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles
0,C=CC1C2CCN(C1)C(C2)C(c1cc[nH0]c2ccc(cc12)OC)O,56-54-2,-3.37,C = C C 1 C 2 C C N ( C 1 ) C ( C 2 ) C ( c 1 ...
1,Brc1c(Br)cccc1,583-53-9,-3.5,Br c 1 c ( Br ) c c c c 1
2,c1c2cccc3c4c(cccc4)c(c23)cc1,206-44-0,-6.0,c 1 c 2 c c c c 3 c 4 c ( c c c c 4 ) c ( c 2 ...
3,c1c(C)c2Cc3c(cccc3)c2cc1,1730-37-6,-5.22,c 1 c ( C ) c 2 C c 3 c ( c c c c 3 ) c 2 c c 1
4,CCCC(C)(C)O,590-36-3,-0.49,C C C C ( C ) ( C ) O


In [4]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [5]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [6]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [7]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [8]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([129, 220])
torch.Size([129, 220])


# Encode to fingerprint

In [18]:
# model = BERT(len(vocab), hidden=256, n_layers=4, attn_heads=4, dropout=0)
# model.load_state_dict(torch.load('../result/GDB17/ep00_it020000.pkl'))
# model.eval()

In [28]:
# -b 16 -e 30 --hidden 256 -l 7 --n_head 4 --lr 1e-4 --lr-decay 5 --final-lr 0.01
model = BERT(len(vocab), hidden=256, n_layers=8, attn_heads=8, dropout=0)
model.load_state_dict(torch.load('../result/chembl_msm/it008000.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [29]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [30]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [31]:
# X_train = np.hstack([np.mean(X_train, axis=1)[:,-512:-256], np.max(X_train, axis=1)[:,-512:-256], X_train[:,0,-512:-256] ])
# X_test = np.hstack([np.mean(X_test, axis=1)[:,-512:-256], np.max(X_test, axis=1)[:,-512:-256], X_test[:,0,-512:-256] ])
X_train = X_train[:,0,:]
X_test = X_test[:,0,:]
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)
#y_train = df_train['solubility']
#y_test = df_test['solubility']

In [32]:
X_train.shape

(1161, 1024)

# Prediction
### MLP

In [33]:
# Default
n = 10
r2 = np.zeros(n)
rmse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor()
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    rmse[i] = mean_squared_error(y_test, y_pred)**0.5

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test RMSE: {:.4f} ± {:.4f}".format(np.mean(rmse), np.std(rmse)))



Test R2: 0.6359 ± 0.0439
Test RMSE: 1.1311 ± 0.0669


In [35]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1,3)
    layers = []
    for i in range(n_layers):
        n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 1, 1000))
        layers.append(n_units)
    
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        mlp = MLPRegressor(hidden_layer_sizes=layers)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        mlp.fit(X_trn, y_trn)
        y_pred = mlp.predict(X_val)
        score += mean_squared_error(y_val, y_pred)**0.5
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_mlp, n_trials=100)

[I 2019-04-09 14:52:59,492] Finished a trial resulted in value: 0.9547148560167998. Current best value is 0.9547148560167998 with parameters: {'n_layers': 3, 'n_units_l0': 13.994330602458989, 'n_units_l1': 13.314157136091186, 'n_units_l2': 149.8621882773519}.
[I 2019-04-09 14:53:06,531] Finished a trial resulted in value: 2.0566061632697084. Current best value is 0.9547148560167998 with parameters: {'n_layers': 3, 'n_units_l0': 13.994330602458989, 'n_units_l1': 13.314157136091186, 'n_units_l2': 149.8621882773519}.
[I 2019-04-09 14:53:14,674] Finished a trial resulted in value: 1.0541698777181172. Current best value is 0.9547148560167998 with parameters: {'n_layers': 3, 'n_units_l0': 13.994330602458989, 'n_units_l1': 13.314157136091186, 'n_units_l2': 149.8621882773519}.
[I 2019-04-09 14:53:23,571] Finished a trial resulted in value: 0.959573111477458. Current best value is 0.9547148560167998 with parameters: {'n_layers': 3, 'n_units_l0': 13.994330602458989, 'n_units_l1': 13.314157136091

[I 2019-04-09 14:56:28,417] Finished a trial resulted in value: 1.9000587314043083. Current best value is 0.8900698736668329 with parameters: {'n_layers': 2, 'n_units_l0': 11.873958368640187, 'n_units_l1': 424.56134367562777}.
[I 2019-04-09 14:59:19,919] Finished a trial resulted in value: 3.027004203982276. Current best value is 0.8900698736668329 with parameters: {'n_layers': 2, 'n_units_l0': 11.873958368640187, 'n_units_l1': 424.56134367562777}.
[I 2019-04-09 14:59:25,012] Finished a trial resulted in value: 2.1408220123468533. Current best value is 0.8900698736668329 with parameters: {'n_layers': 2, 'n_units_l0': 11.873958368640187, 'n_units_l1': 424.56134367562777}.
[I 2019-04-09 14:59:48,100] Finished a trial resulted in value: 0.8999047522134882. Current best value is 0.8900698736668329 with parameters: {'n_layers': 2, 'n_units_l0': 11.873958368640187, 'n_units_l1': 424.56134367562777}.
[I 2019-04-09 15:00:02,809] Finished a trial resulted in value: 0.9285464961834614. Current b

[I 2019-04-09 15:09:25,276] Finished a trial resulted in value: 0.9355038141556224. Current best value is 0.8639864382139183 with parameters: {'n_layers': 2, 'n_units_l0': 52.71598428487625, 'n_units_l1': 362.97673611620104}.
[I 2019-04-09 15:09:33,653] Finished a trial resulted in value: 0.984209681019353. Current best value is 0.8639864382139183 with parameters: {'n_layers': 2, 'n_units_l0': 52.71598428487625, 'n_units_l1': 362.97673611620104}.
[I 2019-04-09 15:09:45,916] Finished a trial resulted in value: 1.4375844909731785. Current best value is 0.8639864382139183 with parameters: {'n_layers': 2, 'n_units_l0': 52.71598428487625, 'n_units_l1': 362.97673611620104}.
[I 2019-04-09 15:09:59,389] Finished a trial resulted in value: 0.9719293302547026. Current best value is 0.8639864382139183 with parameters: {'n_layers': 2, 'n_units_l0': 52.71598428487625, 'n_units_l1': 362.97673611620104}.
[I 2019-04-09 15:10:10,243] Finished a trial resulted in value: 0.909645244916985. Current best v

[I 2019-04-09 15:20:24,103] Finished a trial resulted in value: 1.4657272972643314. Current best value is 0.8362812636995722 with parameters: {'n_layers': 2, 'n_units_l0': 120.18576019570071, 'n_units_l1': 466.1906946243364}.
[I 2019-04-09 15:21:39,388] Finished a trial resulted in value: 2.3159573573223167. Current best value is 0.8362812636995722 with parameters: {'n_layers': 2, 'n_units_l0': 120.18576019570071, 'n_units_l1': 466.1906946243364}.
[I 2019-04-09 15:22:06,016] Finished a trial resulted in value: 0.951787349761438. Current best value is 0.8362812636995722 with parameters: {'n_layers': 2, 'n_units_l0': 120.18576019570071, 'n_units_l1': 466.1906946243364}.
[I 2019-04-09 15:22:24,843] Finished a trial resulted in value: 0.8631854185076593. Current best value is 0.8362812636995722 with parameters: {'n_layers': 2, 'n_units_l0': 120.18576019570071, 'n_units_l1': 466.1906946243364}.
[I 2019-04-09 15:22:42,856] Finished a trial resulted in value: 0.9205855575094471. Current best 

In [36]:
# Optimized

n = 10
r2 = np.zeros(n)
rmse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor((120,466))
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    rmse[i] = mean_squared_error(y_test, y_pred)**0.5

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test RMSE: {:.4f} ± {:.4f}".format(np.mean(rmse), np.std(rmse)))

Test R2: 0.7627 ± 0.0505
Test RMSE: 0.9096 ± 0.0960


### RF

In [30]:
#Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.7391 ± 0.0205
Test MSE: 0.9198 ± 0.0722


In [31]:
def objective_rf(trial):
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 100))
    n_estimators = int(trial.suggest_loguniform('n_estimators', 2, 1000))
    max_features = trial.suggest_int('max_features', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        rf.fit(X_trn, y_trn)
        y_pred = rf.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_rf, n_trials=100)

[I 2019-04-08 21:51:12,302] Finished a trial resulted in value: 1.081967059900994. Current best value is 1.081967059900994 with parameters: {'max_depth': 80.91560273968723, 'n_estimators': 6.834087692986803, 'max_features': 8, 'min_samples_split': 5, 'min_samples_leaf': 6}.
[I 2019-04-08 21:51:12,474] Finished a trial resulted in value: 1.1332188948688922. Current best value is 1.081967059900994 with parameters: {'max_depth': 80.91560273968723, 'n_estimators': 6.834087692986803, 'max_features': 8, 'min_samples_split': 5, 'min_samples_leaf': 6}.
[I 2019-04-08 21:51:13,806] Finished a trial resulted in value: 0.963376185436431. Current best value is 0.963376185436431 with parameters: {'max_depth': 11.843241737364552, 'n_estimators': 60.16827086733581, 'max_features': 9, 'min_samples_split': 10, 'min_samples_leaf': 5}.
[I 2019-04-08 21:51:13,887] Finished a trial resulted in value: 2.1024562434587355. Current best value is 0.963376185436431 with parameters: {'max_depth': 11.84324173736455

[I 2019-04-08 21:55:31,480] Finished a trial resulted in value: 1.2846315936432882. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.75897168245926, 'n_estimators': 120.74289975596233, 'max_features': 10, 'min_samples_split': 3, 'min_samples_leaf': 4}.
[I 2019-04-08 21:55:31,657] Finished a trial resulted in value: 1.2675906733684776. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.75897168245926, 'n_estimators': 120.74289975596233, 'max_features': 10, 'min_samples_split': 3, 'min_samples_leaf': 4}.
[I 2019-04-08 21:55:32,456] Finished a trial resulted in value: 1.108516719284507. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.75897168245926, 'n_estimators': 120.74289975596233, 'max_features': 10, 'min_samples_split': 3, 'min_samples_leaf': 4}.
[I 2019-04-08 21:55:39,997] Finished a trial resulted in value: 0.868256722867016. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.758971

In [47]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor(max_depth=26, n_estimators=41, max_features=5,
                              min_samples_split=6, min_samples_leaf=4)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.7342 ± 0.0045
Test MSE: 1.0853 ± 0.0185
