# Performance comparaison

| Fingerprint | MLP | RF |  
|:-:|:-:|:-:|  
|BERT on GDB17_30 FIRST|0.278|0.668| 
|BERT MSM|0.784||  
|ECFP| 0.740 | 0.659 |
|Can2Can|0.7176||
|Enum2Enum|0.725||
|Transformer|0.862||
| NFP| 0.8845 |  |

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch
import optuna

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [2]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [3]:
df_train = pd.read_csv('data/sol_train.csv')
df_test = pd.read_csv('data/sol_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles,spaced
0,[nH0]1c(SC)c2c([nH0]cc[nH0]2)[nH0]c1,6966-78-5,-2.36,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...
1,CCC(C)Cl,78-86-4,-1.96,C C C ( C ) Cl,C C C ( C ) C l
2,O=C(NC(=O)c1ccccc1)c1ccccc1,614-28-8,-2.27,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...
3,CC(C(C)(C)C)O,464-07-3,-0.62,C C ( C ( C ) ( C ) C ) O,C C ( C ( C ) ( C ) C ) O
4,[O-][N+](c1c(O)cccc1)=O,88-75-5,-1.74,[ O- ] [ N+ ] ( c 1 c ( O ) c c c c 1 ) = O,[ O - ] [ N + ] ( c 1 c ( O ) c c c c 1 ) = O


In [4]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [5]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [6]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index] + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [7]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [8]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([322, 220])
torch.Size([322, 220])


# Encode to fingerprint

In [10]:
model = BERT(len(vocab), hidden=256, n_layers=4, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/GDB17_embed20/ep_09.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [9]:
# -b 16 -e 30 --hidden 256 -l 7 --n_head 4 --lr 1e-4 --lr-decay 5 --final-lr 0.01
model = BERT(len(vocab), hidden=256, n_layers=7, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/chembl/ep_29.pkl'))
model.eval()

RuntimeError: Error(s) in loading state_dict for BERT:
	size mismatch for embedding.position.pe: copying a param with shape torch.Size([1, 512, 256]) from checkpoint, the shape in current model is torch.Size([1, 220, 256]).

In [11]:
del df_test, df_train, x_train, x_test

In [12]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [13]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [14]:
X_train.shape

(968, 220, 512)

In [15]:
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)
X_train = X_train[:,0,:]
X_test = X_test[:,0,:]
#y_train = df_train['solubility']
#y_test = df_test['solubility']

# Prediction
### MLP

In [16]:
# Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor()
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.2623 ± 0.0470
Test MSE: 3.0118 ± 0.1917


In [17]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1,3)
    layers = []
    for i in range(n_layers):
        n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 1, 1000))
        layers.append(n_units)
    
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        mlp = MLPRegressor(hidden_layer_sizes=layers)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        mlp.fit(X_trn, y_trn)
        y_pred = mlp.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_mlp, n_trials=100)

[I 2019-03-27 12:51:56,490] Finished a trial resulted in value: 3.9256684188152144. Current best value is 3.9256684188152144 with parameters: {'n_layers': 1, 'n_units_l0': 5.480629971111014}.
[I 2019-03-27 12:51:59,016] Finished a trial resulted in value: 3.7524683326132813. Current best value is 3.7524683326132813 with parameters: {'n_layers': 2, 'n_units_l0': 3.9368491313706793, 'n_units_l1': 6.536836999182889}.
[I 2019-03-27 12:52:16,463] Finished a trial resulted in value: 2.9943524227140372. Current best value is 2.9943524227140372 with parameters: {'n_layers': 1, 'n_units_l0': 531.787693605594}.
[I 2019-03-27 12:52:19,991] Finished a trial resulted in value: 2.9013518957118545. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:52:31,905] Finished a trial resulted in value: 4.872801503656569. Current best value is 2.9013518957118545 with para

[I 2019-03-27 12:53:39,825] Finished a trial resulted in value: 6.228817863581016. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:53:44,215] Finished a trial resulted in value: 3.077280878260577. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:53:55,187] Finished a trial resulted in value: 3.2241218960290876. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:54:06,357] Finished a trial resulted in value: 3.2871861498569026. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_u

[I 2019-03-27 12:54:44,928] Finished a trial resulted in value: 3.0150933835007745. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:54:46,367] Finished a trial resulted in value: 5.454904404604364. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:54:49,405] Finished a trial resulted in value: 6.708878077420245. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_units_l2': 91.60787247486093}.
[I 2019-03-27 12:54:52,221] Finished a trial resulted in value: 3.14097592982358. Current best value is 2.9013518957118545 with parameters: {'n_layers': 3, 'n_units_l0': 8.356904904140531, 'n_units_l1': 6.695005478441894, 'n_uni

[I 2019-03-27 12:56:14,788] Finished a trial resulted in value: 6.229836148112872. Current best value is 2.9010149384597455 with parameters: {'n_layers': 2, 'n_units_l0': 27.16518371446747, 'n_units_l1': 33.25709946798767}.
[I 2019-03-27 12:56:24,293] Finished a trial resulted in value: 3.3012723909796864. Current best value is 2.9010149384597455 with parameters: {'n_layers': 2, 'n_units_l0': 27.16518371446747, 'n_units_l1': 33.25709946798767}.
[I 2019-03-27 12:56:29,198] Finished a trial resulted in value: 6.463173844573573. Current best value is 2.9010149384597455 with parameters: {'n_layers': 2, 'n_units_l0': 27.16518371446747, 'n_units_l1': 33.25709946798767}.
[I 2019-03-27 12:56:32,918] Finished a trial resulted in value: 3.082468927118054. Current best value is 2.9010149384597455 with parameters: {'n_layers': 2, 'n_units_l0': 27.16518371446747, 'n_units_l1': 33.25709946798767}.
[I 2019-03-27 12:56:35,478] Finished a trial resulted in value: 3.1590927237448008. Current best value 

[I 2019-03-27 12:59:05,575] Finished a trial resulted in value: 3.38557405590739. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 12:59:07,891] Finished a trial resulted in value: 3.37788013413828. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 12:59:09,566] Finished a trial resulted in value: 3.9389376689979256. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 12:59:13,183] Finished a trial resulted in value: 2.985882483477571. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 12:59:18,488] Finished a trial resulted in value: 3.0429487420205072. Current best value is 2.7

[I 2019-03-27 13:01:20,452] Finished a trial resulted in value: 7.151908473803012. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 13:01:26,069] Finished a trial resulted in value: 3.18821781823379. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 13:01:29,456] Finished a trial resulted in value: 3.580537682677979. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 13:01:33,059] Finished a trial resulted in value: 2.8596867511946513. Current best value is 2.776295649090273 with parameters: {'n_layers': 2, 'n_units_l0': 4.952259299483751, 'n_units_l1': 991.4548756639401}.
[I 2019-03-27 13:01:37,250] Finished a trial resulted in value: 3.0546023468510346. Current best value is 2.

In [18]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor((5, 991))
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.1948 ± 0.1323
Test MSE: 3.2874 ± 0.5402


### RF

In [19]:
#Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.6540 ± 0.0206
Test MSE: 1.4126 ± 0.0842


In [20]:
def objective_rf(trial):
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 100))
    n_estimators = int(trial.suggest_loguniform('n_estimators', 2, 1000))
    max_features = trial.suggest_int('max_features', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        rf.fit(X_trn, y_trn)
        y_pred = rf.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_rf, n_trials=100)

[I 2019-03-27 13:13:44,239] Finished a trial resulted in value: 1.9858887117561346. Current best value is 1.9858887117561346 with parameters: {'max_depth': 8.805167701303105, 'n_estimators': 12.484846460673685, 'max_features': 4, 'min_samples_split': 9, 'min_samples_leaf': 7}.
[I 2019-03-27 13:13:48,196] Finished a trial resulted in value: 2.4970896879382862. Current best value is 1.9858887117561346 with parameters: {'max_depth': 8.805167701303105, 'n_estimators': 12.484846460673685, 'max_features': 4, 'min_samples_split': 9, 'min_samples_leaf': 7}.
[I 2019-03-27 13:14:01,168] Finished a trial resulted in value: 1.4760623942197728. Current best value is 1.4760623942197728 with parameters: {'max_depth': 30.722686935078347, 'n_estimators': 776.6583629827223, 'max_features': 7, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:14:13,953] Finished a trial resulted in value: 1.2997838328997737. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.5498115

[I 2019-03-27 13:15:42,703] Finished a trial resulted in value: 1.6467039225180407. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.54981159094242, 'n_estimators': 710.7957377150005, 'max_features': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:15:46,547] Finished a trial resulted in value: 1.450441934986444. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.54981159094242, 'n_estimators': 710.7957377150005, 'max_features': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:15:49,147] Finished a trial resulted in value: 1.5373220315419398. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.54981159094242, 'n_estimators': 710.7957377150005, 'max_features': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:15:53,668] Finished a trial resulted in value: 2.792963148999718. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.549811590942

[I 2019-03-27 13:17:39,473] Finished a trial resulted in value: 1.6384744048694815. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.54981159094242, 'n_estimators': 710.7957377150005, 'max_features': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:17:44,829] Finished a trial resulted in value: 2.6484743419545325. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.54981159094242, 'n_estimators': 710.7957377150005, 'max_features': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:18:00,552] Finished a trial resulted in value: 1.371448509095101. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.54981159094242, 'n_estimators': 710.7957377150005, 'max_features': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}.
[I 2019-03-27 13:18:04,343] Finished a trial resulted in value: 1.606727048046625. Current best value is 1.2997838328997737 with parameters: {'max_depth': 18.549811590942

[I 2019-03-27 13:20:50,837] Finished a trial resulted in value: 1.624694120209438. Current best value is 1.286971368503684 with parameters: {'max_depth': 41.11523186638494, 'n_estimators': 389.5991272932358, 'max_features': 9, 'min_samples_split': 6, 'min_samples_leaf': 2}.
[I 2019-03-27 13:21:03,735] Finished a trial resulted in value: 1.6620758906737343. Current best value is 1.286971368503684 with parameters: {'max_depth': 41.11523186638494, 'n_estimators': 389.5991272932358, 'max_features': 9, 'min_samples_split': 6, 'min_samples_leaf': 2}.
[I 2019-03-27 13:21:11,329] Finished a trial resulted in value: 1.5093046967457309. Current best value is 1.286971368503684 with parameters: {'max_depth': 41.11523186638494, 'n_estimators': 389.5991272932358, 'max_features': 9, 'min_samples_split': 6, 'min_samples_leaf': 2}.
[I 2019-03-27 13:21:12,425] Finished a trial resulted in value: 1.451151120841423. Current best value is 1.286971368503684 with parameters: {'max_depth': 41.11523186638494, 

In [25]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor(max_depth=99, n_estimators=69, max_features=8,
                              min_samples_split=7, min_samples_leaf=3)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.6679 ± 0.0064
Test MSE: 1.3556 ± 0.0262
