# Performance comparaison

| Fingerprint | MLP | RF |  
|:-:|:-:|:-:|  
|BERT on GDB17|0.214|0.655| 
|BERT MSM|0.784||  
|ECFP| 0.740 | 0.659 |
|Can2Can|0.7176||
|Enum2Enum|0.725||
|Transformer|0.862||
| NFP| 0.8845 |  |

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch
import optuna

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [2]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [3]:
df_train = pd.read_csv('data/sol_train.csv')
df_test = pd.read_csv('data/sol_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles,spaced
0,[nH0]1c(SC)c2c([nH0]cc[nH0]2)[nH0]c1,6966-78-5,-2.36,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...
1,CCC(C)Cl,78-86-4,-1.96,C C C ( C ) Cl,C C C ( C ) C l
2,O=C(NC(=O)c1ccccc1)c1ccccc1,614-28-8,-2.27,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...
3,CC(C(C)(C)C)O,464-07-3,-0.62,C C ( C ( C ) ( C ) C ) O,C C ( C ( C ) ( C ) C ) O
4,[O-][N+](c1c(O)cccc1)=O,88-75-5,-1.74,[ O- ] [ N+ ] ( c 1 c ( O ) c c c c 1 ) = O,[ O - ] [ N + ] ( c 1 c ( O ) c c c c 1 ) = O


In [4]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [5]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [6]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index] + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [7]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [8]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([322, 220])
torch.Size([322, 220])


# Encode to fingerprint

In [32]:
# model = BERT(len(vocab), hidden=256, n_layers=4, attn_heads=4, dropout=0)
# model.load_state_dict(torch.load('../result/GDB17_embed20/ep_29.pkl'))
# model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [9]:
model = BERT(len(vocab), hidden=256, n_layers=7, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/chembl/ep_29.pkl'))
model.eval()

RuntimeError: Error(s) in loading state_dict for BERT:
	size mismatch for embedding.position.pe: copying a param with shape torch.Size([1, 512, 256]) from checkpoint, the shape in current model is torch.Size([1, 220, 256]).

In [11]:
del df_test, df_train, x_train, x_test

In [12]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [13]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [14]:
X_train.shape

(968, 220, 512)

In [15]:
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)
X_train = X_train[:,0,:]
X_test = X_test[:,0,:]
#y_train = df_train['solubility']
#y_test = df_test['solubility']

# Prediction
### MLP

In [27]:
# Default
mse = 0
r2 = 0
n = 10

for _ in range(n):
    MLP = MLPRegressor()
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2 += r2_score(y_test, y_pred)
    mse += mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f}".format(r2/n))
print("Test MSE: {:.4f}".format(mse/n))

Test R2: 0.2714
Test MSE: 2.9747


In [17]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1,3)
    layers = []
    for i in range(n_layers):
        n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 4, 1000))
        layers.append(n_units)
    
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        mlp = MLPRegressor(hidden_layer_sizes=layers)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        mlp.fit(X_trn, y_trn)
        y_pred = mlp.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_mlp, n_trials=100)

[I 2019-03-25 19:48:41,781] Finished a trial resulted in value: 3.294587534860959. Current best value is 3.294587534860959 with parameters: {'n_layers': 1, 'n_units_l0': 21.548070178061735}.
[I 2019-03-25 19:48:48,786] Finished a trial resulted in value: 3.346919251293162. Current best value is 3.294587534860959 with parameters: {'n_layers': 1, 'n_units_l0': 21.548070178061735}.
[I 2019-03-25 19:49:06,497] Finished a trial resulted in value: 3.4249313592484554. Current best value is 3.294587534860959 with parameters: {'n_layers': 1, 'n_units_l0': 21.548070178061735}.
[I 2019-03-25 19:49:20,443] Finished a trial resulted in value: 3.210218557039477. Current best value is 3.210218557039477 with parameters: {'n_layers': 3, 'n_units_l0': 18.20941407884273, 'n_units_l1': 36.34818064821991, 'n_units_l2': 305.77959014321476}.
[I 2019-03-25 19:49:31,757] Finished a trial resulted in value: 3.067322517603933. Current best value is 3.067322517603933 with parameters: {'n_layers': 2, 'n_units_l0':

[I 2019-03-25 19:56:03,943] Finished a trial resulted in value: 3.4075579143710337. Current best value is 2.9154720642441094 with parameters: {'n_layers': 3, 'n_units_l0': 39.01260510723468, 'n_units_l1': 7.167384526782892, 'n_units_l2': 10.735430377407239}.
[I 2019-03-25 19:56:13,751] Finished a trial resulted in value: 5.558386902000768. Current best value is 2.9154720642441094 with parameters: {'n_layers': 3, 'n_units_l0': 39.01260510723468, 'n_units_l1': 7.167384526782892, 'n_units_l2': 10.735430377407239}.
[I 2019-03-25 19:56:28,202] Finished a trial resulted in value: 4.6013758518671475. Current best value is 2.9154720642441094 with parameters: {'n_layers': 3, 'n_units_l0': 39.01260510723468, 'n_units_l1': 7.167384526782892, 'n_units_l2': 10.735430377407239}.
[I 2019-03-25 19:56:42,420] Finished a trial resulted in value: 3.2327268766807578. Current best value is 2.9154720642441094 with parameters: {'n_layers': 3, 'n_units_l0': 39.01260510723468, 'n_units_l1': 7.167384526782892, 

[I 2019-03-25 20:01:33,255] Finished a trial resulted in value: 3.003129973970149. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:02:02,365] Finished a trial resulted in value: 3.555589318935312. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:03:33,255] Finished a trial resulted in value: 3.52852278192413. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:04:34,155] Finished a trial resulted in value: 2.837711386456215. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:05:21,668] Finished a trial resulted in value: 2.9907613581773123. Current best valu

[I 2019-03-25 20:18:05,462] Finished a trial resulted in value: 3.944210662106177. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:19:12,448] Finished a trial resulted in value: 2.9291225844654956. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:19:19,044] Finished a trial resulted in value: 3.126356485493972. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:19:57,626] Finished a trial resulted in value: 2.910526769398953. Current best value is 2.7681550104473565 with parameters: {'n_layers': 2, 'n_units_l0': 533.8938559801433, 'n_units_l1': 24.776650285968895}.
[I 2019-03-25 20:20:24,935] Finished a trial resulted in value: 3.575375830638615. Current best val

In [28]:
# Optimized

mse = 0
r2 = 0
n = 10

for _ in range(n):
    MLP = MLPRegressor((240, 17, 267))
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2 += r2_score(y_test, y_pred)
    mse += mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f}".format(r2/n))
print("Test MSE: {:.4f}".format(mse/n))

Test R2: 0.2135
Test MSE: 3.2111


### RF

In [30]:
#Default
mse = 0
r2 = 0
n = 10

for _ in range(n):
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2 += r2_score(y_test, y_pred)
    mse += mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f}".format(r2/n))
print("Test MSE: {:.4f}".format(mse/n))



Test R2: 0.6477
Test MSE: 1.4384


In [24]:
def objective_rf(trial):
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 100))
    n_estimators = int(trial.suggest_loguniform('n_estimators', 2, 1000))
    max_features = trial.suggest_int('max_features', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        rf.fit(X_trn, y_trn)
        y_pred = rf.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_rf, n_trials=100)

[I 2019-03-25 20:31:49,668] Finished a trial resulted in value: 2.080513543423525. Current best value is 2.080513543423525 with parameters: {'max_depth': 3.9736566109412204, 'n_estimators': 42.187951451281585, 'max_features': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}.
[I 2019-03-25 20:31:49,711] Finished a trial resulted in value: 2.392552191739588. Current best value is 2.080513543423525 with parameters: {'max_depth': 3.9736566109412204, 'n_estimators': 42.187951451281585, 'max_features': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}.
[I 2019-03-25 20:31:49,949] Finished a trial resulted in value: 1.6618575213509814. Current best value is 1.6618575213509814 with parameters: {'max_depth': 32.507252783445615, 'n_estimators': 15.582863255659985, 'max_features': 6, 'min_samples_split': 10, 'min_samples_leaf': 4}.
[I 2019-03-25 20:31:50,018] Finished a trial resulted in value: 2.7467569968798466. Current best value is 1.6618575213509814 with parameters: {'max_depth': 32.5072527

[I 2019-03-25 20:33:25,765] Finished a trial resulted in value: 1.4599064147217689. Current best value is 1.3139396675163089 with parameters: {'max_depth': 51.02816455307841, 'n_estimators': 242.85367871258023, 'max_features': 8, 'min_samples_split': 7, 'min_samples_leaf': 1}.
[I 2019-03-25 20:33:35,819] Finished a trial resulted in value: 1.6326428241899742. Current best value is 1.3139396675163089 with parameters: {'max_depth': 51.02816455307841, 'n_estimators': 242.85367871258023, 'max_features': 8, 'min_samples_split': 7, 'min_samples_leaf': 1}.
[I 2019-03-25 20:33:36,377] Finished a trial resulted in value: 1.430045729629065. Current best value is 1.3139396675163089 with parameters: {'max_depth': 51.02816455307841, 'n_estimators': 242.85367871258023, 'max_features': 8, 'min_samples_split': 7, 'min_samples_leaf': 1}.
[I 2019-03-25 20:33:39,014] Finished a trial resulted in value: 1.3774127358088526. Current best value is 1.3139396675163089 with parameters: {'max_depth': 51.02816455

[I 2019-03-25 20:35:10,294] Finished a trial resulted in value: 1.6918122856369404. Current best value is 1.2851880098694446 with parameters: {'max_depth': 28.384584800685214, 'n_estimators': 569.0131803934628, 'max_features': 5, 'min_samples_split': 7, 'min_samples_leaf': 1}.
[I 2019-03-25 20:35:15,406] Finished a trial resulted in value: 1.7118526688274596. Current best value is 1.2851880098694446 with parameters: {'max_depth': 28.384584800685214, 'n_estimators': 569.0131803934628, 'max_features': 5, 'min_samples_split': 7, 'min_samples_leaf': 1}.
[I 2019-03-25 20:35:18,988] Finished a trial resulted in value: 1.553454153835395. Current best value is 1.2851880098694446 with parameters: {'max_depth': 28.384584800685214, 'n_estimators': 569.0131803934628, 'max_features': 5, 'min_samples_split': 7, 'min_samples_leaf': 1}.
[I 2019-03-25 20:35:23,843] Finished a trial resulted in value: 1.8344136664507453. Current best value is 1.2851880098694446 with parameters: {'max_depth': 28.38458480

[I 2019-03-25 20:36:41,879] Finished a trial resulted in value: 1.5962131382114821. Current best value is 1.2842948017849365 with parameters: {'max_depth': 68.0941085313078, 'n_estimators': 448.96558455530123, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-25 20:36:43,208] Finished a trial resulted in value: 1.6800950397749421. Current best value is 1.2842948017849365 with parameters: {'max_depth': 68.0941085313078, 'n_estimators': 448.96558455530123, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-25 20:36:43,608] Finished a trial resulted in value: 1.737747295394195. Current best value is 1.2842948017849365 with parameters: {'max_depth': 68.0941085313078, 'n_estimators': 448.96558455530123, 'max_features': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}.
[I 2019-03-25 20:36:46,037] Finished a trial resulted in value: 1.8236396996031397. Current best value is 1.2842948017849365 with parameters: {'max_depth': 68.09410853130

In [29]:
# Optimized

mse = 0
r2 = 0
n = 10

for _ in range(n):
    RF = RandomForestRegressor(max_depth=68, n_estimators=449, max_features=5,
                              min_samples_split=6, min_samples_leaf=4)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2 += r2_score(y_test, y_pred)
    mse += mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f}".format(r2/n))
print("Test MSE: {:.4f}".format(mse/n))

Test R2: 0.6551
Test MSE: 1.4081
