# Performance comparaison

Delaney's aquaous solubility  

| Fingerprint | MLP R2| MLP RMSE| RF R2 |RF RMSE | 
|:-:|:-:|:-:| :-:| :-:|
|BERT on ChEMBL|0.842| - |0.797|-| 
|ECFP| 0.715 |- | 0.639 | -|
|Can2Can|0.642|-|0.618|-|
|Enum2Enum|0.676|-|0.640|-|
|Transformer|0.842|-|0.772|-|
|MPNN| 0.903 | 0.662 |-|-|
| Graph Convolution | 0.877 | 0.744|-|-|
| Weave |0.897 | 0.681|-|-|

In [43]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch
import optuna
import xgboost

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [2]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [3]:
df_train = pd.read_csv('data/ESOL_train.csv')
df_test = pd.read_csv('data/ESOL_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles
0,C=CC1C2CCN(C1)C(C2)C(c1cc[nH0]c2ccc(cc12)OC)O,56-54-2,-3.37,C = C C 1 C 2 C C N ( C 1 ) C ( C 2 ) C ( c 1 ...
1,Brc1c(Br)cccc1,583-53-9,-3.5,Br c 1 c ( Br ) c c c c 1
2,c1c2cccc3c4c(cccc4)c(c23)cc1,206-44-0,-6.0,c 1 c 2 c c c c 3 c 4 c ( c c c c 4 ) c ( c 2 ...
3,c1c(C)c2Cc3c(cccc3)c2cc1,1730-37-6,-5.22,c 1 c ( C ) c 2 C c 3 c ( c c c c 3 ) c 2 c c 1
4,CCCC(C)(C)O,590-36-3,-0.49,C C C C ( C ) ( C ) O


In [4]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [5]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [6]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [7]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [8]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([129, 220])
torch.Size([129, 220])


# Encode to fingerprint

../result/chembl_msm_256/it080000.pkl  
8 layers 8 heads  
MLP: 0.7820 | 0.8751  
XGB: 0.7915 | 0.8574

In [48]:
model = BERT(len(vocab), hidden=256, n_layers=8, attn_heads=8, dropout=0)
model.load_state_dict(torch.load('../result/chembl/ep00_it010000.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [49]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [50]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [51]:
X_train.shape

(1161, 220, 1024)

In [52]:
# 0.618
X_train = np.hstack([np.mean(X_train, axis=1)[:,-256:], np.max(X_train, axis=1)[:,-256:], X_train[:,0,-512:] ])
X_test = np.hstack([np.mean(X_test, axis=1)[:,-256:], np.max(X_test, axis=1)[:,-256:], X_test[:,0,-512:] ])
# 0.347
# X_train = X_train[:,0,:]
# X_test = X_test[:,0,:]
# 0.535
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)

In [53]:
X_train.shape

(1161, 1024)

# Prediction
### MLP

In [54]:
# Default
n = 10
r2 = np.zeros(n)
rmse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor(max_iter=1000)
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    rmse[i] = mean_squared_error(y_test, y_pred)**0.5

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test RMSE: {:.4f} ± {:.4f}".format(np.mean(rmse), np.std(rmse)))

Test R2: 0.7693 ± 0.0376
Test RMSE: 0.8992 ± 0.0709


In [55]:
def objective_mlp(trial):
    n_layers = trial.suggest_int('n_layers', 1,3)
    layers = []
    for i in range(n_layers):
        n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 1, 1000))
        layers.append(n_units)
    
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        mlp = MLPRegressor(hidden_layer_sizes=layers, max_iter=1000)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        mlp.fit(X_trn, y_trn)
        y_pred = mlp.predict(X_val)
        score += mean_squared_error(y_val, y_pred)**0.5
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_mlp, n_trials=100)

[I 2019-05-17 13:39:18,234] Finished a trial resulted in value: 0.9521812716245708. Current best value is 0.9521812716245708 with parameters: {'n_layers': 1, 'n_units_l0': 213.01326378939115}.
[I 2019-05-17 13:39:32,823] Finished a trial resulted in value: 0.940283384369967. Current best value is 0.940283384369967 with parameters: {'n_layers': 3, 'n_units_l0': 122.6807162665721, 'n_units_l1': 12.797398162279107, 'n_units_l2': 791.8161703205594}.
[I 2019-05-17 13:39:48,091] Finished a trial resulted in value: 0.9886601252402463. Current best value is 0.940283384369967 with parameters: {'n_layers': 3, 'n_units_l0': 122.6807162665721, 'n_units_l1': 12.797398162279107, 'n_units_l2': 791.8161703205594}.
[I 2019-05-17 13:40:07,289] Finished a trial resulted in value: 1.0451926916492187. Current best value is 0.940283384369967 with parameters: {'n_layers': 3, 'n_units_l0': 122.6807162665721, 'n_units_l1': 12.797398162279107, 'n_units_l2': 791.8161703205594}.
[I 2019-05-17 13:41:26,618] Finish

[I 2019-05-17 14:11:22,755] Finished a trial resulted in value: 1.6689373885252934. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:13:11,635] Finished a trial resulted in value: 0.9677068643221982. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:14:51,761] Finished a trial resulted in value: 1.7541993335198514. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:15:04,535] Finished a trial resulted in value: 0.978318445877267. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:15:48,129] Finished a trial resulted in value: 0.9971935055959008. Current b

[I 2019-05-17 14:33:23,808] Finished a trial resulted in value: 1.1624861652616456. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:34:06,637] Finished a trial resulted in value: 0.8715214899111463. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:34:54,933] Finished a trial resulted in value: 0.9488067599266141. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:39:48,018] Finished a trial resulted in value: 1.493581212970006. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 14:40:30,678] Finished a trial resulted in value: 1.0000101759830669. Current b

[I 2019-05-17 15:00:13,860] Finished a trial resulted in value: 1.2711879434300952. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.
[I 2019-05-17 15:02:42,939] Finished a trial resulted in value: 1.2281821525215935. Current best value is 0.8545109209669626 with parameters: {'n_layers': 2, 'n_units_l0': 375.84513915391545, 'n_units_l1': 262.95709021355725}.


In [56]:
# Optimized

n = 10
r2 = np.zeros(n)
rmse = np.zeros(n)

for i in range(n):
    MLP = MLPRegressor((376,263), max_iter=1000)
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    rmse[i] = mean_squared_error(y_test, y_pred)**0.5

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test RMSE: {:.4f} ± {:.4f}".format(np.mean(rmse), np.std(rmse)))

Test R2: 0.7596 ± 0.0473
Test RMSE: 0.9167 ± 0.0864


### XGBoost

In [57]:
# Default
n = 10
r2 = np.zeros(n)
rmse = np.zeros(n)

for i in range(n):
    XGB = xgboost.XGBRegressor()
    XGB.fit(X_train, y_train)
    y_pred = XGB.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    rmse[i] = mean_squared_error(y_test, y_pred)**0.5

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test RMSE: {:.4f} ± {:.4f}".format(np.mean(rmse), np.std(rmse)))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getat

Test R2: 0.8193 ± 0.0000
Test RMSE: 0.7983 ± 0.0000


In [58]:
# チューニング
def objective_xgb(trial):
    n_estimators = trial.suggest_int('n_estimators', 0, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.001, 0.01, 0.001)
    scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 100)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1)
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        xgb = xgboost.XGBRegressor(
            n_estimators = n_estimators,
            max_depth = max_depth,
            min_child_weight = min_child_weight,
            learning_rate = learning_rate,
            scale_pos_weight = scale_pos_weight,
            subsample = subsample,
            colsample_bytree = colsample_bytree)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        xgb.fit(X_trn, y_trn)
        y_pred = xgb.predict(X_val)
        score += mean_squared_error(y_val, y_pred)**0.5
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_xgb, n_trials=100)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 15:16:34,886] Finished a trial resulted in value: 2.0692773802026694. Current best value is 2.0692773802026694 with parameters: {'n_estimators': 368, 'max_depth': 10, 'min_child_weight': 10, 'learning_rate': 0.002, 'scale_pos_weight': 91, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.6000000000000001}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 15:20:53,534] Finished a trial resulted in value: 1.4698103838358365. Current best value is 1.4698103838358365 with parameters: {'n_estimators': 616, 'max_depth': 7, 'min_child_weight': 13, 'learning_rate': 0.002, 'scale_pos_weight': 15, 'subsample': 

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 15:44:27,710] Finished a trial resulted in value: 2.6058880115367296. Current best value is 0.7739681644955543 with parameters: {'n_estimators': 531, 'max_depth': 5, 'min_child_weight': 1, 'learning_rate': 0.01, 'scale_pos_weight': 76, 'subsample': 0.9, 'colsample_bytree': 0.9}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 15:48:26,331] Finished a trial resulted in value: 2.346997981400598. Current best value is 0.7739681644955543 with parameters: {'n_estimators': 531, 'max_depth': 5, 'min_child_weight': 1, 'learning_rate': 0.01, 'scale_pos_weight': 76, 'subsample': 0.9, 'colsample_bytree': 0.9}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if 

[I 2019-05-17 15:58:59,997] Finished a trial resulted in value: 1.0549053596942346. Current best value is 0.7290063269620637 with parameters: {'n_estimators': 791, 'max_depth': 5, 'min_child_weight': 4, 'learning_rate': 0.01, 'scale_pos_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 15:59:03,131] Finished a trial resulted in value: 3.5348286709875874. Current best value is 0.7290063269620637 with parameters: {'n_estimators': 791, 'max_depth': 5, 'min_child_weight': 4, 'learning_rate': 0.01, 'scale_pos_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 201

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 16:44:37,827] Finished a trial resulted in value: 0.725276423330971. Current best value is 0.725276423330971 with parameters: {'n_estimators': 886, 'max_depth': 18, 'min_child_weight': 7, 'learning_rate': 0.008, 'scale_pos_weight': 31, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6000000000000001}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 16:51:34,246] Finished a trial resulted in value: 0.7335360416810047. Current best value is 0.725276423330971 with parameters: {'n_estimators': 886, 'max_depth': 18, 'min_child_weight': 7, 'learning_rate': 0.008, 'scale_pos_weight': 31, 'subsample': 0.7000000000000001, 'colsample_bytree': 0.6000000000000

  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 17:20:03,602] Finished a trial resulted in value: 1.8577604994006924. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 17:24:49,123] Finished a trial resulted in value: 0.7336408591407002. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 

[I 2019-05-17 17:45:28,985] Finished a trial resulted in value: 0.8150289272667317. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 17:49:49,765] Finished a trial resulted in value: 0.8107876759299169. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 18:20:50,070] Finished a trial resulted in value: 0.7360566723835879. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 18:22:12,690] Finished a trial resulted in value: 1.3423348704561129. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 

  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 18:38:40,641] Finished a trial resulted in value: 0.8416061726696675. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 18:45:32,850] Finished a trial resulted in value: 0.7558269041998827. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 

[I 2019-05-17 19:05:52,871] Finished a trial resulted in value: 2.1117437802728256. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 19:10:46,315] Finished a trial resulted in value: 1.0414057329687778. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 20:00:08,634] Finished a trial resulted in value: 0.7653419731144769. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 20:05:59,549] Finished a trial resulted in value: 0.7371945196748392. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 

  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 20:35:48,108] Finished a trial resulted in value: 0.7707247955473802. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 20:45:08,678] Finished a trial resulted in value: 0.7096437131104941. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 

[I 2019-05-17 21:20:14,547] Finished a trial resulted in value: 0.7710338189948263. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 21:25:29,603] Finished a trial resulted in value: 0.7799537489327873. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 21:53:11,650] Finished a trial resulted in value: 0.9291386824764829. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
[I 2019-05-17 21:59:14,239] Finished a trial resulted in value: 0.7586038172435847. Current best value is 0.7003524822371853 with parameters: {'n_estimators': 592, 'max_depth': 18, 'min_child_weight': 12, 'learning_rate': 0.007, 'scale_pos_weight': 15, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.5}.
  if getattr(data, 

KeyboardInterrupt: 

In [59]:
# Default
n = 2
r2 = np.zeros(n)
rmse = np.zeros(n)

for i in range(n):
    XGB = xgboost.XGBRegressor(n_estimators=592, max_depth=18, min_child_weight=12, 
                               learning_rate=0.007, scale_pos_weight=15, subsample=0.6, colsample_bytree=0.5)
    XGB.fit(X_train, y_train)
    y_pred = XGB.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    rmse[i] = mean_squared_error(y_test, y_pred)**0.5

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test RMSE: {:.4f} ± {:.4f}".format(np.mean(rmse), np.std(rmse)))

  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Test R2: 0.8321 ± 0.0000
Test RMSE: 0.7695 ± 0.0000


### RF

In [133]:
#Default
n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))



Test R2: 0.7626 ± 0.0153
Test MSE: 0.8371 ± 0.0539


In [31]:
def objective_rf(trial):
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 100))
    n_estimators = int(trial.suggest_loguniform('n_estimators', 2, 1000))
    max_features = trial.suggest_int('max_features', 1, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
    n_folds = 4
    score = 0
    for _ in range(n_folds):
        rf = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,
                              min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train)
        rf.fit(X_trn, y_trn)
        y_pred = rf.predict(X_val)
        score += mean_squared_error(y_val, y_pred)
    return score/n_folds

study = optuna.create_study()
study.optimize(objective_rf, n_trials=100)

[I 2019-04-08 21:51:12,302] Finished a trial resulted in value: 1.081967059900994. Current best value is 1.081967059900994 with parameters: {'max_depth': 80.91560273968723, 'n_estimators': 6.834087692986803, 'max_features': 8, 'min_samples_split': 5, 'min_samples_leaf': 6}.
[I 2019-04-08 21:51:12,474] Finished a trial resulted in value: 1.1332188948688922. Current best value is 1.081967059900994 with parameters: {'max_depth': 80.91560273968723, 'n_estimators': 6.834087692986803, 'max_features': 8, 'min_samples_split': 5, 'min_samples_leaf': 6}.
[I 2019-04-08 21:51:13,806] Finished a trial resulted in value: 0.963376185436431. Current best value is 0.963376185436431 with parameters: {'max_depth': 11.843241737364552, 'n_estimators': 60.16827086733581, 'max_features': 9, 'min_samples_split': 10, 'min_samples_leaf': 5}.
[I 2019-04-08 21:51:13,887] Finished a trial resulted in value: 2.1024562434587355. Current best value is 0.963376185436431 with parameters: {'max_depth': 11.84324173736455

[I 2019-04-08 21:55:31,480] Finished a trial resulted in value: 1.2846315936432882. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.75897168245926, 'n_estimators': 120.74289975596233, 'max_features': 10, 'min_samples_split': 3, 'min_samples_leaf': 4}.
[I 2019-04-08 21:55:31,657] Finished a trial resulted in value: 1.2675906733684776. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.75897168245926, 'n_estimators': 120.74289975596233, 'max_features': 10, 'min_samples_split': 3, 'min_samples_leaf': 4}.
[I 2019-04-08 21:55:32,456] Finished a trial resulted in value: 1.108516719284507. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.75897168245926, 'n_estimators': 120.74289975596233, 'max_features': 10, 'min_samples_split': 3, 'min_samples_leaf': 4}.
[I 2019-04-08 21:55:39,997] Finished a trial resulted in value: 0.868256722867016. Current best value is 0.7752219808362674 with parameters: {'max_depth': 29.758971

In [47]:
# Optimized

n = 10
r2 = np.zeros(n)
mse = np.zeros(n)

for i in range(n):
    RF = RandomForestRegressor(max_depth=26, n_estimators=41, max_features=5,
                              min_samples_split=6, min_samples_leaf=4)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    r2[i] = r2_score(y_test, y_pred)
    mse[i] = mean_squared_error(y_test, y_pred)

print("Test R2: {:.4f} ± {:.4f}".format(np.mean(r2), np.std(r2)))
print("Test MSE: {:.4f} ± {:.4f}".format(np.mean(mse), np.std(mse)))

Test R2: 0.7342 ± 0.0045
Test MSE: 1.0853 ± 0.0185
