# Performance comparaison

| Fingerprint | R2 | RMSE |  
|:-:|:-:|:-:|  
|BERT|0.437|2.730| 
|BERT MSM|0.784|0.883|  
| ECFP| 0.765 | 0.9808 |
|Can2Can|0.7176|1.073|
|Enum2Enum|0.725|1.059|
|Transformer|0.862|0.750|
| NFP| 0.8845 | 0.6868 |

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import torch

from bert import BERT
from build_vocab import WordVocab
from utils import split

In [2]:
pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [3]:
df_train = pd.read_csv('data/sol_train.csv')
df_test = pd.read_csv('data/sol_test.csv')
df_train.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles,spaced
0,[nH0]1c(SC)c2c([nH0]cc[nH0]2)[nH0]c1,6966-78-5,-2.36,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...,[ n H 0 ] 1 c ( S C ) c 2 c ( [ n H 0 ] c c [ ...
1,CCC(C)Cl,78-86-4,-1.96,C C C ( C ) Cl,C C C ( C ) C l
2,O=C(NC(=O)c1ccccc1)c1ccccc1,614-28-8,-2.27,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...,O = C ( N C ( = O ) c 1 c c c c c 1 ) c 1 c c ...
3,CC(C(C)(C)C)O,464-07-3,-0.62,C C ( C ( C ) ( C ) C ) O,C C ( C ( C ) ( C ) C ) O
4,[O-][N+](c1c(O)cccc1)=O,88-75-5,-1.74,[ O- ] [ N+ ] ( c 1 c ( O ) c c c c 1 ) = O,[ O - ] [ N + ] ( c 1 c ( O ) c c c c 1 ) = O


In [4]:
x_train = [split(sm) for sm in df_train['SMILES']]
y_train = df_train['solubility']
x_test = [split(sm) for sm in df_test['SMILES']]
y_test = df_test['solubility']

In [5]:
vocab = WordVocab.load_vocab('data/vocab.pkl')

In [6]:
def get_inputs(sm):
    seq_len = 220
    ids = [vocab.stoi.get(token, unk_index) for token in sm.split()]
    ids = [sos_index] + ids + [eos_index] + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    
    return ids, seg

In [7]:
def get_array(x):
    x_id, x_seg = [], []
    for sm in x:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
    
xid_train, xseg_train = get_array(x_train)
xid_test, xseg_test = get_array(x_test)

In [8]:
print(xid_test.shape)
print(xseg_test.shape)

torch.Size([322, 220])
torch.Size([322, 220])


# Encode to fingerprint

In [9]:
model = BERT(len(vocab), hidden=256, n_layers=4, attn_heads=4, dropout=0)
model.load_state_dict(torch.load('../result/GDB17_shuffle//ep_29.pkl'))
model.eval()

BERT(
  (embedding): BERTEmbedding(
    (token): TokenEmbedding(45, 256, padding_idx=0)
    (position): PositionalEmbedding()
    (segment): SegmentEmbedding(3, 256, padding_idx=0)
    (dropout): Dropout(p=0)
  )
  (transformer_blocks): ModuleList(
    (0): TransformerBlock(
      (attention): MultiHeadedAttention(
        (linear_layers): ModuleList(
          (0): Linear(in_features=256, out_features=256, bias=True)
          (1): Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=256, bias=True)
        )
        (output_linear): Linear(in_features=256, out_features=256, bias=True)
        (attention): Attention()
        (dropout): Dropout(p=0)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=256, out_features=1024, bias=True)
        (w_2): Linear(in_features=1024, out_features=256, bias=True)
        (dropout): Dropout(p=0)
        (activation): GELU()
      )
      (input_sublayer): Sub

In [9]:
#model = BERT(len(vocab), hidden=256, n_layers=2, attn_heads=2, dropout=0)
# model = torch.load('../result/MSM_on_GDM17/ep_48.pkl')
# model.eval()

AttributeError: 'collections.OrderedDict' object has no attribute 'eval'

In [10]:
del df_test, df_train, x_train, x_test

In [14]:
st,ed = 0,100
X_train = model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()
while ed<len(xid_train):
    st += 100
    ed += 100
    X_train = np.vstack([X_train, model.encode(xid_train[st:ed], xseg_train[st:ed]).detach().numpy()])

In [16]:
st,ed = 0,100
X_test = model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()
while ed<len(xid_test):
    st += 100
    ed += 100
    X_test = np.vstack([X_test, model.encode(xid_test[st:ed], xseg_test[st:ed]).detach().numpy()])

In [15]:
X_train.shape

(968, 220, 512)

In [17]:
# X_train = np.mean(X_train, axis=1)
# X_test = np.mean(X_test, axis=1)
X_train = X_train[:,0,:]
X_test = X_test[:,0,:]
#y_train = df_train['solubility']
#y_test = df_test['solubility']

# Prediction

In [18]:
# MEAN
MLP = MLPRegressor((1000,1000,1000))
MLP.fit(X_train, y_train)

y_train_pred = MLP.predict(X_train)
print("Train R2: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Train MSE: {:.4f}".format(mean_squared_error(y_train, y_train_pred)))

y_pred = MLP.predict(X_test)
print("Test R2: {:.4f}".format(r2_score(y_test, y_pred)))
print("Test MSE: {:.4f}".format(mean_squared_error(y_test, y_pred)))

Train R2: 0.1364
Train MSE: 3.5770
Test R2: 0.1432
Test MSE: 3.4981


In [77]:
# MEAN
MLP = MLPRegressor((1000, 1000, 1000))
MLP.fit(X_train, y_train)

y_train_pred = MLP.predict(X_train)
print("Train R2: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Train MSE: {:.4f}".format(mean_squared_error(y_train, y_train_pred)))

y_pred = MLP.predict(X_test)
print("Test R2: {:.4f}".format(r2_score(y_test, y_pred)))
print("Test MSE: {:.4f}".format(mean_squared_error(y_test, y_pred)))

Train R2: 0.9552
Train MSE: 0.1857
Test R2: 0.7837
Test MSE: 0.8831


In [84]:
# RAW
MLP = MLPRegressor((1000, 1000, 1000))
MLP.fit(X_train, y_train)

y_train_pred = MLP.predict(X_train)
print("Train R2: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Train MSE: {:.4f}".format(mean_squared_error(y_train, y_train_pred)))

y_pred = MLP.predict(X_test)
print("Test R2: {:.4f}".format(r2_score(y_test, y_pred)))
print("Test MSE: {:.4f}".format(mean_squared_error(y_test, y_pred)))

Train R2: 0.7708
Train MSE: 0.9495
Test R2: 0.6932
Test MSE: 1.2527


In [82]:

y_pred = MLP.predict(X_test)
print("Test R2: {:.4f}".format(r2_score(y_test, y_pred)))
print("Test MSE: {:.4f}".format(mean_squared_error(y_test, y_pred)))

Test R2: 0.7592
Test MSE: 0.9832


In [29]:
0.5636**0.5

0.7507329751649384

In [23]:
param_grid = { "hidden_layer_sizes": [(10, 10), (50,50), (100, 100), (500, 500), (1000, 1000),(10,10,10), (50,50,50), (100,100,100), (500,500,500), (1000,1000,1000)]}
MLP_grid = GridSearchCV(estimator=MLPRegressor(), param_grid=param_grid, cv=5, verbose=1, n_jobs=8)
MLP_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   38.7s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:  3.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'hidden_layer_sizes': [(10, 10), (50, 50), (100, 100), (500, 500), (1000, 1000), (10, 10, 10), (50, 50, 50), (100, 100, 100), (500, 500, 500), (1000, 1000, 1000)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [26]:
MLP_grid.best_params_

{'hidden_layer_sizes': (100, 100, 100)}

In [27]:
MLP_grid.best_estimator_.fit(X_train, y_train)
y_train_pred = MLP_grid.best_estimator_.predict(X_train)
y_test_pred = MLP_grid.best_estimator_.predict(X_test)

print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
print('R2 train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )

MSE train : 2.362, test : 2.382
R2 train : 0.430, test : 0.417


In [38]:
1.122**0.5

1.059245014149229

In [21]:
RF = RandomForestRegressor()
RF.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [22]:
y_train_pred = RF.predict(X_train)
y_test_pred = RF.predict(X_test)

print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
print('R2 train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )

MSE train : 0.224, test : 1.200
R2 train : 0.946, test : 0.706


In [21]:
param_grid = { "max_depth": [2,5,10, None],
                "n_estimators": [10,50,100,300],
                "max_features": [1, 3, 10],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [1, 3, 10]
}

In [22]:
RF_grid = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5, n_jobs=8, verbose=1)
RF_grid.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:   10.4s
[Parallel(n_jobs=8)]: Done 852 tasks      | elapsed:   26.9s
[Parallel(n_jobs=8)]: Done 1532 tasks      | elapsed:   55.6s
[Parallel(n_jobs=8)]: Done 1982 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 2160 out of 2160 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'max_depth': [2, 5, 10, None], 'n_estimators': [10, 50, 100, 300], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [23]:
RF_grid.best_params_

{'max_depth': None,
 'max_features': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [24]:
RF_grid.best_estimator_.fit(X_train, y_train)
y_train_pred = RF_grid.best_estimator_.predict(X_train)
y_test_pred = RF_grid.best_estimator_.predict(X_test)

print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
print('R2 train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )

MSE train : 0.163, test : 1.177
R2 train : 0.961, test : 0.712
