In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import chainer
from seq2seq import Seq2seq, load_vocabulary

UNK, EOS = 0, 1

In [42]:
df_sol = pd.read_csv('data/solubility.csv')
df_sol.head()

Unnamed: 0,SMILES,unknown,solubility,processed_smiles
0,O=C(C)N,60-35-5,1.58,O = C ( C ) N
1,NNC,60-34-4,1.34,N N C
2,O=C(C)O,64-19-7,1.22,O = C ( C ) O
3,N1CCCC1,123-75-1,1.15,N 1 C C C C 1
4,O=C(N)NO,127-07-1,1.12,O = C ( N ) N O


In [3]:
x_train,x_test,y_train,y_test = train_test_split(df_sol['processed_smiles'], df_sol['solubility'])

In [14]:
def load_data(vocabulary, lst):
    data = []
    for l in lst:
        words = l.strip().split()
        array = np.array([vocabulary.get(w, UNK) for w in words], np.int32)
        data.append(array)
    return data

In [21]:
source_ids = load_vocabulary('vocab_src.txt')
xnum_train = load_data(source_ids, x_train)
xnum_test = load_data(source_ids, x_test)

# Encode to fingerprint

In [None]:
model = Seq2seq(1, 46, 46, 256)
chainer.serializers.load_npz('result/model_iter_132000.npz', model)

In [48]:
X_train = model.encode(xnum_train)[0].data
X_test = model.encode(xnum_test)[0].data

# Prediction

In [49]:
RF = RandomForestRegressor()
RF.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [50]:
y_train_pred = RF.predict(X_train)
y_test_pred = RF.predict(X_test)

print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
print('R2 train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )

MSE train : 0.223, test : 1.209
R2 train : 0.948, test : 0.677


In [58]:
param_grid = { "max_depth": [2,5,10, None],
                "n_estimators": [10,50,100,300],
                "max_features": [1, 3, 10],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [1, 3, 10]
}

In [59]:
RF_grid = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5, n_jobs=8, verbose=1)
RF_grid.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    6.8s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    7.0s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    7.2s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:    7.7s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    8.0s
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:    8.4s
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:    8.6s
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:    9.0s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    9.4s
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed:    9.9s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   10.3s
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:   10.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   11.3s
[Parallel(n_jobs=8)]: Done 205 tasks      | elapsed:   11.8s
[Parallel(n_jobs=8)]: Do

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'max_depth': [2, 5, 10, None], 'n_estimators': [10, 50, 100, 300], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [61]:
RF_grid.best_params_

{'max_depth': None,
 'max_features': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 300}

In [62]:
RF_grid.best_estimator_.fit(X_train, y_train)
y_train_pred = RF_grid.best_estimator_.predict(X_train)
y_test_pred = RF_grid.best_estimator_.predict(X_test)

print('MSE train : %.3f, test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)) )
print('R2 train : %.3f, test : %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)) )

MSE train : 0.178, test : 1.090
R2 train : 0.958, test : 0.708


In [None]:
RF = RandomForestRegressor()
RF.fit(X_train, y_train)