In [1]:
#pip install tensorflow==2.2.0

In [2]:
from loaders.Loaders import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from splitters.splitters import SingletaskStratifiedSplitter

from sklearn.ensemble import RandomForestRegressor
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error

import tensorflow as tf
print(tf.version.VERSION)

from parameterOptimization.HyperparameterOpt import HyperparamOpt_Valid, HyperparamOpt_CV


2.4.1


Using TensorFlow backend.


In [3]:
#Load Dataset
dataset = CSVLoader(dataset_path='data/PC-3.csv', 
                    mols_field='smiles', 
                    labels_fields='pIC50')
dataset = dataset.create_dataset()
dataset.get_shape()

Mols_shape:  4294
Features_shape:  X not defined!
Labels_shape:  (4294,)


In [4]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Mols_shape:  4294
Features_shape:  (4294, 1024)
Labels_shape:  (4294,)


In [5]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

train_dataset.get_shape()
valid_dataset.get_shape()
test_dataset.get_shape()

Mols_shape:  2574
Features_shape:  (2574, 1024)
Labels_shape:  (2574,)
Mols_shape:  858
Features_shape:  (858, 1024)
Labels_shape:  (858,)
Mols_shape:  858
Features_shape:  (858, 1024)
Labels_shape:  (858,)


In [6]:
#Scikit-Learn Random Forest
#rf = RandomForestRegressor()
#model = SklearnModel(model=rf)

In [7]:
from models.kerasModels import KerasModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


input_dim = train_dataset.X.shape[1]
print(input_dim)

def build_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=[1024]))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1))
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])
    return model

model = KerasModel(build_model, mode='regression', epochs = 5, verbose=1)

1024


In [8]:
#cross validation
model.cross_validate(dataset, Metric(r2_score), folds=3)

Computing K-fold split
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 
r2_score: 
 0.6991572217810277
Test Score: 
r2_score: 
 -0.19650720201362493
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 
r2_score: 
 0.7570900561419888
Test Score: 
r2_score: 
 -0.36062229113626465
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 
r2_score: 
 0.5285901450193853
Test Score: 
r2_score: 
 -0.9485151157854774


(None,
 0,
 0,
 [0.6991572217810277, 0.7570900561419888, 0.5285901450193853],
 [-0.19650720201362493, -0.36062229113626465, -0.9485151157854774],
 0.661612474314134,
 -0.501881536311789)

In [9]:
# model training
model.fit(train_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
metrics = [Metric(mean_absolute_error), Metric(mean_squared_error), Metric(median_absolute_error), Metric(r2_score)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)


#############################
Training Dataset: 
<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7f852f0a8f60>
<class 'tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor'>
mean_absolute_error: 
 0.3705001472916861
mean_squared_error: 
 0.23957549714516277
median_absolute_error: 
 0.29531271934509284
r2_score: 
 0.7735511282249079
#############################
Validation Dataset: 
<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7f852f0a8f60>
<class 'tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor'>
mean_absolute_error: 
 0.5801882118204311
mean_squared_error: 
 0.6024357394891615
median_absolute_error: 
 0.4437227535247805
r2_score: 
 0.43360654537889887
#############################
Test Dataset: 
<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7f852f0a8f60>
<class 'tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor'>
mean_absolute_error: 
 0.5773673188761778
mean_squared_erro

In [11]:
#Build a model function for hyperparameter optimization
def rf_model_builder(n_estimators=10, max_features='auto', criterion='mse'):
    rf_model = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features,
                                     criterion=criterion)
    return rf_model

params_dict_rf = {"n_estimators": [10, 100],
                  "max_features": ["auto", "sqrt", "log2", None],
                  "criterion": ["mse", "mae"]
                  }
    
model = SklearnModel(rf_model_builder, 'regression')

In [12]:
#Hyperparameter Optimization
#optimizer = HyperparamOpt_Valid(rf_model_builder)

#best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, 
#                                                                     train_dataset, 
#                                                                     valid_dataset, 
#                                                                     Metric(r2_score),
#                                                                     n_iter_search=15)

#print('#################')
#print(best_hyperparams)
#print(best_rf)

In [13]:
#Evaluate model
#best_rf.evaluate(test_dataset, metrics)

In [14]:
#Hyperparameter Optimization with CV
optimizer = HyperparamOpt_CV(rf_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search('sklearn',
                                                                     params_dict_rf, 
                                                                     train_dataset,  
                                                                     'r2', 
                                                                     cv=3,
                                                                     n_iter_search=10,
                                                                    n_jobs=8)

print('#################')
print(best_hyperparams)
print(best_rf)

#Evaluate model
best_rf.evaluate(test_dataset, metrics)

MODEL TYPE:  sklearn
Fitting 10 random models from a space of 16 possible models.
RandomForestRegressor(max_features='sqrt')

 
 Best r2: -10.912881 using {'n_estimators': 100, 'max_features': 'sqrt', 'criterion': 'mse'}

 r2: -10.912881 (9.040730) with: {'n_estimators': 100, 'max_features': 'sqrt', 'criterion': 'mse'} 


 r2: -12.762050 (9.291542) with: {'n_estimators': 10, 'max_features': None, 'criterion': 'mae'} 


 r2: -12.448017 (9.870005) with: {'n_estimators': 100, 'max_features': 'auto', 'criterion': 'mae'} 


 r2: -12.071812 (9.915886) with: {'n_estimators': 10, 'max_features': 'log2', 'criterion': 'mae'} 


 r2: -11.526627 (8.939953) with: {'n_estimators': 100, 'max_features': 'auto', 'criterion': 'mse'} 


 r2: -11.209829 (9.477486) with: {'n_estimators': 100, 'max_features': 'sqrt', 'criterion': 'mae'} 


 r2: -11.342046 (8.413157) with: {'n_estimators': 10, 'max_features': 'log2', 'criterion': 'mse'} 


 r2: -12.925393 (9.393455) with: {'n_estimators': 10, 'max_features':

{'mean_absolute_error': 0.45336192873851944,
 'mean_squared_error': 0.3995771320347623,
 'median_absolute_error': 0.33906666666674834,
 'r2_score': 0.6218294021812234}