In [1]:
from Dataset.Dataset import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt

from sklearn.ensemble import RandomForestClassifier

In [2]:
#Load Dataset
dataset = CSVLoader('preprocessed_dataset_wfoodb.csv', 'Smiles', ['Class'], 'ID')#, chunk_size=2000)
dataset.get_shape()

(23290,) (23290,) (0,) (23290,)


((23290,), (23290,), (0,), (23290,))

In [3]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [18:11:49] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [18:11:52] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [18:11:52] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [18:12:06] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
(23286,) (23286,) (23286, 1024) (23286,)


((23286,), (23286,), (23286, 1024), (23286,))

In [4]:
#Feature Selection
dataset = LowVarianceFS(0.15).featureSelection(dataset)
dataset.get_shape()

(23286,) (23286,) (23286, 49) (23286,)


((23286,), (23286,), (23286, 49), (23286,))

In [5]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

In [6]:
#Scikit-Learn Random Forest
rf = RandomForestClassifier()
model = SklearnModel(model=rf)

In [7]:
#cross validation
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split
Train Score: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.8593012490704063
Test Score: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.641215452807231
Train Score: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.855670210177402
Test Score: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.6608069353498354
Train Score: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.8548083044642593
Test Score: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.6323529275116685


(SklearnModel(model=RandomForestClassifier(), model_dir='/tmp/tmpjyfeik55'),
 0.855670210177402,
 0.6608069353498354,
 [0.8593012490704063, 0.855670210177402, 0.8548083044642593],
 [0.641215452807231, 0.6608069353498354, 0.6323529275116685],
 0.8565932545706891,
 0.6447917718895783)

In [8]:
# model training
model.fit(train_dataset)

RandomForestClassifier()

In [9]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

#############################
Training Dataset: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.8633686562375728
precision_score: 
 0.9232
accuracy_score: 
 0.9813144329896907
confusion_matrix: 
 [[13130    48]
 [  213   577]]
classification_report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     13178
           1       0.92      0.73      0.82       790

    accuracy                           0.98     13968
   macro avg       0.95      0.86      0.90     13968
weighted avg       0.98      0.98      0.98     13968

#############################
Validation Dataset: 
Models Class line 158 --> evaluator
roc_auc_score: 
 0.667928323577347
precision_score: 
 0.543859649122807
accuracy_score: 
 0.9467353951890034
confusion_matrix: 
 [[4315   78]
 [ 170   93]]
classification_report: 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      4393
           1       0.54      0.35   

In [10]:
#Build a model function for hyperparameter optimization
def rf_model_builder(n_estimators, max_features, class_weight, model_dir=None):
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, 
                                      class_weight=class_weight)
    return SklearnModel(rf_model, model_dir)

params_dict_rf = {"n_estimators": [10, 100],
                  "max_features": ["auto", "sqrt", "log2", None],
                  "class_weight": [{0: 1., 1: 1.}, {0: 1., 1: 5}, {0: 1., 1: 10}]
                  }
    


In [11]:
#Hyperparameter Optimization
optimizer = GridHyperparamOpt(rf_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, train_dataset, valid_dataset, 'f1')

print('#################')
print(best_hyperparams)
print(best_rf)

Using accuracy instead and  roc_auc_score  on validation!
 

Fitting 15 random models from a space of 24 possible models.
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
Models Class line 158 --> evaluator
roc_auc_score: 
 0.6664486968985397
Model 1/15, Metric roc_auc_score, Validation set 1: 0.666449
	best_validation_score so far: 0.666449
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}


Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/keras/wrappers/scikit_learn.py", line 223, in fit
    return super(KerasClassifier, self).fit(x, y, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/keras/wrappers/scikit_learn.py", line 159, in fit
    if (losses.is_categorical_crossentropy(self.model.loss) and
AttributeError: 'SklearnModel' object has no attribute 'loss'



Models Class line 158 --> evaluator
roc_auc_score: 
 0.7055547236832881
Model 2/15, Metric roc_auc_score, Validation set 2: 0.705555
	best_validation_score so far: 0.705555
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}}
Models Class line 158 --> evaluator
roc_auc_score: 
 0.69784889372048
Model 3/15, Metric roc_auc_score, Validation set 3: 0.697849
	best_validation_score so far: 0.705555
Fitting model 4/15
hyperparameters: {'n_estimators': 10, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 1.0}}
Models Class line 158 --> evaluator
roc_auc_score: 
 0.6678945678356251
Model 4/15, Metric roc_auc_score, Validation set 4: 0.667895
	best_validation_score so far: 0.705555
Fitting model 5/15
hyperparameters: {'n_estimators': 10, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 5}}
Models Class line 158 --> evaluator
roc_auc_score: 
 0.6871124040233382
Model 5/15, Metric roc_auc_score, Validation set 5: 0.687112
	best_va

In [12]:
#Evaluate model
best_rf.evaluate(test_dataset, metrics)

Models Class line 158 --> evaluator
roc_auc_score: 
 0.6753641076063803
precision_score: 
 0.3849056603773585
accuracy_score: 
 0.9304123711340206
confusion_matrix: 
 [[4230  163]
 [ 161  102]]
classification_report: 
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      4393
           1       0.38      0.39      0.39       263

    accuracy                           0.93      4656
   macro avg       0.67      0.68      0.67      4656
weighted avg       0.93      0.93      0.93      4656



{'roc_auc_score': 0.6753641076063803,
 'precision_score': 0.3849056603773585,
 'accuracy_score': 0.9304123711340206,
 'confusion_matrix': 1164.0,
 'classification_report': None}