In [12]:
from Dataset.Dataset import CSVLoader, NumpyDataset
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt

from sklearn.ensemble import RandomForestClassifier

import numpy as np
import random

In [2]:
#Load Dataset
dataset = CSVLoader('preprocessed_dataset_wfoodb.csv', 'Smiles', ['Class'], 'ID')#, chunk_size=2000)
print(dataset.get_shape())

(23290,) (23290,) (0,) (23290,)
((23290,), (23290,), (0,), (23290,))


In [3]:
#Separate sweeteners from non sweeteners
sweets_indexes = np.where(dataset.y == 1)
sweets = dataset.select(sweets_indexes)

non_sweets_indexes = np.where(dataset.y == 0)
non_sweets = dataset.select(non_sweets_indexes)

In [4]:
#Ratio between sweeteners and non sweeteners
n_balanced_datasets = int(np.floor(len(non_sweets.y)/len(sweets.y)))
n_balanced_datasets

16

In [6]:
indexes = list(range(0, len(non_sweets.y)))

def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

dataset_indexes = partition(indexes, n_balanced_datasets)


# TODO: Check merge method
balanced_datasets = []
for i, di in enumerate(dataset_indexes):
    ds = non_sweets.select(di)
    ds = ds.merge([sweets])
    balanced_datasets.append(ds)
    

In [14]:
for dataset in balanced_datasets:
    #Featurization
    #dataset = balanced_datasets[0]
    dataset = MorganFingerprint().featurize(dataset)
    dataset.get_shape()

    #Feature Selection
    dataset = LowVarianceFS(0.15).featureSelection(dataset)
    dataset.get_shape()

    #Data Split
    splitter = SingletaskStratifiedSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                                 frac_valid=0.2, frac_test=0.2)

    #metrics
    metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
               Metric(classification_report)]

    #Build a model function for hyperparameter optimization
    def rf_model_builder(n_estimators, max_features, class_weight, model_dir=None):
        rf_model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, 
                                          class_weight=class_weight)
        return SklearnModel(rf_model, model_dir)

    params_dict_rf = {"n_estimators": [10, 100],
                      "max_features": ["auto", "sqrt", "log2", None],
                      "class_weight": [{0: 1., 1: 1.}, {0: 1., 1: 5}, {0: 1., 1: 10}]
                      }

    #Hyperparameter Optimization
    optimizer = GridHyperparamOpt(rf_model_builder)

    best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, train_dataset, 
                                                                         valid_dataset, Metric(roc_auc_score))

    print('#################')
    print(best_hyperparams)
    print(best_rf)

    #Evaluate best model
    best_rf.evaluate(test_dataset, metrics)

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8098237124092637
Model 1/15, Metric roc_auc_score, Validation set 1: 0.809824
	best_validation_score so far: 0.809824
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.835361216730038
Model 2/15, Metric roc_auc_score, Validation set 2: 0.835361
	best_validation_score so far: 0.835361
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8147805046664361
Model 3/15, Metric roc_auc_score, Validation set 3: 0.814781
	best_validation_score so far: 0.835361
Fitting model 4/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1:

roc_auc_score: 
 0.8180850328378846
Model 14/15, Metric roc_auc_score, Validation set 14: 0.818085
	best_validation_score so far: 0.829326
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8033736605599722
Model 15/15, Metric roc_auc_score, Validation set 15: 0.803374
	best_validation_score so far: 0.829326
roc_auc_score: 
 0.9642223495490735
Best hyperparameters: (100, 'log2', {0: 1.0, 1: 1.0})
train_score: 0.964222
validation_score: 0.829326
#################
(100, 'log2', {0: 1.0, 1: 1.0})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 1.0},
                                          max_features='log2'),
             model_dir='/tmp/tmpvwjv6xd9')
roc_auc_score: 
 0.7743657103352921
precision_score: 
 0.7406143344709898
accuracy_score: 
 0.7732342007434945
confusion_matrix: 
 [[199  76]
 [ 46 217]]
classification_report: 
               precision    recall  f1-score   support

     

RDKit ERROR: [17:09:20] Explicit valence for atom # 3 B, 4, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
Featurizing datapoint 2000
Elements with indexes:  [1259]  were removed due to the presence of NAs!
The elements in question are:  ['OB1O[B]2(O)OB(O)O[B](O)(O1)O2']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8048669201520912
Model 1/15, Metric roc_auc_score, Validation set 1: 0.804867
	best_validation_score so far: 0.804867
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.7969305219495334
Model 2/15, Metric roc_auc_score, Validation set 2: 0.796931
	best_validation_score so far: 0.804867
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8063532665053579
Model 3/15, Metric roc_auc_score, Validation set 3: 0.806353
	best_validation

roc_auc_score: 
 0.8211264100862641
Model 13/15, Metric roc_auc_score, Validation set 13: 0.821126
	best_validation_score so far: 0.822606
Fitting model 14/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8019105286441053
Model 14/15, Metric roc_auc_score, Validation set 14: 0.801911
	best_validation_score so far: 0.822606
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8110346162353461
Model 15/15, Metric roc_auc_score, Validation set 15: 0.811035
	best_validation_score so far: 0.822606
roc_auc_score: 
 0.9593446601941747
Best hyperparameters: (100, 'sqrt', {0: 1.0, 1: 10})
train_score: 0.959345
validation_score: 0.822606
#################
(100, 'sqrt', {0: 1.0, 1: 10})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 10},
                                          max_features='sqrt'),
             model_dir='/tmp/tmp

roc_auc_score: 
 0.8393054633930547
Model 8/15, Metric roc_auc_score, Validation set 8: 0.839305
	best_validation_score so far: 0.839305
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8301122539261225
Model 9/15, Metric roc_auc_score, Validation set 9: 0.830112
	best_validation_score so far: 0.839305
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.822536496350365
Model 10/15, Metric roc_auc_score, Validation set 10: 0.822536
	best_validation_score so far: 0.839305
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8282874364078745
Model 11/15, Metric roc_auc_score, Validation set 11: 0.828287
	best_validation_score so far: 0.839305
Fitting model 12/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 

roc_auc_score: 
 0.7873418596612513
Model 6/15, Metric roc_auc_score, Validation set 6: 0.787342
	best_validation_score so far: 0.789575
Fitting model 7/15
hyperparameters: {'n_estimators': 10, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.7908123055651572
Model 7/15, Metric roc_auc_score, Validation set 7: 0.790812
	best_validation_score so far: 0.790812
Fitting model 8/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8076736951261667
Model 8/15, Metric roc_auc_score, Validation set 8: 0.807674
	best_validation_score so far: 0.807674
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8169305219495333
Model 9/15, Metric roc_auc_score, Validation set 9: 0.816931
	best_validation_score so far: 0.816931
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 

RDKit ERROR: [17:10:13] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 2000
Elements with indexes:  [1218]  were removed due to the presence of NAs!
The elements in question are:  ['[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8151123401313516
Model 1/15, Metric roc_auc_score, Validation set 1: 0.815112
	best_validation_score so far: 0.815112
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8039543726235742
Model 2/15, Metric roc_auc_score, Validation set 2: 0.803954
	best_validation_score so far: 0.815112
Fitting model 3/15
hyperparameters: {'n_estimators': 10

roc_auc_score: 
 0.8200895819508958
Model 13/15, Metric roc_auc_score, Validation set 13: 0.820090
	best_validation_score so far: 0.827527
Fitting model 14/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8113802256138023
Model 14/15, Metric roc_auc_score, Validation set 14: 0.811380
	best_validation_score so far: 0.827527
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8060440168104401
Model 15/15, Metric roc_auc_score, Validation set 15: 0.806044
	best_validation_score so far: 0.827527
roc_auc_score: 
 0.9617922158917225
Best hyperparameters: (100, 'sqrt', {0: 1.0, 1: 1.0})
train_score: 0.961792
validation_score: 0.827527
#################
(100, 'sqrt', {0: 1.0, 1: 1.0})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 1.0},
                                          max_features='sqrt'),
             model_dir='/t

roc_auc_score: 
 0.8061822605618225
Model 9/15, Metric roc_auc_score, Validation set 9: 0.806182
	best_validation_score so far: 0.806182
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8059748949347489
Model 10/15, Metric roc_auc_score, Validation set 10: 0.805975
	best_validation_score so far: 0.806182
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8136197743861977
Model 11/15, Metric roc_auc_score, Validation set 11: 0.813620
	best_validation_score so far: 0.813620
Fitting model 12/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8113111037381111
Model 12/15, Metric roc_auc_score, Validation set 12: 0.811311
	best_validation_score so far: 0.813620
Fitting model 13/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight':

RDKit ERROR: [17:10:46] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [384]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl-](=O)(=O)=O']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.7740544127405442
Model 1/15, Metric roc_auc_score, Validation set 1: 0.774054
	best_validation_score so far: 0.774054
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.7869663791196637
Model 2/15, Metric roc_auc_score, Validation set 2: 0.786966
	best_validation_score so far: 0.786966
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.7737779252377792
Model 3/15, Metric roc_auc_score, Validation set 3: 0.773778
	best_validat

RDKit ERROR: [17:10:55] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [697]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.7402399911523999
Model 1/15, Metric roc_auc_score, Validation set 1: 0.740240
	best_validation_score so far: 0.740240
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.7576310550763106
Model 2/15, Metric roc_auc_score, Validation set 2: 0.757631
	best_validation_score so far: 0.757631
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.7579075425790754
Model 3/15, Metric roc_auc_score, Validation set 3: 0.757908
	best_validation_score so far: 

roc_auc_score: 
 0.776432205264322
Model 13/15, Metric roc_auc_score, Validation set 13: 0.776432
	best_validation_score so far: 0.783870
Fitting model 14/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.7816301703163018
Model 14/15, Metric roc_auc_score, Validation set 14: 0.781630
	best_validation_score so far: 0.783870
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.7942656491926564
Model 15/15, Metric roc_auc_score, Validation set 15: 0.794266
	best_validation_score so far: 0.794266
roc_auc_score: 
 0.9586385721505455
Best hyperparameters: (100, None, {0: 1.0, 1: 10})
train_score: 0.958639
validation_score: 0.794266
#################
(100, None, {0: 1.0, 1: 10})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 10},
                                          max_features=None),
             model_dir='/tmp/tmp9rkzfa9c