In [1]:
from Dataset.Dataset import CSVLoader, NumpyDataset
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import sklearn

import numpy as np
import pandas as pd
import random

In [2]:
#Load Dataset
dataset = CSVLoader('preprocessed_dataset_wfoodb.csv', 'Smiles', ['Class'], 'ID')#, chunk_size=2000)
print(dataset.get_shape())

(23290,) (23290,) (0,) (23290,)
((23290,), (23290,), (0,), (23290,))


In [3]:
#Separate sweeteners from non sweeteners
sweets_indexes = np.where(dataset.y == 1)
#test_sweets_indexes = sweets_indexes[0][:int(len(sweets_indexes[0])*0.1)]
#sweets_indexes = sweets_indexes[0][int(len(sweets_indexes[0])*0.1):]
test_sweets_indexes = sweets_indexes[0][:130]
sweets_indexes = sweets_indexes[0][130:]

sweets = dataset.select(sweets_indexes)
sweets_test = dataset.select(test_sweets_indexes)

non_sweets_indexes = np.where(dataset.y == 0)
#test_non_sweets_indexes = non_sweets_indexes[0][:int(len(non_sweets_indexes[0])*0.15)]
#non_sweets_indexes = non_sweets_indexes[0][int(len(non_sweets_indexes[0])*0.15):]
test_non_sweets_indexes = non_sweets_indexes[0][:130]
non_sweets_indexes = non_sweets_indexes[0][130:]

non_sweets = dataset.select(non_sweets_indexes)
non_sweets_test = dataset.select(test_non_sweets_indexes)

In [4]:
#Ratio between sweeteners and non sweeteners
n_balanced_datasets = int(np.floor(len(non_sweets.y)/len(sweets.y)))
n_balanced_datasets

18

In [5]:
indexes = list(range(0, len(non_sweets.y)))

def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

dataset_indexes = partition(indexes, n_balanced_datasets)


balanced_datasets = []
for i, di in enumerate(dataset_indexes):
    ds = non_sweets.select(di)
    ds = ds.merge([sweets])
    balanced_datasets.append(ds)
    
test_bl_dataset = sweets_test.merge([non_sweets_test])
test_bl_dataset = MorganFingerprint().featurize(test_bl_dataset)
test_bl_dataset.get_shape()
#test_bl_dataset = LowVarianceFS(0.15).featureSelection(test_bl_dataset)
#test_bl_dataset.get_shape()
#test_bl_dataset.features2keep

Featurizing datapoint 0


((260,), (260,), (260, 1024), (260,))

In [6]:
estimators = []
predictions = [0]*len(test_bl_dataset.X)

for i, dataset in enumerate(balanced_datasets):
    #Featurization
    #dataset = balanced_datasets[0]
    dataset = MorganFingerprint().featurize(dataset)
    dataset.get_shape()

    #Feature Selection
    #dataset = LowVarianceFS(0.15).featureSelection(dataset)
    #dataset.get_shape()

    #Data Split
    splitter = SingletaskStratifiedSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                                 frac_valid=0.2, frac_test=0.2)

    #metrics
    metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
               Metric(classification_report)]

    #Build a model function for hyperparameter optimization
    def rf_model_builder(n_estimators, max_features, class_weight, model_dir=None):
        rf_model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, 
                                          class_weight=class_weight)
        return SklearnModel(rf_model, model_dir)

    params_dict_rf = {"n_estimators": [10, 100],
                      "max_features": ["auto", "sqrt", "log2", None],
                      "class_weight": [{0: 1., 1: 1.}, {0: 1., 1: 5}, {0: 1., 1: 10}]
                      }

    #Hyperparameter Optimization
    optimizer = GridHyperparamOpt(rf_model_builder)

    best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, train_dataset, 
                                                                         valid_dataset, Metric(roc_auc_score))

    print('#################')
    print(best_hyperparams)
    print(best_rf)

    #Evaluate best model
    best_rf.evaluate(test_dataset, metrics)
    
    prdt = best_rf.predict(test_bl_dataset)
    for j, p in enumerate(prdt):
        predictions[j] += p[0]
        
    estimator_name = 'rf_'+str(i)
    estimators.append((estimator_name, best_rf))

    
predictions = [x / len(estimators) for x in predictions]
print(predictions)

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8442725425847788
Model 1/15, Metric roc_auc_score, Validation set 1: 0.844273
	best_validation_score so far: 0.844273
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8447413658384124
Model 2/15, Metric roc_auc_score, Validation set 2: 0.844741
	best_validation_score so far: 0.844741
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8694327238631037
Model 3/15, Metric roc_auc_score, Validation set 3: 0.869433
	best_validation_score so far: 0.869433
Fitting model 4/15
hyperparameters: {'n_estimators': 10, 'max_features': 'log2', 'class_weight': {0: 1.0, 1:

roc_auc_score: 
 0.871561544873576
Model 14/15, Metric roc_auc_score, Validation set 14: 0.871562
	best_validation_score so far: 0.875799
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8713531536537927
Model 15/15, Metric roc_auc_score, Validation set 15: 0.871353
	best_validation_score so far: 0.875799
roc_auc_score: 
 0.9972527472527473
Best hyperparameters: (100, 'log2', {0: 1.0, 1: 1.0})
train_score: 0.997253
validation_score: 0.875799
#################
(100, 'log2', {0: 1.0, 1: 1.0})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 1.0},
                                          max_features='log2'),
             model_dir='/tmp/tmp3_wx1ecy')
roc_auc_score: 
 0.8567782484894785
precision_score: 
 0.8141263940520446
accuracy_score: 
 0.85625
confusion_matrix: 
 [[192  50]
 [ 19 219]]
classification_report: 
               precision    recall  f1-score   support

           0   

RDKit ERROR: [13:02:46] Explicit valence for atom # 3 B, 4, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [645]  were removed due to the presence of NAs!
The elements in question are:  ['OB1O[B]2(O)OB(O)O[B](O)(O1)O2']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8054904412147731
Model 1/15, Metric roc_auc_score, Validation set 1: 0.805490
	best_validation_score so far: 0.805490
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.807079231129864
Model 2/15, Metric roc_auc_score, Validation set 2: 0.807079
	best_validation_score so far: 0.807079
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8117153721935719
Model 3/15, Metric roc_auc_score, Validation set 

roc_auc_score: 
 0.8588842006563526
Model 13/15, Metric roc_auc_score, Validation set 13: 0.858884
	best_validation_score so far: 0.858884
Fitting model 14/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8506016565088295
Model 14/15, Metric roc_auc_score, Validation set 14: 0.850602
	best_validation_score so far: 0.858884
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8293483356774496
Model 15/15, Metric roc_auc_score, Validation set 15: 0.829348
	best_validation_score so far: 0.858884
roc_auc_score: 
 0.9965706447187929
Best hyperparameters: (100, 'log2', {0: 1.0, 1: 5})
train_score: 0.996571
validation_score: 0.858884
#################
(100, 'log2', {0: 1.0, 1: 5})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 5},
                                          max_features='log2'),
             model_dir='/tmp/tmphp

RDKit ERROR: [13:04:37] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [761]  were removed due to the presence of NAs!
The elements in question are:  ['[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8239634696853948
Model 1/15, Metric roc_auc_score, Validation set 1: 0.823963
	best_validation_score so far: 0.823963
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8173831516077504
Model 2/15, Metric roc_auc_score, Validation set 2: 0.817383
	best_validation_score so far: 0.823963
Fitting model 3/15
hyperpa

roc_auc_score: 
 0.8501979304118341
Model 13/15, Metric roc_auc_score, Validation set 13: 0.850198
	best_validation_score so far: 0.867144
Fitting model 14/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8334953816237239
Model 14/15, Metric roc_auc_score, Validation set 14: 0.833495
	best_validation_score so far: 0.867144
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8125217028960344
Model 15/15, Metric roc_auc_score, Validation set 15: 0.812522
	best_validation_score so far: 0.867144
roc_auc_score: 
 0.9965706447187929
Best hyperparameters: (100, 'log2', {0: 1.0, 1: 10})
train_score: 0.996571
validation_score: 0.867144
#################
(100, 'log2', {0: 1.0, 1: 10})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 10},
                                          max_features='log2'),
             model_dir='/tmp/tmpy

RDKit ERROR: [13:06:26] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O


RDKit ERROR: [13:06:27] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [188, 968]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'O=[Cl-](=O)(=O)=O']
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8421418154038476
Model 1/15, Metric roc_auc_score, Validation set 1: 0.842142
	best_validation_score so far: 0.842142
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8317070629904855
Model 2/15, Metric roc_auc_score, Validation set 2: 0.831707
	best_validation_score so far: 0.842142
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8587401902909924
Model 3/15, Metric roc_auc_score, Validation set 3: 0.85874

roc_auc_score: 
 0.8399715258004028
Model 13/15, Metric roc_auc_score, Validation set 13: 0.839972
	best_validation_score so far: 0.877370
Fitting model 14/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8500243072435586
Model 14/15, Metric roc_auc_score, Validation set 14: 0.850024
	best_validation_score so far: 0.877370
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8331828599208279
Model 15/15, Metric roc_auc_score, Validation set 15: 0.833183
	best_validation_score so far: 0.877370
roc_auc_score: 
 0.9993141289437586
Best hyperparameters: (100, 'sqrt', {0: 1.0, 1: 5})
train_score: 0.999314
validation_score: 0.877370
#################
(100, 'sqrt', {0: 1.0, 1: 5})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 5},
                                          max_features='sqrt'),
             model_dir='/tmp/tmpcawi

roc_auc_score: 
 0.8549252487367819
Model 8/15, Metric roc_auc_score, Validation set 8: 0.854925
	best_validation_score so far: 0.865370
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8652133145804033
Model 9/15, Metric roc_auc_score, Validation set 9: 0.865213
	best_validation_score so far: 0.865370
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8632598843569308
Model 10/15, Metric roc_auc_score, Validation set 10: 0.863260
	best_validation_score so far: 0.865370
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.857034953378132
Model 11/15, Metric roc_auc_score, Validation set 11: 0.857035
	best_validation_score so far: 0.865370
Fitting model 12/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0:

roc_auc_score: 
 0.8093270365997639
Model 4/15, Metric roc_auc_score, Validation set 4: 0.809327
	best_validation_score so far: 0.846587
Fitting model 5/15
hyperparameters: {'n_estimators': 10, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8361865407319953
Model 5/15, Metric roc_auc_score, Validation set 5: 0.836187
	best_validation_score so far: 0.846587
Fitting model 6/15
hyperparameters: {'n_estimators': 10, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8403534967706091
Model 6/15, Metric roc_auc_score, Validation set 6: 0.840353
	best_validation_score so far: 0.846587
Fitting model 7/15
hyperparameters: {'n_estimators': 10, 'max_features': None, 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 0.8210292381415377
Model 7/15, Metric roc_auc_score, Validation set 7: 0.821029
	best_validation_score so far: 0.846587
Fitting model 8/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}

Featurizing datapoint 1000
Featurizing datapoint 2000
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8463822472261291
Model 1/15, Metric roc_auc_score, Validation set 1: 0.846382
	best_validation_score so far: 0.846382
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
roc_auc_score: 
 0.8215346147835599
Model 2/15, Metric roc_auc_score, Validation set 2: 0.821535
	best_validation_score so far: 0.846382
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8253372922852529
Model 3/15, Metric roc_auc_score, Validation set 3: 0.825337
	best_validation_score so far: 0.846382
Fitting model 4/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}}
roc_auc_score: 
 

roc_auc_score: 
 0.8235814987151885
Model 14/15, Metric roc_auc_score, Validation set 14: 0.823581
	best_validation_score so far: 0.863150
Fitting model 15/15
hyperparameters: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}}
roc_auc_score: 
 0.8251614695464964
Model 15/15, Metric roc_auc_score, Validation set 15: 0.825161
	best_validation_score so far: 0.863150
roc_auc_score: 
 0.9972373132485493
Best hyperparameters: (100, 'auto', {0: 1.0, 1: 1.0})
train_score: 0.997237
validation_score: 0.863150
#################
(100, 'auto', {0: 1.0, 1: 1.0})
SklearnModel(model=RandomForestClassifier(class_weight={0: 1.0, 1: 1.0}),
             model_dir='/tmp/tmpdhwo0s54')
roc_auc_score: 
 0.8777673594832526
precision_score: 
 0.8371212121212122
accuracy_score: 
 0.8770833333333333
confusion_matrix: 
 [[200  43]
 [ 16 221]]
classification_report: 
               precision    recall  f1-score   support

           0       0.93      0.82      0.87       243
           1   

In [7]:
int_preds = []

for p in predictions:
    if p >= 0.5:
        int_preds.append(0)
    else: int_preds.append(1)
        
print(int_preds)
print(test_bl_dataset.y)

[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [8]:
print('precision_score:\n ', sklearn.metrics.precision_score(test_bl_dataset.y, int_preds))

print('accuracy_score:\n ', sklearn.metrics.accuracy_score(test_bl_dataset.y, int_preds))

print('confusion_matrix:\n ', sklearn.metrics.confusion_matrix(test_bl_dataset.y, int_preds))

print('classification_report:\n ', sklearn.metrics.classification_report(test_bl_dataset.y, int_preds))

precision_score:
  0.8394160583941606
accuracy_score:
  0.8576923076923076
confusion_matrix:
  [[108  22]
 [ 15 115]]
classification_report:
                precision    recall  f1-score   support

           0       0.88      0.83      0.85       130
           1       0.84      0.88      0.86       130

    accuracy                           0.86       260
   macro avg       0.86      0.86      0.86       260
weighted avg       0.86      0.86      0.86       260



In [9]:
#TODO: implement voing classifier in our pipeline
#ensemble = VotingClassifier(estimators, voting='hard')

#ensemble.fit(train_dataset.X, train_dataset.y)

#ensemble.score(test_dataset.X, test_dataset.y)