In [2]:
from loaders.Loaders import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import HyperparamOpt_Valid, HyperparamOpt_CV

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import sklearn

import numpy as np
import pandas as pd
import random

In [3]:
#Load Dataset
dataset = CSVLoader(dataset_path='preprocessed_dataset_wfoodb.csv', 
                    mols_field='Smiles', 
                    labels_fields='Class', 
                    id_field='ID')#, shard_size=4000)
dataset = dataset.create_dataset()
dataset.get_shape()

Mols_shape:  23290
Features_shape:  X not defined!
Labels_shape:  (23290,)


In [4]:
#Separate sweeteners from non sweeteners
sweets_indexes = np.where(dataset.y == 1)
#test_sweets_indexes = sweets_indexes[0][:int(len(sweets_indexes[0])*0.1)]
#sweets_indexes = sweets_indexes[0][int(len(sweets_indexes[0])*0.1):]
test_sweets_indexes = sweets_indexes[0][:130]
sweets_indexes = sweets_indexes[0][130:]

sweets = dataset.select(sweets_indexes)
sweets_test = dataset.select(test_sweets_indexes)

non_sweets_indexes = np.where(dataset.y == 0)
#test_non_sweets_indexes = non_sweets_indexes[0][:int(len(non_sweets_indexes[0])*0.15)]
#non_sweets_indexes = non_sweets_indexes[0][int(len(non_sweets_indexes[0])*0.15):]
test_non_sweets_indexes = non_sweets_indexes[0][:130]
non_sweets_indexes = non_sweets_indexes[0][130:]

non_sweets = dataset.select(non_sweets_indexes)
non_sweets_test = dataset.select(test_non_sweets_indexes)

In [5]:
#Ratio between sweeteners and non sweeteners
n_balanced_datasets = int(np.floor(len(non_sweets.y)/len(sweets.y)))
n_balanced_datasets

18

In [6]:
indexes = list(range(0, len(non_sweets.y)))

def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

dataset_indexes = partition(indexes, n_balanced_datasets)


balanced_datasets = []
for i, di in enumerate(dataset_indexes):
    ds = non_sweets.select(di)
    ds = ds.merge([sweets])
    balanced_datasets.append(ds)
    
test_bl_dataset = sweets_test.merge([non_sweets_test])
test_bl_dataset = MorganFingerprint().featurize(test_bl_dataset)
test_bl_dataset.get_shape()
#test_bl_dataset = LowVarianceFS(0.15).featureSelection(test_bl_dataset)
#test_bl_dataset.get_shape()
#test_bl_dataset.features2keep

Featurizing datapoint 0
Mols_shape:  260
Features_shape:  (260, 1024)
Labels_shape:  (260,)


In [9]:
estimators = []
predictions = [0]*len(test_bl_dataset.X)

for i, dataset in enumerate(balanced_datasets):
    #Featurization
    #dataset = balanced_datasets[0]
    dataset = MorganFingerprint().featurize(dataset)
    dataset.get_shape()

    #Feature Selection
    #dataset = LowVarianceFS(0.15).featureSelection(dataset)
    #dataset.get_shape()

    #Data Split
    splitter = SingletaskStratifiedSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                                 frac_valid=0.2, frac_test=0.2)

    #metrics
    metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
               Metric(classification_report)]

    #Build a model function for hyperparameter optimization
    def rf_model_builder(n_estimators, max_features, class_weight):
        rf_model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, 
                                          class_weight=class_weight)
        return rf_model

    params_dict_rf = {"n_estimators": [10, 100],
                      "max_features": ["auto", "sqrt", "log2", None],
                      "class_weight": [{0: 1., 1: 1.}, {0: 1., 1: 5}, {0: 1., 1: 10}]
                      }

    #Hyperparameter Optimization
    optimizer = HyperparamOpt_Valid(rf_model_builder)

    best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, train_dataset, 
                                                                         valid_dataset, Metric(roc_auc_score))

    print('#################')
    print(best_hyperparams)
    print(best_rf)

    #Evaluate best model
    best_rf.evaluate(test_dataset, metrics)
    
    prdt = best_rf.predict(test_bl_dataset)
    for j, p in enumerate(prdt):
        predictions[j] += p[0]
        
    estimator_name = 'rf_'+str(i)
    estimators.append((estimator_name, best_rf))

    
predictions = [x / len(estimators) for x in predictions]
print(predictions)

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Mols_shape:  2406
Features_shape:  (2406, 1024)
Labels_shape:  (2406,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8314580403187999
Model 1/15, Metric roc_auc_score, Validation set 1: 0.831458
	best_validation_score so far: 0.831458
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8274990883992291
Model 2/15, Metric roc_auc_score, Validation set 2: 0.827499
	best_validation_score so far: 0.831458
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike objec

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8187894992707827
Model 8/15, Metric roc_auc_score, Validation set 8: 0.818789
	best_validation_score so far: 0.865216
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8794013473157858
Model 9/15, Metric roc_auc_score, Validation set 9: 0.879401
	best_validation_score so far: 0.879401
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8751649420098618
Model 10/15, Metric roc_auc_score, Validation set 10: 0.875165
	best_validation_score so far: 0.879401
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0

RDKit ERROR: [13:14:57] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [728]  were removed due to the presence of NAs!
The elements in question are:  ['[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Mols_shape:  2405
Features_shape:  (2405, 1024)
Labels_shape:  (2405,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8444162789082575
Model 1/15, Metric roc_auc_score, Validation set 1: 0.844416
	best_validation_score so far: 0.844416
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, by

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8651612231077772
Model 7/15, Metric roc_auc_score, Validation set 7: 0.865161
	best_validation_score so far: 0.867323
Fitting model 8/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8755534718966504
Model 8/15, Metric roc_auc_score, Validation set 8: 0.875553
	best_validation_score so far: 0.875553
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8711777881960723
Model 9/15, Metric roc_auc_score, Validation set 9: 0.871178
	best_validation_score so far: 0.875553
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8251614695464964
Model 15/15, Metric roc_auc_score, Validation set 15: 0.825161
	best_validation_score so far: 0.863150
roc_auc_score: 
 0.9979395604395604
Best hyperparameters: (100, 'log2', {0: 1.0, 1: 10})
train_score: 0.997940
validation_score: 0.863150
#################
(100, 'log2', {0: 1.0, 1: 10})
SklearnModel(mode='/tmp/tmppdwimkx6',
             model=RandomForestClassifier(class_weight={0: 1.0, 1: 10},
                                          max_features='log2'))
roc_auc_score: 
 0.8696410897536073
precision_score: 
 0.8198529411764706
accuracy_score: 
 0.86875
confusion_matrix: 
 [[194  49]
 [ 14 223]]
classification_report: 
               precision    recall  f1-score   support

           0       0.93      0.80      0.86       243
           1       0.82      0.94      0.88       237

    accuracy                           0.87       480
   macro avg       0.88      0.87      0.87       480
weig

RDKit ERROR: [13:16:25] Explicit valence for atom # 1 Cl, 4, is greater than permitted


Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [45]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O']
Mols_shape:  2404
Features_shape:  (2404, 1024)
Labels_shape:  (2404,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8257517883186332
Model 1/15, Metric roc_auc_score, Validation set 1: 0.825752
	best_validation_score so far: 0.825752
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8403534967706091
Model 2/15, Metric roc_auc_score, Validation set 2: 0.840353
	best_validation_score so far: 0.840353
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_f

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8693138412389749
Model 8/15, Metric roc_auc_score, Validation set 8: 0.869314
	best_validation_score so far: 0.869314
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8691402180706993
Model 9/15, Metric roc_auc_score, Validation set 9: 0.869140
	best_validation_score so far: 0.869314
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8692096673380096
Model 10/15, Metric roc_auc_score, Validation set 10: 0.869210
	best_validation_score so far: 0.869314
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 


Featurizing datapoint 1000
Featurizing datapoint 2000
Mols_shape:  2405
Features_shape:  (2405, 1024)
Labels_shape:  (2405,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.7940042715007555
Model 1/15, Metric roc_auc_score, Validation set 1: 0.794004
	best_validation_score so far: 0.794004
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8278116372349844
Model 2/15, Metric roc_auc_score, Validation set 2: 0.827812
	best_validation_score so far: 0.827812
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8693832905062853
Model 8/15, Metric roc_auc_score, Validation set 8: 0.869383
	best_validation_score so far: 0.869383
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8567782484894785
Model 9/15, Metric roc_auc_score, Validation set 9: 0.856778
	best_validation_score so far: 0.869383
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8525071185498994
Model 10/15, Metric roc_auc_score, Validation set 10: 0.852507
	best_validation_score so far: 0.869383
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 


Featurizing datapoint 1000
Featurizing datapoint 2000
Mols_shape:  2405
Features_shape:  (2405, 1024)
Labels_shape:  (2405,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8349541539316476
Model 1/15, Metric roc_auc_score, Validation set 1: 0.834954
	best_validation_score so far: 0.834954
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8402681300361211
Model 2/15, Metric roc_auc_score, Validation set 2: 0.840268
	best_validation_score so far: 0.840268
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_au

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8546774081533439
Model 9/15, Metric roc_auc_score, Validation set 9: 0.854677
	best_validation_score so far: 0.860910
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8546426835196889
Model 10/15, Metric roc_auc_score, Validation set 10: 0.854643
	best_validation_score so far: 0.860910
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8547468574206543
Model 11/15, Metric roc_auc_score, Validation set 11: 0.854747
	best_validation_score so far: 0.860910
Fitting model 12/15
hyperparameters: {'n_estimators': 100, 'max_features': 'log2', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_scor

Featurizing datapoint 1000
Featurizing datapoint 2000
Mols_shape:  2405
Features_shape:  (2405, 1024)
Labels_shape:  (2405,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8220034380371933
Model 1/15, Metric roc_auc_score, Validation set 1: 0.822003
	best_validation_score so far: 0.822003
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8401052247747045
Model 2/15, Metric roc_auc_score, Validation set 2: 0.840105
	best_validation_score so far: 0.840105
Fitting model 3/15
hyperparameters: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_au

RDKit ERROR: [13:22:24] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [666]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl-](=O)(=O)=O']
Mols_shape:  2404
Features_shape:  (2404, 1024)
Labels_shape:  (2404,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8276095562191819
Model 1/15, Metric roc_auc_score, Validation set 1: 0.827610
	best_validation_score so far: 0.827610
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8650079866657407
Model 2/15, Metric roc_auc_score, Validation set 2: 0.865008
	best_validation_score so far: 0.865008
Fitting model 3/15


RDKit ERROR: [13:23:03] Explicit valence for atom # 3 B, 4, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
Featurizing datapoint 1000
Featurizing datapoint 2000
Elements with indexes:  [809]  were removed due to the presence of NAs!
The elements in question are:  ['OB1O[B]2(O)OB(O)O[B](O)(O1)O2']
Mols_shape:  2404
Features_shape:  (2404, 1024)
Labels_shape:  (2404,)
MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8217584554482951
Model 1/15, Metric roc_auc_score, Validation set 1: 0.821758
	best_validation_score so far: 0.821758
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.7986839363844711
Model 2/15, Metric roc_auc_score, Validation set 2: 0.798684
	best_validation_score so far: 0.82

expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8171589310829817
Model 8/15, Metric roc_auc_score, Validation set 8: 0.817159
	best_validation_score so far: 0.836094
Fitting model 9/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8445329999479085
Model 9/15, Metric roc_auc_score, Validation set 9: 0.844533
	best_validation_score so far: 0.844533
Fitting model 10/15
hyperparameters: {'n_estimators': 100, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.8484919518674793
Model 10/15, Metric roc_auc_score, Validation set 10: 0.848492
	best_validation_score so far: 0.848492
Fitting model 11/15
hyperparameters: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 

In [10]:
int_preds = []

for p in predictions:
    if p >= 0.5:
        int_preds.append(0)
    else: int_preds.append(1)
        
print(int_preds)
print(test_bl_dataset.y)

[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [11]:
print('precision_score:\n ', sklearn.metrics.precision_score(test_bl_dataset.y, int_preds))

print('accuracy_score:\n ', sklearn.metrics.accuracy_score(test_bl_dataset.y, int_preds))

print('confusion_matrix:\n ', sklearn.metrics.confusion_matrix(test_bl_dataset.y, int_preds))

print('classification_report:\n ', sklearn.metrics.classification_report(test_bl_dataset.y, int_preds))

precision_score:
  0.855072463768116
accuracy_score:
  0.8769230769230769
confusion_matrix:
  [[110  20]
 [ 12 118]]
classification_report:
                precision    recall  f1-score   support

           0       0.90      0.85      0.87       130
           1       0.86      0.91      0.88       130

    accuracy                           0.88       260
   macro avg       0.88      0.88      0.88       260
weighted avg       0.88      0.88      0.88       260



In [None]:
#TODO: implement voing classifier in our pipeline
#ensemble = VotingClassifier(estimators, voting='hard')

#ensemble.fit(train_dataset.X, train_dataset.y)

#ensemble.score(test_dataset.X, test_dataset.y)