In [1]:
from loaders.Loaders import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import r2_score, roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report, f1_score
from parameterOptimization.HyperparameterOpt import HyperparamOpt_CV, HyperparamOpt_Valid

from sklearn.ensemble import RandomForestClassifier

In [2]:
#Load Dataset
dataset = CSVLoader(dataset_path='preprocessed_dataset_wfoodb.csv', 
                    mols_field='Smiles', 
                    labels_fields='Class', 
                    id_field='ID')#, shard_size=4000)
dataset = dataset.create_dataset()
dataset.get_shape()

Mols_shape:  23290
Features_shape:  X not defined!
Labels_shape:  (23290,)


In [3]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [11:40:33] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [11:40:36] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [11:40:36] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [11:40:51] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Mols_shape:  23286
Features_shape:  (23286, 1024)
Labels_shape:  (23286,)


In [4]:
#Feature Selection
dataset = LowVarianceFS(0.15).featureSelection(dataset)
dataset.get_shape()

Mols_shape:  23286
Features_shape:  (23286, 49)
Labels_shape:  (23286,)


In [5]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

In [6]:
#Scikit-Learn Random Forest
rf = RandomForestClassifier()
model = SklearnModel(model=rf)

In [7]:
#cross validation
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split

Split 1 :
Train Score: 
0 [0.96 0.04]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  y should be a 1d array, got an array of shape (15524, 2) instead.
roc_auc_score: 
 0.861705416128114
Test Score: 
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  y should be a 1d array, got an array of shape (7762, 2) instead.
roc_auc_score: 
 0.649716297087234

Split 2 :
Train Score: 
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  y should be a 1d array, got an array of shape (15524, 2) instead.
roc_auc_score: 
 0.8471359560723425
Test Score: 
0 [0.91 0.09]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  y should be a 1d array, got an array of shape (7762, 2) instead.
roc_auc_score: 
 0.6495646202751009

Split 3 :
Train Score: 
0 [0.

(SklearnModel(model=RandomForestClassifier()),
 0.861705416128114,
 0.649716297087234,
 [0.861705416128114, 0.8471359560723425, 0.8531739858112289],
 [0.649716297087234, 0.6495646202751009, 0.6440733193518722],
 0.8540051193372284,
 0.6477847455714024)

In [8]:
# model training
model.fit(train_dataset)

RandomForestClassifier()

In [9]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

#############################
Training Dataset: 
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  y should be a 1d array, got an array of shape (13968, 2) instead.
roc_auc_score: 
 0.8490777902510533
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
precision_score: 
 0.9388794567062818
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
accuracy_score: 
 0.9805269186712485
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
confusion_matrix: 
 [[13143    36]
 [  236   553]]
0 [1. 0.]
<cla

In [10]:
#Build a model function for hyperparameter optimization
def rf_model_builder(n_estimators=10, max_features='auto', class_weight={0: 1., 1: 1.}):
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, 
                                      class_weight=class_weight)
    return rf_model

params_dict_rf = {"n_estimators": [10, 100],
                  "max_features": ["auto", "sqrt", "log2", None],
                  "class_weight": [{0: 1., 1: 1.}, {0: 1., 1: 5}, {0: 1., 1: 10}]
                  }


In [11]:
#Hyperparameter Optimization
optimizer = HyperparamOpt_Valid(rf_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, 
                                                                     train_dataset, 
                                                                     valid_dataset, 
                                                                     Metric(accuracy_score))

print('#################')
print(best_hyperparams)
print(best_rf)

#Evaluate model
best_rf.evaluate(test_dataset, metrics)

MODE:  classification
Fitting 15 random models from a space of 24 possible models.
Fitting model 1/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}
expected str, bytes or os.PathLike object, not NoneType
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
accuracy_score: 
 0.9383591065292096
Model 1/15, Metric accuracy_score, Validation set 1: 0.938359
	best_validation_score so far: 0.938359
Fitting model 2/15
hyperparameters: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}}
expected str, bytes or os.PathLike object, not NoneType
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
accuracy_score: 
 0.9301975945017182
Mod

0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  y should be a 1d array, got an array of shape (4656, 2) instead.
roc_auc_score: 
 0.6559343434343434
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
precision_score: 
 0.5850340136054422
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
accuracy_score: 
 0.948668384879725
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.int64'> <class 'numpy.float64'>
ERROR:  Classification metrics can't handle a mix of binary and continuous-multioutput targets
confusion_matrix: 
 [[4331   61]
 [ 178   86]]
0 [1. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'num

{'roc_auc_score': 0.6559343434343434,
 'precision_score': 0.5850340136054422,
 'accuracy_score': 0.948668384879725,
 'confusion_matrix': 1164.0,
 'classification_report': None}

In [12]:
#Hyperparameter Optimization with CV
optimizer = HyperparamOpt_CV(rf_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search('sklearn',
                                                                     params_dict_rf, 
                                                                     train_dataset,  
                                                                     'accuracy', 
                                                                     cv=3,
                                                                     n_iter_search=10)

print('#################')
print(best_hyperparams)
print(best_rf)

MODEL TYPE:  sklearn
asdasdas
Fitting 10 random models from a space of 24 possible models.

 
 Best accuracy: 0.938001 using {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 1.0}}

 accuracy: 0.927262 (0.005071) with: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 5}} 


 accuracy: 0.920604 (0.008565) with: {'n_estimators': 100, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 10}} 


 accuracy: 0.936283 (0.004829) with: {'n_estimators': 10, 'max_features': 'sqrt', 'class_weight': {0: 1.0, 1: 1.0}} 


 accuracy: 0.920747 (0.007467) with: {'n_estimators': 10, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}} 


 accuracy: 0.920604 (0.010247) with: {'n_estimators': 10, 'max_features': 'auto', 'class_weight': {0: 1.0, 1: 10}} 


 accuracy: 0.924900 (0.006539) with: {'n_estimators': 100, 'max_features': None, 'class_weight': {0: 1.0, 1: 5}} 


 accuracy: 0.926260 (0.007028) with: {'n_estimators': 10, 'max_features': 'log2', 'class_we

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [None]:
#Evaluate model
best_rf.evaluate(test_dataset, metrics)

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()