In [1]:
from Dataset.Dataset import CSVLoader
from compoundFeaturization.rdkitFingerprints import MorganFingerprint
from featureSelection.baseFeatureSelector import LowVarianceFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
dataset = CSVLoader('preprocessed_dataset_wfoodb.csv', 'Smiles', ['Class'], 'ID')#, chunk_size=2000)
print(dataset.get_shape())

Loading shard 1 of size 2000.
(2000,) (2000,) (0,) (2000,)
((2000,), (2000,), (0,), (2000,))


In [3]:
#Featurization
dataset = MorganFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
(2000,) (2000,) (2000, 1024) (2000,)


((2000,), (2000,), (2000, 1024), (2000,))

In [4]:
#Feature Selection
dataset = LowVarianceFS(0.15).featureSelection(dataset)
dataset.get_shape()

(2000,) (2000,) (2000, 39) (2000,)


((2000,), (2000,), (2000, 39), (2000,))

In [5]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, frac_valid=0.2, frac_test=0.2)

Computing train/valid/test indices


In [6]:
#Scikit-Learn Random Forest

rf = RandomForestClassifier()
model = SklearnModel(model=rf)

In [7]:
#cross validation
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split
[(<Dataset.Dataset.NumpyDataset object at 0x7f24b8358ba8>, <Dataset.Dataset.NumpyDataset object at 0x7f24b8358cc0>), (<Dataset.Dataset.NumpyDataset object at 0x7f24b8358c50>, <Dataset.Dataset.NumpyDataset object at 0x7f24b8358c18>), (<Dataset.Dataset.NumpyDataset object at 0x7f24b8358be0>, <Dataset.Dataset.NumpyDataset object at 0x7f24b8322080>)]




Train Score: 
roc_auc_score: 
 1.0
Test Score: 


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
# model training
print('Fiting Model: ')
model.fit(train_dataset)

In [None]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

In [None]:
#Build a model function for hyperparameter optimization
def rf_model_builder(n_estimators, max_features, class_weight, model_dir=None):
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, 
                                      class_weight=class_weight)
    return SklearnModel(rf_model, model_dir)

params_dict_rf = {"n_estimators": [10, 100],
                  "max_features": ["auto", "sqrt", "log2", None],
                  "class_weight": [{0: 1., 1: 1.}]}#, {0: 1., 1: 5}, {0: 1., 1: 10}]
                  #}
    


In [None]:
#Hyperparameter Optimization
optimizer = GridHyperparamOpt(rf_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_rf, train_dataset, valid_dataset, Metric(roc_auc_score))

print('#################')
print(best_hyperparams)
print(best_rf)

In [None]:
#Evaluate model
best_rf.evaluate(test_dataset, metrics)