# DeepMol - Testing with RandomForest

In [1]:
from IPython.display import clear_output 

In [2]:
!pip install kora
import kora.install.rdkit
clear_output()

In [3]:
!pip install git+https://github.com/samoturk/mol2vec
clear_output()

In [4]:
from compoundFeaturization.Mol2vec.mol2vec_v2 import Mol2Vec  

In [5]:
from loaders.Loaders import CSVLoader
dataset = CSVLoader(dataset_path='../data/HIV.csv', 
                    mols_field='smiles', 
                    labels_fields='HIV_active')#, shard_size=4000)
dataset = dataset.create_dataset()
dataset.get_shape()

Mols_shape:  41127
Features_shape:  X not defined!
Labels_shape:  (41127,)


In [6]:
dataset = Mol2Vec().featurize(dataset)
dataset.get_shape()

Mols_shape:  41127
Features_shape:  (41127, 300)
Labels_shape:  (41127,)


In [7]:
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report, f1_score
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt

from sklearn.ensemble import RandomForestClassifier

In [8]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6, 
                                                                             frac_valid=0.2, frac_test=0.2)

In [9]:
#Scikit-Learn Random Forest
rf = RandomForestClassifier()
model = SklearnModel(model=rf)

In [10]:
#cross validation
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split
Train Score: 
roc_auc_score: 
 0.9994802494802495
Test Score: 
roc_auc_score: 
 0.5863353234837965
Train Score: 
roc_auc_score: 
 0.9994802494802495
Test Score: 
roc_auc_score: 
 0.6026649512437235
Train Score: 
roc_auc_score: 
 0.9994802494802495
Test Score: 
roc_auc_score: 
 0.5883387283447762


(SklearnModel(model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                           class_weight=None, criterion='gini',
                                           max_depth=None, max_features='auto',
                                           max_leaf_nodes=None, max_samples=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=100, n_jobs=None,
                                           oob_score=False, random_state=None,
                                           verbose=0, warm_start=False),
              model_dir='/tmp/tmp1lnmf7h3'),
 0.9994802494802495,
 0.6026649512437235,
 [0.9994802494802495, 0.999480249

In [11]:
# model training
model.fit(train_dataset)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

#############################
Training Dataset: 
roc_auc_score: 
 1.0
precision_score: 
 1.0
accuracy_score: 
 1.0
confusion_matrix: 
 [[23811     0]
 [    0   861]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23811
           1       1.00      1.00      1.00       861

    accuracy                           1.00     24672
   macro avg       1.00      1.00      1.00     24672
weighted avg       1.00      1.00      1.00     24672

#############################
Validation Dataset: 
roc_auc_score: 
 0.5960881496415771
precision_score: 
 0.7567567567567568
accuracy_score: 
 0.9696011673151751
confusion_matrix: 
 [[7918   18]
 [ 232   56]]
classification_report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      7936
           1       0.76      0.19      0.31       288

    accuracy                           0.97      8224
   macro avg       0.86      0