# Random Forest with SMILES Embeddings from BERT

Make sure that the parent directory is on our python path

In [13]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

Install necessary packages

In [14]:
! pip install deepchem



Load dataset

In [15]:
from loaders.Loaders import CSVLoader

dataset = CSVLoader(dataset_path='../data/HIV_featurized.csv',
                    mols_field='mols',
                    features_fields='X', 
                    labels_fields='y')

dataset = dataset.create_dataset()
dataset.splitFeatures()
dataset.get_shape()

Mols_shape:  40358
Features_shape:  (40358, 768)
Labels_shape:  (40358,)


Data split

In [16]:
from splitters.splitters import SingletaskStratifiedSplitter

splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, 
                                                                             frac_train=0.6, 
                                                                             frac_valid=0.2, 
                                                                             frac_test=0.2)

Random Forest from Scikit-Learn

In [17]:
from sklearn.ensemble import RandomForestClassifier
from models.sklearnModels import SklearnModel

rf = RandomForestClassifier()
model = SklearnModel(model=rf)

Cross validation

In [18]:
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score

model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split

Split 1 :
Train Score: 
roc_auc_score: 
 1.0
Test Score: 
roc_auc_score: 
 0.5587288714505411

Split 2 :
Train Score: 
roc_auc_score: 
 0.9994419642857143
Test Score: 
roc_auc_score: 
 0.547452816911078

Split 3 :
Train Score: 
roc_auc_score: 
 1.0
Test Score: 
roc_auc_score: 
 0.553803156721009


(SklearnModel(model=RandomForestClassifier()),
 1.0,
 0.5587288714505411,
 [1.0, 0.9994419642857143, 1.0],
 [0.5587288714505411, 0.547452816911078, 0.553803156721009],
 0.9998139880952381,
 0.5533282816942093)

Model training

In [19]:
model.fit(train_dataset)

RandomForestClassifier()

In [20]:
from metrics.metricsFunctions import precision_score, accuracy_score, confusion_matrix, classification_report, f1_score

metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

#############################
Training Dataset: 
roc_auc_score: 
 0.9993757802746567
precision_score: 
 1.0
accuracy_score: 
 0.9999586947542338
confusion_matrix: 
 [[23409     0]
 [    1   800]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23409
           1       1.00      1.00      1.00       801

    accuracy                           1.00     24210
   macro avg       1.00      1.00      1.00     24210
weighted avg       1.00      1.00      1.00     24210

#############################
Validation Dataset: 
roc_auc_score: 
 0.5556671519309053
precision_score: 
 0.7894736842105263
accuracy_score: 
 0.969640644361834
confusion_matrix: 
 [[7795    8]
 [ 237   30]]
classification_report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      7803
           1       0.79      0.11      0.20       267

    accuracy                           0.97      8070
 