# Random Forest with SMILES Embeddings from BERT

In [None]:
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report, f1_score
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt
from sklearn.ensemble import RandomForestClassifier

Load dataset

In [None]:
from loaders.Loaders import CSVLoader

dataset = CSVLoader(dataset_path='../data/HIV_embeddings.csv', 
                    mols_field='processed_smiles', 
                    labels_fields='HIV_active')

dataset = dataset.create_dataset()
dataset.get_shape()

Data split

In [None]:
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, 
                                                                             frac_train=0.6, 
                                                                             frac_valid=0.2, 
                                                                             frac_test=0.2)

Random Forest from Scikit-Learn

In [None]:
rf = RandomForestClassifier()
model = SklearnModel(model=rf)

Cross validation

In [None]:
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

In [None]:
Model training

In [None]:
model.fit(train_dataset)

In [None]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")