In [1]:
from Dataset.Dataset import CSVLoader
from compoundFeaturization.rdkitFingerprints import MACCSkeysFingerprint
from featureSelection.baseFeatureSelector import SelectFromModelFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import GridHyperparamOpt

from sklearn.svm import SVC

In [2]:
dataset = CSVLoader('preprocessed_dataset_wfoodb.csv', 'Smiles', ['Class'], 'ID')#, chunk_size=2000)
print(dataset.get_shape())

(23290,) (23290,) (0,) (23290,)
((23290,), (23290,), (0,), (23290,))


In [3]:
#Featurization
dataset = MACCSkeysFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [12:02:38] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [12:02:42] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [12:02:42] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [12:02:57] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
(23286,) (23286,) (23286, 167) (23286,)


((23286,), (23286,), (23286, 167), (23286,))

In [4]:
#Feature Selection
dataset = SelectFromModelFS().featureSelection(dataset)
dataset.get_shape()

(23286,) (23286,) (23286, 61) (23286,)


((23286,), (23286,), (23286, 61), (23286,))

In [5]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, 
                                                                             frac_train=0.6, 
                                                                             frac_valid=0.2, 
                                                                             frac_test=0.2)

In [6]:
#Scikit-Learn SVC
svm = SVC()
model = SklearnModel(model=svm)

In [7]:
#cross validation
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split
Train Score: 
roc_auc_score: 
 0.5812570213169272
Test Score: 
roc_auc_score: 
 0.5674989446499963
Train Score: 
roc_auc_score: 
 0.5854931943245846
Test Score: 
roc_auc_score: 
 0.5608830063920117
Train Score: 
roc_auc_score: 
 0.5651984503215888
Test Score: 
roc_auc_score: 
 0.5702947845804989


(SklearnModel(model=SVC(), model_dir='/tmp/tmpcpvkql06'),
 0.5651984503215888,
 0.5702947845804989,
 [0.5812570213169272, 0.5854931943245846, 0.5651984503215888],
 [0.5674989446499963, 0.5608830063920117, 0.5702947845804989],
 0.5773162219877002,
 0.5662255785408356)

In [8]:
# model training
model.fit(train_dataset)

In [9]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

#############################
Training Dataset: 
roc_auc_score: 
 0.5764684524072533
precision_score: 
 0.9758064516129032
accuracy_score: 
 0.9518900343642611
confusion_matrix: 
 [[13175     3]
 [  669   121]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.98     13178
           1       0.98      0.15      0.26       790

    accuracy                           0.95     13968
   macro avg       0.96      0.58      0.62     13968
weighted avg       0.95      0.95      0.93     13968

#############################
Validation Dataset: 
roc_auc_score: 
 0.5716742588234479
precision_score: 
 0.8837209302325582
accuracy_score: 
 0.9506013745704467
confusion_matrix: 
 [[4388    5]
 [ 225   38]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4393
           1       0.88      0.14      0.25       263

    accuracy                           

In [10]:
#Model Builder
def svm_model_builder(C, gamma, kernel, model_dir=None):
    svm_model = SVC(C=C, gamma=gamma, kernel=kernel)
    return SklearnModel(svm_model, model_dir)

params_dict_svm = {'C': [1.0, 0.7, 0.5, 0.3, 0.1],
                   'gamma': ["scale", "auto"],
                   'kernel': ["linear", "rbf"]
                  }

In [11]:
#hyperparameter optimization
optimizer = GridHyperparamOpt(svm_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_svm, 
                                                                     train_dataset, valid_dataset, 
                                                                     Metric(roc_auc_score))

Fitting 15 random models from a space of 20 possible models.
Fitting model 1/15
hyperparameters: {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
roc_auc_score: 
 0.5
Model 1/15, Metric roc_auc_score, Validation set 1: 0.500000
	best_validation_score so far: 0.500000
Fitting model 2/15
hyperparameters: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
roc_auc_score: 
 0.5716742588234479
Model 2/15, Metric roc_auc_score, Validation set 2: 0.571674
	best_validation_score so far: 0.571674
Fitting model 3/15
hyperparameters: {'C': 1.0, 'gamma': 'auto', 'kernel': 'linear'}
roc_auc_score: 
 0.5
Model 3/15, Metric roc_auc_score, Validation set 3: 0.500000
	best_validation_score so far: 0.571674
Fitting model 4/15
hyperparameters: {'C': 1.0, 'gamma': 'auto', 'kernel': 'rbf'}
roc_auc_score: 
 0.5228136882129277
Model 4/15, Metric roc_auc_score, Validation set 4: 0.522814
	best_validation_score so far: 0.571674
Fitting model 5/15
hyperparameters: {'C': 0.7, 'gamma': 'auto', 'kernel': 'linear'}
roc_a

In [12]:
#Evaluate best model
best_rf.evaluate(test_dataset, metrics)

roc_auc_score: 
 0.5664261065175412
precision_score: 
 0.9722222222222222
accuracy_score: 
 0.9508161512027491
confusion_matrix: 
 [[4392    1]
 [ 228   35]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4393
           1       0.97      0.13      0.23       263

    accuracy                           0.95      4656
   macro avg       0.96      0.57      0.60      4656
weighted avg       0.95      0.95      0.93      4656



{'roc_auc_score': 0.5664261065175412,
 'precision_score': 0.9722222222222222,
 'accuracy_score': 0.9508161512027491,
 'confusion_matrix': 1164.0,
 'classification_report': None}