In [2]:
from loaders.Loaders import CSVLoader
from compoundFeaturization.rdkitFingerprints import MACCSkeysFingerprint
from featureSelection.baseFeatureSelector import SelectFromModelFS
from splitters.splitters import SingletaskStratifiedSplitter
from models.sklearnModels import SklearnModel
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from parameterOptimization.HyperparameterOpt import HyperparamOpt_Valid

from sklearn.svm import SVC

In [3]:
dataset = CSVLoader(dataset_path='preprocessed_dataset_wfoodb.csv', 
                    mols_field='Smiles', 
                    labels_fields='Class', 
                    id_field='ID')#, shard_size=4000)
dataset = dataset.create_dataset()
dataset.get_shape()

Mols_shape:  23290
Features_shape:  X not defined!
Labels_shape:  (23290,)


In [4]:
#Featurization
dataset = MACCSkeysFingerprint().featurize(dataset)
dataset.get_shape()

Featurizing datapoint 0
Featurizing datapoint 1000
Featurizing datapoint 2000
Featurizing datapoint 3000
Featurizing datapoint 4000
Featurizing datapoint 5000
Featurizing datapoint 6000


RDKit ERROR: [13:58:18] Explicit valence for atom # 1 Cl, 4, is greater than permitted


error in smile: O=[Cl]=O
Featurizing datapoint 7000


RDKit ERROR: [13:58:22] Explicit valence for atom # 3 B, 4, is greater than permitted
RDKit ERROR: [13:58:22] Explicit valence for atom # 1 Cl, 9, is greater than permitted


error in smile: OB1O[B]2(O)OB(O)O[B](O)(O1)O2
error in smile: O=[Cl-](=O)(=O)=O
Featurizing datapoint 8000
Featurizing datapoint 9000
Featurizing datapoint 10000
Featurizing datapoint 11000
Featurizing datapoint 12000
Featurizing datapoint 13000
Featurizing datapoint 14000


RDKit ERROR: [13:58:37] Explicit valence for atom # 0 P, 11, is greater than permitted


error in smile: [P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C
Featurizing datapoint 15000
Featurizing datapoint 16000
Featurizing datapoint 17000
Featurizing datapoint 18000
Featurizing datapoint 19000
Featurizing datapoint 20000
Featurizing datapoint 21000
Featurizing datapoint 22000
Featurizing datapoint 23000
Elements with indexes:  [6257, 7708, 7709, 14244]  were removed due to the presence of NAs!
The elements in question are:  ['O=[Cl]=O' 'OB1O[B]2(O)OB(O)O[B](O)(O1)O2' 'O=[Cl-](=O)(=O)=O'
 '[P](OCC=C(C)C)(OCC=C(C)C)(=O)(OP(OCC=C(C)C)(OCC=C(C)C)=O)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)(CC=C(C)C)CC=C(C)C']
Mols_shape:  23286
Features_shape:  (23286, 167)
Labels_shape:  (23286,)


In [5]:
#Feature Selection
dataset = SelectFromModelFS().featureSelection(dataset)
dataset.get_shape()

Mols_shape:  23286
Features_shape:  (23286, 63)
Labels_shape:  (23286,)


In [6]:
#Data Split
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, 
                                                                             frac_train=0.6, 
                                                                             frac_valid=0.2, 
                                                                             frac_test=0.2)

In [7]:
#Scikit-Learn SVC
svm = SVC()
model = SklearnModel(model=svm)

In [8]:
#cross validation
model.cross_validate(dataset, Metric(roc_auc_score), folds=3)

Computing Stratified K-fold split

Split 1 :
Train Score: 
roc_auc_score: 
 0.5702606362401081
Test Score: 
roc_auc_score: 
 0.5588177596781804

Split 2 :
Train Score: 
roc_auc_score: 
 0.5859241471811559
Test Score: 
roc_auc_score: 
 0.57534904869383

Split 3 :
Train Score: 
roc_auc_score: 
 0.5829840247066719
Test Score: 
roc_auc_score: 
 0.5766194289034651


(SklearnModel(model=SVC()),
 0.5829840247066719,
 0.5766194289034651,
 [0.5702606362401081, 0.5859241471811559, 0.5829840247066719],
 [0.5588177596781804, 0.57534904869383, 0.5766194289034651],
 0.5797229360426454,
 0.5702620790918251)

In [9]:
# model training
model.fit(train_dataset)

SVC()

In [10]:
metrics = [Metric(roc_auc_score), Metric(precision_score), Metric(accuracy_score), Metric(confusion_matrix), 
           Metric(classification_report)]
print("#############################")
# evaluate the model
print('Training Dataset: ')
train_score = model.evaluate(train_dataset, metrics)
print("#############################")
print('Validation Dataset: ')
valid_score = model.evaluate(valid_dataset, metrics)
print("#############################")
print('Test Dataset: ')
test_score = model.evaluate(test_dataset, metrics)
print("#############################")

#############################
Training Dataset: 
roc_auc_score: 
 0.5764684524072533
precision_score: 
 0.9758064516129032
accuracy_score: 
 0.9518900343642611
confusion_matrix: 
 [[13175     3]
 [  669   121]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.98     13178
           1       0.98      0.15      0.26       790

    accuracy                           0.95     13968
   macro avg       0.96      0.58      0.62     13968
weighted avg       0.95      0.95      0.93     13968

#############################
Validation Dataset: 
roc_auc_score: 
 0.5793926389979219
precision_score: 
 0.9130434782608695
accuracy_score: 
 0.9516752577319587
confusion_matrix: 
 [[4389    4]
 [ 221   42]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.98      4393
           1       0.91      0.16      0.27       263

    accuracy                           

In [11]:
#Model Builder
def svm_model_builder(C, gamma, kernel):
    svm_model = SVC(C=C, gamma=gamma, kernel=kernel)
    return svm_model

params_dict_svm = {'C': [1.0, 0.7, 0.5, 0.3, 0.1],
                   'gamma': ["scale", "auto"],
                   'kernel': ["linear", "rbf"]
                  }

In [12]:
#hyperparameter optimization
optimizer = HyperparamOpt_Valid(svm_model_builder)

best_rf, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict_svm, 
                                                                     train_dataset, valid_dataset, 
                                                                     Metric(roc_auc_score))

MODE:  classification
Fitting 15 random models from a space of 20 possible models.
Fitting model 1/15
hyperparameters: {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.5
Model 1/15, Metric roc_auc_score, Validation set 1: 0.500000
	best_validation_score so far: 0.500000
Fitting model 2/15
hyperparameters: {'C': 1.0, 'gamma': 'auto', 'kernel': 'rbf'}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.5304182509505704
Model 2/15, Metric roc_auc_score, Validation set 2: 0.530418
	best_validation_score so far: 0.530418
Fitting model 3/15
hyperparameters: {'C': 0.7, 'gamma': 'scale', 'kernel': 'linear'}
expected str, bytes or os.PathLike object, not NoneType
roc_auc_score: 
 0.5
Model 3/15, Metric roc_auc_score, Validation set 3: 0.500000
	best_validation_score so far: 0.530418
Fitting model 4/15
hyperparameters: {'C': 0.7, 'gamma': 'scale', 'kernel': 'rbf'}
expected str, bytes or os.PathLike 

In [13]:
#Evaluate best model
best_rf.evaluate(test_dataset, metrics)

roc_auc_score: 
 0.5550192624110774
precision_score: 
 0.9666666666666667
accuracy_score: 
 0.9495274914089347
confusion_matrix: 
 [[4392    1]
 [ 234   29]]
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4393
           1       0.97      0.11      0.20       263

    accuracy                           0.95      4656
   macro avg       0.96      0.56      0.59      4656
weighted avg       0.95      0.95      0.93      4656



{'roc_auc_score': 0.5550192624110774,
 'precision_score': 0.9666666666666667,
 'accuracy_score': 0.9495274914089347,
 'confusion_matrix': 1164.0,
 'classification_report': None}