In [16]:
from skfp.datasets.moleculenet import load_bace, load_bbbp, load_clintox, load_muv, load_pcba, load_tox21, load_toxcast
from skfp.model_selection import scaffold_train_test_split, butina_train_test_split#, maxmin_train_test_split, randomized_scaffold_train_test_split
from skfp.fingerprints import ECFPFingerprint, MACCSFingerprint, RDKitFingerprint
from skfp.preprocessing import MolFromSmilesTransformer
from skfp.filters import BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter
from skfp.filters import HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter
from skfp.filters import OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter
from skfp.filters import RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neural_network import MLPClassifier

import numpy as np
from sklearn.model_selection import KFold

from deepchem.feat.smiles_tokenizer import SmilesTokenizer

import itertools

In [17]:
# SMILEs tokenizer
#tokenizer = SmilesTokenizer("./vocab.txt")
#print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))

In [18]:

# model for classification
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    GaussianNB(),
    tree.DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
    svm.SVC(probability=True),
    # svm.SVR(kernel='linear'),
    # svm.SVR(kernel='poly'),
    # svm.SVR(kernel='rbf'),
    # svm.SVR(kernel='sigmoid'),
]

In [19]:
fingerprints = [
    ECFPFingerprint(),
    MACCSFingerprint(),
    RDKitFingerprint(),
    BeyondRo5Filter(),
]

filters = [
    BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter,
    HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter,
    OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter,
    RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter
]

filter_names = []

for filter in filters:
    filter_names.append(filter.__name__)

In [20]:
pipelines = {
    "baseline_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint()), LogisticRegression(max_iter=1000)),
    "rf_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), RandomForestClassifier(random_state=41, class_weight='balanced')),
    "gb_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), GradientBoostingClassifier()),
    "knc_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[1]),
    "svm_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[7]),
    "GNB_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[2]),
    "mlp_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[6]),
}

In [21]:
def calculate_scores(y_test, y_pred, y_proba):
    r2score = r2_score(y_test, y_pred)
    # print(f"R2 score for Linear regression model: {r2score:.4f}")

    accuracy = accuracy_score(y_test, y_pred)
    # print(f"Accuracy for Linear regression model: {accuracy:.4f}")

    roc_auc = roc_auc_score(y_test, y_proba[:, 1])
    # print(f"ROC-AUC score for Linear regression model: {roc_auc:.4f}")
    return r2score, accuracy, roc_auc

## Applied filters

In [19]:
smiles_bace, labels_bace = load_bace()
smiles_bace_train, smiles_bace_test, y_train, y_test = scaffold_train_test_split(
    smiles_bace, labels_bace, test_size=0.2
)

pipelines['baseline_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_bace_test)
print(len(smiles_bace_test))
bace_mols_mapper = dict() # smile -> label
for i in range(len(smiles_bace)):
    bace_mols_mapper[smiles_bace[i]] = labels_bace[i]

for idx, filter in enumerate(filters):
    if idx == 1:
        break

    filter_f = filter()
    filter_f.fit(smiles_bace, labels_bace)
    filtered_mols = filter_f.transform(smiles_bace)
    filtered_labels = [ bace_mols_mapper[mol] for mol in filtered_mols ]
    rest_mols = [ mol for mol in smiles_bace if mol not in filtered_mols ]
    rest_labels = [ bace_mols_mapper[mol] for mol in rest_mols ]

    filtered_mols_bace_train, filtered_mols_bace_test, y_train_filtered, y_test_filtered = scaffold_train_test_split(
        filtered_mols, filtered_labels, test_size=0.2
    )
    print("Filtered params: ", len(filtered_mols_bace_train) , len(y_train_filtered), len(filtered_mols_bace_test))

    pipelines['baseline_pipeline'].fit(filtered_mols_bace_train, y_train_filtered)
    y_pred_filtered = pipelines['baseline_pipeline'].predict(filtered_mols_bace_test)
    y_proba_filtered = pipelines['baseline_pipeline'].predict_proba(filtered_mols_bace_test)

    print("Baseline: ")
    print(len(y_test), len(y_pred), len(y_proba))
    roc_auc_baseline = calculate_scores(y_test, y_pred, y_proba)
    print("Filtered: ")
    print(len(y_test_filtered), len(y_pred_filtered), len(y_proba_filtered))
    roc_auc_filtered = calculate_scores(y_test_filtered, y_pred_filtered, y_proba_filtered)

    print("Mixed params: ", len(filtered_mols_bace_train), len(y_train_filtered), len(smiles_bace_test))
    pipelines['baseline_pipeline'].fit(filtered_mols_bace_train, y_train_filtered)
    y_pred_mixed = pipelines['baseline_pipeline'].predict(smiles_bace_test)
    y_proba_mixed = pipelines['baseline_pipeline'].predict_proba(smiles_bace_test)

    print("Mixed: ")
    print(len(y_test), len(y_pred_mixed), len(y_proba_mixed))
    roc_auc_mixed = calculate_scores(y_test, y_pred_mixed, y_proba_mixed)

    print("Rest params: ", len(filtered_mols_bace_train), len(y_train_filtered), len(rest_mols))
    pipelines['baseline_pipeline'].fit(filtered_mols_bace_train, y_train_filtered)
    y_pred_rest = pipelines['baseline_pipeline'].predict(rest_mols)
    y_proba_rest = pipelines['baseline_pipeline'].predict_proba(rest_mols)

    print("Rest: ")
    print(len(y_test), len(y_pred_rest), len(y_proba_rest))
    roc_auc_mixed = calculate_scores(rest_labels, y_pred_rest, y_proba_rest)
    

303
Filtered params:  1198 1198 300
Baseline: 
303 303 303
R2 score for Linear regression model: -0.1929
Accuracy for Linear regression model: 0.7096
ROC-AUC score for Linear regression model: 0.7980
Filtered: 
300 300 300
R2 score for Linear regression model: -0.1364
Accuracy for Linear regression model: 0.7200
ROC-AUC score for Linear regression model: 0.8042
Mixed params:  1198 1198 303
Mixed: 
303 303 303
R2 score for Linear regression model: -0.1929
Accuracy for Linear regression model: 0.7096
ROC-AUC score for Linear regression model: 0.7951
Rest params:  1198 1198 15
Rest: 
303 15 15
R2 score for Linear regression model: -0.3636
Accuracy for Linear regression model: 0.7333
ROC-AUC score for Linear regression model: 0.7955


In [6]:
import csv

filename = "results_for_classification_with_2_filters.csv"

with open(filename, "a") as file:
    # writer = csv.writer(file, delimiter=",")
    # writer.writerow(["Dataset", "Model", "Filters", "Metric", "Baseline value", "After filter application", "Improvement (%)", "Filter metric R2", "Filter metric Accuracy", "Baseline metric R2", "Baseline metric Accuracy", "Mixed metric R2", "Mixed metric Accuracy", "Mixed metric roc_auc", "Rest metric R2", "Rest metric Accuracy", "Rest metric roc_auc","Notes"])
    # writer.writerow(["bace222", "LogisticRegression", "-", "ROC-AUC", 0.4753, -1, -1, "None"])

In [23]:
filter_pairs = list(itertools.combinations(filters, r=2))
print(filter_pairs)

[(<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.bms.BMSFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.brenk.BrenkFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.faf4_druglike.FAF4DruglikeFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.faf4_leadlike.FAF4LeadlikeFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.ghose.GhoseFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.glaxo.GlaxoFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.gsk.GSKFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.hao.HaoFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.inpharmatica.InpharmaticaFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.lint.LINTFilter'>), (<class 'skfp.filte

In [24]:
# filename = "results_for_classification.csv"
filename = "results_for_classification_with_2_filters.csv"

prepared_models = {
    # "baseline_pipeline": "LogisticRegression", 
    "gb_pipeline": "GradientBoostingClassifier",
    "GNB_pipeline": "GaussianNB"
}

smiles_bace, labels_bace = load_bace()
smiles_bace_train, smiles_bace_test, y_train, y_test = scaffold_train_test_split(
    smiles_bace, labels_bace, test_size=0.2
)

bace_mols_mapper = dict() # smile -> label
for i in range(len(smiles_bace)):
    bace_mols_mapper[smiles_bace[i]] = labels_bace[i]

for pipeline_name, model_name in prepared_models.items():

    pipelines[pipeline_name].fit(smiles_bace_train, y_train)
    y_pred = pipelines[pipeline_name].predict(smiles_bace_test)
    y_proba = pipelines[pipeline_name].predict_proba(smiles_bace_test)

    r2score_baseline, accuracy_baseline, roc_auc_baseline = calculate_scores(y_test, y_pred, y_proba)

    print(f"Model name: {model_name}")
    for idx, filters in enumerate(filter_pairs):
        print(f"\tFilter names({idx}): {filters[0].__name__}, {filters[1].__name__}")
        try:
            filter_f = filters[0]()
            filter_f.fit(smiles_bace)
            filtered_mols = filter_f.transform(smiles_bace)
            filter_f = filters[1]()
            filter_f.fit(filtered_mols)
            filtered_mols = filter_f.transform(filtered_mols)
            filtered_labels = [ bace_mols_mapper[mol] for mol in filtered_mols ]
            rest_mols = [ mol for mol in smiles_bace if mol not in filtered_mols ]
            rest_labels = [ bace_mols_mapper[mol] for mol in rest_mols ]

            filtered_mols_bace_train, filtered_mols_bace_test, y_train_filtered, y_test_filtered = scaffold_train_test_split(
                filtered_mols, filtered_labels, test_size=0.2
            )


            pipelines[pipeline_name].fit(filtered_mols_bace_train, y_train_filtered)
            y_pred_filtered = pipelines[pipeline_name].predict(filtered_mols_bace_test)
            y_proba_filtered = pipelines[pipeline_name].predict_proba(filtered_mols_bace_test)


            r2score_filtered, accuracy_filtered, roc_auc_filtered = calculate_scores(y_test_filtered, y_pred_filtered, y_proba_filtered)

            pipelines[pipeline_name].fit(filtered_mols_bace_train, y_train_filtered)
            y_pred_mixed = pipelines[pipeline_name].predict(smiles_bace_test)
            y_proba_mixed = pipelines[pipeline_name].predict_proba(smiles_bace_test)

            r2score_mixed, accuracy_mixed, roc_auc_mixed = calculate_scores(y_test, y_pred_mixed, y_proba_mixed)

            pipelines[pipeline_name].fit(filtered_mols_bace_train, y_train_filtered)
            y_pred_rest = pipelines[pipeline_name].predict(rest_mols)
            y_proba_rest = pipelines[pipeline_name].predict_proba(rest_mols)

            r2score_rest, accuracy_rest, roc_auc_rest = calculate_scores(rest_labels, y_pred_rest, y_proba_rest)

            with open(filename, "a") as file:
                writer = csv.writer(file, delimiter=",")
                writer.writerow(["bace", model_name, str(filters[0].__name__)+";"+str(filters[1].__name__), "ROC-AUC", roc_auc_baseline, roc_auc_filtered, (roc_auc_filtered/roc_auc_baseline)*100, r2score_filtered, accuracy_filtered, r2score_baseline, accuracy_baseline, r2score_mixed, accuracy_mixed, roc_auc_mixed, r2score_rest, accuracy_rest, roc_auc_rest, "None"])
        except Exception as e:
            print(f"Exception: {e} for filter: {filter.__name__}")
            continue
    

Model name: GradientBoostingClassifier
	Filter names(0): BeyondRo5Filter, BMSFilter
	Filter names(1): BeyondRo5Filter, BrenkFilter
	Filter names(2): BeyondRo5Filter, FAF4DruglikeFilter
	Filter names(3): BeyondRo5Filter, FAF4LeadlikeFilter
	Filter names(4): BeyondRo5Filter, GhoseFilter
	Filter names(5): BeyondRo5Filter, GlaxoFilter
	Filter names(6): BeyondRo5Filter, GSKFilter
	Filter names(7): BeyondRo5Filter, HaoFilter
	Filter names(8): BeyondRo5Filter, InpharmaticaFilter
	Filter names(9): BeyondRo5Filter, LINTFilter
	Filter names(10): BeyondRo5Filter, LipinskiFilter
	Filter names(11): BeyondRo5Filter, MLSMRFilter
	Filter names(12): BeyondRo5Filter, MolecularWeightFilter
	Filter names(13): BeyondRo5Filter, NIBRFilter
	Filter names(14): BeyondRo5Filter, NIHFilter
	Filter names(15): BeyondRo5Filter, OpreaFilter
	Filter names(16): BeyondRo5Filter, PAINSFilter
	Filter names(17): BeyondRo5Filter, PfizerFilter
	Filter names(18): BeyondRo5Filter, REOSFilter
	Filter names(19): BeyondRo5Filter,



Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(431): RuleOfTwoFilter, SureChEMBLFilter
Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(432): RuleOfTwoFilter, TiceHerbicidesFilter
Exception: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required. for filter: ZINCDruglikeFilter
	Filter names(433): RuleOfTwoFilter, TiceInsecticidesFilter




Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(434): RuleOfTwoFilter, ValenceDiscoveryFilter
Exception: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters. for filter: ZINCDruglikeFilter
	Filter names(435): RuleOfTwoFilter, ZINCBasicFilter
Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(436): RuleOfTwoFilter, ZINCDruglikeFilter
Exception: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required. for filter: ZINCDruglikeFilter
	Filter names(437): RuleOfVeberFilter, RuleOfXuFilter
	Filter names(438): RuleOfVeberFilter, SureChEMBLFilter
	Filter names(439): RuleOfVeberFilter, TiceHerbicidesFilter
	Filter names(440): RuleOfVeberFilter, TiceInsecticidesFilter
	Filter names(441): Rul



Exception: index 1 is out of bounds for axis 1 with size 1 for filter: ZINCDruglikeFilter
	Filter names(106): FAF4DruglikeFilter, RuleOfVeberFilter
	Filter names(107): FAF4DruglikeFilter, RuleOfXuFilter
	Filter names(108): FAF4DruglikeFilter, SureChEMBLFilter
	Filter names(109): FAF4DruglikeFilter, TiceHerbicidesFilter
	Filter names(110): FAF4DruglikeFilter, TiceInsecticidesFilter
	Filter names(111): FAF4DruglikeFilter, ValenceDiscoveryFilter
	Filter names(112): FAF4DruglikeFilter, ZINCBasicFilter
	Filter names(113): FAF4DruglikeFilter, ZINCDruglikeFilter
	Filter names(114): FAF4LeadlikeFilter, GhoseFilter
	Filter names(115): FAF4LeadlikeFilter, GlaxoFilter
	Filter names(116): FAF4LeadlikeFilter, GSKFilter
	Filter names(117): FAF4LeadlikeFilter, HaoFilter
	Filter names(118): FAF4LeadlikeFilter, InpharmaticaFilter
	Filter names(119): FAF4LeadlikeFilter, LINTFilter
	Filter names(120): FAF4LeadlikeFilter, LipinskiFilter
	Filter names(121): FAF4LeadlikeFilter, MLSMRFilter
	Filter names(122

  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


Exception: index 1 is out of bounds for axis 1 with size 1 for filter: ZINCDruglikeFilter
	Filter names(132): FAF4LeadlikeFilter, RuleOfVeberFilter
	Filter names(133): FAF4LeadlikeFilter, RuleOfXuFilter
	Filter names(134): FAF4LeadlikeFilter, SureChEMBLFilter
	Filter names(135): FAF4LeadlikeFilter, TiceHerbicidesFilter
	Filter names(136): FAF4LeadlikeFilter, TiceInsecticidesFilter
	Filter names(137): FAF4LeadlikeFilter, ValenceDiscoveryFilter
	Filter names(138): FAF4LeadlikeFilter, ZINCBasicFilter
	Filter names(139): FAF4LeadlikeFilter, ZINCDruglikeFilter
	Filter names(140): GhoseFilter, GlaxoFilter
	Filter names(141): GhoseFilter, GSKFilter
	Filter names(142): GhoseFilter, HaoFilter
	Filter names(143): GhoseFilter, InpharmaticaFilter
	Filter names(144): GhoseFilter, LINTFilter
	Filter names(145): GhoseFilter, LipinskiFilter
	Filter names(146): GhoseFilter, MLSMRFilter
	Filter names(147): GhoseFilter, MolecularWeightFilter
	Filter names(148): GhoseFilter, NIBRFilter
	Filter names(149):

  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


Exception: index 1 is out of bounds for axis 1 with size 1 for filter: ZINCDruglikeFilter
	Filter names(157): GhoseFilter, RuleOfVeberFilter
	Filter names(158): GhoseFilter, RuleOfXuFilter
	Filter names(159): GhoseFilter, SureChEMBLFilter
	Filter names(160): GhoseFilter, TiceHerbicidesFilter
	Filter names(161): GhoseFilter, TiceInsecticidesFilter
	Filter names(162): GhoseFilter, ValenceDiscoveryFilter
	Filter names(163): GhoseFilter, ZINCBasicFilter
	Filter names(164): GhoseFilter, ZINCDruglikeFilter
	Filter names(165): GlaxoFilter, GSKFilter
	Filter names(166): GlaxoFilter, HaoFilter
	Filter names(167): GlaxoFilter, InpharmaticaFilter
	Filter names(168): GlaxoFilter, LINTFilter
	Filter names(169): GlaxoFilter, LipinskiFilter
	Filter names(170): GlaxoFilter, MLSMRFilter
	Filter names(171): GlaxoFilter, MolecularWeightFilter
	Filter names(172): GlaxoFilter, NIBRFilter
	Filter names(173): GlaxoFilter, NIHFilter
	Filter names(174): GlaxoFilter, OpreaFilter
	Filter names(175): GlaxoFilter,



Exception: index 1 is out of bounds for axis 1 with size 1 for filter: ZINCDruglikeFilter
	Filter names(267): LINTFilter, RuleOfVeberFilter
	Filter names(268): LINTFilter, RuleOfXuFilter
	Filter names(269): LINTFilter, SureChEMBLFilter
	Filter names(270): LINTFilter, TiceHerbicidesFilter
	Filter names(271): LINTFilter, TiceInsecticidesFilter
	Filter names(272): LINTFilter, ValenceDiscoveryFilter
	Filter names(273): LINTFilter, ZINCBasicFilter
	Filter names(274): LINTFilter, ZINCDruglikeFilter
	Filter names(275): LipinskiFilter, MLSMRFilter
	Filter names(276): LipinskiFilter, MolecularWeightFilter
	Filter names(277): LipinskiFilter, NIBRFilter
	Filter names(278): LipinskiFilter, NIHFilter
	Filter names(279): LipinskiFilter, OpreaFilter
	Filter names(280): LipinskiFilter, PAINSFilter
	Filter names(281): LipinskiFilter, PfizerFilter
	Filter names(282): LipinskiFilter, REOSFilter
	Filter names(283): LipinskiFilter, RuleOfFourFilter
	Filter names(284): LipinskiFilter, RuleOfThreeFilter
Exce



Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(431): RuleOfTwoFilter, SureChEMBLFilter
Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(432): RuleOfTwoFilter, TiceHerbicidesFilter


  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


Exception: index 1 is out of bounds for axis 1 with size 1 for filter: ZINCDruglikeFilter
	Filter names(433): RuleOfTwoFilter, TiceInsecticidesFilter




Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(434): RuleOfTwoFilter, ValenceDiscoveryFilter
Exception: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters. for filter: ZINCDruglikeFilter
	Filter names(435): RuleOfTwoFilter, ZINCBasicFilter
Exception: Only one class present in y_true. ROC AUC score is not defined in that case. for filter: ZINCDruglikeFilter
	Filter names(436): RuleOfTwoFilter, ZINCDruglikeFilter




Exception: index 1 is out of bounds for axis 1 with size 1 for filter: ZINCDruglikeFilter
	Filter names(437): RuleOfVeberFilter, RuleOfXuFilter
	Filter names(438): RuleOfVeberFilter, SureChEMBLFilter
	Filter names(439): RuleOfVeberFilter, TiceHerbicidesFilter
	Filter names(440): RuleOfVeberFilter, TiceInsecticidesFilter
	Filter names(441): RuleOfVeberFilter, ValenceDiscoveryFilter
	Filter names(442): RuleOfVeberFilter, ZINCBasicFilter
	Filter names(443): RuleOfVeberFilter, ZINCDruglikeFilter
	Filter names(444): RuleOfXuFilter, SureChEMBLFilter
	Filter names(445): RuleOfXuFilter, TiceHerbicidesFilter
	Filter names(446): RuleOfXuFilter, TiceInsecticidesFilter
	Filter names(447): RuleOfXuFilter, ValenceDiscoveryFilter
	Filter names(448): RuleOfXuFilter, ZINCBasicFilter
	Filter names(449): RuleOfXuFilter, ZINCDruglikeFilter
	Filter names(450): SureChEMBLFilter, TiceHerbicidesFilter
	Filter names(451): SureChEMBLFilter, TiceInsecticidesFilter
	Filter names(452): SureChEMBLFilter, ValenceDis

## Classification with the Logistic regression and RFC based on the clintox dataset; wihout filtering molecules

### Bace dataset split

In [21]:
smiles_bace, labels_bace = load_bace()

In [None]:
kf = KFold(n_splits=2)
kf.get_n_splits(smiles_bace)
for i, (train_index, test_index) in enumerate(kf.split(smiles_bace)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    X_train, X_test, y_train, y_test = smiles_bace[train_index], smiles_bace[test_index], labels_bace[train_index], labels_bace[test_index]
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_proba = classifier.predict_proba(X_test)
    print(len(X_test))
    calculate_scores(y_test, y_pred, y_proba)

Fold 0:
  Train: index=[ 757  758  759  760  761  762  763  764  765  766  767  768  769  770
  771  772  773  774  775  776  777  778  779  780  781  782  783  784
  785  786  787  788  789  790  791  792  793  794  795  796  797  798
  799  800  801  802  803  804  805  806  807  808  809  810  811  812
  813  814  815  816  817  818  819  820  821  822  823  824  825  826
  827  828  829  830  831  832  833  834  835  836  837  838  839  840
  841  842  843  844  845  846  847  848  849  850  851  852  853  854
  855  856  857  858  859  860  861  862  863  864  865  866  867  868
  869  870  871  872  873  874  875  876  877  878  879  880  881  882
  883  884  885  886  887  888  889  890  891  892  893  894  895  896
  897  898  899  900  901  902  903  904  905  906  907  908  909  910
  911  912  913  914  915  916  917  918  919  920  921  922  923  924
  925  926  927  928  929  930  931  932  933  934  935  936  937  938
  939  940  941  942  943  944  945  946  947  948  94

In [8]:
print(len(smiles_bace))
print(smiles_bace[0], labels_bace[0])
print(smiles_bace[1], labels_bace[1])

1513
O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C 1
Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(=O)C)(CC(C)C)C1=O)CCc1ccccc1)[C@H](O)[C@@H]1[NH2+]C[C@H](OCCC)C1 1


In [14]:
bace_mols_mapper = dict() # smile -> label
for i in range(len(smiles_bace)):
    bace_mols_mapper[smiles_bace[i]] = labels_bace[i]

In [None]:
filter = MLSMRFilter()

filter.fit(smiles_bace, labels_bace)
# filtered_mols = filter.transform(smiles_bace)
filtered_mols = filter.fit_transform(smiles_bace, labels_bace)
print(len(filtered_mols), len(labels_bace))
print(filtered_mols[0], labels_bace[0])

395 1513
O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C 1


In [16]:
filtered_labels = [ bace_mols_mapper[mol] for mol in filtered_mols ]
print(len(filtered_labels))

395


In [17]:
smiles_bace_train, smiles_bace_test, y_train, y_test = scaffold_train_test_split(
    filtered_mols, filtered_labels, test_size=0.2
)

### Checking the class distibution of the dataset 

In [11]:
print(f"Class distribution: {np.bincount(labels_bace)}")

Class distribution: [822 691]


## Baseline with Logistic regression model

In [18]:
pipelines['baseline_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_bace_test) 

In [20]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.3657
Accuracy for Linear regression model: 0.6709
ROC-AUC score for Linear regression model: 0.7912


In [22]:
from skfp.datasets.moleculenet import load_bace
from skfp.fingerprints import ECFPFingerprint
from skfp.model_selection import FingerprintEstimatorGridSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

fp = ECFPFingerprint(n_jobs=-1)
fp_params = {"radius": [2, 3]}
clf = RandomForestClassifier(n_jobs=-1)
clf_params = {"min_samples_split": [2, 3, 4]}
clf_cv = GridSearchCV(clf, clf_params)
fp_cv = FingerprintEstimatorGridSearch(fp, fp_params, clf_cv)

In [23]:
fp_cv.fit(smiles_bace_train, y_train)
y_pred = fp_cv.predict(smiles_bace_test)
y_proba = fp_cv.predict_proba(smiles_bace_test) 

In [24]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4807
Accuracy for Linear regression model: 0.3960
ROC-AUC score for Linear regression model: 0.4112


## RF classifier

In [25]:
pipelines['rf_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['rf_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_bace_test)  

In [26]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4943
Accuracy for Linear regression model: 0.3927
ROC-AUC score for Linear regression model: 0.3990


In [66]:
pipelines['gb_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['gb_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['gb_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.3316
Accuracy for Linear regression model: 0.4323
ROC-AUC score for Linear regression model: 0.4265


In [67]:
pipelines['knc_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['knc_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['knc_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4265
Accuracy for Linear regression model: 0.4092
ROC-AUC score for Linear regression model: 0.4389


In [68]:
pipelines['mlp_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['mlp_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['mlp_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.3858
Accuracy for Linear regression model: 0.4191
ROC-AUC score for Linear regression model: 0.5000


In [69]:
pipelines['GNB_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['GNB_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['GNB_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.0334
Accuracy for Linear regression model: 0.5050
ROC-AUC score for Linear regression model: 0.4974


In [70]:
pipelines['svm_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['svm_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['svm_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4265
Accuracy for Linear regression model: 0.4092
ROC-AUC score for Linear regression model: 0.4565


In [45]:
# filtering molecules from the test dataset 

from skfp.filters import PfizerFilter

filt = PfizerFilter(allow_one_violation=True)

In [46]:
smiles_to_label = dict(zip(smiles_train, y_train))

In [47]:
filtered_smiles = filt.transform(smiles_train)
filtered_labels = [smiles_to_label[smi] for smi in filtered_smiles]

In [None]:
pipelines['rf_pipeline'].fit(filtered_smiles, filtered_labels)

In [None]:
y_pred = pipelines['rf_pipeline'].predict(smiles_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_test)

In [None]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for RF classifier: -0.4234
Accuracy for RF classifier: 0.6535
ROC-AUC score for RF classifier: 0.7345


## CLINTOX

In [78]:
smiles, labels = load_clintox()

In [79]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1477
[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)Cl)Cl)Cl)Cl)Cl [1 0]
[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)[O-])O)O [1 0]


In [None]:
pipelines['baseline_pipeline'].fit(smiles_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_test) 

## BBBP

In [28]:
smiles_bbbp, labels_bbbp = load_bbbp()

In [29]:
print(len(smiles_bbbp))
print(smiles_bbbp[0], labels_bbbp[0])
print(smiles_bbbp[1], labels_bbbp[1])

2039
[Cl].CC(C)NCC(O)COc1cccc2ccccc12 1
C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl 1


In [30]:
smiles_bbbp_train, smiles_bbbp_test, y_train, y_test = scaffold_train_test_split(
    smiles_bbbp, labels_bbbp, test_size=0.2
)



In [31]:
print(f"Class distribution: {np.bincount(labels_bbbp)}")

Class distribution: [ 479 1560]


In [32]:
pipelines['baseline_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_bbbp_test) 



In [None]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.9813
Accuracy for Linear regression model: 0.5074
ROC-AUC score for Linear regression model: 0.3868


In [36]:
pipelines['rf_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['rf_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_bbbp_test) 



In [37]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8630
Accuracy for Linear regression model: 0.5368
ROC-AUC score for Linear regression model: 0.4356


In [38]:
pipelines['gb_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['gb_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['gb_pipeline'].predict_proba(smiles_bbbp_test) 



In [39]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8630
Accuracy for Linear regression model: 0.5368
ROC-AUC score for Linear regression model: 0.3970


In [43]:
pipelines['knc_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['knc_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['knc_pipeline'].predict_proba(smiles_bbbp_test) 



In [44]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8433
Accuracy for Linear regression model: 0.5417
ROC-AUC score for Linear regression model: 0.4331


In [61]:
pipelines['svm_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['svm_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['svm_pipeline'].predict_proba(smiles_bbbp_test) 



In [62]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8630
Accuracy for Linear regression model: 0.5368
ROC-AUC score for Linear regression model: 0.4393


In [52]:
pipelines['GNB_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['GNB_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['GNB_pipeline'].predict_proba(smiles_bbbp_test) 



In [53]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.0799
Accuracy for Linear regression model: 0.4828
ROC-AUC score for Linear regression model: 0.4579


In [56]:
pipelines['mlp_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['mlp_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['mlp_pipeline'].predict_proba(smiles_bbbp_test) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [57]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.0306
Accuracy for Linear regression model: 0.4951
ROC-AUC score for Linear regression model: 0.4528


## ESOL

In [66]:
smiles, labels = load_esol()

In [67]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1128
OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O  -0.77
Cc1occc1C(=O)Nc2ccccc2 -3.3


## LIPOP

In [68]:
smiles, labels = load_lipophilicity()

In [69]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

4200
Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14 3.54
COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23 -1.18


## MUV

In [70]:
smiles, labels = load_muv()

In [71]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

93087
Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C [nan nan nan nan nan nan nan  0. nan nan nan  0. nan nan nan nan nan]
Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1 [ 0.  0. nan nan  0.  0.  0. nan nan nan  0. nan  0. nan nan  0.  0.]


## PCBA

In [72]:
smiles, labels = load_pcba()

In [73]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

437929
CC(=O)N1CCC2(CC1)NC(=O)N(c1ccccc1)N2 [ 0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.
  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0. nan  0.  0.  0. nan  0. nan  0.  0. nan  0.  0.  0.
  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0. nan  0.  0.
  0.  0.  1. nan  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan]
N#Cc1nnn(-c2ccc(Cl)cc2)c1N [ 0.  0. nan  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0. nan  0.  0.  0. na

## TOX21

In [74]:
smiles, labels = load_tox21()

In [75]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

7831
CCOc1ccc2nc(S(N)(=O)=O)sc2c1 [ 0.  0.  1. nan nan  0.  0.  1.  0.  0.  0.  0.]
CCN1C(=O)NC(c2ccccc2)C1=O [ 0.  0.  0.  0.  0.  0.  0. nan  0. nan  0.  0.]


## TOXCAST

In [76]:
smiles, labels = load_toxcast()

In [77]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

8576
[O-][N+](=O)C1=CC=C(Cl)C=C1 [ 0.  0. nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  