In [1]:
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from skfp.model_selection import scaffold_train_test_split
from skfp.preprocessing import MolFromSmilesTransformer
from skfp.fingerprints import ECFPFingerprint, MACCSFingerprint, RDKitFingerprint

from skfp.filters import BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter
from skfp.filters import HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter
from skfp.filters import OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter
from skfp.filters import RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter

from skfp.datasets.moleculenet import load_esol, load_lipophilicity
from sklearn.pipeline import make_pipeline, make_union

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, r2_score, root_mean_squared_error
#from deepchem.feat.smiles_tokenizer import SmilesTokenizer

import itertools


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models = [
    LinearRegression(),
    #LogisticRegression(max_iter=1000, random_state=0),
    svm.SVR(),
    MLPRegressor(random_state=1, max_iter=500),
    linear_model.BayesianRidge(),
    make_pipeline(StandardScaler(), 
    SGDRegressor(max_iter=1000, tol=1e-3)),
    KNeighborsRegressor(n_neighbors=2),
    tree.DecisionTreeRegressor(), 
    GradientBoostingRegressor()
]

pipelines = {
    "linear_regression_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint()), models[0]),
    "svm_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[1]),
    "mlp_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[2]),
    "bayesian_ridge_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[3]),
    "sgd_regressor_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[4]),
    "KNR_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[5]),
    "DTR_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[6]),
    "gradient_boosting_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[7]),
}

In [3]:
esol_data, esol_labels = load_esol()

X_train, X_test, y_train, y_test = scaffold_train_test_split(esol_data, esol_labels, test_size=0.2)

In [4]:
regression = pipelines["svm_pipeline"]
regression.fit(X_train, y_train)

In [5]:
y_pred = regression.predict(X_test)

#print("Accuracy: ", accuracy_score(y_pred=y_pred, y_true=y_test))
#print("Roc_auc: ", roc_auc_score(y_pred=y_pred, y_true=y_test))
print("RMSE: ", root_mean_squared_error(y_pred=y_pred, y_true=y_test))

RMSE:  1.2761601881820541


In [6]:
def calculate_scores(y_test, y_pred, y_proba):
    # r2score = r2_score(y_test, y_pred)
    # print(f"R2 score for Linear regression model: {r2score:.4f}")

    # accuracy = accuracy_score(y_test, y_pred)
    # print(f"Accuracy for Linear regression model: {accuracy:.4f}")

    # roc_auc = roc_auc_score(y_test, y_proba[:, 1])
    rmse = root_mean_squared_error(y_pred=y_pred, y_true=y_test)
    # print(f"ROC-AUC score for Linear regression model: {roc_auc:.4f}")
    return rmse #r2score, accuracy, roc_auc

In [7]:
import csv

filename = "results_for_regression_with_2_filters.csv"

with open(filename, "a") as file:
    writer = csv.writer(file, delimiter=",")
    writer.writerow(["Dataset", "Model", "Filters", "Metric", "Baseline value", "After filter application", "Improvement (%)", "Filter metric R2", "Filter metric Accuracy", "Baseline metric R2", "Baseline metric Accuracy", "Mixed metric R2", "Mixed metric Accuracy", "Mixed metric roc_auc", "Rest metric R2", "Rest metric Accuracy", "Rest metric roc_auc","Notes"])
    #writer.writerow(["bace222", "LogisticRegression", "-", "ROC-AUC", 0.4753, -1, -1, "None"])

In [8]:
filters = [
    BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter,
    HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter,
    OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter,
    RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter
]

filter_names = []

for filter in filters:
    filter_names.append(filter.__name__)

In [9]:
filter_pairs = list(itertools.combinations(filters, r=2))
print(filter_pairs)

[(<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.bms.BMSFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.brenk.BrenkFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.faf4_druglike.FAF4DruglikeFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.faf4_leadlike.FAF4LeadlikeFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.ghose.GhoseFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.glaxo.GlaxoFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.gsk.GSKFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.hao.HaoFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.inpharmatica.InpharmaticaFilter'>), (<class 'skfp.filters.beyond_ro5.BeyondRo5Filter'>, <class 'skfp.filters.lint.LINTFilter'>), (<class 'skfp.filte

In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [11]:
import csv
import numpy as np
from sklearn.metrics import mean_squared_error

filename = "results_for_regression_with_2_filters.csv"

prepared_models = {
    "svm_pipeline" : "SVM",
    "mlp_pipeline" : "MLP",
    "sgd_regressor_pipeline" : "SGD",
    "gradient_boosting_pipeline" : "Gradient Boosting Regressor",
}

smiles_esol, labels_esol = load_esol()

smiles_esol_train, smiles_esol_test, y_train, y_test = scaffold_train_test_split(
    smiles_esol, labels_esol, test_size=0.2
)

esol_mols_mapper = {smiles_esol[i]: labels_esol[i] for i in range(len(smiles_esol))}

#with open(filename, "w", newline="") as file:
#    writer = csv.writer(file, delimiter=",")
#    writer.writerow([
#        "Dataset", "Model", "Filters", 
#        "Baseline RMSE", "Filtered RMSE", 
#       "Mixed RMSE", "Rest RMSE", 
#        "Improvement (%)"
#    ])

for pipeline_name, model_name in prepared_models.items():
    pipelines[pipeline_name].fit(smiles_esol_train, y_train)
    y_pred = pipelines[pipeline_name].predict(smiles_esol_test)
    baseline_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    for idx, filters in enumerate(filter_pairs):
        print(f"\tFilter names({idx}): {filters[0].__name__}, {filters[1].__name__}")
        try:
            filter_f = filters[0]()
            filter_f.fit(smiles_esol)
            filtered_mols = filter_f.transform(smiles_esol)

            filter_f2 = filters[1]()
            filter_f2.fit(filtered_mols)
            filtered_mols = filter_f2.transform(filtered_mols)

            filtered_labels = [esol_mols_mapper[mol] for mol in filtered_mols]
            rest_mols = [mol for mol in smiles_esol if mol not in filtered_mols]
            rest_labels = [esol_mols_mapper[mol] for mol in rest_mols]

            filtered_mols_train, filtered_mols_test, y_train_filtered, y_test_filtered = scaffold_train_test_split(
                filtered_mols, filtered_labels, test_size=0.2
            )

            pipelines[pipeline_name].fit(filtered_mols_train, y_train_filtered)
            y_pred_filtered = pipelines[pipeline_name].predict(filtered_mols_test)
            filtered_rmse = np.sqrt(mean_squared_error(y_test_filtered, y_pred_filtered))

            pipelines[pipeline_name].fit(filtered_mols_train, y_train_filtered)
            y_pred_mixed = pipelines[pipeline_name].predict(smiles_esol_test)
            mixed_rmse = np.sqrt(mean_squared_error(y_test, y_pred_mixed))

            pipelines[pipeline_name].fit(filtered_mols_train, y_train_filtered)
            y_pred_rest = pipelines[pipeline_name].predict(rest_mols)
            rest_rmse = np.sqrt(mean_squared_error(rest_labels, y_pred_rest))

            improvement_percentage = (1 - (filtered_rmse / baseline_rmse)) * 100

            with open(filename, "a") as file: 
                writer = csv.writer(file)
                writer.writerow([
                    "esol", model_name, str(filters[0].__name__)+", "+str(filters[1].__name__),
                    baseline_rmse, filtered_rmse, mixed_rmse, rest_rmse,
                    improvement_percentage
                ])
        except Exception as e:
            print(f"Exception: {e} for filters: {filters[0].__name__}")
            continue


	Filter names(0): BeyondRo5Filter, BMSFilter
	Filter names(1): BeyondRo5Filter, BrenkFilter
	Filter names(2): BeyondRo5Filter, FAF4DruglikeFilter
	Filter names(3): BeyondRo5Filter, FAF4LeadlikeFilter
	Filter names(4): BeyondRo5Filter, GhoseFilter
	Filter names(5): BeyondRo5Filter, GlaxoFilter
	Filter names(6): BeyondRo5Filter, GSKFilter
	Filter names(7): BeyondRo5Filter, HaoFilter
	Filter names(8): BeyondRo5Filter, InpharmaticaFilter
	Filter names(9): BeyondRo5Filter, LINTFilter
	Filter names(10): BeyondRo5Filter, LipinskiFilter
	Filter names(11): BeyondRo5Filter, MLSMRFilter
	Filter names(12): BeyondRo5Filter, MolecularWeightFilter
	Filter names(13): BeyondRo5Filter, NIBRFilter
	Filter names(14): BeyondRo5Filter, NIHFilter
	Filter names(15): BeyondRo5Filter, OpreaFilter
	Filter names(16): BeyondRo5Filter, PAINSFilter
	Filter names(17): BeyondRo5Filter, PfizerFilter
	Filter names(18): BeyondRo5Filter, REOSFilter
	Filter names(19): BeyondRo5Filter, RuleOfFourFilter
	Filter names(20): Be