In [1]:
from skfp.datasets.moleculenet import load_bace, load_clintox, load_bbbp, load_esol, load_lipophilicity, load_muv, load_pcba, load_tox21, load_toxcast
from skfp.model_selection import scaffold_train_test_split, butina_train_test_split, maxmin_train_test_split, randomized_scaffold_train_test_split
from skfp.fingerprints import ECFPFingerprint, MACCSFingerprint, RDKitFingerprint
from skfp.preprocessing import MolFromSmilesTransformer
from skfp.filters import BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter
from skfp.filters import HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter
from skfp.filters import OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter
from skfp.filters import RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neural_network import MLPClassifier

import numpy as np



In [2]:

# model for classification
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    svm.SVC(),
    svm.SVR(kernel='linear'),
    svm.SVR(kernel='poly'),
    svm.SVR(kernel='rbf'),
    svm.SVR(kernel='sigmoid'),
    GaussianNB(),
    tree.DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
]

In [3]:
filters = [
    ECFPFingerprint(),
    MACCSFingerprint(),
    RDKitFingerprint(),
    BeyondRo5Filter(),
    
]

In [4]:
pipelines = {
    "baseline_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint()), LogisticRegression(max_iter=1000)),
    "rf_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), RandomForestClassifier(random_state=41, class_weight='balanced')),
    "gb_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), GradientBoostingClassifier())
}

In [5]:
def calculate_scores(y_test, y_pred, y_proba):
    r2score = r2_score(y_test, y_pred)
    print(f"R2 score for Linear regression model: {r2score:.4f}")

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for Linear regression model: {accuracy:.4f}")

    roc_auc = roc_auc_score(y_test, y_proba[:, 1])
    print(f"ROC-AUC score for Linear regression model: {roc_auc:.4f}")

## Classification with the Logistic regression and RFC based on the clintox dataset; wihout filtering molecules

### Clintox dataset split

In [6]:
smiles, labels = load_bace()

In [7]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1513
O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C 1
Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(=O)C)(CC(C)C)C1=O)CCc1ccccc1)[C@H](O)[C@@H]1[NH2+]C[C@H](OCCC)C1 1


In [8]:
fp_cv.predict(smiles)

NameError: name 'fp_cv' is not defined

In [None]:
#labels = [l[0] for l in labels]

In [9]:
smiles_train, smiles_test, y_train, y_test = scaffold_train_test_split(
    smiles, labels, test_size=0.2
)

### Checking the class distibution of the dataset 

In [10]:
print(f"Class distribution: {np.bincount(labels)}")

Class distribution: [822 691]


## Baseline with Logistic regression model

In [12]:
pipelines['baseline_pipeline'].fit(smiles_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_test) 

In [13]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.2232
Accuracy for Linear regression model: 0.4587
ROC-AUC score for Linear regression model: 0.4749


In [14]:
from skfp.datasets.moleculenet import load_bace
from skfp.fingerprints import ECFPFingerprint
from skfp.model_selection import FingerprintEstimatorGridSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

smiles, labels = load_bace()

fp = ECFPFingerprint(n_jobs=-1)
fp_params = {"radius": [2, 3]}
clf = RandomForestClassifier(n_jobs=-1)
clf_params = {"min_samples_split": [2, 3, 4]}
clf_cv = GridSearchCV(clf, clf_params)
fp_cv = FingerprintEstimatorGridSearch(fp, fp_params, clf_cv)

In [15]:
smiles_train, smiles_test, y_train, y_test = scaffold_train_test_split(
    smiles, labels, test_size=0.2
)

In [16]:
fp_cv.fit(smiles_train, y_train)
y_pred = fp_cv.predict(smiles_test)
y_proba = fp_cv.predict_proba(smiles_test) 

In [18]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4536
Accuracy for Linear regression model: 0.4026
ROC-AUC score for Linear regression model: 0.4028


## RF classifier

In [19]:
pipelines['rf_pipeline'].fit(smiles_train, y_train)
y_pred = pipelines['rf_pipeline'].predict(smiles_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_test)  

In [20]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4943
Accuracy for Linear regression model: 0.3927
ROC-AUC score for Linear regression model: 0.3990


In [45]:
# filtering molecules from the test dataset 

from skfp.filters import PfizerFilter

filt = PfizerFilter(allow_one_violation=True)

In [46]:
smiles_to_label = dict(zip(smiles_train, y_train))

In [47]:
filtered_smiles = filt.transform(smiles_train)
filtered_labels = [smiles_to_label[smi] for smi in filtered_smiles]

In [None]:
pipelines['rf_pipeline'].fit(filtered_smiles, filtered_labels)

In [None]:
y_pred = pipelines['rf_pipeline'].predict(smiles_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_test)

In [None]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for RF classifier: -0.4234
Accuracy for RF classifier: 0.6535
ROC-AUC score for RF classifier: 0.7345


## CLINTOX

In [78]:
smiles, labels = load_clintox()

In [79]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1477
[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)Cl)Cl)Cl)Cl)Cl [1 0]
[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)[O-])O)O [1 0]


In [None]:
pipelines['baseline_pipeline'].fit(smiles_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_test) 

## BBBP

In [92]:
smiles, labels = load_bbbp()

In [93]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

2039
[Cl].CC(C)NCC(O)COc1cccc2ccccc12 1
C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl 1


In [None]:
smiles_train, smiles_test, y_train, y_test = scaffold_train_test_split(
    smiles, labels, test_size=0.2
)

NameError: name 'train_test_split' is not defined

In [95]:
print(f"Class distribution: {np.bincount(labels)}")

Class distribution: [ 479 1560]


In [96]:
pipelines['baseline_pipeline'].fit(smiles_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_test) 



In [97]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: 0.0241
Accuracy for Linear regression model: 0.7574
ROC-AUC score for Linear regression model: 0.8362


## ESOL

In [66]:
smiles, labels = load_esol()

In [67]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1128
OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O  -0.77
Cc1occc1C(=O)Nc2ccccc2 -3.3


## LIPOP

In [68]:
smiles, labels = load_lipophilicity()

In [69]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

4200
Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14 3.54
COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23 -1.18


## MUV

In [70]:
smiles, labels = load_muv()

In [71]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

93087
Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C [nan nan nan nan nan nan nan  0. nan nan nan  0. nan nan nan nan nan]
Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1 [ 0.  0. nan nan  0.  0.  0. nan nan nan  0. nan  0. nan nan  0.  0.]


## PCBA

In [72]:
smiles, labels = load_pcba()

In [73]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

437929
CC(=O)N1CCC2(CC1)NC(=O)N(c1ccccc1)N2 [ 0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.
  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0. nan  0.  0.  0. nan  0. nan  0.  0. nan  0.  0.  0.
  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0. nan  0.  0.
  0.  0.  1. nan  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan]
N#Cc1nnn(-c2ccc(Cl)cc2)c1N [ 0.  0. nan  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0. nan  0.  0.  0. na

## TOX21

In [74]:
smiles, labels = load_tox21()

In [75]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

7831
CCOc1ccc2nc(S(N)(=O)=O)sc2c1 [ 0.  0.  1. nan nan  0.  0.  1.  0.  0.  0.  0.]
CCN1C(=O)NC(c2ccccc2)C1=O [ 0.  0.  0.  0.  0.  0.  0. nan  0. nan  0.  0.]


## TOXCAST

In [76]:
smiles, labels = load_toxcast()

In [77]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

8576
[O-][N+](=O)C1=CC=C(Cl)C=C1 [ 0.  0. nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  

### Fingerprints filtering benchmarks

In [21]:
from skfp.datasets.moleculenet import load_bace, load_clintox, load_bbbp, load_esol, load_lipophilicity, load_muv, load_pcba, load_tox21, load_toxcast
from skfp.model_selection import scaffold_train_test_split
from skfp.fingerprints import ECFPFingerprint, MACCSFingerprint, RDKitFingerprint
from skfp.preprocessing import MolFromSmilesTransformer
from skfp.filters import (
    BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, LipinskiFilter, PfizerFilter
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score
import numpy as np

In [23]:
datasets = {
    "bace": load_bace(),
    "clintox": load_clintox(),
    "bbbp": load_bbbp(),
    "esol": load_esol(),
    "lipophilicity": load_lipophilicity(),
}

filters = [
    BeyondRo5Filter(),
    BMSFilter(),
    BrenkFilter(),
    FAF4DruglikeFilter(),
    LipinskiFilter(),
    PfizerFilter(allow_one_violation=True)
]

In [25]:
rf_pipeline = make_pipeline(
    MolFromSmilesTransformer(),
    make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()),
    RandomForestClassifier(random_state=41, class_weight='balanced')
)

def calculate_scores(y_test, y_pred, y_proba):
    results = {}
    results['R2'] = r2_score(y_test, y_pred)
    results['Accuracy'] = accuracy_score(y_test, y_pred)
    results['ROC-AUC'] = roc_auc_score(y_test, y_proba[:, 1]) if y_proba.shape[1] > 1 else 0.0
    return results

In [27]:
for dataset_name, (smiles, labels) in datasets.items():
    print(f"\n--- Dataset: {dataset_name.upper()} ---\n")
    smiles_train, smiles_test, y_train, y_test = scaffold_train_test_split(smiles, labels, test_size=0.2)
    
    print("Without filtering:")
    rf_pipeline.fit(smiles_train, y_train)
    y_pred = rf_pipeline.predict(smiles_test)
    y_proba = rf_pipeline.predict_proba(smiles_test)
    scores_without_filter = calculate_scores(y_test, y_pred, y_proba)
    print(scores_without_filter)
    
    for filt in filters:
        print(f"\nWith filter: {filt.__class__.__name__}")
        smiles_to_label = dict(zip(smiles_train, y_train))
        filtered_smiles = filt.transform(smiles_train)
        filtered_labels = [smiles_to_label[smi] for smi in filtered_smiles]

        rf_pipeline.fit(filtered_smiles, filtered_labels)
        y_pred = rf_pipeline.predict(smiles_test)
        y_proba = rf_pipeline.predict_proba(smiles_test)
        scores_with_filter = calculate_scores(y_test, y_pred, y_proba)
        print(scores_with_filter)


--- Dataset: BACE ---

Without filtering:
{'R2': -1.4942734430923412, 'Accuracy': 0.3927392739273927, 'ROC-AUC': 0.3990470651395848}

With filter: BeyondRo5Filter
{'R2': -1.4807176091624914, 'Accuracy': 0.39603960396039606, 'ROC-AUC': 0.41694255547602005}

With filter: BMSFilter
{'R2': -1.4807176091624914, 'Accuracy': 0.39603960396039606, 'ROC-AUC': 0.39808518253400144}

With filter: BrenkFilter
{'R2': -1.3587151037938443, 'Accuracy': 0.42574257425742573, 'ROC-AUC': 0.45177165354330706}

With filter: FAF4DruglikeFilter
{'R2': -1.3180476020042953, 'Accuracy': 0.43564356435643564, 'ROC-AUC': 0.48120973514674303}

With filter: LipinskiFilter
{'R2': -1.5756084466714393, 'Accuracy': 0.37293729372937295, 'ROC-AUC': 0.4185978883321403}

With filter: PfizerFilter
{'R2': -1.304491768074446, 'Accuracy': 0.4389438943894389, 'ROC-AUC': 0.43595651395848245}

--- Dataset: CLINTOX ---

Without filtering:


KeyboardInterrupt: 