In [239]:
from skfp.datasets.moleculenet import load_bace, load_clintox
from skfp.model_selection import scaffold_train_test_split
from skfp.fingerprints import ECFPFingerprint, MACCSFingerprint, RDKitFingerprint
from skfp.preprocessing import MolFromSmilesTransformer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

## Classification with the Logistic regression and RFC based on the clintox dataset; wihout filtering molecules

### Clintox dataset split

In [240]:
smiles, labels = load_bace()

In [241]:
#labels = [l[0] for l in labels]

In [242]:
smiles_train, smiles_test, y_train, y_test = scaffold_train_test_split(
    smiles, labels, test_size=0.2
)

### Checking the class distibution of the dataset 

In [243]:
print(f"Class distribution: {np.bincount(labels)}")

Class distribution: [822 691]


## Baseline with Logistic regression model

In [244]:
baseline_pipeline = make_pipeline( 
    MolFromSmilesTransformer(),
    make_union(ECFPFingerprint(count=True), MACCSFingerprint()),
    LogisticRegression(max_iter=1000)
)

In [245]:
baseline_pipeline.fit(smiles_train, y_train)
y_pred = baseline_pipeline.predict(smiles_test)
y_proba = baseline_pipeline.predict_proba(smiles_test) 

In [246]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for Linear regression model: {accuracy:.4f}")

roc_auc = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC-AUC score for Linear regression model: {roc_auc:.4f}")

Accuracy for Linear regression model: 0.4587
ROC-AUC score for Linear regression model: 0.4749


## RF classifier

In [250]:
pipeline = make_pipeline(
    MolFromSmilesTransformer(),
    make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()),
    RandomForestClassifier(random_state=0, class_weight='balanced'),
)

In [251]:
pipeline.fit(smiles_train, y_train)
y_pred = pipeline.predict(smiles_test)
y_proba = pipeline.predict_proba(smiles_test)  

In [252]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for RF classifier: {accuracy:.4f}")

roc_auc = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC-AUC score for RF classifier: {roc_auc:.4f}")

Accuracy for RF classifier: 0.3828
ROC-AUC score for RF classifier: 0.4300


In [253]:
#y_train_1d = [a[0] for a in y_train]

In [254]:
# filtering molecules from the test dataset 

from skfp.filters import PfizerFilter

filt = PfizerFilter(allow_one_violation=True)

In [255]:
smiles_to_label = dict(zip(smiles_train, y_train))

In [225]:
filtered_smiles = filt.transform(smiles_train)
filtered_labels = [smiles_to_label[smi] for smi in filtered_smiles]

In [226]:
pipeline.fit(filtered_smiles, filtered_labels)

In [227]:
y_pred = pipeline.predict(smiles_test)
y_proba = pipeline.predict_proba(smiles_test)

In [228]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for RF classifier: {accuracy:.4f}")

roc_auc = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC-AUC score for RF classifier: {roc_auc:.4f}")

Accuracy for RF classifier: 0.4224
ROC-AUC score for RF classifier: 0.4473
