In [2]:
from deepmol.splitters import SingletaskStratifiedSplitter
from deepmol.loaders import CSVLoader
from deepmol.splitters import SimilaritySplitter

from deepmol.compound_featurization import MorganFingerprint

from deepmol.models import SklearnModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from deepmol.metrics import Metric

In [3]:
dataset = CSVLoader("../data/CHEMBL217_reduced.csv", id_field="Original_Entry_ID",
                    smiles_field="SMILES", labels_fields=["Activity_Flag"]).create_dataset()

2023-03-17 18:55:40,496 — ERROR — Molecule with smiles: ClC1=C(N2CCN(O)(CC2)=C/C=C/CNC(=O)C=3C=CC(=CC3)C4=NC=CC=C4)C=CC=C1Cl removed from dataset.
2023-03-17 18:55:40,498 — INFO — Assuming classification since there are less than 10 unique y values. If otherwise, explicitly set the mode to 'regression'!


[18:55:40] Explicit valence for atom # 6 N, 5, is greater than permitted


# Splitters

## SingletaskStratifiedSplitter

In [5]:
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1)

In [5]:
train_dataset.get_shape()

2023-03-17 18:48:42,289 — INFO — Mols_shape: (13316,)
2023-03-17 18:48:42,290 — INFO — Features_shape: None
2023-03-17 18:48:42,290 — INFO — Labels_shape: (13316,)


((13316,), None, (13316,))

In [6]:
valid_dataset.get_shape()

2023-03-17 18:48:48,009 — INFO — Mols_shape: (1664,)
2023-03-17 18:48:48,010 — INFO — Features_shape: None
2023-03-17 18:48:48,011 — INFO — Labels_shape: (1664,)


((1664,), None, (1664,))

In [7]:
test_dataset.get_shape()

2023-03-17 18:48:52,986 — INFO — Mols_shape: (1665,)
2023-03-17 18:48:52,987 — INFO — Features_shape: None
2023-03-17 18:48:52,988 — INFO — Labels_shape: (1665,)


((1665,), None, (1665,))

In [6]:
MorganFingerprint().featurize(train_dataset)
MorganFingerprint().featurize(valid_dataset)
MorganFingerprint().featurize(test_dataset)

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)

In [7]:
model.evaluate(test_dataset, [Metric(accuracy_score, name="accuracy")])

({'accuracy': 0.984984984984985}, {})

## Similarity Splitter

In [11]:
similarity_splitter = SimilaritySplitter()

train_dataset, valid_dataset, test_dataset = similarity_splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, homogenous_threshold=0.15)

In [12]:
train_dataset.get_shape()
valid_dataset.get_shape()
test_dataset.get_shape()

2023-03-17 18:59:56,556 — INFO — Mols_shape: (13315,)
2023-03-17 18:59:56,557 — INFO — Features_shape: None
2023-03-17 18:59:56,557 — INFO — Labels_shape: (13315,)
2023-03-17 18:59:56,558 — INFO — Mols_shape: (1664,)
2023-03-17 18:59:56,558 — INFO — Features_shape: None
2023-03-17 18:59:56,559 — INFO — Labels_shape: (1664,)
2023-03-17 18:59:56,559 — INFO — Mols_shape: (1666,)
2023-03-17 18:59:56,559 — INFO — Features_shape: None
2023-03-17 18:59:56,560 — INFO — Labels_shape: (1666,)


((1666,), None, (1666,))

In [13]:
MorganFingerprint().featurize(train_dataset)
MorganFingerprint().featurize(valid_dataset)
MorganFingerprint().featurize(test_dataset)

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)
model.evaluate(test_dataset, [Metric(accuracy_score, name="accuracy")])

({'accuracy': 0.978391356542617}, {})

In [14]:
train_dataset, valid_dataset, test_dataset = similarity_splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, homogenous_threshold=0.90)

In [15]:
MorganFingerprint().featurize(train_dataset)
MorganFingerprint().featurize(valid_dataset)
MorganFingerprint().featurize(test_dataset)

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)
model.evaluate(test_dataset, [Metric(accuracy_score, name="accuracy")])

({'accuracy': 0.7725090036014406}, {})

## Scaffold Splitter

In [18]:
from deepmol.splitters import ScaffoldSplitter

scaffold_splitter = ScaffoldSplitter()

train_dataset, valid_dataset, test_dataset = scaffold_splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, homogenous_datasets = True)

In [19]:
MorganFingerprint().featurize(train_dataset)
MorganFingerprint().featurize(valid_dataset)
MorganFingerprint().featurize(test_dataset)

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)
model.evaluate(test_dataset, [Metric(accuracy_score, name="accuracy")])

({'accuracy': 0.978391356542617}, {})

In [20]:
train_dataset, valid_dataset, test_dataset = scaffold_splitter.train_valid_test_split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, homogenous_datasets = False)

In [21]:
MorganFingerprint().featurize(train_dataset)
MorganFingerprint().featurize(valid_dataset)
MorganFingerprint().featurize(test_dataset)

rf = RandomForestClassifier()
model = SklearnModel(model=rf)
model.fit(train_dataset)
model.evaluate(test_dataset, [Metric(accuracy_score, name="accuracy")])

({'accuracy': 0.963985594237695}, {})