# Load and Scale Data

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from deepchem.trans import MinMaxTransformer
from deepchem.data import NumpyDataset


aqsoldb = pd.read_csv("aqsoldb.csv")
aqsoldb = pd.DataFrame({
  "logS": aqsoldb['Solubility'],
  "SMILES": aqsoldb["SMILES"]
})

train, test = train_test_split(aqsoldb, test_size=0.2)

train = NumpyDataset(train['SMILES'], y=train['logS'])
test = NumpyDataset(test['SMILES'], y=test['logS'])

transformer = MinMaxTransformer(transform_y=True, dataset=train)

train = transformer.transform(train)
test = transformer.transform(test)

train.y.min(), test.y.min(), train.y.max(), test.y.max()

(0.0, 0.07259506024258956, 1.0, 0.962136024714161)

# Featurise SMILES
Approx 2.5 minutes

In [23]:
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.feat.graph_data import GraphData
from numpy import array

def remove_pos_kwarg(mol: GraphData) -> GraphData:
    del mol.kwargs['pos']
    return mol

def featurize_dataset(dataset, featurizer) -> tuple:
    featurized = [featurizer.featurize(x)[0] for x in dataset.X]
    indices = [
      i for i, data in enumerate(featurized) if type(data) is GraphData
    ]
    return (
        array([remove_pos_kwarg(featurized[i]) for i in indices]),
        dataset.y[indices]
    )


featurizer = MolGraphConvFeaturizer(use_edges=True)

train_featurized, train_y = featurize_dataset(train, featurizer)

test_featurized, test_y = featurize_dataset(test, featurizer)


Failed to featurize datapoint 0, [Ca+2].[OH-].[OH-]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, C.[V]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [F-].[F-].[F-].[La+3]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [Gd+3].[Gd+3].[O-2].[O-2].[O-2]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[Pr+3].[Pr+3].[Pr+3].[Pr+3].[Pr+3].[Pr+3]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [Hf+4].[O-2].[O-2]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [Ir]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 0,

In [25]:
train_featurized[0].to_pyg_graph()

Data(x=[6, 30], edge_index=[2, 10], edge_attr=[10, 11])

In [26]:
from deepchem.data import DiskDataset

DiskDataset.from_numpy(
  X=train_featurized,
  y=train_y,
  data_dir="aqsoldb_train"
)
DiskDataset.from_numpy(
  X=test_featurized,
  y=test_y,
  data_dir="aqsoldb_test"
)


<DiskDataset X.shape: (1963,), y.shape: (1963,), w.shape: (1963,), task_names: [0]>

In [27]:
len(aqsoldb) - (len(test_featurized) + len(train_featurized))

149