# Load and Scale Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from deepchem.trans import MinMaxTransformer
from deepchem.data import NumpyDataset


aqsoldb = pd.read_csv("aqsoldb.csv")
aqsoldb = pd.DataFrame({
  "logS": aqsoldb['Solubility'],
  "SMILES": aqsoldb["SMILES"]
})

train, test = train_test_split(aqsoldb, test_size=0.2)

train = NumpyDataset(train['SMILES'], y=train['logS'])
test = NumpyDataset(test['SMILES'], y=test['logS'])

transformer = MinMaxTransformer(transform_y=True, dataset=train)

train = transformer.transform(train)
test = transformer.transform(test)

train.y.min(), test.y.min(), train.y.max(), test.y.max()

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


FileNotFoundError: [Errno 2] No such file or directory: 'aqsoldb.csv'

# Featurise SMILES
Approx 2.5 minutes

In [23]:
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.feat.graph_data import GraphData
from numpy import array

def remove_pos_kwarg(mol: GraphData) -> GraphData:
    del mol.kwargs['pos']
    return mol

def featurize_dataset(dataset, featurizer) -> tuple:
    featurized = [featurizer.featurize(x)[0] for x in dataset.X]
    indices = [
      i for i, data in enumerate(featurized) if type(data) is GraphData
    ]
    return (
        array([remove_pos_kwarg(featurized[i]) for i in indices]),
        dataset.y[indices]
    )


featurizer = MolGraphConvFeaturizer(use_edges=True)

train_featurized, train_y = featurize_dataset(train, featurizer)

test_featurized, test_y = featurize_dataset(test, featurizer)


Failed to featurize datapoint 0, [Ca+2].[OH-].[OH-]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, C.[V]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [F-].[F-].[F-].[La+3]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [Gd+3].[Gd+3].[O-2].[O-2].[O-2]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[Pr+3].[Pr+3].[Pr+3].[Pr+3].[Pr+3].[Pr+3]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [Hf+4].[O-2].[O-2]. Appending empty array
Exception message: tuple index out of range
Failed to featurize datapoint 0, [Ir]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 0,

In [25]:
train_featurized[0].to_pyg_graph()

Data(x=[6, 30], edge_index=[2, 10], edge_attr=[10, 11])

In [26]:
from deepchem.data import DiskDataset

DiskDataset.from_numpy(
  X=train_featurized,
  y=train_y,
  data_dir="aqsoldb_train"
)
DiskDataset.from_numpy(
  X=test_featurized,
  y=test_y,
  data_dir="aqsoldb_test"
)


<DiskDataset X.shape: (1963,), y.shape: (1963,), w.shape: (1963,), task_names: [0]>

In [27]:
len(aqsoldb) - (len(test_featurized) + len(train_featurized))

149

In [15]:
from deepchem.feat import MolGraphConvFeaturizer
import torch.nn as nn
from torch_geometric.nn import GCNConv, global_add_pool

ethylene = "C=C"
featurizer = MolGraphConvFeaturizer(use_edges=True)
mol = featurizer.featurize(ethylene)[0]
del mol.kwargs['pos']
pyg = mol.to_pyg_graph()
print(pyg, "2 nodes, 30 features per node")
print(pyg.x)
print(pyg.edge_index)
print(pyg.edge_attr)

linear = GCNConv(30, 2)
pred = linear(pyg.x, pyg.edge_index)
for name, param in linear.named_parameters():
    print(name, param.shape)
global_add_pool(pred, batch=None)
# print(linear.weight)

Data(x=[2, 30], edge_index=[2, 2], edge_attr=[2, 11]) 2 nodes, 30 features per node
tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])
tensor([[0, 1],
        [1, 0]])
tensor([[0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])
bias torch.Size([2])
lin.weight torch.Size([2, 30])


tensor([[-0.2479,  1.2631]], grad_fn=<SumBackward1>)

In [26]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

def count_overlap(fp1, fp2):
    overlap = 0
    for i in range(len(fp1)):
        if fp1[i] == 1 and fp2[i] == 1:
            overlap += 1
    return overlap


ethylene = "C=C"
propylene = "CC=C"
ethylene_mol = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(ethylene), 2, nBits=1024)
propylene_mol = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(propylene), 2, nBits=1024)
DataStructs.TanimotoSimilarity(ethylene_mol, propylene_mol)

intersection = count_overlap(ethylene_mol, propylene_mol)
union = ethylene_mol.GetNumOnBits() + propylene_mol.GetNumOnBits() - intersection

# Calculate fraction components
fraction_intersection = intersection / union
fraction_difference = 1 - fraction_intersection

print(f"Intersection: {intersection}")
print(f"Union: {union}")
print(f"Fraction of Intersection: {fraction_intersection}")
print(f"Fraction of Difference: {fraction_difference}")
1024 - 7

Intersection: 1
Union: 7
Fraction of Intersection: 0.14285714285714285
Fraction of Difference: 0.8571428571428572


1017