<a href="https://colab.research.google.com/github/AmirJlr/AmirJlr/blob/main/data_handler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

"""
### **This file contains custom function and classes for Generating PyG Graph data base on csv file(with smiles)**
[Documentation](https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html)

"""

#### Each dataset gets passed a `root folder` which **indicates where the dataset should be stored**. We split up the root folder into two folders: `the raw_dir`, where **the dataset gets downloaded to**, and the `processed_dir`, where the **processed dataset is being saved**.

### Creating “In Memory Datasets”
In order to create a torch_geometric.data.InMemoryDataset, you need to implement four fundamental methods:

- `InMemoryDataset.raw_file_names()`: A list of files in the raw_dir which needs to be found in order to skip the download.

- `InMemoryDataset.processed_file_names()`: A list of files in the processed_dir which needs to be found in order to skip the processing.

- `InMemoryDataset.download()`: Downloads raw data into raw_dir.

- `InMemoryDataset.process()`: Processes raw data and saves it into the processed_dir.

In [None]:
import torch
torch.__version__

'2.3.0+cu121'

In [1]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cu121.html
!pip install torch_geometric
!pip install deepchem
!pip install rdkit
!pip install molfeat

Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/pyg_lib-0.4.0%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_scatter-2.1.2%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_sparse
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_sparse-0.6.18%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_cluster
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_cluster-1.6.3%2Bp

In [4]:
import numpy as np
import pandas as pd

import torch
import torch_geometric
from torch_geometric.data import Dataset, InMemoryDataset, Data

import os
from tqdm import tqdm

import deepchem as dc
from rdkit import Chem

from molfeat.trans.graph.adj import PYGGraphTransformer
from molfeat.calc.atom import AtomCalculator
from molfeat.calc.bond import EdgeMatCalculator

In [12]:
df = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv")
df.head(3)

Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl


## **generate_graph_list** function

In [None]:
from torch_geometric.utils import from_smiles

def generate_graph_list(df, smiles_column, target_column):
    graph_list = []

    for i, smile in tqdm(enumerate(df[smiles_column])):
        g = from_smiles(smile)
        g.x = g.x.float()
        y = torch.tensor(df[target_column][i], dtype=torch.float).view(1, -1)
        g.y = y
        graph_list.append(g)

    return graph_list

# Usage :
# graph_list = generate_graph_list(df, 'smiles', 'exp')

4200it [00:06, 681.00it/s]


## **BasicDatasetHandler** class

In [None]:
class BasicDatasetHandler(InMemoryDataset):
    def __init__(self, root, filename, smiles_column, target_column, transform=None, pre_transform=None, pre_filter=None):
        self.filename = filename
        self.smiles_column = smiles_column
        self.target_column = target_column
        super(BasicDatasetHandler, self).__init__(root, transform, pre_transform, pre_filter)
        # self.data, self.slices = self.load(self.processed_paths[0])
        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [self.filename]

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass  # Implement download logic if needed

    def process(self):
        # Load raw data
        data_path = os.path.join(self.raw_dir, self.filename)
        df = pd.read_csv(data_path)

        # Process each SMILES string
        data_list = []
        for i, row in df.iterrows():
            smile = row[self.smiles_column]
            target = row[self.target_column]

            # Convert SMILES to graph
            g = from_smiles(smile)
            g.x = g.x.float()

            # Convert target to tensor
            y = torch.tensor(target, dtype=torch.float).view(1, -1)
            g.y = y

            # Add graph to data list
            data_list.append(g)

        # Apply pre-filter and pre-transform
        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # Save processed data
        self.save(data_list, self.processed_paths[0])

    def len(self):
        return len(self.data)


In [None]:
# dataset1 = BasicDatasetHandler(root = 'dataset1', filename='Lipophilicity.csv', smiles_column='smiles', target_column='exp')
# dataset1[0]

Processing...
Done!


## **MolGraphConvFeaturizerDataset** class

In [None]:
class MolGraphConvFeaturizerDataset(Dataset):
    def __init__(self, root, filename, smiles_column, label_column, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data).
        """
        self.filename = filename
        self.smiles_column = smiles_column
        self.label_column = label_column
        self.test = test
        super(MolGraphConvFeaturizerDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        """If this file exists in raw_dir, the download is not triggered."""
        return [self.filename]

    @property
    def processed_file_names(self):
        """If these files are found in raw_dir, processing is skipped."""
        data = pd.read_csv(self.raw_paths[0]).reset_index()
        if self.test:
            return [f'data_test_{i}.pt' for i in data.index]
        else:
            return [f'data_{i}.pt' for i in data.index]

    def download(self):
        pass  # Implement download logic if needed

    def process(self):
        # Load raw data
        data_path = os.path.join(self.raw_dir, self.filename)
        df = pd.read_csv(data_path)

        # Featurizer
        featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

        # Process each SMILES string
        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            mol = Chem.MolFromSmiles(row[self.smiles_column])
            if mol is None:
                continue  # Skip invalid SMILES strings

            # Featurize molecule
            f = featurizer._featurize(mol)
            data = f.to_pyg_graph()
            data.y = self._get_label(row[self.label_column])
            data.smiles = row[self.smiles_column]

            # Save processed data
            if self.test:
                torch.save(data, os.path.join(self.processed_dir, f'data_test_{index}.pt'))
            else:
                torch.save(data, os.path.join(self.processed_dir, f'data_{index}.pt'))

    def _get_label(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.float)

    def len(self):
        data = pd.read_csv(self.raw_paths[0])
        return len(data)

    def get(self, idx):
        """Equivalent to __getitem__ in PyTorch."""
        filename = f'data_test_{idx}.pt' if self.test else f'data_{idx}.pt'
        data = torch.load(os.path.join(self.processed_dir, filename))
        return data


# Initialize dataset :
# dataset = MoleculeDataset(
#     root='dataset3/',
#     filename='Lipophilicity.csv',
#     smiles_column='smiles',
#     label_column='exp'
# )

Processing...
100%|██████████| 4200/4200 [01:05<00:00, 63.65it/s]

Data(x=[24, 30], edge_index=[2, 54], edge_attr=[54, 11], y=[1], smiles='Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14')



Done!


## **DefaultDSHandlerExtended** class

In [None]:
class DefaultDSHandlerExtended(Dataset):
    def __init__(self, root, filename, smiles_column, label_column, test=False, transform=None, pre_transform=None):
        self.filename = filename
        self.smiles_column = smiles_column
        self.label_column = label_column
        self.test = test
        super(DefaultDSHandlerExtended, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return [self.filename]

    @property
    def processed_file_names(self):
        data = pd.read_csv(self.raw_paths[0]).reset_index()
        if self.test:
            return [f'data_test_{i}.pt' for i in data.index]
        else:
            return [f'data_{i}.pt' for i in data.index]

    def download(self):
        pass

    def process(self):
        data = pd.read_csv(self.raw_paths[0])
        for index, row in tqdm(data.iterrows(), total=data.shape[0]):
            mol = Chem.MolFromSmiles(row[self.smiles_column])
            if mol is None:
                continue  # Skip invalid SMILES strings

            node_feats = self._get_node_features(mol)
            edge_feats = self._get_edge_features(mol)
            edge_index = self._get_adjacency_info(mol)
            label = self._get_labels(row[self.label_column])

            data = Data(
                x=node_feats,
                edge_index=edge_index,
                edge_attr=edge_feats,
                y=label,
                smiles=row[self.smiles_column]
            )

            if self.test:
                torch.save(data, os.path.join(self.processed_dir, f'data_test_{index}.pt'))
            else:
                torch.save(data, os.path.join(self.processed_dir, f'data_{index}.pt'))

    def _get_node_features(self, mol):
        """Returns a matrix of shape [Number of Nodes, Node Feature size]."""
        all_node_feats = []
        for atom in mol.GetAtoms():
            node_feats = [
                atom.GetAtomicNum(),  # Atomic number
                atom.GetDegree(),  # Degree
                atom.GetFormalCharge(),  # Formal charge
                int(atom.GetHybridization()),  # Hybridization
                atom.GetIsAromatic(),  # Aromaticity
                atom.GetTotalNumHs(),  # Total number of Hs
                atom.GetNumRadicalElectrons(),  # Radical Electrons
                atom.IsInRing(),  # In Ring
                int(atom.GetChiralTag()),  # Chirality
                atom.GetMass(),  # Atomic mass
                atom.GetExplicitValence(),  # Explicit valence
                atom.GetImplicitValence(),  # Implicit valence
                atom.GetTotalValence(),  # Total valence
                atom.GetIsotope()  # Isotope
            ]
            all_node_feats.append(node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """Returns a matrix of shape [Number of edges, Edge Feature size]."""
        all_edge_feats = []
        for bond in mol.GetBonds():
            edge_feats = [
                bond.GetBondTypeAsDouble(),  # Bond type
                bond.IsInRing(),  # In Ring
                bond.GetIsAromatic(),  # Aromaticity
                int(bond.GetBondDir()),  # Bond direction
                int(bond.GetStereo()),  # Stereochemistry
                bond.GetBondLength() if hasattr(bond, 'GetBondLength') else 0  # Bond length
            ]
            # Append edge features to matrix (twice, per direction)
            all_edge_feats += [edge_feats, edge_feats]
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """Returns adjacency information for the molecule."""
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]
        edge_indices = torch.tensor(edge_indices).t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        """Converts label to tensor."""
        return torch.tensor([label], dtype=torch.float)

    def len(self):
        """Returns the number of examples in the dataset."""
        data = pd.read_csv(self.raw_paths[0])
        return len(data)

    def get(self, idx):
        """Equivalent to __getitem__ in PyTorch."""
        filename = f'data_test_{idx}.pt' if self.test else f'data_{idx}.pt'
        data = torch.load(os.path.join(self.processed_dir, filename))
        return data


## **MolFeatDTset** class

In [8]:
from torch_geometric.utils import degree

class MolFeatDTset(Dataset):
    def __init__(self, smiles, y, featurizer):
        super().__init__()
        self.smiles = smiles
        self.featurizer = featurizer
        self.featurizer.auto_self_loop()
        self.y = torch.tensor(y).unsqueeze(-1).float()
        self.transformed_mols = self.featurizer(smiles)
        self._degrees = None

    @property
    def num_atom_features(self):
        return self.featurizer.atom_dim

    @property
    def num_output(self):
        return self.y.shape[-1]

    def __len__(self):
        return len(self.transformed_mols)

    @property
    def num_bond_features(self):
        return self.featurizer.bond_dim


    @property
    def degree(self):
        if self._degrees is  None:
            max_degree = -1
            for data in self.transformed_mols:
                d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
                max_degree = max(max_degree, int(d.max()))
            # Compute the in-degree histogram tensor
            deg = torch.zeros(max_degree + 1, dtype=torch.long)
            for data in self.transformed_mols:
                d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
                deg += torch.bincount(d, minlength=deg.numel())
            self._degrees = deg
        return self._degrees

    def collate_fn(self, **kwargs):
        # luckily the molfeat featurizer provides a collate functoin for PyG
        return self.featurizer.get_collate_fn(**kwargs)

    def __getitem__(self, index):
        return self.transformed_mols[index], self.y[index]

Since training a network with PyTorch requires defining a dataset and dataloader, we can define our custom dataset that will **take (1) the SMILES, (2) the LogD measurement, and (3) our molfeat transformer as input to generate the data point we need for model training : **

In [9]:
featurizer = PYGGraphTransformer(
    atom_featurizer=AtomCalculator(),
    bond_featurizer=EdgeMatCalculator()
)

dataset = MolFeatDTset(df.smiles.values, df.exp.values, featurizer)

In [11]:
dataset[0]

(Data(x=[24, 82], edge_index=[2, 54], edge_attr=[54, 17]), tensor([3.5400]))