In [135]:
import pandas as pd
import torch
from torch.utils.data import IterableDataset, DataLoader
from torch.utils.data import IterableDataset
from chemprop import data
from chemprop import data, featurizers
from chemprop.data.collate import collate_batch
import math
from sklearn.preprocessing import StandardScaler
import psutil
import os
import gc
import time

# **Introduction**

This notebook illustrates the use of torch.utils.data.IterableDataset in order to sequentially load the dataset and handle it. 

**Context:** I want to train a ChemProp model using a dataset of 1 million compounds. While this is not an excessively large dataset, my MacBook M1 with 8GB of RAM struggles to process the entire CSV file into MolecularDatapoints. The system works fine with the CSV file, but struggles with the MolecularDatapoints. As a result, I am looking for an alternative approach to load small subsets of the CSV file sequentially, generate MolecularDatapoints, then create a Dataset and DataLoader, and finally train the model. One of the challenges I face is ensuring the data is shuffled after completing each training epoch. To address this, I found that `torch.utils.data.IterableDataset` is a useful class for my needs.

I started by creating some useful functions to prepare the Chemprop dataset, as outlined in the tutorial.

In [136]:
def datapoint_preparator(df,smiles_column,target_column):
    smis = df.loc[:,smiles_column].values
    ys = df.loc[:,[target_column]].values
            
    datapoints = [data.MoleculeDatapoint.from_smi(smi,y) for smi, y in zip(smis,ys)]
    return datapoints


def dataset_preparator(df, smiles_column, target_column, featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()):
    datapoints = datapoint_preparator(df=df, smiles_column=smiles_column, target_column=target_column)
    dataset = data.MoleculeDataset(datapoints, featurizer=featurizer)
    return dataset
    

# **MAIN PART: StreamingMolDataset**

In [137]:
class IterableMolDatapoints(IterableDataset):
    '''A class to prepare data for streaming, which is a subclass of IterableDataset. 
    The output is a generator that yields one chemprop.data.datasets.Datum at a time.
    '''

    def __init__(self, df, smiles_column, target_column, scaler = None, size_at_time=100, shuffle=True):
        '''Parameters:
        ----------
        df (pd.DataFrame): A pandas dataframe containing the data.
        smiles_column (str): The column name containing SMILES strings.
        target_column (str): The column name containing the target values.
        scaler (StandardScaler): A StandardScaler object (already fitted) for normalizing the target values.
        size_at_time (int): The number of samples to transfrom into chemprop.data.datasets.Datum at a time.
        shuffle (boolean): If the df is shuffled.'''
        
        super().__init__()
        self.df = df
        self.smiles_column = smiles_column
        self.target_column = target_column
        self.size_at_time = size_at_time
        self.shuffle= shuffle
        self.scaler = scaler

    def __len__(self):
        return len(self.df)

    def __iter__(self):
        '''A function to define iteration logic. It take the whole csv data, then shuffled, then access to only a subset of data at a time for transformation.
        The output is a generator that yields chemprop.data.datasets.Datum and ready to put through DataLoader.
        '''

        if self.shuffle:
            df_shuffled = self.df.sample(frac=1).reset_index(drop=True)
        else:
            df_shuffled = self.df.copy()

        # Transform pandas dataframe to molecule dataset according to size_at_time, prevent overloading memory. This is to balance between memory and speed.
        for i in range(0, len(df_shuffled), self.size_at_time):
            df_at_time = df_shuffled.iloc[i:i + self.size_at_time]
            df_process = dataset_preparator(df=df_at_time, smiles_column=self.smiles_column, target_column=self.target_column)

            if self.scaler != None: 
                df_process.normalize_targets(self.scaler)
        
            for mol in df_process: 
                yield mol

# **Test 1: Memory usage**

In [138]:
# Prepare data
data_path = 'on_the_fly_data.csv'
smiles_column = 'smiles'
target_column = 'docking_score'

df = pd.read_csv(data_path)
df = df.sample(100000)
scaler = StandardScaler().fit(df[[target_column]])

# Function to record memory
def memory_record():
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / 1024 ** 2  # in MB
    return mem


In [139]:
gc.collect() 
start_time = time.time()
memory_before = memory_record()
iterable_dataset = IterableMolDatapoints(
    df=df,
    smiles_column=smiles_column,
    target_column=target_column,
    size_at_time=100, scaler=None, shuffle=True
)
memory_after =memory_record()
end_time = time.time()
gc.collect() 

print(f'Memory usage to load streaming dataset: {memory_after-memory_before} MB ')
print(f'Time to load streaming dataset: {end_time-start_time} s ')

Memory usage to load streaming dataset: 0.0 MB 
Time to load streaming dataset: 0.00018095970153808594 s 


In [140]:
gc.collect()
start_time = time.time()
memory_before = memory_record()
dataset = dataset_preparator(
    df=df,
    smiles_column=smiles_column,
    target_column=target_column
)
memory_after = memory_record()
end_time = time.time()
gc.collect()

print(f'Memory usage to load chemprop dataset: {memory_after-memory_before} MB ')
print(f'Time to load streaming dataset: {end_time-start_time} s ')

Memory usage to load chemprop dataset: -226.921875 MB 
Time to load streaming dataset: 10.86172890663147 s 


# **Test 2: Similarity to chemprop data loader**

In this test, I aim to demonstrate that the function works similarly to the Chemprop data loader. Additionally, we can apply a scaler if necessary; however, it is important to fit the scaler on the entire dataset (Pandas DataFrame) before applying it.

For illustration purposes, I will only take 10 instances from the whole dataset for examination.

In [141]:
smiles_column = 'smiles'
target_column = 'docking_score'

df_train = pd.read_csv('on_the_fly_data.csv')
df_train_10 = df_train.sample(10)
scaler = StandardScaler().fit(df_train[[target_column]])

print(f'df_train with unscaled target values: \n{df_train_10.docking_score}')
print('-'*40)
print(f'df_train with scaled target values: \n{pd.Series(scaler.transform(df_train_10[[target_column]]).reshape(-1))}')

df_train with unscaled target values: 
13632    -6.81384
49724    -7.72072
96557    -5.96453
47916    -6.02439
194343   -5.28958
100979   -6.23839
85015    -5.51530
117896   -7.26316
56923    -7.54427
21348    -7.18239
Name: docking_score, dtype: float64
----------------------------------------
df_train with scaled target values: 
0   -0.200613
1   -1.063134
2    0.607155
3    0.550223
4    1.249091
5    0.346690
6    1.034412
7   -0.627955
8   -0.895315
9   -0.551136
dtype: float64


**Chemprop Dataloader: batch_size = 5 (2 batches) and Unscaled target values:**

In [142]:
train_dataset = dataset_preparator(df_train_10, smiles_column, target_column)
train_loader = data.build_dataloader(train_dataset, batch_size=5, shuffle=False)

print('Data batches with Chemprop data loader')
for epoch in range(2):
    for i, batch in enumerate(train_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with Chemprop data loader
Batch 1
tensor([[-6.8138],
        [-7.7207],
        [-5.9645],
        [-6.0244],
        [-5.2896]])
Batch 2
tensor([[-6.2384],
        [-5.5153],
        [-7.2632],
        [-7.5443],
        [-7.1824]])
----------------------------------------
Batch 1
tensor([[-6.8138],
        [-7.7207],
        [-5.9645],
        [-6.0244],
        [-5.2896]])
Batch 2
tensor([[-6.2384],
        [-5.5153],
        [-7.2632],
        [-7.5443],
        [-7.1824]])
----------------------------------------


**StreamingMolDataset Dataloader: batch_size = 5 (2 batches) and Unscaled target values:**

In [143]:
iterable_dataset = IterableMolDatapoints(
    df=df_train_10,
    smiles_column=smiles_column,
    target_column=target_column,
    size_at_time=5, scaler=None, shuffle=False
)

iterable_train_loader = data.build_dataloader(
    iterable_dataset,
    batch_size=5, shuffle=False)

print('Data batches with StreamingMolDataset:')
for epoch in range(2):
    for i, batch in enumerate(iterable_train_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with StreamingMolDataset:
Batch 1
tensor([[-6.8138],
        [-7.7207],
        [-5.9645],
        [-6.0244],
        [-5.2896]])
Batch 2
tensor([[-6.2384],
        [-5.5153],
        [-7.2632],
        [-7.5443],
        [-7.1824]])
----------------------------------------
Batch 1
tensor([[-6.8138],
        [-7.7207],
        [-5.9645],
        [-6.0244],
        [-5.2896]])
Batch 2
tensor([[-6.2384],
        [-5.5153],
        [-7.2632],
        [-7.5443],
        [-7.1824]])
----------------------------------------


**Point:** Without shuffling, the results indicated that the Chemprop dataset and the Streaming dataset behaved similarly. One advantage of the Streaming dataset is that it doesn't require generating all data points at once. 

Additionally, we can apply scaling to it, especially during training. However, this requires fitting an external scaler.

**StreamingMolDataset Dataloader: batch_size = 5 (2 batches) and Scaled target values:**

In [145]:
# DataLoader with batch_size = 5 (2 batches) and scaled
scaler = StandardScaler().fit(df_train[[target_column]]) # Fit on the whole train_data.

iterable_dataset = IterableMolDatapoints(
    df=df_train_10,
    smiles_column=smiles_column,
    target_column=target_column,
    size_at_time=5, scaler=scaler, shuffle=False
)

iterable_train_loader = data.build_dataloader(
    iterable_dataset,
    batch_size=5, shuffle=False)

print('Data batches with scaled target values:')
for epoch in range(2):
    for i, batch in enumerate(iterable_train_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with scaled target values:
Batch 1
tensor([[-0.2006],
        [-1.0631],
        [ 0.6072],
        [ 0.5502],
        [ 1.2491]])
Batch 2
tensor([[ 0.3467],
        [ 1.0344],
        [-0.6280],
        [-0.8953],
        [-0.5511]])
----------------------------------------
Batch 1
tensor([[-0.2006],
        [-1.0631],
        [ 0.6072],
        [ 0.5502],
        [ 1.2491]])
Batch 2
tensor([[ 0.3467],
        [ 1.0344],
        [-0.6280],
        [-0.8953],
        [-0.5511]])
----------------------------------------




**StreamingMolDataset Dataloader: batch_size = 5 (2 batches) and Unscaled target values:**

In this part, when shuffle is activated, the samples in each batch are different between epochs

In [150]:
iterable_dataset = IterableMolDatapoints(
    df=df_train_10,
    smiles_column=smiles_column,
    target_column=target_column,
    size_at_time=5, scaler=None, shuffle=True
)

iterable_train_loader = data.build_dataloader(
    iterable_dataset,
    batch_size=5, shuffle=False)

print('Data batches with unscaled target values:')
for epoch in range(2):
    for i, batch in enumerate(iterable_train_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with unscaled target values:
Batch 1
tensor([[-5.5153],
        [-7.2632],
        [-7.5443],
        [-6.2384],
        [-5.9645]])
Batch 2
tensor([[-5.2896],
        [-6.0244],
        [-6.8138],
        [-7.1824],
        [-7.7207]])
----------------------------------------
Batch 1
tensor([[-6.0244],
        [-7.1824],
        [-6.8138],
        [-5.5153],
        [-6.2384]])
Batch 2
tensor([[-7.5443],
        [-7.2632],
        [-5.2896],
        [-5.9645],
        [-7.7207]])
----------------------------------------
