In [138]:
import pandas as pd
import torch
from chemprop import data
from torch.utils.data import IterableDataset
import torch
from chemprop import data
from chemprop import data, featurizers
import math
from torch.utils.data import IterableDataset, DataLoader
from chemprop.data.collate import collate_batch
from sklearn.preprocessing import StandardScaler
import psutil
import os
import gc
import time

# **Introduction**

This notebook illustrates the use of torch.utils.data.IterableDataset in order to sequentially load the dataset and handle it. 

**Context:** I want to train a ChemProp model using a dataset of 1 million compounds. While this is not an excessively large dataset, my MacBook M1 with 8GB of RAM struggles to process the entire CSV file into MolecularDatapoints. The system works fine with the CSV file, but struggles with the MolecularDatapoints. As a result, I am looking for an alternative approach to load small subsets of the CSV file sequentially, generate MolecularDatapoints, then create a Dataset and DataLoader, and finally train the model. One of the challenges I face is ensuring the data is shuffled after completing each training epoch. To address this, I found that `torch.utils.data.IterableDataset` is a useful class for my needs.

I started by creating some useful functions to prepare the Chemprop dataset, as outlined in the tutorial.

In [2]:
def datapoint_preparator(df,smiles_column,target_column):
    smis = df.loc[:,smiles_column].values
    ys = df.loc[:,[target_column]].values
            
    datapoints = [data.MoleculeDatapoint.from_smi(smi,y) for smi, y in zip(smis,ys)]
    return datapoints


def dataset_preparator(df, smiles_column, target_column, featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()):
    datapoints = datapoint_preparator(df=df, smiles_column=smiles_column, target_column=target_column)
    dataset = data.MoleculeDataset(datapoints, featurizer=featurizer)
    return dataset
    

# **MAIN PART: StreamingMolDataset**

In [None]:
class StreamingMolDataset(IterableDataset):
    def __init__(self, df, smiles_column, target_column, scaler = None, batch_size=64, shuffle=True):
        self.df = df
        self.smiles_column = smiles_column
        self.target_column = target_column
        self.batch_size = batch_size
        self.shuffle= shuffle
        self.scaler = scaler


    def __iter__(self):
        if self.shuffle:
            df_shuffled = self.df.sample(frac=1).reset_index(drop=True)
        else:
            df_shuffled = self.df.copy()


        for i in range(0, len(df_shuffled), self.batch_size):
            df_batch = df_shuffled.iloc[i:i + self.batch_size]
            df_process = dataset_preparator(df=df_batch, smiles_column=self.smiles_column, target_column=self.target_column)

            if self.scaler != None: 
                df_process.normalize_targets(self.scaler)
        
        # Yield all the samples in the current batch
            for mol in df_process: 
                yield mol

# **Test 1: Memory usage**

In [133]:
# Prepare data
data_path = 'on_the_fly_data.csv'
smiles_column = 'smiles'
target_column = 'docking_score'

df = pd.read_csv(data_path)
df = df.sample(100000)
scaler = StandardScaler().fit(df[[target_column]])
batch_size=64

# Function to record memory
def memory_record():
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / 1024 ** 2  # in MB
    return mem


  df = pd.read_csv(data_path)


In [140]:
gc.collect() 
start_time = time.time()
memory_before = memory_record()
streaming_dataset = StreamingMolDataset(
    df=df,
    smiles_column=smiles_column,
    target_column=target_column,
    batch_size=batch_size, scaler=None, shuffle=True
)
memory_after =memory_record()
end_time = time.time()
gc.collect() 

print(f'Memory usage to load streaming dataset: {memory_after-memory_before} MB ')
print(f'Time to load streaming dataset: {end_time-start_time} s ')

Memory usage to load streaming dataset: 0.015625 MB 
Time to load streaming dataset: 0.0001621246337890625 s 


In [143]:
gc.collect()
start_time = time.time()
memory_before = memory_record()
dataset = dataset_preparator(
    df=df,
    smiles_column=smiles_column,
    target_column=target_column
)
memory_after = memory_record()
end_time = time.time()
gc.collect()

print(f'Memory usage to load chemprop dataset: {memory_after-memory_before} MB ')
print(f'Time to load streaming dataset: {end_time-start_time} s ')

Memory usage to load chemprop dataset: 290.1875 MB 
Time to load streaming dataset: 11.422368049621582 s 


# **Test 2: Similarity to chemprop data loader**

In this test, I aim to demonstrate that the function works similarly to the Chemprop data loader. Additionally, we can apply a scaler if necessary; however, it is important to fit the scaler on the entire dataset (Pandas DataFrame) before applying it.

For illustration purposes, I will only take 10 instances from the whole dataset for examination.

In [128]:
smiles_column = 'smiles'
target_column = 'docking_score'

df_train = pd.read_csv('on_the_fly_data.csv')
df_train_10 = df_train.sample(10)
scaler = StandardScaler().fit(df_train[[target_column]])

print(f'df_train with unscaled target values: \n{df_train_10.docking_score}')
print('-'*40)
print(f'df_train with scaled target values: \n{pd.Series(scaler.transform(df_train_10[[target_column]]).reshape(-1))}')

df_train with unscaled target values: 
876243   -7.48446
607660   -7.04395
214620   -7.32907
864331   -8.18819
669759   -7.69349
559369   -5.37299
39825    -5.69807
681769   -7.51751
285017   -7.58660
732176   -6.32177
Name: docking_score, dtype: float64
----------------------------------------
df_train with scaled target values: 
0   -0.841697
1   -0.421607
2   -0.693510
3   -1.512807
4   -1.041038
5    1.171899
6    0.861888
7   -0.873215
8   -0.939103
9    0.267098
dtype: float64


  df_train = pd.read_csv('on_the_fly_data.csv')


**Chemprop Dataloader: batch_size = 5 (2 batches) and Unscaled target values:**

In [129]:
train_dataset = dataset_preparator(df_train_10, smiles_column, target_column)
train_loader = data.build_dataloader(train_dataset, batch_size=5, shuffle=False)

print('Data batches with Chemprop data loader')
for epoch in range(2):
    for i, batch in enumerate(train_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with Chemprop data loader
Batch 1
tensor([[-7.4845],
        [-7.0440],
        [-7.3291],
        [-8.1882],
        [-7.6935]])
Batch 2
tensor([[-5.3730],
        [-5.6981],
        [-7.5175],
        [-7.5866],
        [-6.3218]])
----------------------------------------
Batch 1
tensor([[-7.4845],
        [-7.0440],
        [-7.3291],
        [-8.1882],
        [-7.6935]])
Batch 2
tensor([[-5.3730],
        [-5.6981],
        [-7.5175],
        [-7.5866],
        [-6.3218]])
----------------------------------------


**StreamingMolDataset Dataloader: batch_size = 5 (2 batches) and Unscaled target values:**

In [130]:
train_streaming_dataset = StreamingMolDataset(
    df=df_train_10,
    smiles_column=smiles_column,
    target_column=target_column,
    batch_size=5, scaler=None, shuffle=False
)

train_streaming_loader = DataLoader(
    train_streaming_dataset,
    batch_size=5,
    collate_fn=collate_batch)

print('Data batches with StreamingMolDataset:')
for epoch in range(2):
    for i, batch in enumerate(train_streaming_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with StreamingMolDataset:
Batch 1
tensor([[-7.4845],
        [-7.0440],
        [-7.3291],
        [-8.1882],
        [-7.6935]])
Batch 2
tensor([[-5.3730],
        [-5.6981],
        [-7.5175],
        [-7.5866],
        [-6.3218]])
----------------------------------------
Batch 1
tensor([[-7.4845],
        [-7.0440],
        [-7.3291],
        [-8.1882],
        [-7.6935]])
Batch 2
tensor([[-5.3730],
        [-5.6981],
        [-7.5175],
        [-7.5866],
        [-6.3218]])
----------------------------------------


**Point:** Without shuffling, the results indicated that the Chemprop dataset and the Streaming dataset behaved similarly. One advantage of the Streaming dataset is that it doesn't require generating all data points at once. 

Additionally, we can apply scaling to it, especially during training. However, this requires fitting an external scaler.

**StreamingMolDataset Dataloader: batch_size = 5 (2 batches) and Scaled target values:**

In [131]:
# DataLoader with batch_size = 5 (2 batches) and scaled
scaler = StandardScaler().fit(df_train[[target_column]]) # Fit on the whole train_data.

train_streaming_dataset = StreamingMolDataset(
    df=df_train_10,
    smiles_column=smiles_column,
    target_column=target_column,
    batch_size=5, scaler=scaler, shuffle=False
)

train_streaming_loader = DataLoader(
    train_streaming_dataset,
    batch_size=5,
    collate_fn=collate_batch)

print('Data batches with scaled target values:')
for epoch in range(2):
    for i, batch in enumerate(train_streaming_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with scaled target values:
Batch 1
tensor([[-0.8417],
        [-0.4216],
        [-0.6935],
        [-1.5128],
        [-1.0410]])
Batch 2
tensor([[ 1.1719],
        [ 0.8619],
        [-0.8732],
        [-0.9391],
        [ 0.2671]])
----------------------------------------
Batch 1
tensor([[-0.8417],
        [-0.4216],
        [-0.6935],
        [-1.5128],
        [-1.0410]])
Batch 2
tensor([[ 1.1719],
        [ 0.8619],
        [-0.8732],
        [-0.9391],
        [ 0.2671]])
----------------------------------------




**StreamingMolDataset Dataloader: batch_size = 5 (2 batches) and Unscaled target values:**

In this part, when shuffle is activated, the samples in each batch are different between epochs

In [132]:
train_streaming_dataset = StreamingMolDataset(
    df=df_train_10,
    smiles_column=smiles_column,
    target_column=target_column,
    batch_size=5, scaler=None, shuffle=True
)

train_loader = DataLoader(
    train_streaming_dataset,
    batch_size=5,
    collate_fn=collate_batch)

print('Data batches with unscaled target values:')
for epoch in range(2):
    for i, batch in enumerate(train_loader):
        print(f'Batch {i+1}')
        print(batch.Y)
    print('-'*40)

Data batches with unscaled target values:
Batch 1
tensor([[-7.0440],
        [-7.3291],
        [-7.5175],
        [-8.1882],
        [-5.3730]])
Batch 2
tensor([[-7.4845],
        [-6.3218],
        [-5.6981],
        [-7.5866],
        [-7.6935]])
----------------------------------------
Batch 1
tensor([[-7.3291],
        [-7.4845],
        [-6.3218],
        [-8.1882],
        [-7.5866]])
Batch 2
tensor([[-7.6935],
        [-5.3730],
        [-7.0440],
        [-5.6981],
        [-7.5175]])
----------------------------------------
