#This notebook reproduces the data pre-processing in order to run Chemprop.

In [1]:
import numpy as np
import pandas as pd
import os
from boom.datasets.SMILESDataset import *
from boom.data.load_processed_data import _load_qm9_data

In [2]:
def dataframe_wrapper(dataset, prop_name):
    """
    Wraps the dataset into a pandas dataframe.
    """
    num_samples = len(dataset)
    df = pd.DataFrame(columns=["smiles", prop_name])

    for i in range(num_samples):
        smiles, target = dataset[i]
        df.loc[i] = [smiles, target]
    return df

def dataframe_wrapper_QM9(dataset,prop_name):
    """
    Wraps the dataset into a pandas dataframe.
    """
    num_samples = len(dataset['smiles'])
    df = pd.DataFrame(columns=["smiles", 'qm9_' +prop_name])

    for i in range(num_samples):
        smiles=dataset['smiles'][i]
        target=dataset[prop_name][i]
        df.loc[i] = [smiles, target]
    return df

In [4]:
os.makedirs('./Data', exist_ok=True)
#10k Density
train_dataset = TrainDensityDataset()
iid_test_dataset = IIDDensityDataset()
ood_test_dataset = OODDensityDataset()
train_df = dataframe_wrapper(train_dataset, 'density')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'density')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'density')
os.makedirs('Data/10k_dft_density_OOD',exist_ok=True)
train_df.to_csv('Data/10k_dft_density_OOD/den_OOD_train.csv',index=False)
iid_test_df.to_csv('Data/10k_dft_density_OOD/den_OOD_iid_test.csv',index=False) 
ood_test_df.to_csv('Data/10k_dft_density_OOD/den_OOD_ood_test.csv',index=False) 
print('Done 10k Density!')

#10k HoF
train_dataset = TrainHoFDataset()
iid_test_dataset = IIDHoFDataset()
ood_test_dataset = OODHoFDataset()
train_df = dataframe_wrapper(train_dataset, 'hof')
iid_test_df=dataframe_wrapper(iid_test_dataset, 'hof')
ood_test_df=dataframe_wrapper(ood_test_dataset, 'hof')
os.makedirs('Data/10k_dft_hof_OOD',exist_ok=True)
train_df.to_csv('Data/10k_dft_hof_OOD/10k_dft_hof_OOD_train.csv',index=False)
iid_test_df.to_csv('Data/10k_dft_hof_OOD/10k_dft_hof_OOD_iid_test.csv',index=False) 
ood_test_df.to_csv('Data/10k_dft_hof_OOD/10k_dft_hof_OOD_ood_test.csv',index=False) 
print('Done 10k HoF!')

#Do all the QM9 properties.
for prop in ['alpha','cv','gap','homo','lumo','mu','r2','zpve']:
    data=_load_qm9_data(prop)
    train_dataset, iid_test_dataset, ood_test_dataset=data['train_'+prop],data['iid_'+prop],data['ood_'+prop]
    train_df=dataframe_wrapper_QM9(train_dataset, prop_name=prop)
    iid_test_df=dataframe_wrapper_QM9(iid_test_dataset, prop_name=prop)
    ood_test_df=dataframe_wrapper_QM9(ood_test_dataset, prop_name=prop)
    os.makedirs('Data/qm9_'+prop+'_OOD',exist_ok=True)
    train_df.to_csv('Data/qm9_'+prop+'_OOD/qm9_'+prop+'_OOD_train.csv',index=False)
    iid_test_df.to_csv('Data/qm9_'+prop+'_OOD/qm9_'+prop+'_OOD_iid_test.csv',index=False) 
    ood_test_df.to_csv('Data/qm9_'+prop+'_OOD/qm9_'+prop+'_OOD_ood_test.csv',index=False) 
    print('Done QM9 '+prop+'!')


Done 10k Density!
Done 10k HoF!
Done QM9 alpha!
Done QM9 cv!
Done QM9 gap!
Done QM9 homo!
Done QM9 lumo!
Done QM9 mu!
Done QM9 r2!
Done QM9 zpve!
