In [1]:
# install deepchem
!pip install deepchem

# install Fast-ML
!pip install fast_ml

Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/693.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/693.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m686.1/693.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.9 (from deepchem)
  Downloading scipy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit (from deepchem)
  Downloading rdkit-2023.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━

### Splitting dataset

Once the dataset is cleaned up, we can create the train, validation and test splits.

There are libraries available to split the dataset based on the output value, molecular weight, scaffold etc. This approach requires converting the CSV file to the library-dependent which is sometimes cumbersome.

For simplicity, we will first randomly split the dataset. We will use the QM9 dataset with ```gap``` as the output (target).

In [2]:
# import pandas library
import pandas as pd

# load the dataframe as CSV from URL.
# If you upload the file to Colab, replace the URL with the file name
df = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm9.csv")

# look at the top 5 entries
df.head()

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,C,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,...,0.044749,-40.47893,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.64329,-401.014647,-372.471772
1,gdb_2,N,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,...,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,O,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,...,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,C#C,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,...,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,C#N,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,0.016601,-93.411888,-93.40937,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028


[Fast-ML](https://pypi.org/project/fast-ml/) package has in-built functionalities to analyze the datasets but is not Chemistry-aware. As we are randomly spiltting the dataset, we can use this package.

In [3]:
# import the function to split into train-valid-test
from fast_ml.model_development import train_valid_test_split

In [4]:
# we will split the dataset as train-valid-test = 0.8:0.1:0.1
X_train, y_train, X_valid, y_valid, \
X_test, y_test = train_valid_test_split(df[["smiles","gap"]], target = "gap", train_size=0.8,
                                        valid_size=0.1, test_size=0.1)


In [5]:
X_test

Unnamed: 0,smiles
14676,OCC#CCOC=N
78143,CC1C2C1C(=O)CC2=O
6161,CC(=O)C1CC1(C)O
64418,CC1(O)CCCC2CC12
31964,CCc1[nH]c(cn1)OC
...,...
33460,C#CC1(CO1)C1COC1
26018,c1c[nH]cc1OC(=O)N
131784,c1c(nnnn1)CCO
7176,CC(CCO)OC=N


In [6]:
y_test

14676     0.2741
78143     0.2097
6161      0.2291
64418     0.3272
31964     0.2372
           ...  
33460     0.2652
26018     0.2429
131784    0.1650
7176      0.2800
46038     0.2879
Name: gap, Length: 13389, dtype: float64

In case of more Chemistry-aware dataset splitting, pacakages like [deepchem](https://deepchem.readthedocs.io/en/latest/index.html) can be used. However, the CSV dataset must be converted into a dataset class before the splitting can be performed.

Let's try splitting the dataset based on molecular weight in deepchem.

In [7]:
import deepchem as dc



As the kernal restarted, we will reload the QM9 dataset.

In [8]:
# import that pandas library
import pandas as pd

# load the dataframe as CSV from URL.
# If you upload the file to Colab, replace the URL with the file name
df = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm9.csv")

We will use the ``smiles`` and ``gap`` values from the dataset as before and create the ``NumpyDataset`` object in deepchem. The documentation for dataset in deepchem can be found [here](https://deepchem.readthedocs.io/en/latest/api_reference/data.html#datasets)

In [9]:
# create the deepchem dataset object
# note ids arg is necessary for splitting
dataset = dc.data.NumpyDataset.from_dataframe(df[["smiles","gap"]],
                                              X="smiles",y="gap", ids="smiles")

One can look as the ``X`` and ``y`` values to ensure proper loading of the dataset.

In [10]:
dataset.y

array([[0.5048],
       [0.3399],
       [0.3615],
       ...,
       [0.2953],
       [0.3003],
       [0.3058]])

In [11]:
dataset.X

array([['C'],
       ['N'],
       ['O'],
       ...,
       ['C1N2C3C4C5C2C13CN45'],
       ['C1N2C3C4C5CC13C2C45'],
       ['C1N2C3C4C5OC13C2C45']], dtype=object)

We will perform molecular weight based split. More documentation on splitting methods in deepchem can be found [here](https://deepchem.readthedocs.io/en/latest/api_reference/splitters.html)

In [12]:
# create the molecular weight splitter object
molecularweightsplitter = dc.splits.MolecularWeightSplitter()

train_dataset, valid_dataset, test_dataset \
 = molecularweightsplitter.train_valid_test_split(
    dataset=dataset, frac_train = 0.8, frac_valid = 0.1,
    frac_test = 0.1
 )

We can convert the dataset objects back to pandas dataframe with ``to_dataframe`` for easy analysis, if needed.

In [13]:
train_dataset, valid_dataset, test_dataset\
 = train_dataset.to_dataframe(), valid_dataset.to_dataframe(),\
  test_dataset.to_dataframe()

In [14]:
test_dataset

Unnamed: 0,X,y,ids
0,CC(C#N)C(CO)CO,0.3023,CC(C#N)C(CO)CO
1,CC1COCC1NC=O,0.2746,CC1COCC1NC=O
2,CC1(CCNC(=O)O1)C,0.3080,CC1(CCNC(=O)O1)C
3,COCCC(O)CC#N,0.2886,COCCC(O)CC#N
4,CC(=NO)C1CCCO1,0.2546,CC(=NO)C1CCCO1
...,...,...,...
13384,C(C(CO)C(F)(F)F)O,0.3501,C(C(CO)C(F)(F)F)O
13385,C(COCC(F)(F)F)O,0.3385,C(COCC(F)(F)F)O
13386,CC(O)(CO)C(F)(F)F,0.3254,CC(O)(CO)C(F)(F)F
13387,CC(O)C(O)C(F)(F)F,0.3266,CC(O)C(O)C(F)(F)F
