## Concatenate the Delaney's dataset with the dataset of the new molecules

In [1]:
# import the necessary libraries
import numpy as np
import pandas as pd

### Read the Delaney's dataset

In [2]:
# read the Delaney's dataset
sol = pd.read_csv('delaney.csv')

In [3]:
sol.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [4]:
# select the SMILES and the experimental LogS columns
sol_select = pd.concat([sol.iloc[:,3], sol.iloc[:,1]], axis=1)

In [5]:
# rename the column names
sol_select.columns = ["SMILES","exp_logS"] 

In [6]:
sol_select.head()

Unnamed: 0,SMILES,exp_logS
0,ClCC(Cl)(Cl)Cl,-2.18
1,CC(Cl)(Cl)Cl,-2.0
2,ClC(Cl)C(Cl)Cl,-1.74
3,ClCC(Cl)Cl,-1.48
4,FC(F)(Cl)C(F)(Cl)Cl,-3.04


### Read the new dataset

In [7]:
# read the new dataset
! ls -l ../../../00_database/2_new_compounds/new_compounds.csv
new = pd.read_csv('../../../00_database/2_new_compounds/new_compounds.csv')

-rw-r--r--@ 1 tarus  staff  6763 Nov 24 01:42 ../../../00_database/2_new_compounds/new_compounds.csv


In [8]:
new.head()

Unnamed: 0,Compound ID,InChIKey,SMILES,S(M),logS,MW
0,CN_S001_C001,KROVJOROTYCIHS-UHFFFAOYSA-N,CC(C)CCOC1=CC2=C(C=C1)C1=CC=NC(C)=C1N2,7.4e-05,-4.12767,268.36
1,CN_S001_C002,RBLBQKRAADKOJW-UHFFFAOYSA-N,CCCN1C2=C(C=CC(OCCC(C)C)=C2)C2=CC=NC(C)=C12,6e-06,-5.19098,310.441
2,CN_S001_C003,BMWDRSIHHSOBKI-UHFFFAOYSA-N,CCCN1C2=C(C=CC(OCCC(C)C)=C2)C2=CC=[N+](CC3=CC=...,0.00264,-2.5784,401.573
3,CN_S002_C001,HNQZZUWCYONRSR-UHFFFAOYSA-N,CN1CCN(CC1)C1=NC=C(C=N1)C1=CC2=NC=CC(NC3=NC=CN...,4.4e-05,-4.35655,398.474
4,CN_S002_C002,RQVYLFUGPHKWEQ-UHFFFAOYSA-N,NCCNC1=NC=C(C=N1)C1=CC2=NC=CC(NC3=NC=CN=C3)=C2...,1.8e-05,-4.74473,358.409


In [9]:
# select the SMILES and the experimental logS columns
new_select = pd.concat([new.iloc[:,2], new.iloc[:,4]], axis=1)

In [10]:
new_select.head()

Unnamed: 0,SMILES,logS
0,CC(C)CCOC1=CC2=C(C=C1)C1=CC=NC(C)=C1N2,-4.12767
1,CCCN1C2=C(C=CC(OCCC(C)C)=C2)C2=CC=NC(C)=C12,-5.19098
2,CCCN1C2=C(C=CC(OCCC(C)C)=C2)C2=CC=[N+](CC3=CC=...,-2.5784
3,CN1CCN(CC1)C1=NC=C(C=N1)C1=CC2=NC=CC(NC3=NC=CN...,-4.35655
4,NCCNC1=NC=C(C=N1)C1=CC2=NC=CC(NC3=NC=CN=C3)=C2...,-4.74473


In [11]:
# rename the column names
new_select.columns = ["SMILES","exp_logS"] 

In [12]:
new_select.head()

Unnamed: 0,SMILES,exp_logS
0,CC(C)CCOC1=CC2=C(C=C1)C1=CC=NC(C)=C1N2,-4.12767
1,CCCN1C2=C(C=CC(OCCC(C)C)=C2)C2=CC=NC(C)=C12,-5.19098
2,CCCN1C2=C(C=CC(OCCC(C)C)=C2)C2=CC=[N+](CC3=CC=...,-2.5784
3,CN1CCN(CC1)C1=NC=C(C=N1)C1=CC2=NC=CC(NC3=NC=CN...,-4.35655
4,NCCNC1=NC=C(C=N1)C1=CC2=NC=CC(NC3=NC=CN=C3)=C2...,-4.74473


In [13]:
# concatenates the two datasets
solubility_total = pd.concat([sol_select,new_select], axis=0)

In [14]:
solubility_total.head()

Unnamed: 0,SMILES,exp_logS
0,ClCC(Cl)(Cl)Cl,-2.18
1,CC(Cl)(Cl)Cl,-2.0
2,ClC(Cl)C(Cl)Cl,-1.74
3,ClCC(Cl)Cl,-1.48
4,FC(F)(Cl)C(F)(Cl)Cl,-3.04


In [15]:
# check the number of entries in the final (total) dataset: should be 1144 + 53 = 1197
solubility_total.shape

(1197, 2)

In [16]:
# set the precision of the experimental logS to the Delaney's reference dataset
solubility_total.exp_logS = solubility_total.exp_logS.round(2)

In [17]:
# save the total dataset on disk
solubility_total.to_csv(r'solubility_total.csv', index = False)