In [160]:
import numpy as np
import pandas as pd

In [161]:
### Collecting the data from different sources

#### Train Data 
data1=pd.read_csv('data/water_solubility_data.csv')
data2=pd.read_csv('data/data_paper.csv',encoding='latin1')
data5 = pd.read_excel('data/Supplementary_data.xlsx')
data6=pd.read_csv('data/dataset-not-FA.csv')

### Test data 
test_set=pd.read_csv('data/dataset-E.csv')

In [162]:
### Size of each dataset 
print(data1.shape)
print(data2.shape)
print(data5.shape)
print(data6.shape)
print(test_set.shape)

(900, 12)
(11862, 26)
(9943, 6)
(6154, 9)
(1291, 6)


In [163]:
### In order to merge the data all the dataset should have identical column name ... 
data1=data1[['Smiles','LogS']]
## Changing the column name to be the identical with other dataset 
data1=data1.rename(columns={ "Smiles": "SMILES","LogS":"Solubility"})

### Selecting the requiered column 
data2=data2[['SMILES','LogS']]
## Changing the column name to be the identical with other dataset 
data2=data2.rename(columns={ "LogS":"Solubility"})#

### Changing the column name ...

data5=data5[['SMILES','logS']]
data5 = data5.rename(columns={'logS': 'Solubility'})

data6=data6[['SMILES','Solubility']]



In [164]:
### Merging the train dataset to one dataframe 
frames = [data1,data2,data5,data6]
train_set = pd.concat(frames)
### Size of the train and test dataset before preprocess....
print('Size of the train dataset :',len(train_set))
print('Size of the test dataset :',len(test_set))


train_set.to_csv('data/train4_final.csv')
test_set.to_csv('data/test_final.csv')

Size of the train dataset : 28859
Size of the test dataset : 1291


In [165]:
train_set.reset_index(drop=True)
test_set.reset_index(drop=True)

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility
0,E-1,n-pentane,"InChI=1S/C5H12/c1-3-5-4-2/h3-5H2,1-2H3",OFBQJSOFQDEBGM-UHFFFAOYSA-N,CCCCC,-3.18
1,E-2,cyclopentane,InChI=1S/C5H10/c1-2-4-5-3-1/h1-5H2,RGSFGYAAUTVSQA-UHFFFAOYSA-N,C1CCCC1,-2.64
2,E-3,n-hexane,"InChI=1S/C6H14/c1-3-5-6-4-2/h3-6H2,1-2H3",VLKZOEOYAKHREP-UHFFFAOYSA-N,CCCCCC,-3.84
3,E-4,2-methylpentane,"InChI=1S/C6H14/c1-4-5-6(2)3/h6H,4-5H2,1-3H3",AFABGHUZZDYHJO-UHFFFAOYSA-N,CCCC(C)C,-3.74
4,E-5,"2,2-dimethylbutane","InChI=1S/C6H14/c1-5-6(2,3)4/h5H2,1-4H3",HNRMPXKDFBEGFZ-UHFFFAOYSA-N,CCC(C)(C)C,-3.55
...,...,...,...,...,...,...
1286,E-1287,malathion,InChI=1S/C10H19O6PS2/c1-5-15-9(11)7-8(10(12)16...,JXSJBGJIGXNWCI-UHFFFAOYSA-N,CCOC(=O)CC(SP(=S)(OC)OC)C(=O)OCC,-3.37
1287,E-1288,chlorpyriphos,"InChI=1S/C9H11Cl3NO3PS/c1-3-14-17(18,15-4-2)16...",SBPBAQFWLVIOKP-UHFFFAOYSA-N,CCOP(=S)(OCC)Oc1nc(Cl)c(Cl)cc1Cl,-5.49
1288,E-1289,prostaglandin_E2,InChI=1S/C20H32O5/c1-2-3-6-9-15(21)12-13-17-16...,XEYBRNLFEZDVAW-UHFFFAOYSA-N,CCCCCC(O)C=CC1C(O)CC(=O)C1CC=CCCCC(=O)O,-2.47
1289,E-1290,"p,p'-DDT",InChI=1S/C14H9Cl5/c15-11-5-1-9(2-6-11)13(14(17...,YVGGHNCTFXOJCH-UHFFFAOYSA-N,c(ccc(c1)Cl)(c1)C(c(ccc(c2)Cl)c2)C(Cl)(Cl)Cl,-7.15


In [96]:
from rdkit.Chem import MolFromSmiles as smi2mol
from rdkit.Chem import MolToSmiles as mol2smi
## Function to create canonical smiles 
def canon(smi):
    try:
        mol=smi2mol(smi, sanitize=True)
        smi_canon=mol2smi(mol, isomericSmiles=False, canonical=True)
        return(smi_canon)
    except:
        print("ERROR")
        return(smi)

In [166]:
#### Applying function to create the column with canonical smiles.  
train_set['smiles_canon'] = [canon(smi) for smi in train_set.SMILES]
test_set['smiles_canon'] = [canon(smi) for smi in test_set.SMILES]



In [168]:
test_set[0:200]

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility,smiles_canon
0,E-1,n-pentane,"InChI=1S/C5H12/c1-3-5-4-2/h3-5H2,1-2H3",OFBQJSOFQDEBGM-UHFFFAOYSA-N,CCCCC,-3.18,CCCCC
1,E-2,cyclopentane,InChI=1S/C5H10/c1-2-4-5-3-1/h1-5H2,RGSFGYAAUTVSQA-UHFFFAOYSA-N,C1CCCC1,-2.64,C1CCCC1
2,E-3,n-hexane,"InChI=1S/C6H14/c1-3-5-6-4-2/h3-6H2,1-2H3",VLKZOEOYAKHREP-UHFFFAOYSA-N,CCCCCC,-3.84,CCCCCC
3,E-4,2-methylpentane,"InChI=1S/C6H14/c1-4-5-6(2)3/h6H,4-5H2,1-3H3",AFABGHUZZDYHJO-UHFFFAOYSA-N,CCCC(C)C,-3.74,CCCC(C)C
4,E-5,"2,2-dimethylbutane","InChI=1S/C6H14/c1-5-6(2,3)4/h5H2,1-4H3",HNRMPXKDFBEGFZ-UHFFFAOYSA-N,CCC(C)(C)C,-3.55,CCC(C)(C)C
...,...,...,...,...,...,...,...
195,E-196,"2,3',4',5-PCB",InChI=1S/C12H6Cl4/c13-8-2-4-10(14)9(6-8)7-1-3-...,KENZYIHFBRWMOD-UHFFFAOYSA-N,c1cc(Cl)c(Cl)cc1c2c(Cl)ccc(Cl)c2,-7.25,Clc1ccc(Cl)c(-c2ccc(Cl)c(Cl)c2)c1
196,E-197,"2,2',4,4'-PCB",InChI=1S/C12H6Cl4/c13-7-1-3-9(11(15)5-7)10-4-2...,QORAVNMWUNPXAO-UHFFFAOYSA-N,c1cc(Cl)cc(Cl)c1c2c(Cl)cc(Cl)cc2,-6.51,Clc1ccc(-c2ccc(Cl)cc2Cl)c(Cl)c1
197,E-198,"2,2',3,3',4-PCB",InChI=1S/C12H5Cl5/c13-8-3-1-6(5-10(8)15)7-2-4-...,WIDHRBRBACOVOY-UHFFFAOYSA-N,c1cc(Cl)c(Cl)cc1c2c(Cl)c(Cl)c(Cl)cc2,-7.05,Clc1ccc(-c2ccc(Cl)c(Cl)c2Cl)cc1Cl
198,E-199,"2,2,4,6,6'-PCB",InChI=1S/C12H5Cl5/c13-6-4-9(16)12(10(17)5-6)11...,MTCPZNVSDFCBBE-UHFFFAOYSA-N,Clc1cccc(Cl)c1c2c(Cl)cc(Cl)cc2Cl,-7.32,Clc1cc(Cl)c(-c2c(Cl)cccc2Cl)c(Cl)c1


In [98]:
### Calculate  the occurence of  smiles in both train and test dataset....
train_set['occurence'] = train_set.groupby('smiles_canon')['smiles_canon'].transform('count')
test_set['occurence'] = test_set.groupby('smiles_canon')['smiles_canon'].transform('count')

In [99]:
### Taking out Unique smiles from the train and test dataset 
train_set1= train_set[train_set['occurence']==1]
print(train_set1.shape)
test_set1= test_set[test_set['occurence']==1]
print(test_set1.shape)

(13432, 4)
(1273, 8)


In [100]:
### Selecting specific column of the dataframe ..
train_set1=train_set1[['smiles_canon','Solubility','occurence']]
test_set1=test_set1[['smiles_canon','Solubility','occurence']]

In [101]:
#### Taking out the dataframe which has duplicate smiles means more than one time occurence in the dataset ...
train_set2= train_set[train_set['occurence']>1]
print(train_set2.shape)
test_set2= test_set[test_set['occurence']>1]
print(test_set2.shape)

(15427, 4)
(18, 8)


In [102]:
#Extract duplicate rows
id1 = train_set2["smiles_canon"]
train_set2=train_set2[id1.isin(id1[id1.duplicated()])].sort_values("smiles_canon")
id2 = test_set2["smiles_canon"]
test_set2=test_set2[id2.isin(id2[id2.duplicated()])].sort_values("smiles_canon")


In [103]:
print(train_set2.shape)
print(test_set2.shape)

(15427, 4)
(18, 8)


In [104]:
train_set2.reset_index(drop=True)
test_set2.reset_index(drop=True)

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility,smiles_canon,occurence
0,E-403,hydrocortisone_acetate,InChI=1S/C23H30O6/c1-13(24)29-12-19(27)23(28)9...,ITRJWOMZKQRYTA-UHFFFAOYSA-N,CC(=O)OCC(=O)C3(O)CCC4C2CCC1=CC(=O)CCC1(C)C2C(...,-4.3,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,2
1,E-1004,Cortisone_Acetate,InChI=1S/C23H30O6/c1-13(24)29-12-19(27)23(28)9...,ITRJWOMZKQRYTA-UHFFFAOYSA-N,CC(=O)OCC(=O)C3(O)CCC4C2CCC1=CC(=O)CCC1(C)C2C(...,-4.0,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,2
2,E-666,chenodeoxycholic_acid,InChI=1S/C24H40O4/c1-14(4-7-21(27)28)17-5-6-18...,RUDATBOHQWOJDD-UHFFFAOYSA-N,CC(CCC(O)=O)C3CCC4C2C(O)CC1CC(O)CCC1(C)C2CCC34C,-3.64,CC(CCC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C,2
3,E-665,hyodeoxycholic_acid,InChI=1S/C24H40O4/c1-14(4-7-21(27)28)17-5-6-18...,RUDATBOHQWOJDD-UHFFFAOYSA-N,C1C(O)CC2CC(O)C3C4CCC(C(C)CCC(=O)O)C4(C)CCC3C2...,-3.82,CC(CCC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C,2
4,E-402,dexamethasone,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,UREBDLICKHMUKA-UHFFFAOYSA-N,C1(=O)C=C2CCC3C4CC(C)C(O)(C(=O)CO)C4(C)CC(O)C3...,-3.64,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...,2
5,E-414,betamethasone,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,UREBDLICKHMUKA-UHFFFAOYSA-N,C1=CC(=O)C=C2CCC3C4CC(C)C(O)(C(=O)CO)C4(C)CC(O...,-3.77,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...,2
6,E-1225,"cis-1,2-Dimethylcyclohexane","InChI=1S/C8H16/c1-7-5-3-4-6-8(7)2/h7-8H,3-6H2,...",KVZJLSYJROEPSQ-UHFFFAOYSA-N,C1C(C)C(C)CCC1,-4.27,CC1CCCCC1C,2
7,E-828,"trans-1,2-Dimethylcyclohexane","InChI=1S/C8H16/c1-7-5-3-4-6-8(7)2/h7-8H,3-6H2,...",KVZJLSYJROEPSQ-UHFFFAOYSA-N,C1C(C)C(C)CCC1,-4.33,CC1CCCCC1C,2
8,E-257,"2,3-dimethyl-3-pentanol","InChI=1S/C7H16O/c1-5-7(4,8)6(2)3/h6,8H,5H2,1-4H3",RFZHJHSNHYIRNE-UHFFFAOYSA-N,CC(C)C(C)(O)CC,-0.85,CCC(C)(O)C(C)C,2
9,E-1082,"2,4-dimethyl-3-pentanol","InChI=1S/C7H16O/c1-5-7(4,8)6(2)3/h6,8H,5H2,1-4H3",RFZHJHSNHYIRNE-UHFFFAOYSA-N,CC(C)C(O)(C)CC,-1.22,CCC(C)(O)C(C)C,2


In [105]:
train_set2=train_set2[['smiles_canon','Solubility','occurence']]
test_set2=test_set2[['smiles_canon','Solubility','occurence']]

In [106]:
train_set2.reset_index(drop=True)
test_set2.reset_index(drop=True)

Unnamed: 0,smiles_canon,Solubility,occurence
0,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,-4.3,2
1,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,-4.0,2
2,CC(CCC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C,-3.64,2
3,CC(CCC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C,-3.82,2
4,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...,-3.64,2
5,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...,-3.77,2
6,CC1CCCCC1C,-4.27,2
7,CC1CCCCC1C,-4.33,2
8,CCC(C)(O)C(C)C,-0.85,2
9,CCC(C)(O)C(C)C,-1.22,2


In [107]:
print(train_set2.shape)
print(test_set2.shape)

(15427, 3)
(18, 3)


In [108]:
# Group by SMILES
grouped_train = train_set2.groupby('smiles_canon')
grouped_test = test_set2.groupby('smiles_canon')
# Filter groups where the maximum difference in solubility is less than or equal to 0.50
filtered_groups_train = grouped_train.filter(lambda x: x['Solubility'].max() - x['Solubility'].min() <= 0.50)
filtered_groups_test = grouped_test.filter(lambda x: x['Solubility'].max() - x['Solubility'].min() <= 0.50)


In [109]:
print(len(filtered_groups_train))
print(len(filtered_groups_test))

14044
18


In [110]:
### In order to eavluate to threshold followed by the dataframe we can cross check the value of difference
grouped = filtered_groups_train.groupby('smiles_canon')['Solubility'].apply(lambda x: x.sort_values().tolist())

# Calculate differences in solubility within each group
diffs = grouped.apply(lambda x: [j - i for i, j in zip(x[:-1], x[1:])])

# Create a new DataFrame with SMILES, Solubility values, and differences
result = grouped.reset_index()
result.columns = ['smiles_canon', 'Solubility_Values']

# Add the differences as a new column
result['Differences'] = diffs.values

result

Unnamed: 0,smiles_canon,Solubility_Values,Differences
0,Br.CN(C)CCC=C1c2ccccc2Sc2ccc(F)cc21,"[-5.603798043, -5.603798043]",[0.0]
1,Br.Oc1ccc2c(c1)C13CCCCC1C(C2)N(CCc1ccccc1)CC3,"[-4.833904011, -4.833904011]",[0.0]
2,BrC(Br)(Br)Br,"[-3.157037902, -3.1400001, -3.14]","[0.017037801999999935, 9.999999983634211e-08]"
3,BrC(Br)Br,"[-1.911295104, -1.91, -1.91]","[0.0012951039999999914, 0.0]"
4,BrC(Br)C(Br)Br,"[-2.72, -2.710626523]",[0.009373477000000019]
...,...,...,...
5683,c1cnoc1,"[0.3827, 0.383489897]",[0.0007898970000000394]
5684,c1coc(-c2nc3ccccc3[nH]2)c1,"[-3.414031277, -3.414]",[3.127699999971867e-05]
5685,c1nc[nH]n1,"[0.788504777, 1.005834993]",[0.2173302159999999]
5686,c1ncc2[nH]cnc2n1,"[0.619390837, 0.62]",[0.0006091630000000237]


In [111]:
# Group again by SMILES and calculate the mean of solubility
mean_solubility_df_train = filtered_groups_train.groupby('smiles_canon').mean().reset_index()
mean_solubility_df_test = filtered_groups_test.groupby('smiles_canon').mean().reset_index()

In [112]:
### Combining the dataframe with single entry and matching smiles and average the solubility values 

train_set=pd.concat([train_set1, mean_solubility_df_train], axis=0, ignore_index=True)
test_set=pd.concat([test_set1, mean_solubility_df_test], axis=0, ignore_index=True)


In [113]:
print(train_set.shape)
print(test_set.shape)

(19120, 3)
(1282, 3)


In [114]:
matching_smiles = pd.merge(train_set[['smiles_canon']], test_set[['smiles_canon']], on='smiles_canon')['smiles_canon']

# Remove the matching SMILES from both DataFrames
df1_filtered = train_set[~train_set['smiles_canon'].isin(matching_smiles)]


In [115]:
matching_smiles

0                          Sc1ncnc2nccnc12
1                             Oc1ccnc(S)n1
2                          Cc1cc(O)nc(S)n1
3        O=c1[nH]cnc2c1ncn2C1OC(CO)C(O)C1O
4       C1CCC([Sn](C2CCCCC2)C2CCCCC2)CC1.O
                       ...                
1178                              c1ccnnc1
1179                               c1ccoc1
1180                               c1ccsc1
1181                        c1cnc2ncncc2n1
1182                              c1cncnc1
Name: smiles_canon, Length: 1183, dtype: object

In [116]:
df1_filtered

Unnamed: 0,smiles_canon,Solubility,occurence
0,O=C1c2ccccc2C(=O)c2c(Cl)cccc21,-5.540000,1.0
1,NC(N)=NS(=O)(=O)c1ccc(N)cc1,-1.984970,1.0
2,CCC1C(N)CN1c1cc2c(cc1F)c(=O)c(C(=O)O)cn2C1CC1,-3.912000,1.0
3,CCC1(CC)OC(=O)c2cc([N+](=O)[O-])ccc21,-3.656000,1.0
4,O=C1C=Cc2ccccc2C1=NO,-2.937000,1.0
...,...,...,...
19115,c1cnoc1,0.383095,2.0
19116,c1coc(-c2nc3ccccc3[nH]2)c1,-3.414016,2.0
19117,c1nc[nH]n1,0.897170,2.0
19118,c1ncc2[nH]cnc2n1,0.619695,2.0


In [119]:
df_concat = pd.concat([train_set, test_set])

# Drop duplicates SMILES
df_unique = df_concat.drop_duplicates(subset='smiles_canon', keep=False)

# Split the unique rows back into two DataFrames
df1_filtered = df_unique[df_unique['smiles_canon'].isin(train_set['smiles_canon'])].reset_index(drop=True)


In [120]:
df1_filtered

Unnamed: 0,smiles_canon,Solubility,occurence
0,O=C1c2ccccc2C(=O)c2c(Cl)cccc21,-5.540000,1.0
1,NC(N)=NS(=O)(=O)c1ccc(N)cc1,-1.984970,1.0
2,CCC1C(N)CN1c1cc2c(cc1F)c(=O)c(C(=O)O)cn2C1CC1,-3.912000,1.0
3,CCC1(CC)OC(=O)c2cc([N+](=O)[O-])ccc21,-3.656000,1.0
4,O=C1C=Cc2ccccc2C1=NO,-2.937000,1.0
...,...,...,...
17932,c1cnoc1,0.383095,2.0
17933,c1coc(-c2nc3ccccc3[nH]2)c1,-3.414016,2.0
17934,c1nc[nH]n1,0.897170,2.0
17935,c1ncc2[nH]cnc2n1,0.619695,2.0


In [24]:
# savinf the combine 6 dataset 
train_set.to_csv('data/combined_data6.csv')

In [121]:
### Remove the smiles from train dataset with matching test dataset...
smiles_train_canon=train_set.smiles_canon
smiles_test_canon=test_set.smiles_canon

In [122]:
overlap1 = 0
for x in smiles_train_canon:
    if x in smiles_test_canon:
        overlap1+=1
print("%i of the train molecules are in the test set"%(overlap1))

overlap2 = 0
for x in smiles_test_canon:
    if x in smiles_train_canon:
        overlap2+=1
print("%i of the test molecules are in the train set"%(overlap2))

0 of the train molecules are in the test set
0 of the test molecules are in the train set


In [123]:
Match_rows = pd.merge(test_set, train_set, on=['smiles_canon'], how='inner')
#mergedStuff.head()
print(len(Match_rows))

1183


In [124]:
### Finding the matching smiles in the train dataset..
cond = train_set['smiles_canon'].isin(Match_rows['smiles_canon'])

#### Dropping the smiles which is same and making dataframe with uniques smiles and no smiles same in ttrain  dataset 
train_set.drop(train_set[cond].index, inplace = True)

In [125]:
print(train_set.shape)
print(test_set.shape)

(17937, 3)
(1282, 3)


In [126]:
### Cross check in order to find that any matching smiles exist in the train dataset with test dataset
Match_rows = pd.merge(test_set, train_set, on=['smiles_canon'], how='inner')
#mergedStuff.head()
print(len(Match_rows))

0


In [127]:
train_set.reset_index(drop=True)

Unnamed: 0,smiles_canon,Solubility,occurence
0,O=C1c2ccccc2C(=O)c2c(Cl)cccc21,-5.540000,1.0
1,NC(N)=NS(=O)(=O)c1ccc(N)cc1,-1.984970,1.0
2,CCC1C(N)CN1c1cc2c(cc1F)c(=O)c(C(=O)O)cn2C1CC1,-3.912000,1.0
3,CCC1(CC)OC(=O)c2cc([N+](=O)[O-])ccc21,-3.656000,1.0
4,O=C1C=Cc2ccccc2C1=NO,-2.937000,1.0
...,...,...,...
17932,c1cnoc1,0.383095,2.0
17933,c1coc(-c2nc3ccccc3[nH]2)c1,-3.414016,2.0
17934,c1nc[nH]n1,0.897170,2.0
17935,c1ncc2[nH]cnc2n1,0.619695,2.0


In [128]:
print(train_set.shape)
print(test_set.shape)

(17937, 3)
(1282, 3)


In [129]:
### Saving the data to the disk 
train_set.to_csv('data/unique_train4_new24_new.csv')
test_set.to_csv('data/unique_test_new24.csv')

In [130]:
train_smiles = set(train_set['smiles_canon'])
test_smiles = set(test_set['smiles_canon'])
overlap = train_smiles.intersection(test_smiles)

if overlap:
    print("There are overlapping smiles between the training and test datasets:")
    print(overlap)
else:
    print("No overlapping smiles between the training and test datasets.")

No overlapping smiles between the training and test datasets.


In [169]:
#### End here ....