In [50]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

## Split molecules (TrainVal, test)

In [51]:
df_unique_smiles_info = pd.read_excel("../../../s0_prepData/s0f_analyzeData_LnAn/output_uniqueSMILES_ClassBySMARTS.xlsx")
print(df_unique_smiles_info.shape)
df_unique_smiles_info.head(3)

(295, 4)


Unnamed: 0,SMILES_ID,SMILES,Class_by_SMARTS,SMILES_DOI
0,0,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,"Ether, Amide, Diglycolamide",https://doi.org/10.1002/chem.201806161
1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@@H](C)C(=O...,"Ether, Amide, Diglycolamide",https://doi.org/10.1002/chem.201806161
2,2,CN(C(=O)COCC(=O)N(C)c1ccccc1)c1ccccc1,"Ether, Amide",https://doi.org/10.1007/BF02349516;https://doi...


In [52]:
class_counts = df_unique_smiles_info['Class_by_SMARTS'].value_counts()
class_counts

Class_by_SMARTS
Amide                                                     65
Ether, Amide, Diglycolamide                               62
Ether, Amide                                              40
Amide, Pyridine                                           32
Pyridine                                                  23
Other                                                     13
Amide, Phosphine Oxides                                    9
Phosphine Oxides                                           7
Amide, Phosphine Oxides, Organophosphine Oxides            7
Ether                                                      7
Phosphine Oxides, Pyridine                                 5
Ether, Amide, Pyridine                                     4
Ether, Pyridine                                            4
Ketone, Pyridine                                           3
Hydroxyl, Phosphine Oxides                                 3
Hydroxyl, Carboxylic Acid                                  2
Ether, A

In [53]:
df_unique_smiles_info['Class_by_SMARTS_combineRare'] = df_unique_smiles_info['Class_by_SMARTS'].apply(lambda x: x if class_counts[x] > 3 else 'Rare groups')
print(df_unique_smiles_info.shape)
df_unique_smiles_info.head(3)

(295, 5)


Unnamed: 0,SMILES_ID,SMILES,Class_by_SMARTS,SMILES_DOI,Class_by_SMARTS_combineRare
0,0,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,"Ether, Amide, Diglycolamide",https://doi.org/10.1002/chem.201806161,"Ether, Amide, Diglycolamide"
1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@@H](C)C(=O...,"Ether, Amide, Diglycolamide",https://doi.org/10.1002/chem.201806161,"Ether, Amide, Diglycolamide"
2,2,CN(C(=O)COCC(=O)N(C)c1ccccc1)c1ccccc1,"Ether, Amide",https://doi.org/10.1007/BF02349516;https://doi...,"Ether, Amide"


In [54]:
class_counts_rareGroup = df_unique_smiles_info['Class_by_SMARTS_combineRare'].value_counts()
class_counts_rareGroup

Class_by_SMARTS_combineRare
Amide                                              65
Ether, Amide, Diglycolamide                        62
Ether, Amide                                       40
Amide, Pyridine                                    32
Pyridine                                           23
Rare groups                                        17
Other                                              13
Amide, Phosphine Oxides                             9
Ether                                               7
Amide, Phosphine Oxides, Organophosphine Oxides     7
Phosphine Oxides                                    7
Phosphine Oxides, Pyridine                          5
Ether, Pyridine                                     4
Ether, Amide, Pyridine                              4
Name: count, dtype: int64

In [55]:
# Create the StratifiedShuffleSplit object
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=0)

# Perform the split
for train_idx, test_idx in splitter.split(df_unique_smiles_info, df_unique_smiles_info['Class_by_SMARTS_combineRare']):
    train_set_SMILES = df_unique_smiles_info.loc[train_idx]
    test_set_SMILES = df_unique_smiles_info.loc[test_idx]

print(train_set_SMILES.shape)
print(test_set_SMILES.shape)
test_set_SMILES

(280, 5)
(15, 5)


Unnamed: 0,SMILES_ID,SMILES,Class_by_SMARTS,SMILES_DOI,Class_by_SMARTS_combineRare
175,175,CCCCC(CC)CP(=O)(CC(=O)N(CC(C)C)CC(C)C)c1ccccc1,"Amide, Phosphine Oxides",https://doi.org/10.1080/07366298608917877,"Amide, Phosphine Oxides"
35,35,CCCCCCCCN(CCCCCCCC)C(=O)COCC(=O)NCCNC(=O)c1cc(...,"Ether, Amide",https://doi.org/10.1039/C6DT04034A;https://doi...,"Ether, Amide"
145,145,CCCCCCCCNC(=O)c1cnccn1,Amide,https://cordis.europa.eu/project/id/211267,Amide
133,133,CCCCC(CC)CN(CC(CC)CCCC)C(=O)CN(CC(=O)N(CC(CC)C...,Amide,https://cordis.europa.eu/project/id/211267,Amide
243,243,CCCCN(CCCC)C(=O)COCC(=O)NCCCCCNC(=O)COCC(=O)N(...,"Ether, Amide",https://doi.org/10.1080/07366290802672212,"Ether, Amide"
53,53,CCCCCCCCCCCCN(CCC)C(=O)COCC(=O)N(CCC)CCCCCCCCCCCC,"Ether, Amide, Diglycolamide",https://doi.org/10.2298/JSC171109043H,"Ether, Amide, Diglycolamide"
14,14,CCCCN(CCCC)C(=O)COCC(=O)N(CCCC)CCCC,"Ether, Amide, Diglycolamide",https://doi.org/10.1007/s41365-017-0229-4;http...,"Ether, Amide, Diglycolamide"
169,169,CCCCCCCCN(CCCCCCCC)C(=O)COCP(=O)(CCCC)CCCC,"Ether, Amide, Phosphine Oxides, Organophosphin...",https://cordis.europa.eu/project/id/211267,Rare groups
228,228,CCCCNC(=S)C(CC)C(=S)Nc1cccc(NC(=S)C(CC)C(=S)NC...,Other,https://doi.org/10.1524/ract.2008.1485,Other
63,63,CCN(C(=O)c1cccc(-c2cccc(C(=O)N(CC)c3cc(C)ccc3C...,"Amide, Pyridine",https://doi.org/10.1039/C8DT03734E,"Amide, Pyridine"


In [56]:
test_set_SMILES['Class_by_SMARTS_combineRare'].value_counts()

Class_by_SMARTS_combineRare
Amide                          3
Ether, Amide, Diglycolamide    3
Ether, Amide                   2
Amide, Pyridine                2
Amide, Phosphine Oxides        1
Rare groups                    1
Other                          1
Ether                          1
Pyridine                       1
Name: count, dtype: int64

In [57]:
train_set_SMILES.to_excel('output_trainset_uniqueSMILES.xlsx', index=False)

## Split entries

In [58]:
smiles_testDataset = test_set_SMILES['SMILES']
print(len(smiles_testDataset))
print(len(set(smiles_testDataset)))
# This two number should be the same

15
15


In [59]:
df = pd.read_excel("../../../s0_prepData/s0e_prepData_LnAn/db_LnAn_full.xlsx", header=1)
print(df.shape)
df.head(3)

(8075, 17)


Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments
0,0,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,La(III),0.01,HNO3,4.3,295.15,0.321764,-0.492462,https://doi.org/10.1002/chem.201806161,DGA LANL
1,1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Ce(III),0.01,HNO3,4.3,295.15,0.622257,-0.20603,https://doi.org/10.1002/chem.201806161,DGA LANL
2,2,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Pr(III),0.01,HNO3,4.3,295.15,0.757525,-0.120603,https://doi.org/10.1002/chem.201806161,DGA LANL


In [60]:
def classify_D(x):
    if x < 0.1:
        class_index = 0
    elif x >= 0.1 and x < 1:
        class_index = 1
    elif x >= 1 and x < 15:
        class_index = 2
    else:
        class_index = 3
    return class_index

df['Class_index'] = df['Distribution_ratio'].apply(classify_D)
print(df.shape)
df.head()

(8075, 18)


Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments,Class_index
0,0,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,La(III),0.01,HNO3,4.3,295.15,0.321764,-0.492462,https://doi.org/10.1002/chem.201806161,DGA LANL,1
1,1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Ce(III),0.01,HNO3,4.3,295.15,0.622257,-0.20603,https://doi.org/10.1002/chem.201806161,DGA LANL,1
2,2,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Pr(III),0.01,HNO3,4.3,295.15,0.757525,-0.120603,https://doi.org/10.1002/chem.201806161,DGA LANL,1
3,3,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Nd(III),0.01,HNO3,4.3,295.15,0.77526,-0.110553,https://doi.org/10.1002/chem.201806161,DGA LANL,1
4,4,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Sm(III),0.01,HNO3,4.3,295.15,0.830994,-0.080402,https://doi.org/10.1002/chem.201806161,DGA LANL,1


In [61]:
df['Class_index'].value_counts().sort_index()

Class_index
0    1967
1    1753
2    2456
3    1899
Name: count, dtype: int64

In [62]:
def classify_smiles(smiles):
    smiles_class = df_unique_smiles_info[df_unique_smiles_info['SMILES'] == smiles]
    
    return smiles_class['Class_by_SMARTS_combineRare'].values[0]

df['SMILES_class'] = df['SMILES'].apply(classify_smiles)
print(df.shape)
df.head(3)

(8075, 19)


Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments,Class_index,SMILES_class
0,0,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,La(III),0.01,HNO3,4.3,295.15,0.321764,-0.492462,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"
1,1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Ce(III),0.01,HNO3,4.3,295.15,0.622257,-0.20603,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"
2,2,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Pr(III),0.01,HNO3,4.3,295.15,0.757525,-0.120603,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"


In [63]:
trainValDataset_df = df[~df['SMILES'].isin(smiles_testDataset)]
testDataset_df = df[df['SMILES'].isin(smiles_testDataset)]
print(trainValDataset_df.shape)
print(testDataset_df.shape)

(7581, 19)
(494, 19)


In [64]:
testDataset_df['Class_index'].value_counts().sort_index()

Class_index
0     82
1    134
2    211
3     67
Name: count, dtype: int64

In [65]:
output_file = "output_original_MLdataset.xlsx"
with pd.ExcelWriter(output_file) as writer:
    trainValDataset_df.to_excel(writer, sheet_name='trainVal_dataset', index=False)
    testDataset_df.to_excel(writer, sheet_name='test_dataset', index=False)