In [None]:
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [7]:
trainVal_df = pd.read_excel("../0_splitData/output_original_MLdataset.xlsx", sheet_name='trainVal_dataset')
test_df = pd.read_excel("../0_splitData/output_original_MLdataset.xlsx", sheet_name='test_dataset')
print(trainVal_df.shape)
print(test_df.shape)

(7581, 19)
(494, 19)


In [8]:
df = pd.concat([trainVal_df, test_df]).reset_index(drop=True)
df.head()

Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments,Class_index,SMILES_class
0,0,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,La(III),0.01,HNO3,4.3,295.15,0.321764,-0.492462,https://doi.org/10.1002/chem.201806161,DGA LANL,0,"Ether, Amide, Diglycolamide"
1,1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Ce(III),0.01,HNO3,4.3,295.15,0.622257,-0.20603,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"
2,2,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Pr(III),0.01,HNO3,4.3,295.15,0.757525,-0.120603,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"
3,3,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Nd(III),0.01,HNO3,4.3,295.15,0.77526,-0.110553,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"
4,4,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Sm(III),0.01,HNO3,4.3,295.15,0.830994,-0.080402,https://doi.org/10.1002/chem.201806161,DGA LANL,1,"Ether, Amide, Diglycolamide"


## Extractants prepare

In [9]:
circular_fps_list = []
rdkit_descs_list = []

for smile in df['SMILES']: 
    mol = Chem.MolFromSmiles(smile)

    circular_fp_feature = list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=4, nBits=2048))
    circular_fps_list.append(circular_fp_feature)

    rdkit_desc_feature = [desc(mol) for _, desc in Descriptors.descList]
    rdkit_descs_list.append(rdkit_desc_feature)

circular_fp_df = pd.DataFrame(circular_fps_list, columns=[f'CircularFP_{i}' for i in range(2048)])
rdkit_desc_df = pd.DataFrame(rdkit_descs_list, columns=[desc_name for desc_name, _ in Descriptors.descList])

ml_df = pd.concat([circular_fp_df, rdkit_desc_df], axis=1)
print(ml_df.shape)

(8075, 2258)


In [10]:
# Remove 'Ipc', which contain large values

ml_df = ml_df.drop(columns=['Ipc'])
print(ml_df.shape)

(8075, 2257)


In [11]:
min_value = df['Extractant_conc_M'].min()
max_value = df['Extractant_conc_M'].max()
median_value = df['Extractant_conc_M'].median()
print(min_value)
print(max_value)
print(median_value)

0.00011
10.5
0.1


In [12]:
ml_df = pd.concat([ml_df, df[['Extractant_conc_M']]], axis=1)
print(ml_df.shape)

(8075, 2258)


## Solvents prepare

In [13]:
ml_df = pd.concat([ml_df, df[['Volume_fraction_A', 'Volume_fraction_B']]], axis=1)
print(ml_df.shape)

(8075, 2260)


In [14]:
solvents_desp_df = pd.read_excel("../../../s0_prepData/s0e_prepData_LnAn/db_LnAn_conditions_desp.xlsx", 
                                 sheet_name='solvents_descriptors', 
                                 index_col=0)
print(solvents_desp_df.shape)
solvents_desp_df.head()

(45, 7)


Unnamed: 0_level_0,Molar_mass(g/mol),Log_P,Boiling_point(K),Melting_point(K),Density(g/mL),Solubility_in_water(g/L),Data_source
Solvent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hydrogenated tetrapropylene,170.34,4.4949,489.427778,263.538889,0.7487,4e-06,MW and logP is calculated from rdkit (check co...
chloroform,119.378,1.9864,334.816667,209.65,1.4832,7.95,MW and logP is calculated from rdkit (check co...
1-dodecanol,186.339,3.8996,532.038889,297.038889,0.831,0.004,MW and logP is calculated from rdkit (check co...
n-octane,114.232,3.3668,398.761111,216.372222,0.703,0.00066,MW and logP is calculated from rdkit (check co...
toluene,92.141,1.99502,383.761111,178.15,0.867,0.526,MW and logP is calculated from rdkit (check co...


In [15]:
solvents_ml_columns_name_list = []

for one_sol_col_name in solvents_desp_df.columns[0:-2]:
    one_sol_col_name += "_A"
    solvents_ml_columns_name_list.append(one_sol_col_name)
for one_sol_col_name in solvents_desp_df.columns[0:-2]:
    one_sol_col_name += "_B"
    solvents_ml_columns_name_list.append(one_sol_col_name)
solvents_ml_columns_name_list

['Molar_mass(g/mol)_A',
 'Log_P_A',
 'Boiling_point(K)_A',
 'Melting_point(K)_A',
 'Density(g/mL)_A',
 'Molar_mass(g/mol)_B',
 'Log_P_B',
 'Boiling_point(K)_B',
 'Melting_point(K)_B',
 'Density(g/mL)_B']

In [16]:
solvents_desp_length = len(solvents_desp_df.iloc[0][0:-2])
print(solvents_desp_length)
solvents_desp_df.iloc[0][0:-2]

5


Molar_mass(g/mol)        170.34
Log_P                    4.4949
Boiling_point(K)     489.427778
Melting_point(K)     263.538889
Density(g/mL)            0.7487
Name: hydrogenated tetrapropylene, dtype: object

In [17]:
solvents_list = []

for index, row in df[['Solvent_A', 'Solvent_B']].iterrows(): 
    one_solA_list = list(solvents_desp_df.loc[row.iloc[0]][0:solvents_desp_length])
    if pd.isnull(row.iloc[1]):
        # print(row.iloc[1])
        one_solB_list = [0] * solvents_desp_length
    else:
        one_solB_list = list(solvents_desp_df.loc[row.iloc[1]][0:solvents_desp_length])

    solvents_list.append(one_solA_list + one_solB_list)

solvents_ml_df = pd.DataFrame(solvents_list, columns=solvents_ml_columns_name_list)

ml_df = pd.concat([ml_df, solvents_ml_df], axis=1)
print(ml_df.shape)
ml_df.head()

(8075, 2270)


Unnamed: 0,CircularFP_0,CircularFP_1,CircularFP_2,CircularFP_3,CircularFP_4,CircularFP_5,CircularFP_6,CircularFP_7,CircularFP_8,CircularFP_9,...,Molar_mass(g/mol)_A,Log_P_A,Boiling_point(K)_A,Melting_point(K)_A,Density(g/mL)_A,Molar_mass(g/mol)_B,Log_P_B,Boiling_point(K)_B,Melting_point(K)_B,Density(g/mL)_B
0,1,1,0,0,0,0,0,0,0,0,...,170.34,4.4949,489.427778,263.538889,0.7487,0.0,0.0,0.0,0.0,0.0
1,1,1,0,0,0,0,0,0,0,0,...,170.34,4.4949,489.427778,263.538889,0.7487,0.0,0.0,0.0,0.0,0.0
2,1,1,0,0,0,0,0,0,0,0,...,170.34,4.4949,489.427778,263.538889,0.7487,0.0,0.0,0.0,0.0,0.0
3,1,1,0,0,0,0,0,0,0,0,...,170.34,4.4949,489.427778,263.538889,0.7487,0.0,0.0,0.0,0.0,0.0
4,1,1,0,0,0,0,0,0,0,0,...,170.34,4.4949,489.427778,263.538889,0.7487,0.0,0.0,0.0,0.0,0.0


## Acids prepare

In [18]:
acids_desp_df = pd.read_excel("../../../s0_prepData/s0e_prepData_LnAn/db_LnAn_conditions_desp.xlsx", 
                              sheet_name='acids', 
                              index_col=0)
print(acids_desp_df.shape)
acids_desp_df

(9, 2)


Unnamed: 0_level_0,Dipole_moment_D,Data_source
Acid_type,Unnamed: 1_level_1,Unnamed: 2_level_1
HNO3,2.17,https://en.wikipedia.org/wiki/Nitric_acid
HCl,1.05,https://en.wikipedia.org/wiki/Hydrogen_chloride
H2SO4,2.72,https://www.wikidata.org/wiki/Q4118
citric acid,3.33,https://doi.org/10.3390/ijms22147676
lactic acid,3.624,https://doi.org/10.1038/s41598-019-56019-4
malonic acid,2.07,https://doi.org/10.1016/0009-2614(86)80129-4
tartaric acid,3.3505,"Ekincioğlu, Y., Kiliç, H.Ş. and Dereli, Ö., 20..."
HClO4,2.28,https://doi.org/10.1016/j.jms.2005.07.012
organic acid,1.4,https://doi.org/10.1021/jacsau.2c00122


In [19]:
min_value = df['Acid_conc_M'].min()
max_value = df['Acid_conc_M'].max()
median_value = df['Acid_conc_M'].median()
print(min_value)
print(max_value)
print(median_value)

0.0
15.0
1.5


In [20]:
acids_list = []

for one_acid in df['Acid_type']: 
    one_acid_list = [acids_desp_df.loc[one_acid].iloc[0]]
    acids_list.append(one_acid_list)

acids_ml_df = pd.DataFrame(acids_list, columns=[acids_desp_df.columns[0]])

ml_df = pd.concat([ml_df, acids_ml_df, df[['Acid_conc_M']]], axis=1)
print(ml_df.shape)

(8075, 2272)


## Temperature

In [21]:
ml_df = pd.concat([ml_df, df[['Temperature_K']]], axis=1)
ml_df.shape

(8075, 2273)

## Metal prepare

In [22]:
# replace zero to min value for metal concentration

min_value = df.loc[df['Metal_conc_mM'] != 0, 'Metal_conc_mM'].min()
print('min/10 of Metal_conc_mM: ', min_value/10)
df['Metal_conc_mM'] = df['Metal_conc_mM'].replace(0, min_value/10)

min/10 of Metal_conc_mM:  1e-10


In [23]:
min_value = df['Metal_conc_mM'].min()
max_value = df['Metal_conc_mM'].max()
median_value = df['Metal_conc_mM'].median()
print(min_value)
print(max_value)
print(median_value)

1e-10
1250.0
0.0001


In [24]:
df['Log_metal_conc_mM'] = np.log10(df['Metal_conc_mM'].copy())

In [25]:
ml_df = pd.concat([ml_df, df[['Log_metal_conc_mM']]], axis=1)
print(ml_df.shape)

(8075, 2274)


In [26]:
metals_desp_df = pd.read_excel("../../../s0_prepData/s0e_prepData_LnAn/db_LnAn_conditions_desp.xlsx", sheet_name='metals', index_col=0)
print(metals_desp_df.shape)
metals_desp_df.head()

(28, 12)


Unnamed: 0_level_0,Atomic_number,Melting_point_K,Boiling_point_K,Density_g/cm3,First_IE_kJ/mol,Second_IE_kJ/mol,Third_IE_kJ/mol,Matallic_radius_nm,Pauling_EN,Ionic_radius_nm,Oxidation_state,Data_source
Metal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
La(III),57,1193.15,3737.15,6.162,538.1,1067.0,1850.3,0.162,1.1,0.1172,3,"https://webelements.com/, https://pubs.acs.org..."
Ce(III),58,1068.15,3633.15,6.77,534.4,1046.9,1948.8,0.1818,1.12,0.115,3,"https://webelements.com/, https://pubs.acs.org..."
Pr(III),59,1208.15,3563.15,6.77,528.1,1017.9,2086.4,0.1824,1.13,0.113,3,"https://webelements.com/, https://pubs.acs.org..."
Nd(III),60,1297.15,3373.15,7.01,533.1,1034.3,2132.3,0.1814,1.14,0.1123,3,"https://webelements.com/, https://pubs.acs.org..."
Pm(III),61,1315.15,3273.15,7.26,538.1,1055.4,2170.0,0.1834,1.13,0.111,3,"https://webelements.com/, https://pubs.acs.org..."


In [27]:
metal_desp_length = len(metals_desp_df.iloc[0][0:-1])
print(metal_desp_length)

11


In [28]:
metal_list = []

for one_metal in df['Metal']: 
    one_metal_list = list(metals_desp_df.loc[one_metal][0:-1])
    metal_list.append(one_metal_list)

metals_ml_df = pd.DataFrame(metal_list, columns=metals_desp_df.columns[0:-1])

ml_df = pd.concat([ml_df, metals_ml_df], axis=1)
ml_df.shape

(8075, 2285)

## Distribution ratio

In [29]:
ml_df = pd.concat([ml_df, df[['Class_index']]], axis=1)

In [30]:
ml_df = pd.concat([ml_df, df[['SMILES']]], axis=1)
ml_df = pd.concat([ml_df, df[['SMILES_class']]], axis=1)
ml_df = pd.concat([ml_df, df[['Distribution_ratio']]], axis=1)
ml_df = pd.concat([ml_df, df[['Log_D']]], axis=1)

## Test and Check the data

In [31]:
# Check for NaN values
has_nan = ml_df[ml_df.columns].isnull().any()

print("The column contain Nan values:")
for index, one_bool in has_nan.items():
    if one_bool:
        print(index)

The column contain Nan values:


In [32]:
# Check for numerial values
from pandas.api.types import is_numeric_dtype

print(is_numeric_dtype(ml_df.iloc[:, 0:-4].values))
# should return True

True


In [33]:
# Check large value
large_value_threshold = 1e10

print((np.abs(ml_df.iloc[:, 0:-4].values) > large_value_threshold).any())
# should return False

large_value_columns = []
for column in ml_df.columns:
    if is_numeric_dtype(ml_df[column].values):
        if (np.abs(ml_df[column]) > large_value_threshold).any():
            large_value_columns.append(column)
            print(f"Column '{column}' contains values larger than {large_value_threshold}.")

print("Columns with very large values:", large_value_columns)
# should return empty list

False
Columns with very large values: []


## Feature selection

In [34]:
# Remove columns that contain only zeros
ml_df = ml_df.loc[:, (ml_df != 0).any(axis=0)]
print(ml_df.shape)

(8075, 1865)


In [35]:
# Save all ML features
cols_df = pd.Series(ml_df.columns, name="ML_features")
print(cols_df.shape)
cols_df.to_excel('ML_features_all.xlsx',index=False)

(1865,)


## Output

In [36]:
trainValDataset_ml_df = ml_df.iloc[0:trainVal_df.shape[0]]
testDataset_ml_df = ml_df.iloc[trainVal_df.shape[0]:]

print("trainVal size: ", trainValDataset_ml_df.shape)
print("test size: ", testDataset_ml_df.shape)
# This should match the trainVal and test size we split previously

trainVal size:  (7581, 1865)
test size:  (494, 1865)


In [37]:
trainValDataset_ml_df.to_csv('trainVal_dataset.csv', index=False)
testDataset_ml_df.to_csv('test_dataset.csv', index=False)